1 {-# LANGUAGE OverloadedStrings #-} 2 {-# LANGUAGE FlexibleInstances #-} 3 {-# LANGUAGE MultiParamTypeClasses #-} 4 5 module Text.XmlHtml.TextParser where 6 7 import Control.Applicative 8 import Data.Char 9 import Data.Maybe 10 import Text.XmlHtml.Common 11 12 import Data.Text (Text) 13 import qualified Data.Text as T 14 15 import Text.Parsec (Parsec) 16 import qualified Text.Parsec as P 17 18 import Data.ByteString (ByteString) 19 import qualified Data.ByteString as B 20 21 22 ------------------------------------------------------------------------------ 23 -- | Get an initial guess at document encoding from the byte order mark. If 24 -- the mark doesn't exist, guess UTF-8. Otherwise, guess according to the 25 -- mark. 26 guessEncoding :: ByteString -> (Encoding, ByteString) 27 guessEncoding b 28 | B.take 3 b == B.pack [ 0xEF, 0xBB, 0xBF ] = (UTF8, B.drop 3 b) 29 | B.take 2 b == B.pack [ 0xFE, 0xFF ] = (UTF16BE, B.drop 2 b) 30 | B.take 2 b == B.pack [ 0xFF, 0xFE ] = (UTF16LE, B.drop 2 b) 31 | otherwise = (UTF8, b) 32 33 34 ------------------------------------------------------------------------------ 35 -- | Specialized type for the parsers we use here. 36 type Parser = Parsec Text () 37 38 39 ------------------------------------------------------------------------------ 40 -- An (orphaned) instance for parsing Text with Parsec. 41 instance (Monad m) => P.Stream T.Text m Char where 42 uncons = return . T.uncons 43 44 45 ------------------------------------------------------------------------------ 46 parse :: (Encoding -> Parser a) -> String -> ByteString -> Either String a 47 parse p src b = let (e, b') = guessEncoding b 48 t = decoder e b' 49 bad = T.find (not . isValidChar) t 50 in if isNothing bad 51 then parseText (p e <* P.eof) src t 52 else Left $ "Document contains invalid character:" 53 ++ " \\" ++ show (ord (fromJust bad)) 54 55 56 ------------------------------------------------------------------------------ 57 -- | Checks if a document contains invalid characters. 58 -- 59 isValidChar :: Char -> Bool 60 isValidChar c | c < '\x9' = False 61 | c > '\xA' && c < '\xD' = False 62 | c > '\xD' && c < '\x20' = False 63 | c > '\xD7FF' && c < '\xE000' = False 64 | c > '\xFFFD' && c < '\x10000' = False 65 | otherwise = True 66 67 68 ------------------------------------------------------------------------------ 69 -- | Parses a 'Text' value and gives back the result. The parser is expected 70 -- to match the entire string. 71 parseText :: Parser a -- ^ The parser to match 72 -> String -- ^ Name of the source file (can be @\"\"@) 73 -> Text -- ^ Text to parse 74 -> Either String a -- Either an error message or the result 75 parseText p src t = inLeft show (P.parse p src t) 76 where inLeft :: (a -> b) -> Either a c -> Either b c 77 inLeft f (Left x) = Left (f x) 78 inLeft _ (Right x) = Right x 79 80 81 ------------------------------------------------------------------------------ 82 -- | Consume input as long as the predicate returns 'True', and return the 83 -- consumed input. This parser does not fail. If it matches no input, it 84 -- will return an empty string. 85 takeWhile0 :: (Char -> Bool) -> Parser Text 86 takeWhile0 p = fmap T.pack $ P.many $ P.satisfy p 87 88 89 ------------------------------------------------------------------------------ 90 -- | Consume input as long as the predicate returns 'True', and return the 91 -- consumed input. This parser requires the predicate to succeed on at least 92 -- one character of input. It will fail if the first character fails the 93 -- predicate. 94 takeWhile1 :: (Char -> Bool) -> Parser Text 95 takeWhile1 p = fmap T.pack $ P.many1 $ P.satisfy p 96 97 98 ------------------------------------------------------------------------------ 99 -- | The equivalent of Parsec's string combinator, but for text. If there is 100 -- not a complete match, then no input is consumed. This matches the behavior 101 -- of @string@ from the attoparsec-text package. 102 text :: Text -> Parser Text 103 text t = P.try $ P.string (T.unpack t) *> return t 104 105 106 ------------------------------------------------------------------------------ 107 -- | Represents the state of a text scanner, for use with the 'scanText' 108 -- parser combinator. 109 data ScanState = ScanNext (Char -> ScanState) 110 | ScanFinish 111 | ScanFail String 112 113 114 ------------------------------------------------------------------------------ 115 -- | Scans text and progresses through a DFA, collecting the complete matching 116 -- text as it goes. 117 scanText :: (Char -> ScanState) -> Parser String 118 scanText f = do 119 P.try $ do 120 c <- P.anyChar 121 case f c of 122 ScanNext f' -> (c:) `fmap` scanText f' 123 ScanFinish -> return [c] 124 ScanFail err -> fail err 125