1 {-# LANGUAGE OverloadedStrings     #-}
    2 {-# LANGUAGE FlexibleInstances     #-}
    3 {-# LANGUAGE MultiParamTypeClasses #-}
    4 
    5 module Text.XmlHtml.TextParser where
    6 
    7 import           Control.Applicative
    8 import           Data.Char
    9 import           Data.Maybe
   10 import           Text.XmlHtml.Common
   11 
   12 import           Data.Text (Text)
   13 import qualified Data.Text as T
   14 
   15 import           Text.Parsec (Parsec)
   16 import qualified Text.Parsec as P
   17 
   18 import           Data.ByteString (ByteString)
   19 import qualified Data.ByteString as B
   20 
   21 
   22 ------------------------------------------------------------------------------
   23 -- | Get an initial guess at document encoding from the byte order mark.  If
   24 -- the mark doesn't exist, guess UTF-8.  Otherwise, guess according to the
   25 -- mark.
   26 guessEncoding :: ByteString -> (Encoding, ByteString)
   27 guessEncoding b
   28     | B.take 3 b == B.pack [ 0xEF, 0xBB, 0xBF ] = (UTF8,    B.drop 3 b)
   29     | B.take 2 b == B.pack [ 0xFE, 0xFF ]       = (UTF16BE, B.drop 2 b)
   30     | B.take 2 b == B.pack [ 0xFF, 0xFE ]       = (UTF16LE, B.drop 2 b)
   31     | otherwise                                 = (UTF8,    b)
   32 
   33 
   34 ------------------------------------------------------------------------------
   35 -- | Specialized type for the parsers we use here.
   36 type Parser = Parsec Text ()
   37 
   38 
   39 ------------------------------------------------------------------------------
   40 -- An (orphaned) instance for parsing Text with Parsec.
   41 instance (Monad m) => P.Stream T.Text m Char where
   42     uncons = return . T.uncons
   43 
   44 
   45 ------------------------------------------------------------------------------
   46 parse :: (Encoding -> Parser a) -> String -> ByteString -> Either String a
   47 parse p src b = let (e, b') = guessEncoding b
   48                     t       = decoder e b'
   49                     bad     = T.find (not . isValidChar) t
   50                 in  if isNothing bad
   51                         then parseText (p e <* P.eof) src t
   52                         else Left $ "Document contains invalid character:"
   53                                  ++ " \\" ++ show (ord (fromJust bad))
   54 
   55 
   56 ------------------------------------------------------------------------------
   57 -- | Checks if a document contains invalid characters.
   58 --
   59 isValidChar :: Char -> Bool
   60 isValidChar c | c < '\x9'                     = False
   61               | c > '\xA'    && c < '\xD'     = False
   62               | c > '\xD'    && c < '\x20'    = False
   63               | c > '\xD7FF' && c < '\xE000'  = False
   64               | c > '\xFFFD' && c < '\x10000' = False
   65               | otherwise                     = True
   66 
   67 
   68 ------------------------------------------------------------------------------
   69 -- | Parses a 'Text' value and gives back the result.  The parser is expected
   70 -- to match the entire string.
   71 parseText :: Parser a         -- ^ The parser to match
   72           -> String           -- ^ Name of the source file (can be @\"\"@)
   73           -> Text             -- ^ Text to parse
   74           -> Either String a  -- Either an error message or the result
   75 parseText p src t = inLeft show (P.parse p src t)
   76   where inLeft :: (a -> b) -> Either a c -> Either b c
   77         inLeft f (Left x)  = Left (f x)
   78         inLeft _ (Right x) = Right x
   79 
   80 
   81 ------------------------------------------------------------------------------
   82 -- | Consume input as long as the predicate returns 'True', and return the
   83 -- consumed input.  This parser does not fail.  If it matches no input, it
   84 -- will return an empty string.
   85 takeWhile0 :: (Char -> Bool) -> Parser Text
   86 takeWhile0 p = fmap T.pack $ P.many $ P.satisfy p
   87 
   88 
   89 ------------------------------------------------------------------------------
   90 -- | Consume input as long as the predicate returns 'True', and return the
   91 -- consumed input.  This parser requires the predicate to succeed on at least
   92 -- one character of input.  It will fail if the first character fails the
   93 -- predicate.
   94 takeWhile1 :: (Char -> Bool) -> Parser Text
   95 takeWhile1 p = fmap T.pack $ P.many1 $ P.satisfy p
   96 
   97 
   98 ------------------------------------------------------------------------------
   99 -- | The equivalent of Parsec's string combinator, but for text.  If there is
  100 -- not a complete match, then no input is consumed.  This matches the behavior
  101 -- of @string@ from the attoparsec-text package.
  102 text :: Text -> Parser Text
  103 text t = P.try $ P.string (T.unpack t) *> return t
  104 
  105 
  106 ------------------------------------------------------------------------------
  107 -- | Represents the state of a text scanner, for use with the 'scanText'
  108 -- parser combinator.
  109 data ScanState = ScanNext (Char -> ScanState)
  110                | ScanFinish
  111                | ScanFail String
  112 
  113 
  114 ------------------------------------------------------------------------------
  115 -- | Scans text and progresses through a DFA, collecting the complete matching
  116 -- text as it goes.
  117 scanText :: (Char -> ScanState) -> Parser String
  118 scanText f = do
  119     P.try $ do
  120         c <- P.anyChar
  121         case f c of
  122             ScanNext f'  -> (c:) `fmap` scanText f'
  123             ScanFinish   -> return [c]
  124             ScanFail err -> fail err
  125