1 {-# LANGUAGE OverloadedStrings #-}
    2 {-# LANGUAGE TupleSections     #-}
    3 
    4 module Text.XmlHtml.HTML.Parse where
    5 
    6 import           Control.Applicative
    7 import           Control.Monad
    8 import           Data.Char
    9 import           Data.List
   10 import           Data.Maybe
   11 import           Text.XmlHtml.Common
   12 import           Text.XmlHtml.HTML.Meta
   13 import           Text.XmlHtml.TextParser
   14 import qualified Text.XmlHtml.XML.Parse as XML
   15 
   16 import qualified Text.Parsec as P
   17 
   18 import qualified Data.Set as S
   19 import qualified Data.Map as M
   20 
   21 import           Data.Text (Text)
   22 import qualified Data.Text as T
   23 
   24 
   25 ------------------------------------------------------------------------------
   26 -- | HTML version of document fragment parsing rule  It differs only in that
   27 -- it parses the HTML version of 'content' and returns an 'HtmlDocument'.
   28 docFragment :: Encoding -> Parser Document
   29 docFragment e = do
   30     (dt, nodes1)      <- prolog
   31     (nodes2, Matched) <- content Nothing
   32     return $ HtmlDocument e dt (nodes1 ++ nodes2)
   33 
   34 
   35 ------------------------------------------------------------------------------
   36 -- Parsing code                                                             --
   37 ------------------------------------------------------------------------------
   38 
   39 {-
   40     The following are the differences between this code and the straight XML
   41     parsing code.
   42 
   43     1. HTML void tags (area, base, etc.) are always treated as empty tags,
   44        regardless of whether they have the empty-tag slash.
   45 
   46     2. HTML raw text tags (script and style) are parsed as straight text
   47        with neither markup nor references, except that they end at the first
   48        syntactically valid matching end tag.
   49 
   50     3. End tags need only match their corresponding start tags in a case
   51        insensitive comparison.  In case they are different, the start tag is
   52        used for the element tag name.
   53 
   54     4. Hexadecimal char references may use &#X...; (capital X)  -- DONE
   55 
   56     5. Attribute names are allowed to consist of any text except for control
   57        characters, space, '\"', '\'', '>', '/', or '='.
   58 
   59     6. Empty attribute syntax is allowed (an attribute not followed by an eq).
   60        In this case, the attribute value is considered to be the empty string.
   61 
   62     7. Quoted attribute syntax is relaxed to allow any character except for
   63        the matching quote.  References are allowed.
   64 
   65     8. Attribute values may be unquoted.  In this case, the attribute value
   66        may not contain space, single or double quotes, '=', '<', '>', or '`',
   67        and may not be the empty string.  It can still contain references.
   68 
   69     9. There are many more character references available.
   70 
   71     10. Only "ambiguous" ampersands are prohibited in character data.  This
   72         means ampersands that parse like character or entity references.
   73 
   74     11. Omittable end tags are inserted automatically.
   75 
   76     12. DOCTYPE tags matched with case insensitive keywords.
   77 -}
   78 
   79 
   80 ------------------------------------------------------------------------------
   81 prolog :: Parser (Maybe DocType, [Node])
   82 prolog = do
   83     _      <- optional XML.xmlDecl
   84     nodes1 <- many XML.misc
   85     rest   <- optional $ do
   86         dt     <- docTypeDecl
   87         nodes2 <- many XML.misc
   88         return (dt, nodes2)
   89     case rest of
   90         Nothing           -> return (Nothing, catMaybes nodes1)
   91         Just (dt, nodes2) -> return (Just dt, catMaybes (nodes1 ++ nodes2))
   92 
   93 
   94 ------------------------------------------------------------------------------
   95 -- | Internal subset is parsed, but ignored since we don't have data types to
   96 -- store it.
   97 docTypeDecl :: Parser DocType
   98 docTypeDecl = do
   99     P.try $ do
  100         _      <- text "<!"
  101         decl   <- XML.name
  102         when (T.toLower decl /= "doctype") $ fail "Expected DOCTYPE"
  103     XML.whiteSpace
  104     tag    <- XML.name
  105     _      <- optional XML.whiteSpace
  106     extid  <- externalID
  107     _      <- optional XML.whiteSpace
  108     intsub <- XML.internalDoctype
  109     _      <- P.char '>'
  110     return (DocType tag extid intsub)
  111 
  112 
  113 ------------------------------------------------------------------------------
  114 externalID :: Parser ExternalID
  115 externalID = do
  116     tok  <- optional $ T.toLower <$> XML.name
  117     case tok of
  118         Just "system" -> systemID
  119         Just "public" -> publicID
  120         Just _        -> fail "Expected SYSTEM or PUBLIC"
  121         Nothing       -> return NoExternalID
  122   where
  123     systemID = do
  124         XML.whiteSpace
  125         System <$> XML.systemLiteral
  126     publicID = do
  127         XML.whiteSpace
  128         pid <- XML.pubIdLiteral
  129         XML.whiteSpace
  130         sid <- XML.systemLiteral
  131         return (Public pid sid)
  132 
  133 
  134 ------------------------------------------------------------------------------
  135 -- | When parsing an element, three things can happen (besides failure):
  136 --
  137 -- (1) The end tag matches the start tag.  This is a Matched.
  138 --
  139 -- (2) The end tag does not match, but the element has an end tag that can be
  140 -- omitted when there is no more content in its parent.  This is an
  141 -- ImplicitLast.  In this case, we need to remember the tag name of the
  142 -- end tag that we did find, so as to match it later.
  143 --
  144 -- (3) A start tag is found such that it implicitly ends the current element.
  145 -- This is an ImplicitNext.  In this case, we parse and remember the
  146 -- entire element that comes next, so that it can be inserted after the
  147 -- element being parsed.
  148 data ElemResult = Matched
  149                 | ImplicitLast Text
  150                 | ImplicitNext Text Text [(Text, Text)] Bool
  151 
  152 
  153 ------------------------------------------------------------------------------
  154 finishElement :: Text -> Text -> [(Text, Text)] -> Bool
  155               -> Parser (Node, ElemResult)
  156 finishElement t tbase a b = do
  157     if b then return (Element t a [], Matched)
  158          else nonEmptyElem
  159   where
  160     nonEmptyElem
  161         | tbase `S.member` rawTextTags = do
  162             c <- XML.cdata  "<"  $ P.try (endTag t)
  163             return (Element t a [c], Matched)
  164         | tbase `S.member` endOmittableLast = tagContents optional
  165         | otherwise = tagContents (fmap Just)
  166     tagContents modifier = do
  167         (c,r1) <- content (Just tbase)
  168         case r1 of
  169             Matched -> do
  170                 r2 <- modifier (endTag t)
  171                 case r2 of
  172                     Nothing -> return (Element t a c, Matched)
  173                     Just rr -> return (Element t a c, rr)
  174             ImplicitLast tag | T.toCaseFold tag == T.toCaseFold t -> do
  175                 return (Element t a c, Matched)
  176             end -> do
  177                 return (Element t a c, end)
  178 
  179 
  180 ------------------------------------------------------------------------------
  181 emptyOrStartTag :: Parser (Text, Text, [(Text, Text)], Bool)
  182 emptyOrStartTag = do
  183     t <- P.try $ P.char '<' *> XML.name
  184     let tbase = T.toLower $ snd $ T.breakOnEnd ":" t
  185     a <- many $ P.try $ do
  186         XML.whiteSpace
  187         attribute
  188     when (hasDups a) $ fail "Duplicate attribute names in element"
  189     _ <- optional XML.whiteSpace
  190     e <- fmap isJust $ optional (P.char '/')
  191     let e' = e || (tbase `S.member` voidTags)
  192     _ <- P.char '>'
  193     return (t, tbase, a, e')
  194   where
  195     hasDups a = length (nub (map fst a)) < length a
  196 
  197 
  198 ------------------------------------------------------------------------------
  199 attrName :: Parser Text
  200 attrName = takeWhile1 isAttrName
  201   where isAttrName c | c `elem` "\0 \"\'>/=" = False
  202                      | isControlChar c       = False
  203                      | otherwise             = True
  204 
  205 
  206 ------------------------------------------------------------------------------
  207 -- | From 8.2.2.3 of the HTML 5 spec, omitting the very high control
  208 -- characters because they are unlikely to occur and I got tired of typing.
  209 isControlChar :: Char -> Bool
  210 isControlChar c | c >= '\x007F' && c <= '\x009F' = True
  211                 | c >= '\xFDD0' && c <= '\xFDEF' = True
  212                 | otherwise                      = False
  213 
  214 
  215 ------------------------------------------------------------------------------
  216 quotedAttrValue :: Parser Text
  217 quotedAttrValue = singleQuoted <|> doubleQuoted
  218   where
  219     singleQuoted = P.char '\'' *> refTill "&\'" <* P.char '\''
  220     doubleQuoted = P.char '\"' *> refTill "&\"" <* P.char '\"'
  221     refTill end = T.concat <$> many
  222         (takeWhile1 (not . (`elem` end)) <|> reference)
  223 
  224 
  225 ------------------------------------------------------------------------------
  226 unquotedAttrValue :: Parser Text
  227 unquotedAttrValue = refTill " \"\'=<>&`"
  228   where
  229     refTill end = T.concat <$> some
  230         (takeWhile1 (not . (`elem` end)) <|> reference)
  231 
  232 
  233 ------------------------------------------------------------------------------
  234 attrValue :: Parser Text
  235 attrValue = quotedAttrValue <|> unquotedAttrValue
  236 
  237 
  238 ------------------------------------------------------------------------------
  239 attribute :: Parser (Text, Text)
  240 attribute = do
  241     n <- attrName
  242     _ <- optional XML.whiteSpace
  243     v <- optional $ do
  244         _ <- P.char '='
  245         _ <- optional XML.whiteSpace
  246         attrValue
  247     return $ maybe (n,"") (n,) v
  248 
  249 
  250 ------------------------------------------------------------------------------
  251 endTag :: Text -> Parser ElemResult
  252 endTag s = do
  253     _ <- text "</"
  254     t <- XML.name
  255     let tbase = T.toLower $ snd $ T.breakOnEnd ":" t
  256     r <- if (T.toCaseFold s == T.toCaseFold t)
  257             then return Matched
  258             else if tbase `S.member` endOmittableLast
  259                 then return (ImplicitLast t)
  260                 else fail $ "mismatched tags: </" ++ T.unpack t ++
  261                             "> found inside <" ++ T.unpack s ++ "> tag"
  262     _ <- optional XML.whiteSpace
  263     _ <- text ">"
  264     return r
  265 
  266 
  267 ------------------------------------------------------------------------------
  268 content :: Maybe Text -> Parser ([Node], ElemResult)
  269 content parent = do
  270     (ns, end) <- readText
  271     return (coalesceText (catMaybes ns), end)
  272   where
  273     readText     = do
  274         s <- optional XML.charData
  275         t <- optional whileMatched
  276         case t of
  277             Nothing      -> return ([s], Matched)
  278             Just (tt, m) -> return (s:tt, m)
  279 
  280     whileMatched = do
  281         (n,end) <- (,Matched) <$> (:[]) <$> Just <$> TextNode <$> reference
  282                <|> (,Matched) <$> (:[]) <$> XML.cdSect
  283                <|> (,Matched) <$> (:[]) <$> XML.processingInstruction
  284                <|> (,Matched) <$> (:[]) <$> XML.comment
  285                <|> doElement
  286         case end of
  287             Matched -> do
  288                 (ns, end') <- readText
  289                 return (n ++ ns, end')
  290             _ -> do
  291                 return (n, end)
  292 
  293     doElement = do
  294         (t,tb, a,b) <- emptyOrStartTag
  295         handle t tb a b
  296 
  297     handle t tb a b = do
  298         if breaksTag tb parent
  299             then return ([Nothing], ImplicitNext t tb a b)
  300             else do
  301                 (n,end) <- finishElement t tb a b
  302                 case end of
  303                     ImplicitNext t' tb' a' b' -> do
  304                         (ns,end') <- handle t' tb' a' b'
  305                         return (Just n : ns, end')
  306                     _ -> return ([Just n], end)
  307 
  308     breaksTag _     Nothing       = False
  309     breaksTag child (Just tag) = case M.lookup tag endOmittableNext of
  310         Nothing -> False
  311         Just s  -> S.member child s
  312 
  313     coalesceText (TextNode s : TextNode t : ns)
  314         = coalesceText (TextNode (T.append s t) : ns)
  315     coalesceText (n:ns)
  316         = n : coalesceText ns
  317     coalesceText []
  318         = []
  319 
  320 
  321 ------------------------------------------------------------------------------
  322 reference :: Parser Text
  323 reference = do
  324     _    <- P.char '&'
  325     r    <- (Left  <$> P.try finishCharRef)
  326         <|> (Right <$> P.try finishEntityRef)
  327         <|> (Left  <$> return '&')
  328     case r of
  329         Left c   -> do
  330             when (not (isValidChar c)) $ fail $
  331                 "Reference is not a valid character"
  332             return (T.singleton c)
  333         Right nm -> case M.lookup nm predefinedRefs of
  334             Nothing -> fail $ "Unknown entity reference: " ++ T.unpack nm
  335             Just t  -> return t
  336 
  337 
  338 ------------------------------------------------------------------------------
  339 finishCharRef :: Parser Char
  340 finishCharRef = P.char '#' *> (hexCharRef <|> decCharRef)
  341   where
  342     decCharRef = do
  343         ds <- some digit
  344         _ <- P.char ';'
  345         let c = chr $ foldl' (\a b -> 10 * a + b) 0 ds
  346         return c
  347       where
  348         digit = do
  349             d <- P.satisfy (\c -> c >= '0' && c <= '9')
  350             return (ord d - ord '0')
  351     hexCharRef = do
  352         _ <- P.char 'x' <|> P.char 'X'
  353         ds <- some digit
  354         _ <- P.char ';'
  355         let c = chr $ foldl' (\a b -> 16 * a + b) 0 ds
  356         return c
  357       where
  358         digit = num <|> upper <|> lower
  359         num = do
  360             d <- P.satisfy (\c -> c >= '0' && c <= '9')
  361             return (ord d - ord '0')
  362         upper = do
  363             d <- P.satisfy (\c -> c >= 'A' && c <= 'F')
  364             return (10 + ord d - ord 'A')
  365         lower = do
  366             d <- P.satisfy (\c -> c >= 'a' && c <= 'f')
  367             return (10 + ord d - ord 'a')
  368 
  369 
  370 ------------------------------------------------------------------------------
  371 finishEntityRef :: Parser Text
  372 finishEntityRef = XML.name <* P.char ';'
  373