1 {-# LANGUAGE OverloadedStrings #-} 2 {-# LANGUAGE TupleSections #-} 3 4 module Text.XmlHtml.HTML.Parse where 5 6 import Control.Applicative 7 import Control.Monad 8 import Data.Char 9 import Data.List 10 import Data.Maybe 11 import Text.XmlHtml.Common 12 import Text.XmlHtml.HTML.Meta 13 import Text.XmlHtml.TextParser 14 import qualified Text.XmlHtml.XML.Parse as XML 15 16 import qualified Text.Parsec as P 17 18 import qualified Data.Set as S 19 import qualified Data.Map as M 20 21 import Data.Text (Text) 22 import qualified Data.Text as T 23 24 25 ------------------------------------------------------------------------------ 26 -- | HTML version of document fragment parsing rule It differs only in that 27 -- it parses the HTML version of 'content' and returns an 'HtmlDocument'. 28 docFragment :: Encoding -> Parser Document 29 docFragment e = do 30 (dt, nodes1) <- prolog 31 (nodes2, Matched) <- content Nothing 32 return $ HtmlDocument e dt (nodes1 ++ nodes2) 33 34 35 ------------------------------------------------------------------------------ 36 -- Parsing code -- 37 ------------------------------------------------------------------------------ 38 39 {- 40 The following are the differences between this code and the straight XML 41 parsing code. 42 43 1. HTML void tags (area, base, etc.) are always treated as empty tags, 44 regardless of whether they have the empty-tag slash. 45 46 2. HTML raw text tags (script and style) are parsed as straight text 47 with neither markup nor references, except that they end at the first 48 syntactically valid matching end tag. 49 50 3. End tags need only match their corresponding start tags in a case 51 insensitive comparison. In case they are different, the start tag is 52 used for the element tag name. 53 54 4. Hexadecimal char references may use &#X...; (capital X) -- DONE 55 56 5. Attribute names are allowed to consist of any text except for control 57 characters, space, '\"', '\'', '>', '/', or '='. 58 59 6. Empty attribute syntax is allowed (an attribute not followed by an eq). 60 In this case, the attribute value is considered to be the empty string. 61 62 7. Quoted attribute syntax is relaxed to allow any character except for 63 the matching quote. References are allowed. 64 65 8. Attribute values may be unquoted. In this case, the attribute value 66 may not contain space, single or double quotes, '=', '<', '>', or '`', 67 and may not be the empty string. It can still contain references. 68 69 9. There are many more character references available. 70 71 10. Only "ambiguous" ampersands are prohibited in character data. This 72 means ampersands that parse like character or entity references. 73 74 11. Omittable end tags are inserted automatically. 75 76 12. DOCTYPE tags matched with case insensitive keywords. 77 -} 78 79 80 ------------------------------------------------------------------------------ 81 prolog :: Parser (Maybe DocType, [Node]) 82 prolog = do 83 _ <- optional XML.xmlDecl 84 nodes1 <- many XML.misc 85 rest <- optional $ do 86 dt <- docTypeDecl 87 nodes2 <- many XML.misc 88 return (dt, nodes2) 89 case rest of 90 Nothing -> return (Nothing, catMaybes nodes1) 91 Just (dt, nodes2) -> return (Just dt, catMaybes (nodes1 ++ nodes2)) 92 93 94 ------------------------------------------------------------------------------ 95 -- | Internal subset is parsed, but ignored since we don't have data types to 96 -- store it. 97 docTypeDecl :: Parser DocType 98 docTypeDecl = do 99 P.try $ do 100 _ <- text "<!" 101 decl <- XML.name 102 when (T.toLower decl /= "doctype") $ fail "Expected DOCTYPE" 103 XML.whiteSpace 104 tag <- XML.name 105 _ <- optional XML.whiteSpace 106 extid <- externalID 107 _ <- optional XML.whiteSpace 108 intsub <- XML.internalDoctype 109 _ <- P.char '>' 110 return (DocType tag extid intsub) 111 112 113 ------------------------------------------------------------------------------ 114 externalID :: Parser ExternalID 115 externalID = do 116 tok <- optional $ T.toLower <$> XML.name 117 case tok of 118 Just "system" -> systemID 119 Just "public" -> publicID 120 Just _ -> fail "Expected SYSTEM or PUBLIC" 121 Nothing -> return NoExternalID 122 where 123 systemID = do 124 XML.whiteSpace 125 System <$> XML.systemLiteral 126 publicID = do 127 XML.whiteSpace 128 pid <- XML.pubIdLiteral 129 XML.whiteSpace 130 sid <- XML.systemLiteral 131 return (Public pid sid) 132 133 134 ------------------------------------------------------------------------------ 135 -- | When parsing an element, three things can happen (besides failure): 136 -- 137 -- (1) The end tag matches the start tag. This is a Matched. 138 -- 139 -- (2) The end tag does not match, but the element has an end tag that can be 140 -- omitted when there is no more content in its parent. This is an 141 -- ImplicitLast. In this case, we need to remember the tag name of the 142 -- end tag that we did find, so as to match it later. 143 -- 144 -- (3) A start tag is found such that it implicitly ends the current element. 145 -- This is an ImplicitNext. In this case, we parse and remember the 146 -- entire element that comes next, so that it can be inserted after the 147 -- element being parsed. 148 data ElemResult = Matched 149 | ImplicitLast Text 150 | ImplicitNext Text Text [(Text, Text)] Bool 151 152 153 ------------------------------------------------------------------------------ 154 finishElement :: Text -> Text -> [(Text, Text)] -> Bool 155 -> Parser (Node, ElemResult) 156 finishElement t tbase a b = do 157 if b then return (Element t a [], Matched) 158 else nonEmptyElem 159 where 160 nonEmptyElem 161 | tbase `S.member` rawTextTags = do 162 c <- XML.cdata "<" $ P.try (endTag t) 163 return (Element t a [c], Matched) 164 | tbase `S.member` endOmittableLast = tagContents optional 165 | otherwise = tagContents (fmap Just) 166 tagContents modifier = do 167 (c,r1) <- content (Just tbase) 168 case r1 of 169 Matched -> do 170 r2 <- modifier (endTag t) 171 case r2 of 172 Nothing -> return (Element t a c, Matched) 173 Just rr -> return (Element t a c, rr) 174 ImplicitLast tag | T.toCaseFold tag == T.toCaseFold t -> do 175 return (Element t a c, Matched) 176 end -> do 177 return (Element t a c, end) 178 179 180 ------------------------------------------------------------------------------ 181 emptyOrStartTag :: Parser (Text, Text, [(Text, Text)], Bool) 182 emptyOrStartTag = do 183 t <- P.try $ P.char '<' *> XML.name 184 let tbase = T.toLower $ snd $ T.breakOnEnd ":" t 185 a <- many $ P.try $ do 186 XML.whiteSpace 187 attribute 188 when (hasDups a) $ fail "Duplicate attribute names in element" 189 _ <- optional XML.whiteSpace 190 e <- fmap isJust $ optional (P.char '/') 191 let e' = e || (tbase `S.member` voidTags) 192 _ <- P.char '>' 193 return (t, tbase, a, e') 194 where 195 hasDups a = length (nub (map fst a)) < length a 196 197 198 ------------------------------------------------------------------------------ 199 attrName :: Parser Text 200 attrName = takeWhile1 isAttrName 201 where isAttrName c | c `elem` "\0 \"\'>/=" = False 202 | isControlChar c = False 203 | otherwise = True 204 205 206 ------------------------------------------------------------------------------ 207 -- | From 8.2.2.3 of the HTML 5 spec, omitting the very high control 208 -- characters because they are unlikely to occur and I got tired of typing. 209 isControlChar :: Char -> Bool 210 isControlChar c | c >= '\x007F' && c <= '\x009F' = True 211 | c >= '\xFDD0' && c <= '\xFDEF' = True 212 | otherwise = False 213 214 215 ------------------------------------------------------------------------------ 216 quotedAttrValue :: Parser Text 217 quotedAttrValue = singleQuoted <|> doubleQuoted 218 where 219 singleQuoted = P.char '\'' *> refTill "&\'" <* P.char '\'' 220 doubleQuoted = P.char '\"' *> refTill "&\"" <* P.char '\"' 221 refTill end = T.concat <$> many 222 (takeWhile1 (not . (`elem` end)) <|> reference) 223 224 225 ------------------------------------------------------------------------------ 226 unquotedAttrValue :: Parser Text 227 unquotedAttrValue = refTill " \"\'=<>&`" 228 where 229 refTill end = T.concat <$> some 230 (takeWhile1 (not . (`elem` end)) <|> reference) 231 232 233 ------------------------------------------------------------------------------ 234 attrValue :: Parser Text 235 attrValue = quotedAttrValue <|> unquotedAttrValue 236 237 238 ------------------------------------------------------------------------------ 239 attribute :: Parser (Text, Text) 240 attribute = do 241 n <- attrName 242 _ <- optional XML.whiteSpace 243 v <- optional $ do 244 _ <- P.char '=' 245 _ <- optional XML.whiteSpace 246 attrValue 247 return $ maybe (n,"") (n,) v 248 249 250 ------------------------------------------------------------------------------ 251 endTag :: Text -> Parser ElemResult 252 endTag s = do 253 _ <- text "</" 254 t <- XML.name 255 let tbase = T.toLower $ snd $ T.breakOnEnd ":" t 256 r <- if (T.toCaseFold s == T.toCaseFold t) 257 then return Matched 258 else if tbase `S.member` endOmittableLast 259 then return (ImplicitLast t) 260 else fail $ "mismatched tags: </" ++ T.unpack t ++ 261 "> found inside <" ++ T.unpack s ++ "> tag" 262 _ <- optional XML.whiteSpace 263 _ <- text ">" 264 return r 265 266 267 ------------------------------------------------------------------------------ 268 content :: Maybe Text -> Parser ([Node], ElemResult) 269 content parent = do 270 (ns, end) <- readText 271 return (coalesceText (catMaybes ns), end) 272 where 273 readText = do 274 s <- optional XML.charData 275 t <- optional whileMatched 276 case t of 277 Nothing -> return ([s], Matched) 278 Just (tt, m) -> return (s:tt, m) 279 280 whileMatched = do 281 (n,end) <- (,Matched) <$> (:[]) <$> Just <$> TextNode <$> reference 282 <|> (,Matched) <$> (:[]) <$> XML.cdSect 283 <|> (,Matched) <$> (:[]) <$> XML.processingInstruction 284 <|> (,Matched) <$> (:[]) <$> XML.comment 285 <|> doElement 286 case end of 287 Matched -> do 288 (ns, end') <- readText 289 return (n ++ ns, end') 290 _ -> do 291 return (n, end) 292 293 doElement = do 294 (t,tb, a,b) <- emptyOrStartTag 295 handle t tb a b 296 297 handle t tb a b = do 298 if breaksTag tb parent 299 then return ([Nothing], ImplicitNext t tb a b) 300 else do 301 (n,end) <- finishElement t tb a b 302 case end of 303 ImplicitNext t' tb' a' b' -> do 304 (ns,end') <- handle t' tb' a' b' 305 return (Just n : ns, end') 306 _ -> return ([Just n], end) 307 308 breaksTag _ Nothing = False 309 breaksTag child (Just tag) = case M.lookup tag endOmittableNext of 310 Nothing -> False 311 Just s -> S.member child s 312 313 coalesceText (TextNode s : TextNode t : ns) 314 = coalesceText (TextNode (T.append s t) : ns) 315 coalesceText (n:ns) 316 = n : coalesceText ns 317 coalesceText [] 318 = [] 319 320 321 ------------------------------------------------------------------------------ 322 reference :: Parser Text 323 reference = do 324 _ <- P.char '&' 325 r <- (Left <$> P.try finishCharRef) 326 <|> (Right <$> P.try finishEntityRef) 327 <|> (Left <$> return '&') 328 case r of 329 Left c -> do 330 when (not (isValidChar c)) $ fail $ 331 "Reference is not a valid character" 332 return (T.singleton c) 333 Right nm -> case M.lookup nm predefinedRefs of 334 Nothing -> fail $ "Unknown entity reference: " ++ T.unpack nm 335 Just t -> return t 336 337 338 ------------------------------------------------------------------------------ 339 finishCharRef :: Parser Char 340 finishCharRef = P.char '#' *> (hexCharRef <|> decCharRef) 341 where 342 decCharRef = do 343 ds <- some digit 344 _ <- P.char ';' 345 let c = chr $ foldl' (\a b -> 10 * a + b) 0 ds 346 return c 347 where 348 digit = do 349 d <- P.satisfy (\c -> c >= '0' && c <= '9') 350 return (ord d - ord '0') 351 hexCharRef = do 352 _ <- P.char 'x' <|> P.char 'X' 353 ds <- some digit 354 _ <- P.char ';' 355 let c = chr $ foldl' (\a b -> 16 * a + b) 0 ds 356 return c 357 where 358 digit = num <|> upper <|> lower 359 num = do 360 d <- P.satisfy (\c -> c >= '0' && c <= '9') 361 return (ord d - ord '0') 362 upper = do 363 d <- P.satisfy (\c -> c >= 'A' && c <= 'F') 364 return (10 + ord d - ord 'A') 365 lower = do 366 d <- P.satisfy (\c -> c >= 'a' && c <= 'f') 367 return (10 + ord d - ord 'a') 368 369 370 ------------------------------------------------------------------------------ 371 finishEntityRef :: Parser Text 372 finishEntityRef = XML.name <* P.char ';' 373