1 {-# LANGUAGE OverloadedStrings #-} 2 3 module Text.XmlHtml.XML.Parse where 4 5 import Control.Applicative 6 import Control.Monad 7 import Data.Char 8 import Data.List 9 import Data.Maybe 10 import Text.XmlHtml.Common 11 import Text.XmlHtml.TextParser 12 13 import qualified Text.Parsec as P 14 15 import Data.Map (Map) 16 import qualified Data.Map as M 17 18 import Data.Text (Text) 19 import qualified Data.Text as T 20 21 22 ------------------------------------------------------------------------------ 23 -- | This is my best guess as to the best rule for handling document fragments 24 -- for processing. It is essentially modeled after document, but allowing 25 -- multiple nodes. 26 docFragment :: Encoding -> Parser Document 27 docFragment e = do 28 (dt, nodes1) <- prolog 29 nodes2 <- content 30 return $ XmlDocument e dt (nodes1 ++ nodes2) 31 32 33 ------------------------------------------------------------------------------ 34 -- Everything from here forward is translated from the XML specification. -- 35 ------------------------------------------------------------------------------ 36 37 {- 38 Map from numbered productions in the XML specification to symbols here: 39 40 PROD SPEC NAME PARSER NAME NOTES 41 -----|------------------|---------------------|------- 42 [1] document document 43 [2] Char {2} 44 [3] S whiteSpace 45 [4] NameStartChar isNameStartChar {1} 46 [4a] NameChar isNameChar {1} 47 [5] Name name 48 [6] Names names 49 [7] Nmtoken nmtoken 50 [8] Nmtokens nmtokens 51 [9] EntityValue {4} 52 [10] AttValue attrValue 53 [11] SystemLiteral systemLiteral 54 [12] PubidLiteral pubIdLiteral 55 [13] PubidChar isPubIdChar {1} 56 [14] CharData charData 57 [15] Comment comment 58 [16] PI processingInstruction 59 [17] PITarget piTarget 60 [18] CDSect cdSect 61 [19] CDStart cdSect {3} 62 [20] CData cdSect {3} 63 [21] CDEnd cdSect {3} 64 [22] prolog prolog 65 [23] XMLDecl xmlDecl 66 [24] VersionInfo versionInfo 67 [25] Eq eq 68 [26] VersionNum versionInfo {3} 69 [27] Misc misc 70 [28] doctypedecl docTypeDecl 71 [28a] DeclSep {4} 72 [28b] intSubset {4} 73 [29] markupdecl {4} 74 [30] extSubset {4} 75 [31] extSubsetDecl {4} 76 [32] SDDecl sdDecl 77 [39] element element 78 [40] STag emptyOrStartTag 79 [41] Attribute attribute 80 [42] ETag endTag 81 [43] content content 82 [44] EmptyElemTag emptyOrStartTag 83 [45] elementDecl {4} 84 [46] contentSpec {4} 85 [47] children {4} 86 [48] cp {4} 87 [49] choice {4} 88 [50] seq {4} 89 [51] Mixed {4} 90 [52] AttlistDecl {4} 91 [53] AttDef {4} 92 [54] AttType {4} 93 [55] StringType {4} 94 [56] TokenizedType {4} 95 [57] EnumeratedType {4} 96 [58] NotationType {4} 97 [59] Enumeration {4} 98 [60] DefaultDecl {4} 99 [61] conditionalSect {4} 100 [62] includeSect {4} 101 [63] ignoreSect {4} 102 [64] ignoreSectContents {4} 103 [65] Ignore {4} 104 [66] CharRef charRef 105 [67] Reference reference 106 [68] EntityRef entityRef 107 [69] PEReference {4} 108 [70] EntityDecl {4} 109 [71] GEDecl {4} 110 [72] PEDecl {4} 111 [73] EntityDef {4} 112 [74] PEDef {4} 113 [75] ExternalID externalID 114 [76] NDataDecl {4} 115 [77] TextDecl textDecl 116 [78] extParsedEnt extParsedEnt 117 [80] EncodingDecl encodingDecl 118 [81] EncName encodingDecl {3} 119 [82] NotationDecl {4} 120 [83] PublicID {4} 121 [84] Letter {5} 122 [85] BaseChar {5} 123 [86] Ideographic {5} 124 [87] CombiningChar {5} 125 [88] Digit {5} 126 [89] Extender {5} 127 128 Notes: 129 {1} - These productions match single characters, and so are 130 implemented as predicates instead of parsers. 131 {3} - Denotes a production which is not exposed as a top-level symbol 132 because it is trivial and included in another definition. 133 {4} - This module does not contain a parser for the DTD subsets, so 134 grammar that occurs only in DTD subsets is not defined. 135 {5} - These are orphaned productions for character classes. 136 -} 137 138 139 ------------------------------------------------------------------------------ 140 whiteSpace :: Parser () 141 whiteSpace = some (P.satisfy (`elem` " \t\r\n")) *> return () 142 143 144 ------------------------------------------------------------------------------ 145 isNameStartChar :: Char -> Bool 146 isNameStartChar c | c == ':' = True 147 | c == '_' = True 148 | c >= 'a' && c <= 'z' = True 149 | c >= 'A' && c <= 'Z' = True 150 | c >= '\xc0' && c <= '\xd6' = True 151 | c >= '\xd8' && c <= '\xf6' = True 152 | c >= '\xf8' && c <= '\x2ff' = True 153 | c >= '\x370' && c <= '\x37d' = True 154 | c >= '\x37f' && c <= '\x1fff' = True 155 | c >= '\x200c' && c <= '\x200d' = True 156 | c >= '\x2070' && c <= '\x218f' = True 157 | c >= '\x2c00' && c <= '\x2fef' = True 158 | c >= '\x3001' && c <= '\xd7ff' = True 159 | c >= '\xf900' && c <= '\xfdcf' = True 160 | c >= '\xfdf0' && c <= '\xfffd' = True 161 | c >= '\x10000' && c <= '\xeffff' = True 162 | otherwise = False 163 164 165 ------------------------------------------------------------------------------ 166 isNameChar :: Char -> Bool 167 isNameChar c | isNameStartChar c = True 168 | c == '-' = True 169 | c == '.' = True 170 | c == '\xb7' = True 171 | c >= '0' && c <= '9' = True 172 | c >= '\x300' && c <= '\x36f' = True 173 | c >= '\x203f' && c <= '\x2040' = True 174 | otherwise = False 175 176 177 ------------------------------------------------------------------------------ 178 name :: Parser Text 179 name = do 180 c <- P.satisfy isNameStartChar 181 r <- takeWhile0 isNameChar 182 return $ T.cons c r 183 184 185 ------------------------------------------------------------------------------ 186 attrValue :: Parser Text 187 attrValue = fmap T.concat (singleQuoted <|> doubleQuoted) 188 where 189 singleQuoted = P.char '\'' *> refTill "<&\'" <* P.char '\'' 190 doubleQuoted = P.char '\"' *> refTill "<&\"" <* P.char '\"' 191 refTill end = many (takeWhile1 (not . (`elem` end)) <|> reference) 192 193 194 ------------------------------------------------------------------------------ 195 systemLiteral :: Parser Text 196 systemLiteral = singleQuoted <|> doubleQuoted 197 where 198 singleQuoted = do 199 _ <- P.char '\'' 200 x <- takeWhile0 (not . (== '\'')) 201 _ <- P.char '\'' 202 return x 203 doubleQuoted = do 204 _ <- P.char '\"' 205 x <- takeWhile0 (not . (== '\"')) 206 _ <- P.char '\"' 207 return x 208 209 210 ------------------------------------------------------------------------------ 211 pubIdLiteral :: Parser Text 212 pubIdLiteral = singleQuoted <|> doubleQuoted 213 where 214 singleQuoted = do 215 _ <- P.char '\'' 216 x <- takeWhile0 (\c -> isPubIdChar c && c /= '\'') 217 _ <- P.char '\'' 218 return x 219 doubleQuoted = do 220 _ <- P.char '\"' 221 x <- takeWhile0 isPubIdChar 222 _ <- P.char '\"' 223 return x 224 225 226 ------------------------------------------------------------------------------ 227 isPubIdChar :: Char -> Bool 228 isPubIdChar c | c >= 'a' && c <= 'z' = True 229 | c >= 'A' && c <= 'Z' = True 230 | c >= '0' && c <= '9' = True 231 | c `elem` " \r\n-\'()+,./:=?;!*#@$_%" = True 232 | otherwise = False 233 234 235 ------------------------------------------------------------------------------ 236 -- | The requirement to not contain "]]>" is for SGML compatibility. We 237 -- deliberately choose to not enforce it. This makes the parser accept 238 -- strictly more documents than a standards-compliant parser. 239 charData :: Parser Node 240 charData = TextNode <$> takeWhile1 (not . (`elem` "<&")) 241 242 243 ------------------------------------------------------------------------------ 244 comment :: Parser (Maybe Node) 245 comment = text "<!--" *> (Just <$> Comment <$> commentText) <* text "-->" 246 where 247 commentText = fmap T.concat $ many $ 248 nonDash <|> P.try (T.cons <$> P.char '-' <*> nonDash) 249 nonDash = takeWhile1 (not . (== '-')) 250 251 252 ------------------------------------------------------------------------------ 253 -- | Always returns Nothing since there's no representation for a PI in the 254 -- document tree. 255 processingInstruction :: Parser (Maybe Node) 256 processingInstruction = do 257 _ <- text "<?" 258 _ <- piTarget 259 _ <- emptyEnd <|> contentEnd 260 return Nothing 261 where 262 emptyEnd = P.try (P.string "?>") 263 contentEnd = P.try $ do 264 _ <- whiteSpace 265 P.manyTill P.anyChar (P.try $ text "?>") 266 267 ------------------------------------------------------------------------------ 268 piTarget :: Parser () 269 piTarget = do 270 n <- name 271 when (T.map toLower n == "xml") $ fail "xml declaration can't occur here" 272 273 274 ------------------------------------------------------------------------------ 275 cdata :: [Char] -> Parser a -> Parser Node 276 cdata cs end = TextNode <$> T.concat <$> P.manyTill part end 277 where part = takeWhile1 (not . (`elem` cs)) 278 <|> T.singleton <$> P.anyChar 279 280 281 ------------------------------------------------------------------------------ 282 cdSect :: Parser (Maybe Node) 283 cdSect = Just <$> do 284 _ <- text "<![CDATA[" 285 cdata "]" (text "]]>") 286 287 288 ------------------------------------------------------------------------------ 289 prolog :: Parser (Maybe DocType, [Node]) 290 prolog = do 291 _ <- optional xmlDecl 292 nodes1 <- many misc 293 rest <- optional $ do 294 dt <- docTypeDecl 295 nodes2 <- many misc 296 return (dt, nodes2) 297 case rest of 298 Nothing -> return (Nothing, catMaybes nodes1) 299 Just (dt, nodes2) -> return (Just dt, catMaybes (nodes1 ++ nodes2)) 300 301 302 ------------------------------------------------------------------------------ 303 -- | Return value is the encoding, if present. 304 xmlDecl :: Parser (Maybe Text) 305 xmlDecl = do 306 _ <- text "<?xml" 307 _ <- versionInfo 308 e <- optional encodingDecl 309 _ <- optional sdDecl 310 _ <- optional whiteSpace 311 _ <- text "?>" 312 return e 313 314 315 ------------------------------------------------------------------------------ 316 versionInfo :: Parser () 317 versionInfo = do 318 whiteSpace *> text "version" *> eq *> (singleQuoted <|> doubleQuoted) 319 where 320 singleQuoted = P.char '\'' *> versionNum <* P.char '\'' 321 doubleQuoted = P.char '\"' *> versionNum <* P.char '\"' 322 versionNum = do 323 _ <- text "1." 324 _ <- some (P.satisfy (\c -> c >= '0' && c <= '9')) 325 return () 326 327 328 ------------------------------------------------------------------------------ 329 eq :: Parser () 330 eq = optional whiteSpace *> P.char '=' *> optional whiteSpace *> return () 331 332 333 ------------------------------------------------------------------------------ 334 misc :: Parser (Maybe Node) 335 misc = comment <|> processingInstruction <|> (whiteSpace *> return Nothing) 336 337 338 ------------------------------------------------------------------------------ 339 -- | Internal subset is parsed, but ignored since we don't have data types to 340 -- store it. 341 docTypeDecl :: Parser DocType 342 docTypeDecl = do 343 _ <- text "<!DOCTYPE" 344 whiteSpace 345 tag <- name 346 _ <- optional whiteSpace 347 extid <- externalID 348 _ <- optional whiteSpace 349 intsub <- internalDoctype 350 _ <- P.char '>' 351 return (DocType tag extid intsub) 352 353 354 ------------------------------------------------------------------------------ 355 -- | States for the DOCTYPE internal subset state machine. 356 data InternalDoctypeState = IDSStart 357 | IDSScanning Int 358 | IDSInQuote Int Char 359 | IDSCommentS1 Int 360 | IDSCommentS2 Int 361 | IDSCommentS3 Int 362 | IDSComment Int 363 | IDSCommentD1 Int 364 | IDSCommentE1 Int 365 366 367 ------------------------------------------------------------------------------ 368 -- | Internal DOCTYPE subset. We don't actually parse this; just scan through 369 -- and look for the end, and store it in a block of text. 370 internalDoctype :: Parser InternalSubset 371 internalDoctype = InternalText <$> T.pack <$> scanText (dfa IDSStart) 372 <|> return NoInternalSubset 373 where dfa IDSStart '[' = ScanNext (dfa (IDSScanning 0)) 374 dfa IDSStart _ = ScanFail "Not a DOCTYPE internal subset" 375 dfa (IDSInQuote n c) d 376 | c == d = ScanNext (dfa (IDSScanning n)) 377 | otherwise = ScanNext (dfa (IDSInQuote n c)) 378 dfa (IDSScanning n) '[' = ScanNext (dfa (IDSScanning (n+1))) 379 dfa (IDSScanning 0) ']' = ScanFinish 380 dfa (IDSScanning n) ']' = ScanNext (dfa (IDSScanning (n-1))) 381 dfa (IDSScanning n) '\'' = ScanNext (dfa (IDSInQuote n '\'')) 382 dfa (IDSScanning n) '\"' = ScanNext (dfa (IDSInQuote n '\"')) 383 dfa (IDSScanning n) '<' = ScanNext (dfa (IDSCommentS1 n)) 384 dfa (IDSScanning n) _ = ScanNext (dfa (IDSScanning n)) 385 dfa (IDSCommentS1 n) '[' = ScanNext (dfa (IDSScanning (n+1))) 386 dfa (IDSCommentS1 0) ']' = ScanFinish 387 dfa (IDSCommentS1 n) ']' = ScanNext (dfa (IDSScanning (n-1))) 388 dfa (IDSCommentS1 n) '\'' = ScanNext (dfa (IDSInQuote n '\'')) 389 dfa (IDSCommentS1 n) '\"' = ScanNext (dfa (IDSInQuote n '\"')) 390 dfa (IDSCommentS1 n) '!' = ScanNext (dfa (IDSCommentS2 n)) 391 dfa (IDSCommentS1 n) _ = ScanNext (dfa (IDSScanning n)) 392 dfa (IDSCommentS2 n) '[' = ScanNext (dfa (IDSScanning (n+1))) 393 dfa (IDSCommentS2 0) ']' = ScanFinish 394 dfa (IDSCommentS2 n) ']' = ScanNext (dfa (IDSScanning (n-1))) 395 dfa (IDSCommentS2 n) '\'' = ScanNext (dfa (IDSInQuote n '\'')) 396 dfa (IDSCommentS2 n) '\"' = ScanNext (dfa (IDSInQuote n '\"')) 397 dfa (IDSCommentS2 n) '-' = ScanNext (dfa (IDSCommentS3 n)) 398 dfa (IDSCommentS2 n) _ = ScanNext (dfa (IDSScanning n)) 399 dfa (IDSCommentS3 n) '[' = ScanNext (dfa (IDSScanning (n+1))) 400 dfa (IDSCommentS3 0) ']' = ScanFinish 401 dfa (IDSCommentS3 n) ']' = ScanNext (dfa (IDSScanning (n-1))) 402 dfa (IDSCommentS3 n) '\'' = ScanNext (dfa (IDSInQuote n '\'')) 403 dfa (IDSCommentS3 n) '\"' = ScanNext (dfa (IDSInQuote n '\"')) 404 dfa (IDSCommentS3 n) '-' = ScanNext (dfa (IDSComment n)) 405 dfa (IDSCommentS3 n) _ = ScanNext (dfa (IDSScanning n)) 406 dfa (IDSComment n) '-' = ScanNext (dfa (IDSCommentD1 n)) 407 dfa (IDSComment n) _ = ScanNext (dfa (IDSComment n)) 408 dfa (IDSCommentD1 n) '-' = ScanNext (dfa (IDSCommentE1 n)) 409 dfa (IDSCommentD1 n) _ = ScanNext (dfa (IDSComment n)) 410 dfa (IDSCommentE1 n) '>' = ScanNext (dfa (IDSScanning n)) 411 dfa (IDSCommentE1 _) _ = ScanFail "Poorly formatted comment" 412 413 414 ------------------------------------------------------------------------------ 415 sdDecl :: Parser () 416 sdDecl = do 417 _ <- P.try $ whiteSpace *> text "standalone" 418 eq 419 _ <- single <|> double 420 return () 421 where 422 single = P.char '\'' *> yesno <* P.char '\'' 423 double = P.char '\"' *> yesno <* P.char '\"' 424 yesno = text "yes" <|> text "no" 425 426 427 ------------------------------------------------------------------------------ 428 element :: Parser Node 429 element = do 430 (t,a,b) <- emptyOrStartTag 431 if b then return (Element t a []) 432 else nonEmptyElem t a 433 where 434 nonEmptyElem t a = do 435 c <- content 436 endTag t 437 return (Element t a c) 438 439 440 ------------------------------------------------------------------------------ 441 -- | Results are (tag name, attributes, isEmpty) 442 emptyOrStartTag :: Parser (Text, [(Text, Text)], Bool) 443 emptyOrStartTag = do 444 t <- P.try $ P.char '<' *> name 445 a <- many $ P.try $ do 446 whiteSpace 447 attribute 448 when (hasDups a) $ fail "Duplicate attribute names in element" 449 _ <- optional whiteSpace 450 e <- optional (P.char '/') 451 _ <- P.char '>' 452 return (t, a, isJust e) 453 where 454 hasDups a = length (nub (map fst a)) < length a 455 456 457 ------------------------------------------------------------------------------ 458 attribute :: Parser (Text, Text) 459 attribute = do 460 n <- name 461 eq 462 v <- attrValue 463 return (n,v) 464 465 466 ------------------------------------------------------------------------------ 467 endTag :: Text -> Parser () 468 endTag s = do 469 _ <- text "</" 470 t <- name 471 when (s /= t) $ fail $ "mismatched tags: </" ++ T.unpack t ++ 472 "> found inside <" ++ T.unpack s ++ "> tag" 473 _ <- optional whiteSpace 474 _ <- text ">" 475 return () 476 477 478 ------------------------------------------------------------------------------ 479 content :: Parser [Node] 480 content = do 481 n <- optional charData 482 ns <- fmap concat $ many $ do 483 s <- ((Just <$> TextNode <$> reference) 484 <|> cdSect 485 <|> processingInstruction 486 <|> comment 487 <|> fmap Just element) 488 t <- optional charData 489 return [s,t] 490 return $ coalesceText $ catMaybes (n:ns) 491 where 492 coalesceText (TextNode s : TextNode t : ns) 493 = coalesceText (TextNode (T.append s t) : ns) 494 coalesceText (n:ns) 495 = n : coalesceText ns 496 coalesceText [] 497 = [] 498 499 500 ------------------------------------------------------------------------------ 501 charRef :: Parser Text 502 charRef = hexCharRef <|> decCharRef 503 where 504 decCharRef = do 505 _ <- text "&#" 506 ds <- some digit 507 _ <- P.char ';' 508 let c = chr $ foldl' (\a b -> 10 * a + b) 0 ds 509 when (not (isValidChar c)) $ fail $ 510 "Reference is not a valid character" 511 return $ T.singleton c 512 where 513 digit = do 514 d <- P.satisfy (\c -> c >= '0' && c <= '9') 515 return (ord d - ord '0') 516 hexCharRef = do 517 _ <- text "&#x" 518 ds <- some digit 519 _ <- P.char ';' 520 let c = chr $ foldl' (\a b -> 16 * a + b) 0 ds 521 when (not (isValidChar c)) $ fail $ 522 "Reference is not a valid character" 523 return $ T.singleton c 524 where 525 digit = num <|> upper <|> lower 526 num = do 527 d <- P.satisfy (\c -> c >= '0' && c <= '9') 528 return (ord d - ord '0') 529 upper = do 530 d <- P.satisfy (\c -> c >= 'A' && c <= 'F') 531 return (10 + ord d - ord 'A') 532 lower = do 533 d <- P.satisfy (\c -> c >= 'a' && c <= 'f') 534 return (10 + ord d - ord 'a') 535 536 537 ------------------------------------------------------------------------------ 538 reference :: Parser Text 539 reference = charRef <|> entityRef 540 541 542 ------------------------------------------------------------------------------ 543 entityRef :: Parser Text 544 entityRef = do 545 _ <- P.char '&' 546 n <- name 547 _ <- P.char ';' 548 case M.lookup n entityRefLookup of 549 Nothing -> fail $ "Unknown entity reference: " ++ T.unpack n 550 Just t -> return t 551 where 552 entityRefLookup :: Map Text Text 553 entityRefLookup = M.fromList [ 554 ("amp", "&"), 555 ("lt", "<"), 556 ("gt", ">"), 557 ("apos", "\'"), 558 ("quot", "\"") 559 ] 560 561 562 ------------------------------------------------------------------------------ 563 externalID :: Parser ExternalID 564 externalID = systemID <|> publicID <|> return NoExternalID 565 where 566 systemID = do 567 _ <- text "SYSTEM" 568 whiteSpace 569 fmap System systemLiteral 570 publicID = do 571 _ <- text "PUBLIC" 572 whiteSpace 573 pid <- pubIdLiteral 574 whiteSpace 575 sid <- systemLiteral 576 return (Public pid sid) 577 578 579 ------------------------------------------------------------------------------ 580 encodingDecl :: Parser Text 581 encodingDecl = do 582 _ <- P.try $ whiteSpace *> text "encoding" 583 _ <- eq 584 singleQuoted <|> doubleQuoted 585 where 586 singleQuoted = P.char '\'' *> encName <* P.char '\'' 587 doubleQuoted = P.char '\"' *> encName <* P.char '\"' 588 encName = do 589 c <- P.satisfy isEncStart 590 cs <- takeWhile0 isEnc 591 return (T.cons c cs) 592 isEncStart c | c >= 'A' && c <= 'Z' = True 593 | c >= 'a' && c <= 'z' = True 594 | otherwise = False 595 isEnc c | c >= 'A' && c <= 'Z' = True 596 | c >= 'a' && c <= 'z' = True 597 | c >= '0' && c <= '9' = True 598 | c `elem` "._-" = True 599 | otherwise = False 600