1 {-# LANGUAGE OverloadedStrings #-} 2 {-# LANGUAGE FlexibleInstances #-} 3 {-# LANGUAGE MultiParamTypeClasses #-} 4 5 module Text.XmlHtml.Common where 6 7 import Blaze.ByteString.Builder 8 import Data.Maybe 9 10 import Data.Text (Text) 11 import qualified Data.Text as T 12 import qualified Data.Text.Encoding as T 13 import qualified Data.Text.Encoding.Error as TE 14 15 import Data.ByteString (ByteString) 16 17 18 ------------------------------------------------------------------------------ 19 -- | Represents a document fragment, including the format, encoding, and 20 -- document type declaration as well as its content. 21 data Document = XmlDocument { 22 docEncoding :: !Encoding, 23 docType :: !(Maybe DocType), 24 docContent :: ![Node] 25 } 26 | HtmlDocument { 27 docEncoding :: !Encoding, 28 docType :: !(Maybe DocType), 29 docContent :: ![Node] 30 } 31 deriving (Eq, Show) 32 33 34 ------------------------------------------------------------------------------ 35 -- | A node of a document structure. A node can be text, a comment, or an 36 -- element. XML processing instructions are intentionally omitted as a 37 -- simplification, and CDATA and plain text are both text nodes, since they 38 -- ought to be semantically interchangeable. 39 data Node = TextNode !Text 40 | Comment !Text 41 | Element { 42 elementTag :: !Text, 43 elementAttrs :: ![(Text, Text)], 44 elementChildren :: ![Node] 45 } 46 deriving (Eq, Show) 47 48 49 ------------------------------------------------------------------------------ 50 -- | Determines whether the node is text or not. 51 isTextNode :: Node -> Bool 52 isTextNode (TextNode _) = True 53 isTextNode _ = False 54 55 56 ------------------------------------------------------------------------------ 57 -- | Determines whether the node is a comment or not. 58 isComment :: Node -> Bool 59 isComment (Comment _) = True 60 isComment _ = False 61 62 63 ------------------------------------------------------------------------------ 64 -- | Determines whether the node is an element or not. 65 isElement :: Node -> Bool 66 isElement (Element _ _ _) = True 67 isElement _ = False 68 69 70 ------------------------------------------------------------------------------ 71 -- | Gives the tag name of an element, or 'Nothing' if the node isn't an 72 -- element. 73 tagName :: Node -> Maybe Text 74 tagName (Element t _ _) = Just t 75 tagName _ = Nothing 76 77 78 ------------------------------------------------------------------------------ 79 -- | Retrieves the attribute with the given name. If the 'Node' is not an 80 -- element, the result is always 'Nothing' 81 getAttribute :: Text -> Node -> Maybe Text 82 getAttribute name (Element _ attrs _) = lookup name attrs 83 getAttribute _ _ = Nothing 84 85 86 ------------------------------------------------------------------------------ 87 -- | Checks if a given attribute exists in a 'Node'. 88 hasAttribute :: Text -> Node -> Bool 89 hasAttribute name = isJust . getAttribute name 90 91 92 ------------------------------------------------------------------------------ 93 -- | Sets the attribute name to the given value. If the 'Node' is not an 94 -- element, this is the identity. 95 setAttribute :: Text -> Text -> Node -> Node 96 setAttribute name val (Element t a c) = Element t newAttrs c 97 where newAttrs = (name, val) : filter ((/= name) . fst) a 98 setAttribute _ _ n = n 99 100 101 ------------------------------------------------------------------------------ 102 -- | Gives the entire text content of a node, ignoring markup. 103 nodeText :: Node -> Text 104 nodeText (TextNode t) = t 105 nodeText (Comment _) = "" 106 nodeText (Element _ _ c) = T.concat (map nodeText c) 107 108 109 ------------------------------------------------------------------------------ 110 -- | Gives the child nodes of the given node. Only elements have child nodes. 111 childNodes :: Node -> [Node] 112 childNodes (Element _ _ c) = c 113 childNodes _ = [] 114 115 116 ------------------------------------------------------------------------------ 117 -- | Gives the child elements of the given node. 118 childElements :: Node -> [Node] 119 childElements = filter isElement . childNodes 120 121 122 ------------------------------------------------------------------------------ 123 -- | Gives all of the child elements of the node with the given tag 124 -- name. 125 childElementsTag :: Text -> Node -> [Node] 126 childElementsTag tag = filter ((== Just tag) . tagName) . childNodes 127 128 129 ------------------------------------------------------------------------------ 130 -- | Gives the first child element of the node with the given tag name, 131 -- or 'Nothing' if there is no such child element. 132 childElementTag :: Text -> Node -> Maybe Node 133 childElementTag tag = listToMaybe . childElementsTag tag 134 135 136 ------------------------------------------------------------------------------ 137 -- | Gives the descendants of the given node in the order that they begin in 138 -- the document. 139 descendantNodes :: Node -> [Node] 140 descendantNodes = concatMap (\n -> n : descendantNodes n) . childNodes 141 142 ------------------------------------------------------------------------------ 143 -- | Gives the descendant elements of the given node, in the order that their 144 -- start tags appear in the document. 145 descendantElements :: Node -> [Node] 146 descendantElements = filter isElement . descendantNodes 147 148 149 ------------------------------------------------------------------------------ 150 -- | Gives the descendant elements with a given tag name. 151 descendantElementsTag :: Text -> Node -> [Node] 152 descendantElementsTag tag = filter ((== Just tag) . tagName) . descendantNodes 153 154 155 ------------------------------------------------------------------------------ 156 -- | Gives the first descendant element of the node with the given tag name, 157 -- or 'Nothing' if there is no such element. 158 descendantElementTag :: Text -> Node -> Maybe Node 159 descendantElementTag tag = listToMaybe . descendantElementsTag tag 160 161 162 ------------------------------------------------------------------------------ 163 -- | A document type declaration. Note that DTD internal subsets are 164 -- currently unimplemented. 165 data DocType = DocType !Text !ExternalID !InternalSubset 166 deriving (Eq, Show) 167 168 169 ------------------------------------------------------------------------------ 170 -- | An external ID, as in a document type declaration. This can be a 171 -- SYSTEM identifier, or a PUBLIC identifier, or can be omitted. 172 data ExternalID = Public !Text !Text 173 | System !Text 174 | NoExternalID 175 deriving (Eq, Show) 176 177 178 ------------------------------------------------------------------------------ 179 -- | The internal subset is unparsed, but preserved in case it's actually 180 -- wanted. 181 data InternalSubset = InternalText !Text 182 | NoInternalSubset 183 deriving (Eq, Show) 184 185 186 ------------------------------------------------------------------------------ 187 -- | The character encoding of a document. Currently only the required 188 -- character encodings are implemented. 189 data Encoding = UTF8 | UTF16BE | UTF16LE deriving (Eq, Show) 190 191 192 ------------------------------------------------------------------------------ 193 -- | Retrieves the preferred name of a character encoding for embedding in 194 -- a document. 195 encodingName :: Encoding -> Text 196 encodingName UTF8 = "UTF-8" 197 encodingName UTF16BE = "UTF-16" 198 encodingName UTF16LE = "UTF-16" 199 200 201 ------------------------------------------------------------------------------ 202 -- | Gets the encoding function from 'Text' to 'ByteString' for an encoding. 203 encoder :: Encoding -> Text -> ByteString 204 encoder UTF8 = T.encodeUtf8 205 encoder UTF16BE = T.encodeUtf16BE 206 encoder UTF16LE = T.encodeUtf16LE 207 208 209 ------------------------------------------------------------------------------ 210 -- | Gets the decoding function from 'ByteString' to 'Text' for an encoding. 211 decoder :: Encoding -> ByteString -> Text 212 decoder UTF8 = T.decodeUtf8With (TE.replace '\xFFFF') 213 decoder UTF16BE = T.decodeUtf16BEWith (TE.replace '\xFFFF') 214 decoder UTF16LE = T.decodeUtf16LEWith (TE.replace '\xFFFF') 215 216 217 ------------------------------------------------------------------------------ 218 isUTF16 :: Encoding -> Bool 219 isUTF16 e = e == UTF16BE || e == UTF16LE 220 221 222 ------------------------------------------------------------------------------ 223 fromText :: Encoding -> Text -> Builder 224 fromText e t = fromByteString (encoder e t) 225