1 {-# LANGUAGE OverloadedStrings     #-}
    2 {-# LANGUAGE FlexibleInstances     #-}
    3 {-# LANGUAGE MultiParamTypeClasses #-}
    4 
    5 module Text.XmlHtml.Common where
    6 
    7 import           Blaze.ByteString.Builder
    8 import           Data.Maybe
    9 
   10 import           Data.Text (Text)
   11 import qualified Data.Text as T
   12 import qualified Data.Text.Encoding as T
   13 import qualified Data.Text.Encoding.Error as TE
   14 
   15 import           Data.ByteString (ByteString)
   16 
   17 
   18 ------------------------------------------------------------------------------
   19 -- | Represents a document fragment, including the format, encoding, and
   20 -- document type declaration as well as its content.
   21 data Document = XmlDocument  {
   22                     docEncoding :: !Encoding,
   23                     docType     :: !(Maybe DocType),
   24                     docContent  :: ![Node]
   25                 }
   26               | HtmlDocument {
   27                     docEncoding :: !Encoding,
   28                     docType     :: !(Maybe DocType),
   29                     docContent  :: ![Node]
   30                 }
   31     deriving (Eq, Show)
   32 
   33 
   34 ------------------------------------------------------------------------------
   35 -- | A node of a document structure.  A node can be text, a comment, or an
   36 -- element.  XML processing instructions are intentionally omitted as a
   37 -- simplification, and CDATA and plain text are both text nodes, since they
   38 -- ought to be semantically interchangeable.
   39 data Node = TextNode !Text
   40           | Comment  !Text
   41           | Element {
   42                 elementTag      :: !Text,
   43                 elementAttrs    :: ![(Text, Text)],
   44                 elementChildren :: ![Node]
   45             }
   46     deriving (Eq, Show)
   47 
   48 
   49 ------------------------------------------------------------------------------
   50 -- | Determines whether the node is text or not.
   51 isTextNode :: Node -> Bool
   52 isTextNode (TextNode _) = True
   53 isTextNode _            = False
   54 
   55 
   56 ------------------------------------------------------------------------------
   57 -- | Determines whether the node is a comment or not.
   58 isComment :: Node -> Bool
   59 isComment (Comment _) = True
   60 isComment _           = False
   61 
   62 
   63 ------------------------------------------------------------------------------
   64 -- | Determines whether the node is an element or not.
   65 isElement :: Node -> Bool
   66 isElement (Element _ _ _) = True
   67 isElement _               = False
   68 
   69 
   70 ------------------------------------------------------------------------------
   71 -- | Gives the tag name of an element, or 'Nothing' if the node isn't an
   72 -- element.
   73 tagName :: Node -> Maybe Text
   74 tagName (Element t _ _) = Just t
   75 tagName _               = Nothing
   76 
   77 
   78 ------------------------------------------------------------------------------
   79 -- | Retrieves the attribute with the given name.  If the 'Node' is not an
   80 -- element, the result is always 'Nothing'
   81 getAttribute :: Text -> Node -> Maybe Text
   82 getAttribute name (Element _ attrs _) = lookup name attrs
   83 getAttribute _    _                   = Nothing
   84 
   85 
   86 ------------------------------------------------------------------------------
   87 -- | Checks if a given attribute exists in a 'Node'.
   88 hasAttribute :: Text -> Node -> Bool
   89 hasAttribute name = isJust . getAttribute name
   90 
   91 
   92 ------------------------------------------------------------------------------
   93 -- | Sets the attribute name to the given value.  If the 'Node' is not an
   94 -- element, this is the identity.
   95 setAttribute :: Text -> Text -> Node -> Node
   96 setAttribute name val (Element t a c) = Element t newAttrs c
   97   where newAttrs = (name, val) : filter ((/= name) . fst) a
   98 setAttribute _    _   n                   = n
   99 
  100 
  101 ------------------------------------------------------------------------------
  102 -- | Gives the entire text content of a node, ignoring markup.
  103 nodeText :: Node -> Text
  104 nodeText (TextNode t)    = t
  105 nodeText (Comment _)     = ""
  106 nodeText (Element _ _ c) = T.concat (map nodeText c)
  107 
  108 
  109 ------------------------------------------------------------------------------
  110 -- | Gives the child nodes of the given node.  Only elements have child nodes.
  111 childNodes :: Node -> [Node]
  112 childNodes (Element _ _ c) = c
  113 childNodes _               = []
  114 
  115 
  116 ------------------------------------------------------------------------------
  117 -- | Gives the child elements of the given node.
  118 childElements :: Node -> [Node]
  119 childElements = filter isElement . childNodes
  120 
  121 
  122 ------------------------------------------------------------------------------
  123 -- | Gives all of the child elements of the node with the given tag
  124 -- name.
  125 childElementsTag :: Text -> Node -> [Node]
  126 childElementsTag tag = filter ((== Just tag) . tagName) . childNodes
  127 
  128 
  129 ------------------------------------------------------------------------------
  130 -- | Gives the first child element of the node with the given tag name,
  131 -- or 'Nothing' if there is no such child element.
  132 childElementTag :: Text -> Node -> Maybe Node
  133 childElementTag tag = listToMaybe . childElementsTag tag
  134 
  135 
  136 ------------------------------------------------------------------------------
  137 -- | Gives the descendants of the given node in the order that they begin in
  138 -- the document.
  139 descendantNodes :: Node -> [Node]
  140 descendantNodes = concatMap (\n -> n : descendantNodes n) . childNodes
  141 
  142 ------------------------------------------------------------------------------
  143 -- | Gives the descendant elements of the given node, in the order that their
  144 -- start tags appear in the document.
  145 descendantElements :: Node -> [Node]
  146 descendantElements = filter isElement . descendantNodes
  147 
  148 
  149 ------------------------------------------------------------------------------
  150 -- | Gives the descendant elements with a given tag name.
  151 descendantElementsTag :: Text -> Node -> [Node]
  152 descendantElementsTag tag = filter ((== Just tag) . tagName) . descendantNodes
  153 
  154 
  155 ------------------------------------------------------------------------------
  156 -- | Gives the first descendant element of the node with the given tag name,
  157 -- or 'Nothing' if there is no such element.
  158 descendantElementTag :: Text -> Node -> Maybe Node
  159 descendantElementTag tag = listToMaybe . descendantElementsTag tag
  160 
  161 
  162 ------------------------------------------------------------------------------
  163 -- | A document type declaration.  Note that DTD internal subsets are
  164 -- currently unimplemented.
  165 data DocType = DocType !Text !ExternalID !InternalSubset
  166     deriving (Eq, Show)
  167 
  168 
  169 ------------------------------------------------------------------------------
  170 -- | An external ID, as in a document type declaration.  This can be a
  171 -- SYSTEM identifier, or a PUBLIC identifier, or can be omitted.
  172 data ExternalID = Public !Text !Text
  173                 | System !Text
  174                 | NoExternalID
  175     deriving (Eq, Show)
  176 
  177 
  178 ------------------------------------------------------------------------------
  179 -- | The internal subset is unparsed, but preserved in case it's actually
  180 -- wanted.
  181 data InternalSubset = InternalText !Text
  182                     | NoInternalSubset
  183     deriving (Eq, Show)
  184 
  185 
  186 ------------------------------------------------------------------------------
  187 -- | The character encoding of a document.  Currently only the required
  188 -- character encodings are implemented.
  189 data Encoding = UTF8 | UTF16BE | UTF16LE deriving (Eq, Show)
  190 
  191 
  192 ------------------------------------------------------------------------------
  193 -- | Retrieves the preferred name of a character encoding for embedding in
  194 -- a document.
  195 encodingName :: Encoding -> Text
  196 encodingName UTF8    = "UTF-8"
  197 encodingName UTF16BE = "UTF-16"
  198 encodingName UTF16LE = "UTF-16"
  199 
  200 
  201 ------------------------------------------------------------------------------
  202 -- | Gets the encoding function from 'Text' to 'ByteString' for an encoding.
  203 encoder :: Encoding -> Text -> ByteString
  204 encoder UTF8    = T.encodeUtf8
  205 encoder UTF16BE = T.encodeUtf16BE
  206 encoder UTF16LE = T.encodeUtf16LE
  207 
  208 
  209 ------------------------------------------------------------------------------
  210 -- | Gets the decoding function from 'ByteString' to 'Text' for an encoding.
  211 decoder :: Encoding -> ByteString -> Text
  212 decoder UTF8    = T.decodeUtf8With    (TE.replace '\xFFFF')
  213 decoder UTF16BE = T.decodeUtf16BEWith (TE.replace '\xFFFF')
  214 decoder UTF16LE = T.decodeUtf16LEWith (TE.replace '\xFFFF')
  215 
  216 
  217 ------------------------------------------------------------------------------
  218 isUTF16 :: Encoding -> Bool
  219 isUTF16 e = e == UTF16BE || e == UTF16LE
  220 
  221 
  222 ------------------------------------------------------------------------------
  223 fromText :: Encoding -> Text -> Builder
  224 fromText e t = fromByteString (encoder e t)
  225