pandoc/src/Text/Pandoc/Readers/OPML.hs

{-# LANGUAGE FlexibleContexts #-}
module Text.Pandoc.Readers.OPML ( readOPML ) where
import Data.Char (toUpper)
import Text.Pandoc.Options
import Text.Pandoc.Definition
import Text.Pandoc.Builder
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Readers.Markdown (readMarkdown)
import Text.XML.Light
import Text.HTML.TagSoup.Entity (lookupEntity)
import Data.Generics
import Control.Monad.State
import Data.Default
import Text.Pandoc.Class (PandocMonad)

type OPML m = StateT OPMLState m

data OPMLState = OPMLState{
                        opmlSectionLevel :: Int
                      , opmlDocTitle     :: Inlines
                      , opmlDocAuthors   :: [Inlines]
                      , opmlDocDate      :: Inlines
                      } deriving Show

instance Default OPMLState where
  def = OPMLState{ opmlSectionLevel = 0
                 , opmlDocTitle = mempty
                 , opmlDocAuthors = []
                 , opmlDocDate = mempty
                  }

readOPML :: PandocMonad m => ReaderOptions -> String -> m Pandoc
readOPML _ inp  = do
  (bs, st') <- flip runStateT def (mapM parseBlock $ normalizeTree $ parseXML inp)
  return $
    setTitle (opmlDocTitle st') $
    setAuthors (opmlDocAuthors st') $
    setDate (opmlDocDate st') $
    doc $ mconcat bs

-- normalize input, consolidating adjacent Text and CRef elements
normalizeTree :: [Content] -> [Content]
normalizeTree = everywhere (mkT go)
  where go :: [Content] -> [Content]
        go (Text (CData CDataRaw _ _):xs) = xs
        go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
           Text (CData CDataText (s1 ++ s2) z):xs
        go (Text (CData CDataText s1 z):CRef r:xs) =
           Text (CData CDataText (s1 ++ convertEntity r) z):xs
        go (CRef r:Text (CData CDataText s1 z):xs) =
             Text (CData CDataText (convertEntity r ++ s1) z):xs
        go (CRef r1:CRef r2:xs) =
             Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs
        go xs = xs

convertEntity :: String -> String
convertEntity e = maybe (map toUpper e) id (lookupEntity e)

-- convenience function to get an attribute value, defaulting to ""
attrValue :: String -> Element -> String
attrValue attr elt =
  case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of
    Just z  -> z
    Nothing -> ""

-- exceptT :: PandocMonad m => Either PandocError a -> OPML m a
-- exceptT = either throwError return

asHtml :: PandocMonad m => String -> OPML m Inlines
asHtml s =
  (\(Pandoc _ bs) -> case bs of
                                [Plain ils] -> fromList ils
                                _ -> mempty) <$> (lift $ readHtml def s)

asMarkdown :: PandocMonad m => String -> OPML m Blocks
asMarkdown s = (\(Pandoc _ bs) -> fromList bs) <$> (lift $ readMarkdown def s)

getBlocks :: PandocMonad m => Element -> OPML m Blocks
getBlocks e =  mconcat <$> (mapM parseBlock $ elContent e)

parseBlock :: PandocMonad m => Content -> OPML m Blocks
parseBlock (Elem e) =
  case qName (elName e) of
        "ownerName"    -> mempty <$ modify (\st ->
                              st{opmlDocAuthors = [text $ strContent e]})
        "dateModified" -> mempty <$ modify (\st ->
                              st{opmlDocDate = text $ strContent e})
        "title"        -> mempty <$ modify (\st ->
                              st{opmlDocTitle = text $ strContent e})
        "outline" -> gets opmlSectionLevel >>= sect . (+1)
        "?xml"  -> return mempty
        _       -> getBlocks e
   where sect n = do headerText <- asHtml $ attrValue "text" e
                     noteBlocks <- asMarkdown $ attrValue "_note" e
                     modify $ \st -> st{ opmlSectionLevel = n }
                     bs <- getBlocks e
                     modify $ \st -> st{ opmlSectionLevel = n - 1 }
                     let headerText' = case map toUpper (attrValue "type" e) of
                                             "LINK"  -> link
                                               (attrValue "url" e) "" headerText
                                             _ -> headerText
                     return $ header n headerText' <> noteBlocks <> bs
parseBlock _ = return mempty
Change return type of OPML reader 2015-02-18 13:05:05 +00:00			`{-# LANGUAGE FlexibleContexts #-}`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00			`module Text.Pandoc.Readers.OPML ( readOPML ) where`
			`import Data.Char (toUpper)`
			`import Text.Pandoc.Options`
			`import Text.Pandoc.Definition`
			`import Text.Pandoc.Builder`
			`import Text.Pandoc.Readers.HTML (readHtml)`
			`import Text.Pandoc.Readers.Markdown (readMarkdown)`
			`import Text.XML.Light`
Remove TagSoup compat We already lower-bound tagsoup at 0.13.7, which means we were always running the compatibility layer (it was conditional on min value 0.13). Better to just use `lookupEntity` from the library directly, and convert a string to a char if need be. 2016-09-02 11:35:28 -04:00			`import Text.HTML.TagSoup.Entity (lookupEntity)`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00			`import Data.Generics`
			`import Control.Monad.State`
Change return type of OPML reader 2015-02-18 13:05:05 +00:00			`import Data.Default`
Unify Errors. 2016-12-01 12:13:51 -05:00			`import Text.Pandoc.Class (PandocMonad)`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00
Working on readers. 2016-11-28 17:13:46 -05:00			`type OPML m = StateT OPMLState m`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00
			`data OPMLState = OPMLState{`
			`opmlSectionLevel :: Int`
			`, opmlDocTitle :: Inlines`
			`, opmlDocAuthors :: [Inlines]`
			`, opmlDocDate :: Inlines`
			`} deriving Show`

Change return type of OPML reader 2015-02-18 13:05:05 +00:00			`instance Default OPMLState where`
			`def = OPMLState{ opmlSectionLevel = 0`
			`, opmlDocTitle = mempty`
			`, opmlDocAuthors = []`
			`, opmlDocDate = mempty`
			`}`

Working on readers. 2016-11-28 17:13:46 -05:00			`readOPML :: PandocMonad m => ReaderOptions -> String -> m Pandoc`
			`readOPML _ inp = do`
			`(bs, st') <- flip runStateT def (mapM parseBlock $ normalizeTree $ parseXML inp)`
			`return $`
			`setTitle (opmlDocTitle st') $`
			`setAuthors (opmlDocAuthors st') $`
			`setDate (opmlDocDate st') $`
			`doc $ mconcat bs`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00
			`-- normalize input, consolidating adjacent Text and CRef elements`
			`normalizeTree :: [Content] -> [Content]`
			`normalizeTree = everywhere (mkT go)`
			`where go :: [Content] -> [Content]`
			`go (Text (CData CDataRaw _ _):xs) = xs`
			`go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =`
			`Text (CData CDataText (s1 ++ s2) z):xs`
			`go (Text (CData CDataText s1 z):CRef r:xs) =`
			`Text (CData CDataText (s1 ++ convertEntity r) z):xs`
			`go (CRef r:Text (CData CDataText s1 z):xs) =`
			`Text (CData CDataText (convertEntity r ++ s1) z):xs`
			`go (CRef r1:CRef r2:xs) =`
			`Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs`
			`go xs = xs`

			`convertEntity :: String -> String`
Remove TagSoup compat We already lower-bound tagsoup at 0.13.7, which means we were always running the compatibility layer (it was conditional on min value 0.13). Better to just use `lookupEntity` from the library directly, and convert a string to a char if need be. 2016-09-02 11:35:28 -04:00			`convertEntity e = maybe (map toUpper e) id (lookupEntity e)`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00
			`-- convenience function to get an attribute value, defaulting to ""`
			`attrValue :: String -> Element -> String`
			`attrValue attr elt =`
			`case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of`
			`Just z -> z`
			`Nothing -> ""`

Unify Errors. 2016-12-01 12:13:51 -05:00			`-- exceptT :: PandocMonad m => Either PandocError a -> OPML m a`
Working on readers. 2016-11-28 17:13:46 -05:00			`-- exceptT = either throwError return`
Change return type of OPML reader 2015-02-18 13:05:05 +00:00
Working on readers. 2016-11-28 17:13:46 -05:00			`asHtml :: PandocMonad m => String -> OPML m Inlines`
			`asHtml s =`
			`(\(Pandoc _ bs) -> case bs of`
Change return type of OPML reader 2015-02-18 13:05:05 +00:00			`[Plain ils] -> fromList ils`
Working on readers. 2016-11-28 17:13:46 -05:00			`_ -> mempty) <$> (lift $ readHtml def s)`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00
Working on readers. 2016-11-28 17:13:46 -05:00			`asMarkdown :: PandocMonad m => String -> OPML m Blocks`
			`asMarkdown s = (\(Pandoc _ bs) -> fromList bs) <$> (lift $ readMarkdown def s)`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00
Working on readers. 2016-11-28 17:13:46 -05:00			`getBlocks :: PandocMonad m => Element -> OPML m Blocks`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00			`getBlocks e = mconcat <$> (mapM parseBlock $ elContent e)`

Working on readers. 2016-11-28 17:13:46 -05:00			`parseBlock :: PandocMonad m => Content -> OPML m Blocks`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00			`parseBlock (Elem e) =`
			`case qName (elName e) of`
			`"ownerName" -> mempty <$ modify (\st ->`
			`st{opmlDocAuthors = [text $ strContent e]})`
			`"dateModified" -> mempty <$ modify (\st ->`
			`st{opmlDocDate = text $ strContent e})`
			`"title" -> mempty <$ modify (\st ->`
			`st{opmlDocTitle = text $ strContent e})`
			`"outline" -> gets opmlSectionLevel >>= sect . (+1)`
			`"?xml" -> return mempty`
			`_ -> getBlocks e`
Change return type of OPML reader 2015-02-18 13:05:05 +00:00			`where sect n = do headerText <- asHtml $ attrValue "text" e`
			`noteBlocks <- asMarkdown $ attrValue "_note" e`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00			`modify $ \st -> st{ opmlSectionLevel = n }`
			`bs <- getBlocks e`
			`modify $ \st -> st{ opmlSectionLevel = n - 1 }`
OPML reader: Type attributes are not case sensitive. So, `type="link"` or `type="LINK"`. 2013-03-20 09:16:16 -07:00			`let headerText' = case map toUpper (attrValue "type" e) of`
			`"LINK" -> link`
Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. 2013-03-17 17:43:51 -07:00			`(attrValue "url" e) "" headerText`
			`_ -> headerText`
			`return $ header n headerText' <> noteBlocks <> bs`
			`parseBlock _ = return mempty`