pandoc/src/Text/Pandoc/Readers/DocBook.hs

104 lines
4.6 KiB
Haskell
Raw Normal View History

module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Text.Pandoc.Parsing (ParserState(..))
import Text.Pandoc.Definition
import Text.Pandoc.Builder
import Text.XML.Light
import Data.Monoid
import Data.Char (isSpace)
2012-04-14 17:33:56 -07:00
import Control.Monad.State
import Control.Applicative ((<$>))
type DB = State DBState
data DBState = DBState{ dbSectionLevel :: Int
, dbDocTitle :: Inlines
, dbDocAuthors :: [Inlines]
, dbDocDate :: Inlines
} deriving Show
readDocBook :: ParserState -> String -> Pandoc
readDocBook st inp = setTitle (dbDocTitle st')
$ setAuthors (dbDocAuthors st')
$ setDate (dbDocDate st')
$ doc $ mconcat bs
where (bs, st') = runState (mapM parseBlock $ parseXML inp)
DBState{ dbSectionLevel = 0
, dbDocTitle = mempty
, dbDocAuthors = []
, dbDocDate = mempty
}
2012-04-14 17:33:56 -07:00
parseBlock :: Content -> DB Blocks
parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE
2012-04-14 17:33:56 -07:00
parseBlock (Text (CData _ s _)) = if all isSpace s
then return mempty
else return $ plain $ text s
2012-04-14 17:33:56 -07:00
parseBlock (Elem e) =
case qName (elName e) of
2012-04-14 17:33:56 -07:00
"para" -> para <$> getInlines e
"blockquote" -> blockQuote <$> innerBlocks
2012-04-14 17:33:56 -07:00
"sect1" -> sect 1
"sect2" -> sect 2
"sect3" -> sect 3
"sect4" -> sect 4
"sect5" -> sect 5
"sect6" -> sect 6
"section" -> gets dbSectionLevel >>= sect . (+1)
"articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty
"title" -> return mempty -- processed by sect
"?xml" -> return mempty
2012-04-14 17:33:56 -07:00
_ -> innerBlocks
where innerBlocks = mconcat <$> (mapM parseBlock $ elContent e)
getInlines e' = (trimInlines . mconcat) <$>
(mapM parseInline $ elContent e')
isTitle e' = qName (elName e') == "title"
skipWhite (Text (CData _ s _):xs) | all isSpace s = skipWhite xs
| otherwise = xs
skipWhite xs = xs
getTitle = case findChild (unqual "title") e of
Just t -> do
tit <- getInlines t
modify $ \st -> st{dbDocTitle = tit}
Nothing -> return ()
getAuthors = do
auths <- mapM getInlines
$ findChildren (unqual "author") e
modify $ \st -> st{dbDocAuthors = auths}
getDate = case findChild (unqual "date") e of
Just t -> do
dat <- getInlines t
modify $ \st -> st{dbDocDate = dat}
Nothing -> return ()
2012-04-14 17:33:56 -07:00
sect n = case skipWhite (elContent e) of
((Elem t):body)
| isTitle t -> do
h <- header n <$> (getInlines t)
modify $ \st -> st{ dbSectionLevel = n }
b <- mconcat <$> (mapM parseBlock body)
modify $ \st -> st{ dbSectionLevel = n - 1 }
return $ h <> b
body -> do
let h = header n mempty
modify $ \st -> st{ dbSectionLevel = n }
2012-04-14 17:33:56 -07:00
b <- mconcat <$> (mapM parseBlock body)
modify $ \st -> st{ dbSectionLevel = n - 1 }
2012-04-14 17:33:56 -07:00
return $ h <> b
parseBlock (CRef _) = return mempty
2012-04-14 17:33:56 -07:00
parseInline :: Content -> DB Inlines
parseInline (Text (CData _ s _)) = return $ text s
parseInline (Elem e) =
case qName (elName e) of
"subscript" -> subscript <$> innerInlines
"superscript" -> superscript <$> innerInlines
"emphasis" -> case lookupAttrBy (\attr -> qName attr == "role")
(elAttribs e) of
2012-04-14 17:33:56 -07:00
Just "strong" -> strong <$> innerInlines
_ -> emph <$> innerInlines
"footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e)
_ -> innerInlines
2012-04-14 17:33:56 -07:00
where innerInlines = (trimInlines . mconcat) <$>
(mapM parseInline $ elContent e)
parseInline (CRef _) = return mempty