Move Docx reader to DocxContext monad

This is a ReaderT State stack, which keeps track of some environment info, such
as the options and the docx doc. The state will come in handy in the future,
for a couple of planned features (rewriting the section anchors as auto_idents,
and hopefully smart-quoting).
This commit is contained in:
Jesse Rosenthal 2014-06-26 16:48:41 -04:00
parent b1a8f1fa1a
commit 4248f25152

View file

@ -88,6 +88,9 @@ import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import Data.ByteString.Base64 (encode)
import System.FilePath (combine)
import qualified Data.Map as M
import Control.Monad.Reader
import Control.Monad.State
readDocx :: ReaderOptions
-> B.ByteString
@ -97,11 +100,24 @@ readDocx opts bytes =
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
Nothing -> error $ "couldn't parse docx file"
data DState = DState { docxHdrLinks :: M.Map String String }
data DEnv = DEnv { docxOptions :: ReaderOptions
, docxDocument :: Docx}
type DocxContext = ReaderT DEnv (State DState)
evalDocxContext :: DocxContext a -> DEnv -> DState -> a
evalDocxContext ctx env st = evalState (runReaderT ctx env) st
concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
concatMapM f xs = liftM concat (mapM f xs)
-- This is empty, but we put it in for future-proofing.
spansToKeep :: [String]
spansToKeep = []
-- This is empty, but we put it in for future-proofing.
divsToKeep :: [String]
divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
@ -213,57 +229,69 @@ inlineCodeContainer (Container f) = case f [] of
inlineCodeContainer _ = False
runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
runToInlines _ _ (Run rs runElems)
runToInlines :: Run -> DocxContext [Inline]
runToInlines (Run rs runElems)
| any inlineCodeContainer (runStyleToContainers rs) =
return $
rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]
| otherwise =
return $
rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) =
runToInlines (Footnote fnId) = do
(Docx _ notes _ _ _ ) <- asks docxDocument
case (getFootNote fnId notes) of
Just bodyParts ->
[Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
Nothing ->
[Note []]
runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) =
Just bodyParts -> do
blks <- concatMapM bodyPartToBlocks bodyParts
return $ [Note blks]
Nothing -> return [Note []]
runToInlines (Endnote fnId) = do
(Docx _ notes _ _ _ ) <- asks docxDocument
case (getEndNote fnId notes) of
Just bodyParts ->
[Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
Nothing ->
[Note []]
Just bodyParts -> do
blks <- concatMapM bodyPartToBlocks bodyParts
return $ [Note blks]
Nothing -> return [Note []]
parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline]
parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
parPartToInlines opts docx (Insertion _ author date runs) =
parPartToInlines :: ParPart -> DocxContext [Inline]
parPartToInlines (PlainRun r) = runToInlines r
parPartToInlines (Insertion _ author date runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AcceptChanges -> concatMap (runToInlines opts docx) runs
RejectChanges -> []
AllChanges ->
[Span
("", ["insertion"], [("author", author), ("date", date)])
(concatMap (runToInlines opts docx) runs)]
parPartToInlines opts docx (Deletion _ author date runs) =
AcceptChanges -> concatMapM runToInlines runs >>= return
RejectChanges -> return []
AllChanges -> do
ils <- (concatMapM runToInlines runs)
return [Span
("", ["insertion"], [("author", author), ("date", date)])
ils]
parPartToInlines (Deletion _ author date runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AcceptChanges -> []
RejectChanges -> concatMap (runToInlines opts docx) runs
AllChanges ->
[Span
("", ["deletion"], [("author", author), ("date", date)])
(concatMap (runToInlines opts docx) runs)]
parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = []
parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []]
parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) =
case lookupRelationship relid rels of
AcceptChanges -> return []
RejectChanges -> concatMapM runToInlines runs >>= return
AllChanges -> do
ils <- concatMapM runToInlines runs
return [Span
("", ["deletion"], [("author", author), ("date", date)])
ils]
parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return []
parPartToInlines (BookMark _ anchor) = return [Span (anchor, ["anchor"], []) []]
parPartToInlines (Drawing relid) = do
(Docx _ _ _ rels _) <- asks docxDocument
return $ case lookupRelationship relid rels of
Just target -> [Image [] (combine "word" target, "")]
Nothing -> [Image [] ("", "")]
parPartToInlines opts docx (InternalHyperLink anchor runs) =
[Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")]
parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) =
case lookupRelationship relid rels of
parPartToInlines (InternalHyperLink anchor runs) = do
ils <- concatMapM runToInlines runs
return [Link ils ('#' : anchor, "")]
parPartToInlines (ExternalHyperLink relid runs) = do
(Docx _ _ _ rels _) <- asks docxDocument
rs <- concatMapM runToInlines runs
return $ case lookupRelationship relid rels of
Just target ->
[Link (concatMap (runToInlines opts docx) runs) (target, "")]
[Link rs (target, "")]
Nothing ->
[Link (concatMap (runToInlines opts docx) runs) ("", "")]
[Link rs ("", "")]
isAnchorSpan :: Inline -> Bool
isAnchorSpan (Span (ident, classes, kvs) ils) =
@ -287,25 +315,18 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
_ -> h
makeHeaderAnchors blk = blk
parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline]
parPartsToInlines opts docx parparts =
--
-- We're going to skip data-uri's for now. It should be an option,
-- not mandatory.
--
(if False -- TODO depend on option
then walk (makeImagesSelfContained docx)
else id) $
-- bottomUp spanTrim $
-- bottomUp spanCorrect $
-- bottomUp spanReduce $
reduceList $ concatMap (parPartToInlines opts docx) parparts
parPartsToInlines :: [ParPart] -> DocxContext [Inline]
parPartsToInlines parparts = do
ils <- concatMapM parPartToInlines parparts >>=
-- TODO: Option for self-containted images
(if False then (walkM makeImagesSelfContained) else return)
return $ reduceList $ ils
cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block]
cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps
cellToBlocks :: Cell -> DocxContext [Block]
cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps
rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
rowToBlocksList :: Row -> DocxContext [[Block]]
rowToBlocksList (Row cells) = mapM cellToBlocks cells
blockCodeContainer :: Container Block -> Bool
blockCodeContainer (Container f) = case f [] of
@ -313,27 +334,32 @@ blockCodeContainer (Container f) = case f [] of
_ -> False
blockCodeContainer _ = False
bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block]
bodyPartToBlocks _ _ (Paragraph pPr parparts)
bodyPartToBlocks :: BodyPart -> DocxContext [Block]
bodyPartToBlocks (Paragraph pPr parparts)
| any blockCodeContainer (parStyleToContainers pPr) =
let
otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr)
in
return $
rebuild
otherConts
[CodeBlock ("", [], []) (concatMap parPartToString parparts)]
bodyPartToBlocks opts docx (Paragraph pPr parparts) =
case parPartsToInlines opts docx parparts of
[] ->
[]
_ ->
let parContents = parPartsToInlines opts docx parparts
trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents
in
bodyPartToBlocks (Paragraph pPr parparts) = do
ils <- parPartsToInlines parparts
case ils of
[] -> return []
_ -> do
parContents <- parPartsToInlines parparts
let trimmedContents = reverse $
dropWhile (Space ==) $
reverse $
dropWhile (Space ==) parContents
return $
rebuild
(parStyleToContainers pPr)
[Para trimmedContents]
bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
bodyPartToBlocks (ListItem pPr numId lvl parparts) = do
(Docx _ _ numbering _ _) <- asks docxDocument
let
kvs = case lookupLevel numId lvl numbering of
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
@ -349,23 +375,22 @@ bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parp
, ("text", txt)
]
Nothing -> []
in
[Div
("", ["list-item"], kvs)
(bodyPartToBlocks opts docx (Paragraph pPr parparts))]
bodyPartToBlocks _ _ (Tbl _ _ _ []) =
[Para []]
bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
blks <- bodyPartToBlocks (Paragraph pPr parparts)
return $ [Div ("", ["list-item"], kvs) blks]
bodyPartToBlocks (Tbl _ _ _ []) =
return [Para []]
bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
let caption = strToInlines cap
(hdr, rows) = case firstRowFormatting look of
True -> (Just r, rs)
False -> (Nothing, r:rs)
hdrCells = case hdr of
Just r' -> rowToBlocksList opts docx r'
Nothing -> []
cells = map (rowToBlocksList opts docx) rows
hdrCells <- case hdr of
Just r' -> rowToBlocksList r'
Nothing -> return []
cells <- mapM rowToBlocksList rows
size = case null hdrCells of
let size = case null hdrCells of
True -> length $ head cells
False -> length $ hdrCells
--
@ -374,34 +399,42 @@ bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
-- moment. Width information is in the TblGrid field of the Tbl,
-- so should be possible. Alignment might be more difficult,
-- since there doesn't seem to be a column entity in docx.
alignments = take size (repeat AlignDefault)
widths = take size (repeat 0) :: [Double]
in
[Table caption alignments widths hdrCells cells]
alignments = replicate size AlignDefault
widths = replicate size 0 :: [Double]
return [Table caption alignments widths hdrCells cells]
makeImagesSelfContained :: Docx -> Inline -> Inline
makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) =
case lookup uri media of
Just bs -> case getMimeType uri of
Just mime -> let data_uri =
"data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs)
in
Image alt (data_uri, title)
Nothing -> i
makeImagesSelfContained :: Inline -> DocxContext Inline
makeImagesSelfContained i@(Image alt (uri, title)) = do
(Docx _ _ _ _ media) <- asks docxDocument
return $ case lookup uri media of
Just bs ->
case getMimeType uri of
Just mime ->
let data_uri = "data:" ++ mime ++ ";base64," ++
toString (encode $ BS.concat $ B.toChunks bs)
in
Image alt (data_uri, title)
Nothing -> i
Nothing -> i
makeImagesSelfContained _ inline = inline
makeImagesSelfContained inline = return inline
bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
bodyToBlocks opts docx (Body bps) =
map (makeHeaderAnchors) $
blocksToDefinitions $
blocksToBullets $
concatMap (bodyPartToBlocks opts docx) bps
bodyToBlocks :: Body -> DocxContext [Block]
bodyToBlocks (Body bps) = do
blks <- concatMapM bodyPartToBlocks bps
return $
map (makeHeaderAnchors) $
blocksToDefinitions $
blocksToBullets $ blks
docxToBlocks :: ReaderOptions -> Docx -> [Block]
docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) =
let dState = DState { docxHdrLinks = M.empty }
dEnv = DEnv { docxOptions = opts
, docxDocument = d}
in
evalDocxContext (bodyToBlocks body) dEnv dState
ilToCode :: Inline -> String
ilToCode (Str s) = s