Move Docx reader to DocxContext monad
This is a ReaderT State stack, which keeps track of some environment info, such as the options and the docx doc. The state will come in handy in the future, for a couple of planned features (rewriting the section anchors as auto_idents, and hopefully smart-quoting).
This commit is contained in:
parent
b1a8f1fa1a
commit
4248f25152
1 changed files with 135 additions and 102 deletions
|
@ -88,6 +88,9 @@ import qualified Data.ByteString as BS
|
|||
import qualified Data.ByteString.Lazy as B
|
||||
import Data.ByteString.Base64 (encode)
|
||||
import System.FilePath (combine)
|
||||
import qualified Data.Map as M
|
||||
import Control.Monad.Reader
|
||||
import Control.Monad.State
|
||||
|
||||
readDocx :: ReaderOptions
|
||||
-> B.ByteString
|
||||
|
@ -97,11 +100,24 @@ readDocx opts bytes =
|
|||
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
|
||||
Nothing -> error $ "couldn't parse docx file"
|
||||
|
||||
|
||||
data DState = DState { docxHdrLinks :: M.Map String String }
|
||||
|
||||
data DEnv = DEnv { docxOptions :: ReaderOptions
|
||||
, docxDocument :: Docx}
|
||||
|
||||
type DocxContext = ReaderT DEnv (State DState)
|
||||
|
||||
evalDocxContext :: DocxContext a -> DEnv -> DState -> a
|
||||
evalDocxContext ctx env st = evalState (runReaderT ctx env) st
|
||||
|
||||
concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
|
||||
concatMapM f xs = liftM concat (mapM f xs)
|
||||
|
||||
-- This is empty, but we put it in for future-proofing.
|
||||
spansToKeep :: [String]
|
||||
spansToKeep = []
|
||||
|
||||
|
||||
-- This is empty, but we put it in for future-proofing.
|
||||
divsToKeep :: [String]
|
||||
divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
|
||||
|
||||
|
@ -213,57 +229,69 @@ inlineCodeContainer (Container f) = case f [] of
|
|||
inlineCodeContainer _ = False
|
||||
|
||||
|
||||
runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
|
||||
runToInlines _ _ (Run rs runElems)
|
||||
runToInlines :: Run -> DocxContext [Inline]
|
||||
runToInlines (Run rs runElems)
|
||||
| any inlineCodeContainer (runStyleToContainers rs) =
|
||||
return $
|
||||
rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]
|
||||
| otherwise =
|
||||
return $
|
||||
rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
|
||||
runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) =
|
||||
runToInlines (Footnote fnId) = do
|
||||
(Docx _ notes _ _ _ ) <- asks docxDocument
|
||||
case (getFootNote fnId notes) of
|
||||
Just bodyParts ->
|
||||
[Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
|
||||
Nothing ->
|
||||
[Note []]
|
||||
runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) =
|
||||
Just bodyParts -> do
|
||||
blks <- concatMapM bodyPartToBlocks bodyParts
|
||||
return $ [Note blks]
|
||||
Nothing -> return [Note []]
|
||||
runToInlines (Endnote fnId) = do
|
||||
(Docx _ notes _ _ _ ) <- asks docxDocument
|
||||
case (getEndNote fnId notes) of
|
||||
Just bodyParts ->
|
||||
[Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
|
||||
Nothing ->
|
||||
[Note []]
|
||||
Just bodyParts -> do
|
||||
blks <- concatMapM bodyPartToBlocks bodyParts
|
||||
return $ [Note blks]
|
||||
Nothing -> return [Note []]
|
||||
|
||||
parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline]
|
||||
parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
|
||||
parPartToInlines opts docx (Insertion _ author date runs) =
|
||||
parPartToInlines :: ParPart -> DocxContext [Inline]
|
||||
parPartToInlines (PlainRun r) = runToInlines r
|
||||
parPartToInlines (Insertion _ author date runs) = do
|
||||
opts <- asks docxOptions
|
||||
case readerTrackChanges opts of
|
||||
AcceptChanges -> concatMap (runToInlines opts docx) runs
|
||||
RejectChanges -> []
|
||||
AllChanges ->
|
||||
[Span
|
||||
("", ["insertion"], [("author", author), ("date", date)])
|
||||
(concatMap (runToInlines opts docx) runs)]
|
||||
parPartToInlines opts docx (Deletion _ author date runs) =
|
||||
AcceptChanges -> concatMapM runToInlines runs >>= return
|
||||
RejectChanges -> return []
|
||||
AllChanges -> do
|
||||
ils <- (concatMapM runToInlines runs)
|
||||
return [Span
|
||||
("", ["insertion"], [("author", author), ("date", date)])
|
||||
ils]
|
||||
parPartToInlines (Deletion _ author date runs) = do
|
||||
opts <- asks docxOptions
|
||||
case readerTrackChanges opts of
|
||||
AcceptChanges -> []
|
||||
RejectChanges -> concatMap (runToInlines opts docx) runs
|
||||
AllChanges ->
|
||||
[Span
|
||||
("", ["deletion"], [("author", author), ("date", date)])
|
||||
(concatMap (runToInlines opts docx) runs)]
|
||||
parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = []
|
||||
parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []]
|
||||
parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) =
|
||||
case lookupRelationship relid rels of
|
||||
AcceptChanges -> return []
|
||||
RejectChanges -> concatMapM runToInlines runs >>= return
|
||||
AllChanges -> do
|
||||
ils <- concatMapM runToInlines runs
|
||||
return [Span
|
||||
("", ["deletion"], [("author", author), ("date", date)])
|
||||
ils]
|
||||
parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return []
|
||||
parPartToInlines (BookMark _ anchor) = return [Span (anchor, ["anchor"], []) []]
|
||||
parPartToInlines (Drawing relid) = do
|
||||
(Docx _ _ _ rels _) <- asks docxDocument
|
||||
return $ case lookupRelationship relid rels of
|
||||
Just target -> [Image [] (combine "word" target, "")]
|
||||
Nothing -> [Image [] ("", "")]
|
||||
parPartToInlines opts docx (InternalHyperLink anchor runs) =
|
||||
[Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")]
|
||||
parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) =
|
||||
case lookupRelationship relid rels of
|
||||
parPartToInlines (InternalHyperLink anchor runs) = do
|
||||
ils <- concatMapM runToInlines runs
|
||||
return [Link ils ('#' : anchor, "")]
|
||||
parPartToInlines (ExternalHyperLink relid runs) = do
|
||||
(Docx _ _ _ rels _) <- asks docxDocument
|
||||
rs <- concatMapM runToInlines runs
|
||||
return $ case lookupRelationship relid rels of
|
||||
Just target ->
|
||||
[Link (concatMap (runToInlines opts docx) runs) (target, "")]
|
||||
[Link rs (target, "")]
|
||||
Nothing ->
|
||||
[Link (concatMap (runToInlines opts docx) runs) ("", "")]
|
||||
[Link rs ("", "")]
|
||||
|
||||
isAnchorSpan :: Inline -> Bool
|
||||
isAnchorSpan (Span (ident, classes, kvs) ils) =
|
||||
|
@ -287,25 +315,18 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
|
|||
_ -> h
|
||||
makeHeaderAnchors blk = blk
|
||||
|
||||
parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline]
|
||||
parPartsToInlines opts docx parparts =
|
||||
--
|
||||
-- We're going to skip data-uri's for now. It should be an option,
|
||||
-- not mandatory.
|
||||
--
|
||||
(if False -- TODO depend on option
|
||||
then walk (makeImagesSelfContained docx)
|
||||
else id) $
|
||||
-- bottomUp spanTrim $
|
||||
-- bottomUp spanCorrect $
|
||||
-- bottomUp spanReduce $
|
||||
reduceList $ concatMap (parPartToInlines opts docx) parparts
|
||||
parPartsToInlines :: [ParPart] -> DocxContext [Inline]
|
||||
parPartsToInlines parparts = do
|
||||
ils <- concatMapM parPartToInlines parparts >>=
|
||||
-- TODO: Option for self-containted images
|
||||
(if False then (walkM makeImagesSelfContained) else return)
|
||||
return $ reduceList $ ils
|
||||
|
||||
cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block]
|
||||
cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps
|
||||
cellToBlocks :: Cell -> DocxContext [Block]
|
||||
cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps
|
||||
|
||||
rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
|
||||
rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
|
||||
rowToBlocksList :: Row -> DocxContext [[Block]]
|
||||
rowToBlocksList (Row cells) = mapM cellToBlocks cells
|
||||
|
||||
blockCodeContainer :: Container Block -> Bool
|
||||
blockCodeContainer (Container f) = case f [] of
|
||||
|
@ -313,27 +334,32 @@ blockCodeContainer (Container f) = case f [] of
|
|||
_ -> False
|
||||
blockCodeContainer _ = False
|
||||
|
||||
bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block]
|
||||
bodyPartToBlocks _ _ (Paragraph pPr parparts)
|
||||
bodyPartToBlocks :: BodyPart -> DocxContext [Block]
|
||||
bodyPartToBlocks (Paragraph pPr parparts)
|
||||
| any blockCodeContainer (parStyleToContainers pPr) =
|
||||
let
|
||||
otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr)
|
||||
in
|
||||
return $
|
||||
rebuild
|
||||
otherConts
|
||||
[CodeBlock ("", [], []) (concatMap parPartToString parparts)]
|
||||
bodyPartToBlocks opts docx (Paragraph pPr parparts) =
|
||||
case parPartsToInlines opts docx parparts of
|
||||
[] ->
|
||||
[]
|
||||
_ ->
|
||||
let parContents = parPartsToInlines opts docx parparts
|
||||
trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents
|
||||
in
|
||||
bodyPartToBlocks (Paragraph pPr parparts) = do
|
||||
ils <- parPartsToInlines parparts
|
||||
case ils of
|
||||
[] -> return []
|
||||
_ -> do
|
||||
parContents <- parPartsToInlines parparts
|
||||
let trimmedContents = reverse $
|
||||
dropWhile (Space ==) $
|
||||
reverse $
|
||||
dropWhile (Space ==) parContents
|
||||
return $
|
||||
rebuild
|
||||
(parStyleToContainers pPr)
|
||||
[Para trimmedContents]
|
||||
bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
|
||||
bodyPartToBlocks (ListItem pPr numId lvl parparts) = do
|
||||
(Docx _ _ numbering _ _) <- asks docxDocument
|
||||
let
|
||||
kvs = case lookupLevel numId lvl numbering of
|
||||
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
|
||||
|
@ -349,23 +375,22 @@ bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parp
|
|||
, ("text", txt)
|
||||
]
|
||||
Nothing -> []
|
||||
in
|
||||
[Div
|
||||
("", ["list-item"], kvs)
|
||||
(bodyPartToBlocks opts docx (Paragraph pPr parparts))]
|
||||
bodyPartToBlocks _ _ (Tbl _ _ _ []) =
|
||||
[Para []]
|
||||
bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
|
||||
blks <- bodyPartToBlocks (Paragraph pPr parparts)
|
||||
return $ [Div ("", ["list-item"], kvs) blks]
|
||||
bodyPartToBlocks (Tbl _ _ _ []) =
|
||||
return [Para []]
|
||||
bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
|
||||
let caption = strToInlines cap
|
||||
(hdr, rows) = case firstRowFormatting look of
|
||||
True -> (Just r, rs)
|
||||
False -> (Nothing, r:rs)
|
||||
hdrCells = case hdr of
|
||||
Just r' -> rowToBlocksList opts docx r'
|
||||
Nothing -> []
|
||||
cells = map (rowToBlocksList opts docx) rows
|
||||
hdrCells <- case hdr of
|
||||
Just r' -> rowToBlocksList r'
|
||||
Nothing -> return []
|
||||
|
||||
cells <- mapM rowToBlocksList rows
|
||||
|
||||
size = case null hdrCells of
|
||||
let size = case null hdrCells of
|
||||
True -> length $ head cells
|
||||
False -> length $ hdrCells
|
||||
--
|
||||
|
@ -374,34 +399,42 @@ bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
|
|||
-- moment. Width information is in the TblGrid field of the Tbl,
|
||||
-- so should be possible. Alignment might be more difficult,
|
||||
-- since there doesn't seem to be a column entity in docx.
|
||||
alignments = take size (repeat AlignDefault)
|
||||
widths = take size (repeat 0) :: [Double]
|
||||
in
|
||||
[Table caption alignments widths hdrCells cells]
|
||||
alignments = replicate size AlignDefault
|
||||
widths = replicate size 0 :: [Double]
|
||||
|
||||
return [Table caption alignments widths hdrCells cells]
|
||||
|
||||
|
||||
makeImagesSelfContained :: Docx -> Inline -> Inline
|
||||
makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) =
|
||||
case lookup uri media of
|
||||
Just bs -> case getMimeType uri of
|
||||
Just mime -> let data_uri =
|
||||
"data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs)
|
||||
in
|
||||
Image alt (data_uri, title)
|
||||
Nothing -> i
|
||||
makeImagesSelfContained :: Inline -> DocxContext Inline
|
||||
makeImagesSelfContained i@(Image alt (uri, title)) = do
|
||||
(Docx _ _ _ _ media) <- asks docxDocument
|
||||
return $ case lookup uri media of
|
||||
Just bs ->
|
||||
case getMimeType uri of
|
||||
Just mime ->
|
||||
let data_uri = "data:" ++ mime ++ ";base64," ++
|
||||
toString (encode $ BS.concat $ B.toChunks bs)
|
||||
in
|
||||
Image alt (data_uri, title)
|
||||
Nothing -> i
|
||||
Nothing -> i
|
||||
makeImagesSelfContained _ inline = inline
|
||||
makeImagesSelfContained inline = return inline
|
||||
|
||||
bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
|
||||
bodyToBlocks opts docx (Body bps) =
|
||||
map (makeHeaderAnchors) $
|
||||
blocksToDefinitions $
|
||||
blocksToBullets $
|
||||
concatMap (bodyPartToBlocks opts docx) bps
|
||||
bodyToBlocks :: Body -> DocxContext [Block]
|
||||
bodyToBlocks (Body bps) = do
|
||||
blks <- concatMapM bodyPartToBlocks bps
|
||||
return $
|
||||
map (makeHeaderAnchors) $
|
||||
blocksToDefinitions $
|
||||
blocksToBullets $ blks
|
||||
|
||||
docxToBlocks :: ReaderOptions -> Docx -> [Block]
|
||||
docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
|
||||
|
||||
docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) =
|
||||
let dState = DState { docxHdrLinks = M.empty }
|
||||
dEnv = DEnv { docxOptions = opts
|
||||
, docxDocument = d}
|
||||
in
|
||||
evalDocxContext (bodyToBlocks body) dEnv dState
|
||||
|
||||
ilToCode :: Inline -> String
|
||||
ilToCode (Str s) = s
|
||||
|
|
Loading…
Add table
Reference in a new issue