Docx Reader: A nicer Docx type.

This modifies the Docx type in the parser to avoid all the extra files
(Notes, numbering, etc). A reader monad keeps track of these, and applies
them at the end. The reader monad is stacked with ErrorT to enable better
error-handling than the old Maybes. (Note that the better error handling
isn't really there yet, but it is now possible.)

One long-term goal of these changes is to make it easier to write the Docx
type. This should make it easier to develop a standalone docx package in the
future.
This commit is contained in:
Jesse Rosenthal 2014-07-08 13:22:20 -04:00
parent 7d6da118d3
commit d65fd58171
2 changed files with 713 additions and 752 deletions

View file

@ -91,7 +91,6 @@ import Data.List (delete, isPrefixOf, (\\), intercalate)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import Data.ByteString.Base64 (encode)
import System.FilePath (combine)
import qualified Data.Map as M
import Control.Monad.Reader
import Control.Monad.State
@ -102,8 +101,8 @@ readDocx :: ReaderOptions
-> Pandoc
readDocx opts bytes =
case archiveToDocx (toArchive bytes) of
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
Nothing -> error $ "couldn't parse docx file"
Right docx -> Pandoc nullMeta (docxToBlocks opts docx)
Left _ -> error $ "couldn't parse docx file"
data DState = DState { docxAnchorMap :: M.Map String String
, docxInTexSubscript :: Bool }
@ -159,7 +158,7 @@ runStyleToContainers rPr =
, if isStrike rPr then (Just Strikeout) else Nothing
, if isSuperScript rPr then (Just Superscript) else Nothing
, if isSubScript rPr then (Just Subscript) else Nothing
, underline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)]))
, rUnderline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)]))
]
in
classContainers ++ formatters
@ -259,20 +258,17 @@ runToInlines (Run rs runElems)
| otherwise =
return $
rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
runToInlines (Footnote fnId) = do
(Docx _ notes _ _ _ ) <- asks docxDocument
case (getFootNote fnId notes) of
Just bodyParts -> do
blks <- concatMapM bodyPartToBlocks bodyParts
return $ [Note blks]
Nothing -> return [Note []]
runToInlines (Endnote fnId) = do
(Docx _ notes _ _ _ ) <- asks docxDocument
case (getEndNote fnId notes) of
Just bodyParts -> do
blks <- concatMapM bodyPartToBlocks bodyParts
return $ [Note blks]
Nothing -> return [Note []]
runToInlines (Footnote bps) =
concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks])
runToInlines (Endnote bps) =
concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks])
makeDataUrl :: String -> B.ByteString -> Maybe String
makeDataUrl fp bs =
case getMimeType fp of
Just mime -> Just $ "data:" ++ mime ++ ";base64," ++
toString (encode $ BS.concat $ B.toChunks bs)
Nothing -> Nothing
parPartToInlines :: ParPart -> DocxContext [Inline]
parPartToInlines (PlainRun r) = runToInlines r
@ -313,22 +309,18 @@ parPartToInlines (BookMark _ anchor) =
False -> anchor
updateDState $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}
return [Span (anchor, ["anchor"], []) []]
parPartToInlines (Drawing relid) = do
(Docx _ _ _ rels _) <- asks docxDocument
return $ case lookupRelationship relid rels of
Just target -> [Image [] (combine "word" target, "")]
Nothing -> [Image [] ("", "")]
parPartToInlines (Drawing fp bs) = do
return $ case True of -- TODO: add self-contained images
True -> [Image [] (fp, "")]
False -> case makeDataUrl fp bs of
Just d -> [Image [] (d, "")]
Nothing -> [Image [] ("", "")]
parPartToInlines (InternalHyperLink anchor runs) = do
ils <- concatMapM runToInlines runs
return [Link ils ('#' : anchor, "")]
parPartToInlines (ExternalHyperLink relid runs) = do
(Docx _ _ _ rels _) <- asks docxDocument
rs <- concatMapM runToInlines runs
return $ case lookupRelationship relid rels of
Just target ->
[Link rs (target, "")]
Nothing ->
[Link rs ("", "")]
parPartToInlines (ExternalHyperLink target runs) = do
ils <- concatMapM runToInlines runs
return [Link ils (target, "")]
parPartToInlines (PlainOMath omath) = do
s <- oMathToTexString omath
return [Math InlineMath s]
@ -450,6 +442,9 @@ oMathElemToTexString (NAry _ sub sup base) = do
baseString <- baseToTexString base
return $ printf "\\int_{%s}^{%s}{%s}"
subString supString baseString
oMathElemToTexString (Phantom base) = do
baseString <- baseToTexString base
return $ printf "\\phantom{%s}" baseString
oMathElemToTexString (Radical degree base) = do
degString <- concatMapM oMathElemToTexString degree
baseString <- baseToTexString base
@ -475,7 +470,6 @@ oMathElemToTexString (Super base sup) = do
supString <- concatMapM oMathElemToTexString sup
return $ printf "%s^{%s}" baseString supString
oMathElemToTexString (OMathRun _ run) = return $ stringToTex $ runToString run
oMathElemToTexString _ = return "[NOT IMPLEMENTED]"
baseToTexString :: Base -> DocxContext String
baseToTexString (Base mathElems) =
@ -518,9 +512,7 @@ makeHeaderAnchor blk = return blk
parPartsToInlines :: [ParPart] -> DocxContext [Inline]
parPartsToInlines parparts = do
ils <- concatMapM parPartToInlines parparts >>=
-- TODO: Option for self-containted images
(if False then (walkM makeImagesSelfContained) else return)
ils <- concatMapM parPartToInlines parparts
return $ reduceList $ ils
cellToBlocks :: Cell -> DocxContext [Block]
@ -563,23 +555,21 @@ bodyPartToBlocks (Paragraph pPr parparts) = do
rebuild
(parStyleToContainers pPr)
[Para ils]
bodyPartToBlocks (ListItem pPr numId lvl parparts) = do
(Docx _ _ numbering _ _) <- asks docxDocument
bodyPartToBlocks (ListItem pPr numId lvl levelInfo parparts) = do
let
kvs = case lookupLevel numId lvl numbering of
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
, ("start", (show start))
]
kvs = case levelInfo of
(_, fmt, txt, Just start) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
, ("start", (show start))
]
Just (_, fmt, txt, Nothing) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
]
Nothing -> []
(_, fmt, txt, Nothing) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
]
blks <- bodyPartToBlocks (Paragraph pPr parparts)
return $ [Div ("", ["list-item"], kvs) blks]
bodyPartToBlocks (Tbl _ _ _ []) =
@ -622,20 +612,6 @@ rewriteLink l@(Link ils ('#':target, title)) = do
Nothing -> l
rewriteLink il = return il
makeImagesSelfContained :: Inline -> DocxContext Inline
makeImagesSelfContained i@(Image alt (uri, title)) = do
(Docx _ _ _ _ media) <- asks docxDocument
return $ case lookup uri media of
Just bs ->
case getMimeType uri of
Just mime ->
let data_uri = "data:" ++ mime ++ ";base64," ++
toString (encode $ BS.concat $ B.toChunks bs)
in
Image alt (data_uri, title)
Nothing -> i
Nothing -> i
makeImagesSelfContained inline = return inline
bodyToBlocks :: Body -> DocxContext [Block]
bodyToBlocks (Body bps) = do
@ -646,7 +622,7 @@ bodyToBlocks (Body bps) = do
blocksToBullets $ blks
docxToBlocks :: ReaderOptions -> Docx -> [Block]
docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) =
docxToBlocks opts d@(Docx (Document _ body)) =
let dState = DState { docxAnchorMap = M.empty
, docxInTexSubscript = False}
dEnv = DEnv { docxOptions = opts

File diff suppressed because it is too large Load diff