pandoc/src/Text/Pandoc/Readers/Docx.hs
John MacFarlane 9849ba7fd7 Use Control.Monad.State.Strict throughout.
This gives 20-30% speedup and reduction of memory
usage in most of the writers.
2017-06-17 07:45:28 +02:00

620 lines
22 KiB
Haskell

{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}
{-
Copyright (C) 2014-2017 Jesse Rosenthal <jrosenthal@jhu.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Docx
Copyright : Copyright (C) 2014-2017 Jesse Rosenthal
License : GNU GPL, version 2 or above
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
Stability : alpha
Portability : portable
Conversion of Docx type (defined in Text.Pandoc.Readers.Docx.Parse)
to 'Pandoc' document. -}
{-
Current state of implementation of Docx entities ([x] means
implemented, [-] means partially implemented):
* Blocks
- [X] Para
- [X] CodeBlock (styled with `SourceCode`)
- [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally,
indented)
- [X] OrderedList
- [X] BulletList
- [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`)
- [X] Header (styled with `Heading#`)
- [ ] HorizontalRule
- [-] Table (column widths and alignments not yet implemented)
* Inlines
- [X] Str
- [X] Emph (italics and underline both read as Emph)
- [X] Strong
- [X] Strikeout
- [X] Superscript
- [X] Subscript
- [X] SmallCaps
- [ ] Quoted
- [ ] Cite
- [X] Code (styled with `VerbatimChar`)
- [X] Space
- [X] LineBreak (these are invisible in Word: entered with Shift-Return)
- [X] Math
- [X] Link (links to an arbitrary bookmark create a span with the target as
id and "anchor" class)
- [X] Image
- [X] Note (Footnotes and Endnotes are silently combined.)
-}
module Text.Pandoc.Readers.Docx
( readDocx
) where
import Codec.Archive.Zip
import Control.Monad.Reader
import Control.Monad.State.Strict
import qualified Data.ByteString.Lazy as B
import Data.Default (Default)
import Data.List (delete, intersect)
import qualified Data.Map as M
import Data.Sequence (ViewL (..), viewl)
import qualified Data.Sequence as Seq
import qualified Data.Set as Set
import Text.Pandoc.Builder
-- import Text.Pandoc.Definition
import Text.Pandoc.MediaBag (MediaBag)
import Text.Pandoc.Options
import Text.Pandoc.Readers.Docx.Combine
import Text.Pandoc.Readers.Docx.Lists
import Text.Pandoc.Readers.Docx.Parse
import Text.Pandoc.Shared
import Text.Pandoc.Walk
import Text.TeXMath (writeTeX)
#if !(MIN_VERSION_base(4,8,0))
import Data.Traversable (traverse)
#endif
import Control.Monad.Except (throwError)
import Text.Pandoc.Class (PandocMonad)
import qualified Text.Pandoc.Class as P
import Text.Pandoc.Error
import Text.Pandoc.Logging
readDocx :: PandocMonad m
=> ReaderOptions
-> B.ByteString
-> m Pandoc
readDocx opts bytes
| Right archive <- toArchiveOrFail bytes
, Right (docx, parserWarnings) <- archiveToDocxWithWarnings archive = do
mapM_ (P.report . DocxParserWarning) parserWarnings
(meta, blks) <- docxToOutput opts docx
return $ Pandoc meta blks
readDocx _ _ =
throwError $ PandocSomeError "couldn't parse docx file"
data DState = DState { docxAnchorMap :: M.Map String String
, docxMediaBag :: MediaBag
, docxDropCap :: Inlines
, docxWarnings :: [String]
}
instance Default DState where
def = DState { docxAnchorMap = M.empty
, docxMediaBag = mempty
, docxDropCap = mempty
, docxWarnings = []
}
data DEnv = DEnv { docxOptions :: ReaderOptions
, docxInHeaderBlock :: Bool }
instance Default DEnv where
def = DEnv def False
type DocxContext m = ReaderT DEnv (StateT DState m)
evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a
evalDocxContext ctx env st = flip evalStateT st $ flip runReaderT env $ ctx
-- This is empty, but we put it in for future-proofing.
spansToKeep :: [String]
spansToKeep = []
divsToKeep :: [String]
divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
metaStyles :: M.Map String String
metaStyles = M.fromList [ ("Title", "title")
, ("Subtitle", "subtitle")
, ("Author", "author")
, ("Date", "date")
, ("Abstract", "abstract")]
sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp))
isMetaPar :: BodyPart -> Bool
isMetaPar (Paragraph pPr _) =
not $ null $ intersect (pStyle pPr) (M.keys metaStyles)
isMetaPar _ = False
isEmptyPar :: BodyPart -> Bool
isEmptyPar (Paragraph _ parParts) =
all isEmptyParPart parParts
where
isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
isEmptyParPart _ = False
isEmptyElem (TextRun s) = trim s == ""
isEmptyElem _ = True
isEmptyPar _ = False
bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String MetaValue)
bodyPartsToMeta' [] = return M.empty
bodyPartsToMeta' (bp : bps)
| (Paragraph pPr parParts) <- bp
, (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
, (Just metaField) <- M.lookup c metaStyles = do
inlines <- smushInlines <$> mapM parPartToInlines parParts
remaining <- bodyPartsToMeta' bps
let
f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks)
f m (MetaList mv) = MetaList (m : mv)
f m n = MetaList [m, n]
return $ M.insertWith f metaField (MetaInlines (toList inlines)) remaining
bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps
bodyPartsToMeta :: PandocMonad m => [BodyPart] -> DocxContext m Meta
bodyPartsToMeta bps = do
mp <- bodyPartsToMeta' bps
let mp' =
case M.lookup "author" mp of
Just mv -> M.insert "author" (fixAuthors mv) mp
Nothing -> mp
return $ Meta mp'
fixAuthors :: MetaValue -> MetaValue
fixAuthors (MetaBlocks blks) =
MetaList $ map g $ filter f blks
where f (Para _) = True
f _ = False
g (Para ils) = MetaInlines ils
g _ = MetaInlines []
fixAuthors mv = mv
codeStyles :: [String]
codeStyles = ["VerbatimChar"]
codeDivs :: [String]
codeDivs = ["SourceCode"]
runElemToInlines :: RunElem -> Inlines
runElemToInlines (TextRun s) = text s
runElemToInlines (LnBrk) = linebreak
runElemToInlines (Tab) = space
runElemToInlines (SoftHyphen) = text "\xad"
runElemToInlines (NoBreakHyphen) = text "\x2011"
runElemToString :: RunElem -> String
runElemToString (TextRun s) = s
runElemToString (LnBrk) = ['\n']
runElemToString (Tab) = ['\t']
runElemToString (SoftHyphen) = ['\xad']
runElemToString (NoBreakHyphen) = ['\x2011']
runToString :: Run -> String
runToString (Run _ runElems) = concatMap runElemToString runElems
runToString _ = ""
parPartToString :: ParPart -> String
parPartToString (PlainRun run) = runToString run
parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
parPartToString _ = ""
blacklistedCharStyles :: [String]
blacklistedCharStyles = ["Hyperlink"]
resolveDependentRunStyle :: RunStyle -> RunStyle
resolveDependentRunStyle rPr
| Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles =
rPr
| Just (_, cs) <- rStyle rPr =
let rPr' = resolveDependentRunStyle cs
in
RunStyle { isBold = case isBold rPr of
Just bool -> Just bool
Nothing -> isBold rPr'
, isItalic = case isItalic rPr of
Just bool -> Just bool
Nothing -> isItalic rPr'
, isSmallCaps = case isSmallCaps rPr of
Just bool -> Just bool
Nothing -> isSmallCaps rPr'
, isStrike = case isStrike rPr of
Just bool -> Just bool
Nothing -> isStrike rPr'
, rVertAlign = case rVertAlign rPr of
Just valign -> Just valign
Nothing -> rVertAlign rPr'
, rUnderline = case rUnderline rPr of
Just ulstyle -> Just ulstyle
Nothing -> rUnderline rPr'
, rStyle = rStyle rPr }
| otherwise = rPr
runStyleToTransform :: RunStyle -> (Inlines -> Inlines)
runStyleToTransform rPr
| Just (s, _) <- rStyle rPr
, s `elem` spansToKeep =
let rPr' = rPr{rStyle = Nothing}
in
(spanWith ("", [s], [])) . (runStyleToTransform rPr')
| Just True <- isItalic rPr =
emph . (runStyleToTransform rPr {isItalic = Nothing})
| Just True <- isBold rPr =
strong . (runStyleToTransform rPr {isBold = Nothing})
| Just True <- isSmallCaps rPr =
smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing})
| Just True <- isStrike rPr =
strikeout . (runStyleToTransform rPr {isStrike = Nothing})
| Just SupScrpt <- rVertAlign rPr =
superscript . (runStyleToTransform rPr {rVertAlign = Nothing})
| Just SubScrpt <- rVertAlign rPr =
subscript . (runStyleToTransform rPr {rVertAlign = Nothing})
| Just "single" <- rUnderline rPr =
emph . (runStyleToTransform rPr {rUnderline = Nothing})
| otherwise = id
runToInlines :: PandocMonad m => Run -> DocxContext m Inlines
runToInlines (Run rs runElems)
| Just (s, _) <- rStyle rs
, s `elem` codeStyles =
let rPr = resolveDependentRunStyle rs
codeString = code $ concatMap runElemToString runElems
in
return $ case rVertAlign rPr of
Just SupScrpt -> superscript codeString
Just SubScrpt -> subscript codeString
_ -> codeString
| otherwise = do
let ils = smushInlines (map runElemToInlines runElems)
return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils
runToInlines (Footnote bps) = do
blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps)
return $ note blksList
runToInlines (Endnote bps) = do
blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps)
return $ note blksList
runToInlines (InlineDrawing fp title alt bs ext) = do
(lift . lift) $ P.insertMedia fp Nothing bs
return $ imageWith (extentToAttr ext) fp title $ text alt
runToInlines InlineChart = return $ spanWith ("", ["chart"], []) $ text "[CHART]"
extentToAttr :: Extent -> Attr
extentToAttr (Just (w, h)) =
("", [], [("width", showDim w), ("height", showDim h)] )
where
showDim d = show (d / 914400) ++ "in"
extentToAttr _ = nullAttr
blocksToInlinesWarn :: PandocMonad m => String -> Blocks -> DocxContext m Inlines
blocksToInlinesWarn cmtId blks = do
let blkList = toList blks
notParaOrPlain :: Block -> Bool
notParaOrPlain (Para _) = False
notParaOrPlain (Plain _) = False
notParaOrPlain _ = True
when (not $ null $ filter notParaOrPlain blkList) $
lift $ P.report $ DocxParserWarning $
"Docx comment " ++ cmtId ++ " will not retain formatting"
return $ fromList $ blocksToInlines blkList
parPartToInlines :: PandocMonad m => ParPart -> DocxContext m Inlines
parPartToInlines (PlainRun r) = runToInlines r
parPartToInlines (Insertion _ author date runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AcceptChanges -> smushInlines <$> mapM runToInlines runs
RejectChanges -> return mempty
AllChanges -> do
ils <- smushInlines <$> mapM runToInlines runs
let attr = ("", ["insertion"], [("author", author), ("date", date)])
return $ spanWith attr ils
parPartToInlines (Deletion _ author date runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AcceptChanges -> return mempty
RejectChanges -> smushInlines <$> mapM runToInlines runs
AllChanges -> do
ils <- smushInlines <$> mapM runToInlines runs
let attr = ("", ["deletion"], [("author", author), ("date", date)])
return $ spanWith attr ils
parPartToInlines (CommentStart cmtId author date bodyParts) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AllChanges -> do
blks <- smushBlocks <$> mapM bodyPartToBlocks bodyParts
ils <- blocksToInlinesWarn cmtId blks
let attr = ("", ["comment-start"], [("id", cmtId), ("author", author), ("date", date)])
return $ spanWith attr ils
_ -> return mempty
parPartToInlines (CommentEnd cmtId) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AllChanges -> do
let attr = ("", ["comment-end"], [("id", cmtId)])
return $ spanWith attr mempty
_ -> return mempty
parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors =
return mempty
parPartToInlines (BookMark _ anchor) =
-- We record these, so we can make sure not to overwrite
-- user-defined anchor links with header auto ids.
do
-- get whether we're in a header.
inHdrBool <- asks docxInHeaderBlock
-- Get the anchor map.
anchorMap <- gets docxAnchorMap
-- We don't want to rewrite if we're in a header, since we'll take
-- care of that later, when we make the header anchor. If the
-- bookmark were already in uniqueIdent form, this would lead to a
-- duplication. Otherwise, we check to see if the id is already in
-- there. Rewrite if necessary. This will have the possible effect
-- of rewriting user-defined anchor links. However, since these
-- are not defined in pandoc, it seems like a necessary evil to
-- avoid an extra pass.
let newAnchor =
if not inHdrBool && anchor `elem` (M.elems anchorMap)
then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
else anchor
unless inHdrBool
(modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
return $ spanWith (newAnchor, ["anchor"], []) mempty
parPartToInlines (Drawing fp title alt bs ext) = do
(lift . lift) $ P.insertMedia fp Nothing bs
return $ imageWith (extentToAttr ext) fp title $ text alt
parPartToInlines Chart = do
return $ spanWith ("", ["chart"], []) $ text "[CHART]"
parPartToInlines (InternalHyperLink anchor runs) = do
ils <- smushInlines <$> mapM runToInlines runs
return $ link ('#' : anchor) "" ils
parPartToInlines (ExternalHyperLink target runs) = do
ils <- smushInlines <$> mapM runToInlines runs
return $ link target "" ils
parPartToInlines (PlainOMath exps) = do
return $ math $ writeTeX exps
parPartToInlines (SmartTag runs) = do
ils <- smushInlines <$> mapM runToInlines runs
return ils
isAnchorSpan :: Inline -> Bool
isAnchorSpan (Span (_, classes, kvs) _) =
classes == ["anchor"] &&
null kvs
isAnchorSpan _ = False
dummyAnchors :: [String]
dummyAnchors = ["_GoBack"]
makeHeaderAnchor :: PandocMonad m => Blocks -> DocxContext m Blocks
makeHeaderAnchor bs = traverse makeHeaderAnchor' bs
makeHeaderAnchor' :: PandocMonad m => Block -> DocxContext m Block
-- If there is an anchor already there (an anchor span in the header,
-- to be exact), we rename and associate the new id with the old one.
makeHeaderAnchor' (Header n (ident, classes, kvs) ils)
| (c:_) <- filter isAnchorSpan ils
, (Span (anchIdent, ["anchor"], _) cIls) <- c = do
hdrIDMap <- gets docxAnchorMap
let newIdent = if null ident
then uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
else ident
newIls = concatMap f ils where f il | il == c = cIls
| otherwise = [il]
modify $ \s -> s {docxAnchorMap = M.insert anchIdent newIdent hdrIDMap}
makeHeaderAnchor' $ Header n (newIdent, classes, kvs) newIls
-- Otherwise we just give it a name, and register that name (associate
-- it with itself.)
makeHeaderAnchor' (Header n (ident, classes, kvs) ils) =
do
hdrIDMap <- gets docxAnchorMap
let newIdent = if null ident
then uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
else ident
modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
return $ Header n (newIdent, classes, kvs) ils
makeHeaderAnchor' blk = return blk
-- Rewrite a standalone paragraph block as a plain
singleParaToPlain :: Blocks -> Blocks
singleParaToPlain blks
| (Para (ils) :< seeq) <- viewl $ unMany blks
, Seq.null seeq =
singleton $ Plain ils
singleParaToPlain blks = blks
cellToBlocks :: PandocMonad m => Cell -> DocxContext m Blocks
cellToBlocks (Cell bps) = do
blks <- smushBlocks <$> mapM bodyPartToBlocks bps
return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
rowToBlocksList :: PandocMonad m => Row -> DocxContext m [Blocks]
rowToBlocksList (Row cells) = do
blksList <- mapM cellToBlocks cells
return $ map singleParaToPlain blksList
-- like trimInlines, but also take out linebreaks
trimSps :: Inlines -> Inlines
trimSps (Many ils) = Many $ Seq.dropWhileL isSp $ Seq.dropWhileR isSp $ ils
where isSp Space = True
isSp SoftBreak = True
isSp LineBreak = True
isSp _ = False
parStyleToTransform :: ParagraphStyle -> (Blocks -> Blocks)
parStyleToTransform pPr
| (c:cs) <- pStyle pPr
, c `elem` divsToKeep =
let pPr' = pPr { pStyle = cs }
in
(divWith ("", [c], [])) . (parStyleToTransform pPr')
| (c:cs) <- pStyle pPr,
c `elem` listParagraphDivs =
let pPr' = pPr { pStyle = cs, indentation = Nothing}
in
(divWith ("", [c], [])) . (parStyleToTransform pPr')
| (_:cs) <- pStyle pPr
, Just True <- pBlockQuote pPr =
let pPr' = pPr { pStyle = cs }
in
blockQuote . (parStyleToTransform pPr')
| (_:cs) <- pStyle pPr =
let pPr' = pPr { pStyle = cs}
in
parStyleToTransform pPr'
| null (pStyle pPr)
, Just left <- indentation pPr >>= leftParIndent
, Just hang <- indentation pPr >>= hangingParIndent =
let pPr' = pPr { indentation = Nothing }
in
case (left - hang) > 0 of
True -> blockQuote . (parStyleToTransform pPr')
False -> parStyleToTransform pPr'
| null (pStyle pPr),
Just left <- indentation pPr >>= leftParIndent =
let pPr' = pPr { indentation = Nothing }
in
case left > 0 of
True -> blockQuote . (parStyleToTransform pPr')
False -> parStyleToTransform pPr'
parStyleToTransform _ = id
bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks
bodyPartToBlocks (Paragraph pPr parparts)
| not $ null $ codeDivs `intersect` (pStyle pPr) =
return
$ parStyleToTransform pPr
$ codeBlock
$ concatMap parPartToString parparts
| Just (style, n) <- pHeading pPr = do
ils <- local (\s-> s{docxInHeaderBlock=True}) $
(smushInlines <$> mapM parPartToInlines parparts)
makeHeaderAnchor $
headerWith ("", delete style (pStyle pPr), []) n ils
| otherwise = do
ils <- (trimSps . smushInlines) <$> mapM parPartToInlines parparts
dropIls <- gets docxDropCap
let ils' = dropIls <> ils
if dropCap pPr
then do modify $ \s -> s { docxDropCap = ils' }
return mempty
else do modify $ \s -> s { docxDropCap = mempty }
return $ case isNull ils' of
True -> mempty
_ -> parStyleToTransform pPr $ para ils'
bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do
let
kvs = case levelInfo of
(_, fmt, txt, Just start) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
, ("start", (show start))
]
(_, fmt, txt, Nothing) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
]
blks <- bodyPartToBlocks (Paragraph pPr parparts)
return $ divWith ("", ["list-item"], kvs) blks
bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
let pPr' = pPr {pStyle = "ListParagraph": (pStyle pPr)}
in
bodyPartToBlocks $ Paragraph pPr' parparts
bodyPartToBlocks (Tbl _ _ _ []) =
return $ para mempty
bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
let caption = text cap
(hdr, rows) = case firstRowFormatting look of
True | null rs -> (Nothing, [r])
| otherwise -> (Just r, rs)
False -> (Nothing, r:rs)
cells <- mapM rowToBlocksList rows
let width = case cells of
r':_ -> length r'
-- shouldn't happen
[] -> 0
hdrCells <- case hdr of
Just r' -> rowToBlocksList r'
Nothing -> return $ replicate width mempty
-- The two following variables (horizontal column alignment and
-- relative column widths) go to the default at the
-- moment. Width information is in the TblGrid field of the Tbl,
-- so should be possible. Alignment might be more difficult,
-- since there doesn't seem to be a column entity in docx.
let alignments = replicate width AlignDefault
widths = replicate width 0 :: [Double]
return $ table caption (zip alignments widths) hdrCells cells
bodyPartToBlocks (OMathPara e) = do
return $ para $ displayMath (writeTeX e)
-- replace targets with generated anchors.
rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline
rewriteLink' l@(Link attr ils ('#':target, title)) = do
anchorMap <- gets docxAnchorMap
return $ case M.lookup target anchorMap of
Just newTarget -> (Link attr ils ('#':newTarget, title))
Nothing -> l
rewriteLink' il = return il
rewriteLinks :: PandocMonad m => [Block] -> DocxContext m [Block]
rewriteLinks = mapM (walkM rewriteLink')
bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block])
bodyToOutput (Body bps) = do
let (metabps, blkbps) = sepBodyParts bps
meta <- bodyPartsToMeta metabps
blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
return $ (meta, blks')
docxToOutput :: PandocMonad m
=> ReaderOptions
-> Docx
-> m (Meta, [Block])
docxToOutput opts (Docx (Document _ body)) =
let dEnv = def { docxOptions = opts} in
evalDocxContext (bodyToOutput body) dEnv def