Merge pull request #1366 from jkr/reducible3
Docx rewrite and cleanup (in terms of Reducible typeclass)
This commit is contained in:
commit
ac6756009f
7 changed files with 286 additions and 273 deletions
|
@ -327,6 +327,7 @@ Library
|
|||
Text.Pandoc.SelfContained,
|
||||
Text.Pandoc.Process
|
||||
Other-Modules: Text.Pandoc.Readers.Docx.Lists,
|
||||
Text.Pandoc.Readers.Docx.Reducible,
|
||||
Text.Pandoc.Readers.Docx.Parse,
|
||||
Text.Pandoc.Writers.Shared,
|
||||
Text.Pandoc.Asciify,
|
||||
|
|
|
@ -79,8 +79,10 @@ import Text.Pandoc.Builder (text, toList)
|
|||
import Text.Pandoc.Generic (bottomUp)
|
||||
import Text.Pandoc.MIME (getMimeType)
|
||||
import Text.Pandoc.UTF8 (toString)
|
||||
import Text.Pandoc.Walk
|
||||
import Text.Pandoc.Readers.Docx.Parse
|
||||
import Text.Pandoc.Readers.Docx.Lists
|
||||
import Text.Pandoc.Readers.Docx.Reducible
|
||||
import Data.Maybe (mapMaybe, isJust, fromJust)
|
||||
import Data.List (delete, isPrefixOf, (\\), intersect)
|
||||
import qualified Data.ByteString as BS
|
||||
|
@ -96,28 +98,65 @@ readDocx opts bytes =
|
|||
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
|
||||
Nothing -> error $ "couldn't parse docx file"
|
||||
|
||||
runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)])
|
||||
runStyleToSpanAttr rPr = ("",
|
||||
mapMaybe id [
|
||||
if isBold rPr then (Just "strong") else Nothing,
|
||||
if isItalic rPr then (Just "emph") else Nothing,
|
||||
if isSmallCaps rPr then (Just "smallcaps") else Nothing,
|
||||
if isStrike rPr then (Just "strike") else Nothing,
|
||||
if isSuperScript rPr then (Just "superscript") else Nothing,
|
||||
if isSubScript rPr then (Just "subscript") else Nothing,
|
||||
rStyle rPr],
|
||||
case underline rPr of
|
||||
Just fmt -> [("underline", fmt)]
|
||||
_ -> []
|
||||
)
|
||||
spansToKeep :: [String]
|
||||
spansToKeep = ["list-item", "Definition", "DefinitionTerm"] ++ codeSpans
|
||||
|
||||
parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)])
|
||||
parStyleToDivAttr pPr = ("",
|
||||
pStyle pPr,
|
||||
case indent pPr of
|
||||
Just n -> [("indent", (show n))]
|
||||
Nothing -> []
|
||||
)
|
||||
|
||||
-- This is empty, but we put it in for future-proofing.
|
||||
divsToKeep :: [String]
|
||||
divsToKeep = []
|
||||
|
||||
runStyleToContainers :: RunStyle -> [Container Inline]
|
||||
runStyleToContainers rPr =
|
||||
let formatters = mapMaybe id
|
||||
[ if isBold rPr then (Just Strong) else Nothing
|
||||
, if isItalic rPr then (Just Emph) else Nothing
|
||||
, if isSmallCaps rPr then (Just SmallCaps) else Nothing
|
||||
, if isStrike rPr then (Just Strikeout) else Nothing
|
||||
, if isSuperScript rPr then (Just Superscript) else Nothing
|
||||
, if isSubScript rPr then (Just Subscript) else Nothing
|
||||
, rStyle rPr >>=
|
||||
(\s -> if s `elem` spansToKeep then Just s else Nothing) >>=
|
||||
(\s -> Just $ Span ("", [s], []))
|
||||
, underline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)]))
|
||||
]
|
||||
in
|
||||
map Container formatters
|
||||
|
||||
|
||||
divAttrToContainers :: [String] -> [(String, String)] -> [Container Block]
|
||||
divAttrToContainers [] [] = []
|
||||
divAttrToContainers (c:cs) _ | isJust (isHeaderClass c) =
|
||||
let n = fromJust (isHeaderClass c)
|
||||
in
|
||||
[(Container $ \blks ->
|
||||
Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))]
|
||||
divAttrToContainers (c:_) _ | c `elem` codeDivs =
|
||||
[Container $ \blks -> CodeBlock ("", [], []) (concatMap blkToCode blks)]
|
||||
divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs =
|
||||
let kvs' = filter (\(k,_) -> k /= "indent") kvs
|
||||
in
|
||||
(Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs')
|
||||
divAttrToContainers (c:cs) kvs | c `elem` blockQuoteDivs =
|
||||
(Container BlockQuote) : (divAttrToContainers (cs \\ blockQuoteDivs) kvs)
|
||||
divAttrToContainers (c:cs) kvs | c `elem` divsToKeep =
|
||||
(Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs)
|
||||
divAttrToContainers (_:cs) kvs = divAttrToContainers cs kvs
|
||||
divAttrToContainers [] (kv:kvs) | fst kv == "indent" =
|
||||
(Container BlockQuote) : divAttrToContainers [] kvs
|
||||
divAttrToContainers [] (_:kvs) =
|
||||
divAttrToContainers [] kvs
|
||||
|
||||
|
||||
parStyleToContainers :: ParagraphStyle -> [Container Block]
|
||||
parStyleToContainers pPr =
|
||||
let classes = pStyle pPr
|
||||
kvs = case indent pPr of
|
||||
Just n -> [("indent", show n)]
|
||||
Nothing -> []
|
||||
in
|
||||
divAttrToContainers classes kvs
|
||||
|
||||
|
||||
strToInlines :: String -> [Inline]
|
||||
strToInlines = toList . text
|
||||
|
@ -144,103 +183,42 @@ runElemToString (Tab) = ['\t']
|
|||
runElemsToString :: [RunElem] -> String
|
||||
runElemsToString = concatMap runElemToString
|
||||
|
||||
--- We use this instead of the more general
|
||||
--- Text.Pandoc.Shared.normalize for reasons of efficiency. For
|
||||
--- whatever reason, `normalize` makes a run take almost twice as
|
||||
--- long. (It does more, but this does what we need)
|
||||
inlineNormalize :: [Inline] -> [Inline]
|
||||
inlineNormalize [] = []
|
||||
inlineNormalize (Str "" : ils) = inlineNormalize ils
|
||||
inlineNormalize ((Str s) : (Str s') : l) =
|
||||
inlineNormalize (Str (s++s') : l)
|
||||
inlineNormalize ((Emph ils) : (Emph ils') : l) =
|
||||
inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l
|
||||
inlineNormalize ((Emph ils) : l) =
|
||||
Emph (inlineNormalize ils) : (inlineNormalize l)
|
||||
inlineNormalize ((Strong ils) : (Strong ils') : l) =
|
||||
inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l
|
||||
inlineNormalize ((Strong ils) : l) =
|
||||
Strong (inlineNormalize ils) : (inlineNormalize l)
|
||||
inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) =
|
||||
inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l
|
||||
inlineNormalize ((Strikeout ils) : l) =
|
||||
Strikeout (inlineNormalize ils) : (inlineNormalize l)
|
||||
inlineNormalize ((Superscript ils) : (Superscript ils') : l) =
|
||||
inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l
|
||||
inlineNormalize ((Superscript ils) : l) =
|
||||
Superscript (inlineNormalize ils) : (inlineNormalize l)
|
||||
inlineNormalize ((Subscript ils) : (Subscript ils') : l) =
|
||||
inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l
|
||||
inlineNormalize ((Subscript ils) : l) =
|
||||
Subscript (inlineNormalize ils) : (inlineNormalize l)
|
||||
inlineNormalize ((Space : Space : l)) =
|
||||
inlineNormalize $ (Space : l)
|
||||
inlineNormalize ((Quoted qt ils) : l) =
|
||||
Quoted qt (inlineNormalize ils) : inlineNormalize l
|
||||
inlineNormalize ((Cite cits ils) : l) =
|
||||
let
|
||||
f :: Citation -> Citation
|
||||
f (Citation s pref suff mode num hash) =
|
||||
Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash
|
||||
in
|
||||
Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l)
|
||||
inlineNormalize ((Link ils s) : l) =
|
||||
Link (inlineNormalize ils) s : (inlineNormalize l)
|
||||
inlineNormalize ((Image ils s) : l) =
|
||||
Image (inlineNormalize ils) s : (inlineNormalize l)
|
||||
inlineNormalize ((Note blks) : l) =
|
||||
Note (map blockNormalize blks) : (inlineNormalize l)
|
||||
inlineNormalize ((Span attr ils) : l) =
|
||||
Span attr (inlineNormalize ils) : (inlineNormalize l)
|
||||
inlineNormalize (il : l) = il : (inlineNormalize l)
|
||||
|
||||
stripSpaces :: [Inline] -> [Inline]
|
||||
stripSpaces ils =
|
||||
reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils
|
||||
inlineCodeContainer :: Container Inline -> Bool
|
||||
inlineCodeContainer (Container f) = case f [] of
|
||||
Span (_, classes, _) _ -> (not . null) (classes `intersect` codeSpans)
|
||||
_ -> False
|
||||
inlineCodeContainer _ = False
|
||||
|
||||
blockNormalize :: Block -> Block
|
||||
blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils
|
||||
blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils
|
||||
blockNormalize (Header n attr ils) =
|
||||
Header n attr $ stripSpaces $ inlineNormalize ils
|
||||
blockNormalize (Table ils align width hdr cells) =
|
||||
Table (stripSpaces $ inlineNormalize ils) align width hdr cells
|
||||
blockNormalize (DefinitionList pairs) =
|
||||
DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs
|
||||
blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks)
|
||||
blockNormalize (OrderedList attr blkslst) =
|
||||
OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst
|
||||
blockNormalize (BulletList blkslst) =
|
||||
BulletList $ map (\blks -> map blockNormalize blks) blkslst
|
||||
blockNormalize (Div attr blks) = Div attr (map blockNormalize blks)
|
||||
blockNormalize blk = blk
|
||||
-- blockCodeContainer :: Container Block -> Bool
|
||||
-- blockCodeContainer (Container f) = case f [] of
|
||||
-- Div (ident, classes, kvs) _ -> (not . null) (classes `intersect` codeDivs)
|
||||
-- _ -> False
|
||||
-- blockCodeContainer _ = False
|
||||
|
||||
runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
|
||||
runToInlines _ _ (Run rs runElems)
|
||||
| isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans =
|
||||
case runStyleToSpanAttr rs == ("", [], []) of
|
||||
True -> [Str (runElemsToString runElems)]
|
||||
False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]]
|
||||
| otherwise = case runStyleToSpanAttr rs == ("", [], []) of
|
||||
True -> concatMap runElemToInlines runElems
|
||||
False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)]
|
||||
| any inlineCodeContainer (runStyleToContainers rs) =
|
||||
rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]
|
||||
| otherwise =
|
||||
rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
|
||||
runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) =
|
||||
case (getFootNote fnId notes) of
|
||||
Just bodyParts ->
|
||||
[Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
|
||||
[Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
|
||||
Nothing ->
|
||||
[Note [Div ("", ["footnote"], []) []]]
|
||||
[Note []]
|
||||
runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) =
|
||||
case (getEndNote fnId notes) of
|
||||
Just bodyParts ->
|
||||
[Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
|
||||
[Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
|
||||
Nothing ->
|
||||
[Note [Div ("", ["endnote"], []) []]]
|
||||
[Note []]
|
||||
|
||||
parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline]
|
||||
parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
|
||||
parPartToInlines _ _ (BookMark _ anchor) =
|
||||
[Span (anchor, ["anchor"], []) []]
|
||||
parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = []
|
||||
parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []]
|
||||
parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) =
|
||||
case lookupRelationship relid rels of
|
||||
Just target -> [Image [] (combine "word" target, "")]
|
||||
|
@ -276,7 +254,6 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
|
|||
_ -> h
|
||||
makeHeaderAnchors blk = blk
|
||||
|
||||
|
||||
parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline]
|
||||
parPartsToInlines opts docx parparts =
|
||||
--
|
||||
|
@ -284,23 +261,32 @@ parPartsToInlines opts docx parparts =
|
|||
-- not mandatory.
|
||||
--
|
||||
(if False -- TODO depend on option
|
||||
then bottomUp (makeImagesSelfContained docx)
|
||||
then walk (makeImagesSelfContained docx)
|
||||
else id) $
|
||||
bottomUp spanTrim $
|
||||
bottomUp spanCorrect $
|
||||
bottomUp spanReduce $
|
||||
concatMap (parPartToInlines opts docx) parparts
|
||||
-- bottomUp spanTrim $
|
||||
-- bottomUp spanCorrect $
|
||||
-- bottomUp spanReduce $
|
||||
reduceList $ concatMap (parPartToInlines opts docx) parparts
|
||||
|
||||
cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block]
|
||||
cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps
|
||||
cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps
|
||||
|
||||
rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
|
||||
rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
|
||||
|
||||
bodyPartToBlock :: ReaderOptions -> Docx -> BodyPart -> Block
|
||||
bodyPartToBlock opts docx (Paragraph pPr parparts) =
|
||||
Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)]
|
||||
bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
|
||||
bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block]
|
||||
bodyPartToBlocks opts docx (Paragraph pPr parparts) =
|
||||
case parPartsToInlines opts docx parparts of
|
||||
[] ->
|
||||
[]
|
||||
_ ->
|
||||
let parContents = parPartsToInlines opts docx parparts
|
||||
trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents
|
||||
in
|
||||
rebuild
|
||||
(parStyleToContainers pPr)
|
||||
[Para trimmedContents]
|
||||
bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
|
||||
let
|
||||
kvs = case lookupLevel numId lvl numbering of
|
||||
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
|
||||
|
@ -317,12 +303,12 @@ bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parpa
|
|||
]
|
||||
Nothing -> []
|
||||
in
|
||||
Div
|
||||
("", ["list-item"], kvs)
|
||||
[bodyPartToBlock opts docx (Paragraph pPr parparts)]
|
||||
bodyPartToBlock _ _ (Tbl _ _ _ []) =
|
||||
Para []
|
||||
bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
|
||||
[Div
|
||||
("", ["list-item"], kvs)
|
||||
(bodyPartToBlocks opts docx (Paragraph pPr parparts))]
|
||||
bodyPartToBlocks _ _ (Tbl _ _ _ []) =
|
||||
[Para []]
|
||||
bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
|
||||
let caption = strToInlines cap
|
||||
(hdr, rows) = case firstRowFormatting look of
|
||||
True -> (Just r, rs)
|
||||
|
@ -344,7 +330,8 @@ bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
|
|||
alignments = take size (repeat AlignDefault)
|
||||
widths = take size (repeat 0) :: [Double]
|
||||
in
|
||||
Table caption alignments widths hdrCells cells
|
||||
[Table caption alignments widths hdrCells cells]
|
||||
|
||||
|
||||
makeImagesSelfContained :: Docx -> Inline -> Inline
|
||||
makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) =
|
||||
|
@ -360,127 +347,19 @@ makeImagesSelfContained _ inline = inline
|
|||
|
||||
bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
|
||||
bodyToBlocks opts docx (Body bps) =
|
||||
bottomUp removeEmptyPars $
|
||||
map blockNormalize $
|
||||
bottomUp spanRemove $
|
||||
bottomUp divRemove $
|
||||
map (makeHeaderAnchors) $
|
||||
bottomUp divCorrect $
|
||||
bottomUp divReduce $
|
||||
bottomUp divCorrectPreReduce $
|
||||
bottomUp blocksToDefinitions $
|
||||
blocksToBullets $
|
||||
map (bodyPartToBlock opts docx) bps
|
||||
concatMap (bodyPartToBlocks opts docx) bps
|
||||
|
||||
docxToBlocks :: ReaderOptions -> Docx -> [Block]
|
||||
docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
|
||||
|
||||
spanReduce :: [Inline] -> [Inline]
|
||||
spanReduce [] = []
|
||||
spanReduce ((Span (id1, classes1, kvs1) ils1) : ils)
|
||||
| (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils)
|
||||
spanReduce (s1@(Span (id1, classes1, kvs1) ils1) :
|
||||
s2@(Span (id2, classes2, kvs2) ils2) :
|
||||
ils) =
|
||||
let classes' = classes1 `intersect` classes2
|
||||
kvs' = kvs1 `intersect` kvs2
|
||||
classes1' = classes1 \\ classes'
|
||||
kvs1' = kvs1 \\ kvs'
|
||||
classes2' = classes2 \\ classes'
|
||||
kvs2' = kvs2 \\ kvs'
|
||||
in
|
||||
case null classes' && null kvs' of
|
||||
True -> s1 : (spanReduce (s2 : ils))
|
||||
False -> let attr' = ("", classes', kvs')
|
||||
attr1' = (id1, classes1', kvs1')
|
||||
attr2' = (id2, classes2', kvs2')
|
||||
in
|
||||
spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] :
|
||||
ils)
|
||||
spanReduce (il:ils) = il : (spanReduce ils)
|
||||
|
||||
ilToCode :: Inline -> String
|
||||
ilToCode (Str s) = s
|
||||
ilToCode _ = ""
|
||||
|
||||
spanRemove' :: Inline -> [Inline]
|
||||
spanRemove' s@(Span (ident, classes, _) [])
|
||||
-- "_GoBack" is automatically inserted. We don't want to keep it.
|
||||
| classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s]
|
||||
spanRemove' (Span (_, _, kvs) ils) =
|
||||
case lookup "underline" kvs of
|
||||
Just val -> [Span ("", [], [("underline", val)]) ils]
|
||||
Nothing -> ils
|
||||
spanRemove' il = [il]
|
||||
|
||||
spanRemove :: [Inline] -> [Inline]
|
||||
spanRemove = concatMap spanRemove'
|
||||
|
||||
spanTrim' :: Inline -> [Inline]
|
||||
spanTrim' il@(Span _ []) = [il]
|
||||
spanTrim' il@(Span attr (il':[]))
|
||||
| il' == Space = [Span attr [], Space]
|
||||
| otherwise = [il]
|
||||
spanTrim' (Span attr ils)
|
||||
| head ils == Space && last ils == Space =
|
||||
[Space, Span attr (init $ tail ils), Space]
|
||||
| head ils == Space = [Space, Span attr (tail ils)]
|
||||
| last ils == Space = [Span attr (init ils), Space]
|
||||
spanTrim' il = [il]
|
||||
|
||||
spanTrim :: [Inline] -> [Inline]
|
||||
spanTrim = concatMap spanTrim'
|
||||
|
||||
spanCorrect' :: Inline -> [Inline]
|
||||
spanCorrect' (Span ("", [], []) ils) = ils
|
||||
spanCorrect' (Span (ident, classes, kvs) ils)
|
||||
| "emph" `elem` classes =
|
||||
[Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils]
|
||||
| "strong" `elem` classes =
|
||||
[Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils]
|
||||
| "smallcaps" `elem` classes =
|
||||
[SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils]
|
||||
| "strike" `elem` classes =
|
||||
[Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils]
|
||||
| "superscript" `elem` classes =
|
||||
[Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils]
|
||||
| "subscript" `elem` classes =
|
||||
[Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils]
|
||||
| (not . null) (codeSpans `intersect` classes) =
|
||||
[Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)]
|
||||
| otherwise =
|
||||
[Span (ident, classes, kvs) ils]
|
||||
spanCorrect' il = [il]
|
||||
|
||||
spanCorrect :: [Inline] -> [Inline]
|
||||
spanCorrect = concatMap spanCorrect'
|
||||
|
||||
removeEmptyPars :: [Block] -> [Block]
|
||||
removeEmptyPars blks = filter (\b -> b /= (Para [])) blks
|
||||
|
||||
divReduce :: [Block] -> [Block]
|
||||
divReduce [] = []
|
||||
divReduce ((Div (id1, classes1, kvs1) blks1) : blks)
|
||||
| (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks)
|
||||
divReduce (d1@(Div (id1, classes1, kvs1) blks1) :
|
||||
d2@(Div (id2, classes2, kvs2) blks2) :
|
||||
blks) =
|
||||
let classes' = classes1 `intersect` classes2
|
||||
kvs' = kvs1 `intersect` kvs2
|
||||
classes1' = classes1 \\ classes'
|
||||
kvs1' = kvs1 \\ kvs'
|
||||
classes2' = classes2 \\ classes'
|
||||
kvs2' = kvs2 \\ kvs'
|
||||
in
|
||||
case null classes' && null kvs' of
|
||||
True -> d1 : (divReduce (d2 : blks))
|
||||
False -> let attr' = ("", classes', kvs')
|
||||
attr1' = (id1, classes1', kvs1')
|
||||
attr2' = (id2, classes2', kvs2')
|
||||
in
|
||||
divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] :
|
||||
blks)
|
||||
divReduce (blk:blks) = blk : (divReduce blks)
|
||||
|
||||
isHeaderClass :: String -> Maybe Int
|
||||
isHeaderClass s | "Heading" `isPrefixOf` s =
|
||||
|
@ -490,27 +369,12 @@ isHeaderClass s | "Heading" `isPrefixOf` s =
|
|||
_ -> Nothing
|
||||
isHeaderClass _ = Nothing
|
||||
|
||||
findHeaderClass :: [String] -> Maybe Int
|
||||
findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of
|
||||
[] -> Nothing
|
||||
n : _ -> Just n
|
||||
|
||||
blksToInlines :: [Block] -> [Inline]
|
||||
blksToInlines (Para ils : _) = ils
|
||||
blksToInlines (Plain ils : _) = ils
|
||||
blksToInlines _ = []
|
||||
|
||||
divCorrectPreReduce' :: Block -> [Block]
|
||||
divCorrectPreReduce' (Div (ident, classes, kvs) blks)
|
||||
| isJust $ findHeaderClass classes =
|
||||
let n = fromJust $ findHeaderClass classes
|
||||
in
|
||||
[Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)]
|
||||
| otherwise = [Div (ident, classes, kvs) blks]
|
||||
divCorrectPreReduce' blk = [blk]
|
||||
|
||||
divCorrectPreReduce :: [Block] -> [Block]
|
||||
divCorrectPreReduce = concatMap divCorrectPreReduce'
|
||||
|
||||
blkToCode :: Block -> String
|
||||
blkToCode (Para []) = ""
|
||||
|
@ -520,29 +384,3 @@ blkToCode (Para ((Span (_, classes, _) ils'): ils))
|
|||
(init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils))
|
||||
blkToCode _ = ""
|
||||
|
||||
divRemove' :: Block -> [Block]
|
||||
divRemove' (Div (_, _, kvs) blks) =
|
||||
case lookup "indent" kvs of
|
||||
Just val -> [Div ("", [], [("indent", val)]) blks]
|
||||
Nothing -> blks
|
||||
divRemove' blk = [blk]
|
||||
|
||||
divRemove :: [Block] -> [Block]
|
||||
divRemove = concatMap divRemove'
|
||||
|
||||
divCorrect' :: Block -> [Block]
|
||||
divCorrect' b@(Div (ident, classes, kvs) blks)
|
||||
| (not . null) (blockQuoteDivs `intersect` classes) =
|
||||
[BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]]
|
||||
| (not . null) (codeDivs `intersect` classes) =
|
||||
[CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)]
|
||||
| otherwise =
|
||||
case lookup "indent" kvs of
|
||||
Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]
|
||||
Just _ ->
|
||||
[BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]]
|
||||
Nothing -> [b]
|
||||
divCorrect' blk = [blk]
|
||||
|
||||
divCorrect :: [Block] -> [Block]
|
||||
divCorrect = concatMap divCorrect'
|
||||
|
|
|
@ -29,9 +29,12 @@ Functions for converting flat docx paragraphs into nested lists.
|
|||
-}
|
||||
|
||||
module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets
|
||||
, blocksToDefinitions) where
|
||||
, blocksToDefinitions
|
||||
, listParagraphDivs
|
||||
) where
|
||||
|
||||
import Text.Pandoc.JSON
|
||||
import Text.Pandoc.Generic (bottomUp)
|
||||
import Text.Pandoc.Shared (trim)
|
||||
import Control.Monad
|
||||
import Data.List
|
||||
|
@ -159,10 +162,9 @@ flatToBullets elems = flatToBullets' (-1) elems
|
|||
|
||||
blocksToBullets :: [Block] -> [Block]
|
||||
blocksToBullets blks =
|
||||
-- bottomUp removeListItemDivs $
|
||||
bottomUp removeListDivs $
|
||||
flatToBullets $ (handleListParagraphs blks)
|
||||
|
||||
|
||||
plainParaInlines :: Block -> [Inline]
|
||||
plainParaInlines (Plain ils) = ils
|
||||
plainParaInlines (Para ils) = ils
|
||||
|
@ -199,6 +201,23 @@ blocksToDefinitions' [] acc (b:blks) =
|
|||
blocksToDefinitions' defAcc acc (b:blks) =
|
||||
blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks
|
||||
|
||||
removeListDivs' :: Block -> [Block]
|
||||
removeListDivs' (Div (ident, classes, kvs) blks)
|
||||
| "list-item" `elem` classes =
|
||||
case delete "list-item" classes of
|
||||
[] -> blks
|
||||
classes' -> [Div (ident, classes', kvs) $ blks]
|
||||
removeListDivs' (Div (ident, classes, kvs) blks)
|
||||
| not $ null $ listParagraphDivs `intersect` classes =
|
||||
case classes \\ listParagraphDivs of
|
||||
[] -> blks
|
||||
classes' -> [Div (ident, classes', kvs) blks]
|
||||
removeListDivs' blk = [blk]
|
||||
|
||||
removeListDivs :: [Block] -> [Block]
|
||||
removeListDivs = concatMap removeListDivs'
|
||||
|
||||
|
||||
|
||||
blocksToDefinitions :: [Block] -> [Block]
|
||||
blocksToDefinitions = blocksToDefinitions' [] []
|
||||
|
|
150
src/Text/Pandoc/Readers/Docx/Reducible.hs
Normal file
150
src/Text/Pandoc/Readers/Docx/Reducible.hs
Normal file
|
@ -0,0 +1,150 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Text.Pandoc.Readers.Docx.Reducible ((<++>),
|
||||
(<+++>),
|
||||
Reducible,
|
||||
Container(..),
|
||||
container,
|
||||
innards,
|
||||
reduceList,
|
||||
reduceListB,
|
||||
rebuild)
|
||||
where
|
||||
|
||||
import Text.Pandoc.Builder
|
||||
import Data.List ((\\), intersect)
|
||||
|
||||
data Container a = Container ([a] -> a) | NullContainer
|
||||
|
||||
instance (Eq a) => Eq (Container a) where
|
||||
(Container x) == (Container y) = ((x []) == (y []))
|
||||
NullContainer == NullContainer = True
|
||||
_ == _ = False
|
||||
|
||||
instance (Show a) => Show (Container a) where
|
||||
show (Container x) = "Container {" ++
|
||||
(reverse $ drop 3 $ reverse $ show $ x []) ++
|
||||
"}"
|
||||
show (NullContainer) = "NullContainer"
|
||||
|
||||
class Reducible a where
|
||||
(<++>) :: a -> a -> [a]
|
||||
container :: a -> Container a
|
||||
innards :: a -> [a]
|
||||
isSpace :: a -> Bool
|
||||
|
||||
(<+++>) :: (Reducible a) => Many a -> Many a -> Many a
|
||||
mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms
|
||||
|
||||
reduceListB :: (Reducible a) => Many a -> Many a
|
||||
reduceListB = fromList . reduceList . toList
|
||||
|
||||
reduceList' :: (Reducible a) => [a] -> [a] -> [a]
|
||||
reduceList' acc [] = acc
|
||||
reduceList' [] (x:xs) = reduceList' [x] xs
|
||||
reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs
|
||||
|
||||
reduceList :: (Reducible a) => [a] -> [a]
|
||||
reduceList = reduceList' []
|
||||
|
||||
combineReducibles :: (Reducible a, Eq a) => a -> a -> [a]
|
||||
combineReducibles r s =
|
||||
let (conts, rs) = topLevelContainers r
|
||||
(conts', ss) = topLevelContainers s
|
||||
shared = conts `intersect` conts'
|
||||
remaining = conts \\ shared
|
||||
remaining' = conts' \\ shared
|
||||
in
|
||||
case null shared of
|
||||
True -> case (not . null) rs && isSpace (last rs) of
|
||||
True -> rebuild conts (init rs) ++ [last rs, s]
|
||||
False -> [r,s]
|
||||
False -> rebuild
|
||||
shared $
|
||||
reduceList $
|
||||
(rebuild remaining rs) ++ (rebuild remaining' ss)
|
||||
|
||||
instance Reducible Inline where
|
||||
s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) =
|
||||
let classes' = classes1 `intersect` classes2
|
||||
kvs' = kvs1 `intersect` kvs2
|
||||
classes1' = classes1 \\ classes'
|
||||
kvs1' = kvs1 \\ kvs'
|
||||
classes2' = classes2 \\ classes'
|
||||
kvs2' = kvs2 \\ kvs'
|
||||
in
|
||||
case null classes' && null kvs' of
|
||||
True -> [s1,s2]
|
||||
False -> let attr' = ("", classes', kvs')
|
||||
attr1' = (id1, classes1', kvs1')
|
||||
attr2' = (id2, classes2', kvs2')
|
||||
s1' = case null classes1' && null kvs1' of
|
||||
True -> ils1
|
||||
False -> [Span attr1' ils1]
|
||||
s2' = case null classes2' && null kvs2' of
|
||||
True -> ils2
|
||||
False -> [Span attr2' ils2]
|
||||
in
|
||||
[Span attr' $ reduceList $ s1' ++ s2']
|
||||
|
||||
(Str x) <++> (Str y) = [Str (x++y)]
|
||||
il <++> il' = combineReducibles il il'
|
||||
|
||||
container (Emph _) = Container Emph
|
||||
container (Strong _) = Container Strong
|
||||
container (Strikeout _) = Container Strikeout
|
||||
container (Subscript _) = Container Subscript
|
||||
container (Superscript _) = Container Superscript
|
||||
container (Quoted qt _) = Container $ Quoted qt
|
||||
container (Cite cs _) = Container $ Cite cs
|
||||
container (Span attr _) = Container $ Span attr
|
||||
container _ = NullContainer
|
||||
|
||||
innards (Emph ils) = ils
|
||||
innards (Strong ils) = ils
|
||||
innards (Strikeout ils) = ils
|
||||
innards (Subscript ils) = ils
|
||||
innards (Superscript ils) = ils
|
||||
innards (Quoted _ ils) = ils
|
||||
innards (Cite _ ils) = ils
|
||||
innards (Span _ ils) = ils
|
||||
innards _ = []
|
||||
|
||||
isSpace Space = True
|
||||
isSpace _ = False
|
||||
|
||||
instance Reducible Block where
|
||||
(Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes =
|
||||
[Div (ident, classes, kvs) (reduceList blks), blk]
|
||||
|
||||
blk <++> blk' = combineReducibles blk blk'
|
||||
|
||||
container (BlockQuote _) = Container BlockQuote
|
||||
container (Div attr _) = Container $ Div attr
|
||||
container _ = NullContainer
|
||||
|
||||
innards (BlockQuote bs) = bs
|
||||
innards (Div _ bs) = bs
|
||||
innards _ = []
|
||||
|
||||
isSpace _ = False
|
||||
|
||||
|
||||
topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a])
|
||||
topLevelContainers' (r : []) = case container r of
|
||||
NullContainer -> ([], [r])
|
||||
_ ->
|
||||
let (conts, inns) = topLevelContainers' (innards r)
|
||||
in
|
||||
((container r) : conts, inns)
|
||||
topLevelContainers' rs = ([], rs)
|
||||
|
||||
topLevelContainers :: (Reducible a) => a -> ([Container a], [a])
|
||||
topLevelContainers il = topLevelContainers' [il]
|
||||
|
||||
rebuild :: [Container a] -> [a] -> [a]
|
||||
rebuild [] xs = xs
|
||||
rebuild ((Container f) : cs) xs = rebuild cs $ [f xs]
|
||||
rebuild (NullContainer : cs) xs = rebuild cs $ xs
|
||||
|
||||
|
|
@ -82,6 +82,10 @@ tests = [ testGroup "inlines"
|
|||
"normalizing inlines deep inside blocks"
|
||||
"docx.deep_normalize.docx"
|
||||
"docx.deep_normalize.native"
|
||||
, testCompare
|
||||
"move trailing spaces outside of formatting"
|
||||
"docx.trailing_spaces_in_formatting.docx"
|
||||
"docx.trailing_spaces_in_formatting.native"
|
||||
]
|
||||
, testGroup "blocks"
|
||||
[ testCompare
|
||||
|
|
BIN
tests/docx.trailing_spaces_in_formatting.docx
Normal file
BIN
tests/docx.trailing_spaces_in_formatting.docx
Normal file
Binary file not shown.
1
tests/docx.trailing_spaces_in_formatting.native
Normal file
1
tests/docx.trailing_spaces_in_formatting.native
Normal file
|
@ -0,0 +1 @@
|
|||
[Para [Str "Turn",Space,Str "my",Space,Emph [Str "formatting"],Space,Str "off",Space,Str "after",Space,Str "the",Space,Str "spaces."]]
|
Loading…
Add table
Reference in a new issue