Docx reader: Make metavalues out of styled paragraphs.
This will make paragraphs styled with `Author`, `Title`, `Subtitle`, `Date`, and `Abstract` into pandoc metavalues, rather than text. The implementation only takes those elements from the beginning of the document (ignoring empty paragraphs). Multiple paragraphs in the `Author` style will be made into a metaList, one paragraph per item. Hard linebreaks (shift-return) in the paragraph will be maintained, and can be used for institution, email, etc.
This commit is contained in:
parent
3eff3782c1
commit
840108a9c1
8 changed files with 99 additions and 11 deletions
|
@ -87,7 +87,7 @@ import Text.Pandoc.Readers.Docx.Reducible
|
|||
import Text.Pandoc.Readers.Docx.TexChar
|
||||
import Text.Pandoc.Shared
|
||||
import Data.Maybe (mapMaybe, fromMaybe)
|
||||
import Data.List (delete, isPrefixOf, (\\), intercalate)
|
||||
import Data.List (delete, isPrefixOf, (\\), intercalate, intersect)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import Data.ByteString.Base64 (encode)
|
||||
|
@ -101,7 +101,8 @@ readDocx :: ReaderOptions
|
|||
-> Pandoc
|
||||
readDocx opts bytes =
|
||||
case archiveToDocx (toArchive bytes) of
|
||||
Right docx -> Pandoc nullMeta (docxToBlocks opts docx)
|
||||
Right docx -> Pandoc meta blks where
|
||||
(meta, blks) = (docxToMetaAndBlocks opts docx)
|
||||
Left _ -> error $ "couldn't parse docx file"
|
||||
|
||||
data DState = DState { docxAnchorMap :: M.Map String String
|
||||
|
@ -134,6 +135,65 @@ spansToKeep = []
|
|||
divsToKeep :: [String]
|
||||
divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
|
||||
|
||||
metaStyles :: M.Map String String
|
||||
metaStyles = M.fromList [ ("Title", "title")
|
||||
, ("Subtitle", "subtitle")
|
||||
, ("Author", "author")
|
||||
, ("Date", "date")
|
||||
, ("Abstract", "abstract")]
|
||||
|
||||
sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
|
||||
sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp))
|
||||
|
||||
isMetaPar :: BodyPart -> Bool
|
||||
isMetaPar (Paragraph pPr _) =
|
||||
not $ null $ intersect (pStyle pPr) (M.keys metaStyles)
|
||||
isMetaPar _ = False
|
||||
|
||||
isEmptyPar :: BodyPart -> Bool
|
||||
isEmptyPar (Paragraph _ parParts) =
|
||||
all isEmptyParPart parParts
|
||||
where
|
||||
isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
|
||||
isEmptyParPart _ = False
|
||||
isEmptyElem (TextRun s) = trim s == ""
|
||||
isEmptyElem _ = True
|
||||
isEmptyPar _ = False
|
||||
|
||||
bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue)
|
||||
bodyPartsToMeta' [] = return M.empty
|
||||
bodyPartsToMeta' (bp : bps)
|
||||
| (Paragraph pPr parParts) <- bp
|
||||
, (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
|
||||
, (Just metaField) <- M.lookup c metaStyles = do
|
||||
inlines <- parPartsToInlines parParts
|
||||
remaining <- bodyPartsToMeta' bps
|
||||
let
|
||||
f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
|
||||
f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks)
|
||||
f m (MetaList mv) = MetaList (m : mv)
|
||||
f m n = MetaList [m, n]
|
||||
return $ M.insertWith f metaField (MetaInlines inlines) remaining
|
||||
bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps
|
||||
|
||||
bodyPartsToMeta :: [BodyPart] -> DocxContext Meta
|
||||
bodyPartsToMeta bps = do
|
||||
mp <- bodyPartsToMeta' bps
|
||||
let mp' =
|
||||
case M.lookup "author" mp of
|
||||
Just mv -> M.insert "author" (fixAuthors mv) mp
|
||||
Nothing -> mp
|
||||
return $ Meta mp'
|
||||
|
||||
fixAuthors :: MetaValue -> MetaValue
|
||||
fixAuthors (MetaBlocks blks) =
|
||||
MetaList $ map g $ filter f blks
|
||||
where f (Para _) = True
|
||||
f _ = False
|
||||
g (Para ils) = MetaInlines ils
|
||||
g _ = MetaInlines []
|
||||
fixAuthors mv = mv
|
||||
|
||||
runStyleToContainers :: RunStyle -> [Container Inline]
|
||||
runStyleToContainers rPr =
|
||||
let spanClassToContainers :: String -> [Container Inline]
|
||||
|
@ -615,24 +675,26 @@ rewriteLink l@(Link ils ('#':target, title)) = do
|
|||
Nothing -> l
|
||||
rewriteLink il = return il
|
||||
|
||||
|
||||
bodyToBlocks :: Body -> DocxContext [Block]
|
||||
bodyToBlocks (Body bps) = do
|
||||
blks <- concatMapM bodyPartToBlocks bps >>=
|
||||
bodyToMetaAndBlocks :: Body -> DocxContext (Meta, [Block])
|
||||
bodyToMetaAndBlocks (Body bps) = do
|
||||
let (metabps, blkbps) = sepBodyParts bps
|
||||
meta <- bodyPartsToMeta metabps
|
||||
blks <- concatMapM bodyPartToBlocks blkbps >>=
|
||||
walkM rewriteLink
|
||||
return $
|
||||
blocksToDefinitions $
|
||||
blocksToBullets $ blks
|
||||
(meta,
|
||||
blocksToDefinitions $
|
||||
blocksToBullets $ blks)
|
||||
|
||||
docxToBlocks :: ReaderOptions -> Docx -> [Block]
|
||||
docxToBlocks opts d@(Docx (Document _ body)) =
|
||||
docxToMetaAndBlocks :: ReaderOptions -> Docx -> (Meta, [Block])
|
||||
docxToMetaAndBlocks opts d@(Docx (Document _ body)) =
|
||||
let dState = DState { docxAnchorMap = M.empty
|
||||
, docxInHeaderBlock = False
|
||||
, docxInTexSubscript = False}
|
||||
dEnv = DEnv { docxOptions = opts
|
||||
, docxDocument = d}
|
||||
in
|
||||
evalDocxContext (bodyToBlocks body) dEnv dState
|
||||
evalDocxContext (bodyToMetaAndBlocks body) dEnv dState
|
||||
|
||||
ilToCode :: Inline -> String
|
||||
ilToCode (Str s) = s
|
||||
|
|
|
@ -164,5 +164,20 @@ tests = [ testGroup "inlines"
|
|||
"docx.track_changes_deletion.docx"
|
||||
"docx.track_changes_deletion_all.native"
|
||||
]
|
||||
, testGroup "metadata"
|
||||
[ testCompareWithOpts def{readerStandalone=True}
|
||||
"metadata fields"
|
||||
"docx.metadata.docx"
|
||||
"docx.metadata.native"
|
||||
, testCompareWithOpts def{readerStandalone=True}
|
||||
"linebreak between authors"
|
||||
"docx.metadata_author_linebreak.docx"
|
||||
"docx.metadata_author_linebreak.native"
|
||||
, testCompareWithOpts def{readerStandalone=True}
|
||||
"stop recording metadata with normal text"
|
||||
"docx.metadata_after_normal.docx"
|
||||
"docx.metadata_after_normal.native"
|
||||
]
|
||||
|
||||
]
|
||||
|
||||
|
|
BIN
tests/docx.metadata.docx
Normal file
BIN
tests/docx.metadata.docx
Normal file
Binary file not shown.
2
tests/docx.metadata.native
Normal file
2
tests/docx.metadata.native
Normal file
|
@ -0,0 +1,2 @@
|
|||
Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]})
|
||||
[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]]
|
BIN
tests/docx.metadata_after_normal.docx
Normal file
BIN
tests/docx.metadata_after_normal.docx
Normal file
Binary file not shown.
7
tests/docx.metadata_after_normal.native
Normal file
7
tests/docx.metadata_after_normal.native
Normal file
|
@ -0,0 +1,7 @@
|
|||
Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]})
|
||||
[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]
|
||||
,Para [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"]
|
||||
,Para [Str "Mary",Space,Str "Ann",Space,Str "Evans"]
|
||||
,Para [Str "Aurore",Space,Str "Dupin"]
|
||||
,Para [Str "July",Space,Str "28,",Space,Str "2014"]
|
||||
,Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]]
|
BIN
tests/docx.metadata_author_linebreak.docx
Normal file
BIN
tests/docx.metadata_author_linebreak.docx
Normal file
Binary file not shown.
2
tests/docx.metadata_author_linebreak.native
Normal file
2
tests/docx.metadata_author_linebreak.native
Normal file
|
@ -0,0 +1,2 @@
|
|||
Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]})
|
||||
[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]]
|
Loading…
Add table
Reference in a new issue