Merge branch 'docx' of https://github.com/jkr/pandoc into jkr-docx
This commit is contained in:
commit
bec9f3c641
27 changed files with 1511 additions and 33 deletions
18
README
18
README
|
@ -13,15 +13,15 @@ Description
|
|||
Pandoc is a [Haskell] library for converting from one markup format to
|
||||
another, and a command-line tool that uses this library. It can read
|
||||
[markdown] and (subsets of) [Textile], [reStructuredText], [HTML],
|
||||
[LaTeX], [MediaWiki markup], [Haddock markup], [OPML], [Emacs Org-mode]
|
||||
and [DocBook]; and it can write plain text, [markdown],
|
||||
[reStructuredText], [XHTML], [HTML 5], [LaTeX] (including [beamer] slide
|
||||
shows), [ConTeXt], [RTF], [OPML], [DocBook], [OpenDocument], [ODT],
|
||||
[Word docx], [GNU Texinfo], [MediaWiki markup], [EPUB] (v2 or v3),
|
||||
[FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode],
|
||||
[AsciiDoc], [InDesign ICML], and [Slidy], [Slideous], [DZSlides],
|
||||
[reveal.js] or [S5] HTML slide shows. It can also produce [PDF] output
|
||||
on systems where LaTeX is installed.
|
||||
[LaTeX], [MediaWiki markup], [Haddock markup], [OPML], [Emacs
|
||||
Org-mode], [DocBook], and [Word docx]; and it can write plain text,
|
||||
[markdown], [reStructuredText], [XHTML], [HTML 5], [LaTeX] (including
|
||||
[beamer] slide shows), [ConTeXt], [RTF], [OPML], [DocBook],
|
||||
[OpenDocument], [ODT], [Word docx], [GNU Texinfo], [MediaWiki markup],
|
||||
[EPUB] (v2 or v3), [FictionBook2], [Textile], [groff man] pages,
|
||||
[Emacs Org-Mode], [AsciiDoc], [InDesign ICML], and [Slidy],
|
||||
[Slideous], [DZSlides], [reveal.js] or [S5] HTML slide shows. It can
|
||||
also produce [PDF] output on systems where LaTeX is installed.
|
||||
|
||||
Pandoc's enhanced version of markdown includes syntax for footnotes,
|
||||
tables, flexible ordered lists, definition lists, fenced code blocks,
|
||||
|
|
24
pandoc.cabal
24
pandoc.cabal
|
@ -184,7 +184,25 @@ Extra-Source-Files:
|
|||
tests/fb2.math.markdown,
|
||||
tests/fb2.math.fb2,
|
||||
tests/fb2.test-small.png,
|
||||
tests/fb2.test.jpg
|
||||
tests/fb2.test.jpg,
|
||||
tests/docx.block_quotes.docx,
|
||||
tests/docx.block_quotes_parse_indent.native,
|
||||
tests/docx.headers.docx,
|
||||
tests/docx.headers.native,
|
||||
tests/docx.image.docx,
|
||||
tests/docx.image_no_embed.native,
|
||||
tests/docx.inline_formatting.docx,
|
||||
tests/docx.inline_formatting.native,
|
||||
tests/docx.links.docx,
|
||||
tests/docx.links.native,
|
||||
tests/docx.lists.docx,
|
||||
tests/docx.lists.native,
|
||||
tests/docx.notes.docx,
|
||||
tests/docx.notes.native,
|
||||
tests/docx.tables.docx,
|
||||
tests/docx.tables.native,
|
||||
tests/docx.unicode.docx,
|
||||
tests/docx.unicode.native
|
||||
Extra-Tmp-Files: man/man1/pandoc.1,
|
||||
man/man5/pandoc_markdown.5
|
||||
|
||||
|
@ -275,6 +293,7 @@ Library
|
|||
Text.Pandoc.Readers.Textile,
|
||||
Text.Pandoc.Readers.Native,
|
||||
Text.Pandoc.Readers.Haddock,
|
||||
Text.Pandoc.Readers.DocX,
|
||||
Text.Pandoc.Writers.Native,
|
||||
Text.Pandoc.Writers.Docbook,
|
||||
Text.Pandoc.Writers.OPML,
|
||||
|
@ -305,6 +324,8 @@ Library
|
|||
Text.Pandoc.Process
|
||||
Other-Modules: Text.Pandoc.Readers.Haddock.Lex,
|
||||
Text.Pandoc.Readers.Haddock.Parse,
|
||||
Text.Pandoc.Readers.DocX.Lists,
|
||||
Text.Pandoc.Readers.DocX.Parse,
|
||||
Text.Pandoc.Writers.Shared,
|
||||
Text.Pandoc.Asciify,
|
||||
Text.Pandoc.MIME,
|
||||
|
@ -390,6 +411,7 @@ Test-Suite test-pandoc
|
|||
Tests.Readers.Markdown
|
||||
Tests.Readers.Org
|
||||
Tests.Readers.RST
|
||||
Tests.Readers.DocX
|
||||
Tests.Writers.Native
|
||||
Tests.Writers.ConTeXt
|
||||
Tests.Writers.HTML
|
||||
|
|
13
pandoc.hs
13
pandoc.hs
|
@ -858,6 +858,7 @@ defaultReaderName fallback (x:xs) =
|
|||
".textile" -> "textile"
|
||||
".native" -> "native"
|
||||
".json" -> "json"
|
||||
".docx" -> "docx"
|
||||
_ -> defaultReaderName fallback xs
|
||||
|
||||
-- Returns True if extension of first source is .lhs
|
||||
|
@ -1158,15 +1159,21 @@ main = do
|
|||
Left e -> throwIO e
|
||||
Right (bs,_) -> return $ UTF8.toString bs
|
||||
|
||||
let readFiles [] = error "Cannot read archive from stdin"
|
||||
readFiles (x:_) = B.readFile x
|
||||
|
||||
let convertTabs = tabFilter (if preserveTabs then 0 else tabStop)
|
||||
|
||||
let handleIncludes' = if readerName' == "latex" || readerName' == "latex+lhs"
|
||||
then handleIncludes
|
||||
else return
|
||||
|
||||
doc <- readSources sources >>=
|
||||
handleIncludes' . convertTabs . intercalate "\n" >>=
|
||||
reader readerOpts
|
||||
doc <- case reader of
|
||||
StringReader r->
|
||||
readSources sources >>=
|
||||
handleIncludes' . convertTabs . intercalate "\n" >>=
|
||||
r readerOpts
|
||||
ByteStringReader r -> readFiles sources >>= r readerOpts
|
||||
|
||||
|
||||
let doc0 = M.foldWithKey setMeta doc metadata
|
||||
|
|
|
@ -62,6 +62,8 @@ module Text.Pandoc
|
|||
, readers
|
||||
, writers
|
||||
-- * Readers: converting /to/ Pandoc format
|
||||
, Reader (..)
|
||||
, readDocX
|
||||
, readMarkdown
|
||||
, readMediaWiki
|
||||
, readRST
|
||||
|
@ -125,6 +127,7 @@ import Text.Pandoc.Readers.HTML
|
|||
import Text.Pandoc.Readers.Textile
|
||||
import Text.Pandoc.Readers.Native
|
||||
import Text.Pandoc.Readers.Haddock
|
||||
import Text.Pandoc.Readers.DocX
|
||||
import Text.Pandoc.Writers.Native
|
||||
import Text.Pandoc.Writers.Markdown
|
||||
import Text.Pandoc.Writers.RST
|
||||
|
@ -192,24 +195,34 @@ markdown o s = do
|
|||
mapM_ warn warnings
|
||||
return doc
|
||||
|
||||
data Reader = StringReader (ReaderOptions -> String -> IO Pandoc)
|
||||
| ByteStringReader (ReaderOptions -> BL.ByteString -> IO Pandoc)
|
||||
|
||||
mkStringReader :: (ReaderOptions -> String -> Pandoc) -> Reader
|
||||
mkStringReader r = StringReader (\o s -> return $ r o s)
|
||||
|
||||
mkBSReader :: (ReaderOptions -> BL.ByteString -> Pandoc) -> Reader
|
||||
mkBSReader r = ByteStringReader (\o s -> return $ r o s)
|
||||
|
||||
-- | Association list of formats and readers.
|
||||
readers :: [(String, ReaderOptions -> String -> IO Pandoc)]
|
||||
readers = [ ("native" , \_ s -> return $ readNative s)
|
||||
,("json" , \o s -> return $ readJSON o s)
|
||||
,("markdown" , markdown)
|
||||
,("markdown_strict" , markdown)
|
||||
,("markdown_phpextra" , markdown)
|
||||
,("markdown_github" , markdown)
|
||||
,("markdown_mmd", markdown)
|
||||
,("rst" , \o s -> return $ readRST o s)
|
||||
,("mediawiki" , \o s -> return $ readMediaWiki o s)
|
||||
,("docbook" , \o s -> return $ readDocBook o s)
|
||||
,("opml" , \o s -> return $ readOPML o s)
|
||||
,("org" , \o s -> return $ readOrg o s)
|
||||
,("textile" , \o s -> return $ readTextile o s) -- TODO : textile+lhs
|
||||
,("html" , \o s -> return $ readHtml o s)
|
||||
,("latex" , \o s -> return $ readLaTeX o s)
|
||||
,("haddock" , \o s -> return $ readHaddock o s)
|
||||
readers :: [(String, Reader)]
|
||||
readers = [ ("native" , StringReader $ \_ s -> return $ readNative s)
|
||||
,("json" , mkStringReader readJSON )
|
||||
,("markdown" , StringReader markdown)
|
||||
,("markdown_strict" , StringReader markdown)
|
||||
,("markdown_phpextra" , StringReader markdown)
|
||||
,("markdown_github" , StringReader markdown)
|
||||
,("markdown_mmd", StringReader markdown)
|
||||
,("rst" , mkStringReader readRST )
|
||||
,("mediawiki" , mkStringReader readMediaWiki)
|
||||
,("docbook" , mkStringReader readDocBook)
|
||||
,("opml" , mkStringReader readOPML)
|
||||
,("org" , mkStringReader readOrg)
|
||||
,("textile" , mkStringReader readTextile) -- TODO : textile+lhs
|
||||
,("html" , mkStringReader readHtml)
|
||||
,("latex" , mkStringReader readLaTeX)
|
||||
,("haddock" , mkStringReader readHaddock)
|
||||
,("docx" , mkBSReader readDocX)
|
||||
]
|
||||
|
||||
data Writer = PureStringWriter (WriterOptions -> Pandoc -> String)
|
||||
|
@ -280,14 +293,17 @@ getDefaultExtensions "textile" = Set.fromList [Ext_auto_identifiers, Ext
|
|||
getDefaultExtensions _ = Set.fromList [Ext_auto_identifiers]
|
||||
|
||||
-- | Retrieve reader based on formatSpec (format+extensions).
|
||||
getReader :: String -> Either String (ReaderOptions -> String -> IO Pandoc)
|
||||
getReader :: String -> Either String Reader
|
||||
getReader s =
|
||||
case parseFormatSpec s of
|
||||
Left e -> Left $ intercalate "\n" $ [m | Message m <- errorMessages e]
|
||||
Right (readerName, setExts) ->
|
||||
Right (readerName, setExts) ->
|
||||
case lookup readerName readers of
|
||||
Nothing -> Left $ "Unknown reader: " ++ readerName
|
||||
Just r -> Right $ \o ->
|
||||
Just (StringReader r) -> Right $ StringReader $ \o ->
|
||||
r o{ readerExtensions = setExts $
|
||||
getDefaultExtensions readerName }
|
||||
Just (ByteStringReader r) -> Right $ ByteStringReader $ \o ->
|
||||
r o{ readerExtensions = setExts $
|
||||
getDefaultExtensions readerName }
|
||||
|
||||
|
|
479
src/Text/Pandoc/Readers/DocX.hs
Normal file
479
src/Text/Pandoc/Readers/DocX.hs
Normal file
|
@ -0,0 +1,479 @@
|
|||
{-
|
||||
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
-}
|
||||
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.DocX
|
||||
Copyright : Copyright (C) 2014 Jesse Rosenthal
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
Conversion of DocX type (defined in Text.Pandoc.Readers.DocX.Parse)
|
||||
to 'Pandoc' document. -}
|
||||
|
||||
{-
|
||||
Current state of implementation of DocX entities ([x] means
|
||||
implemented, [-] means partially implemented):
|
||||
|
||||
* Blocks
|
||||
|
||||
- [X] Para
|
||||
- [X] CodeBlock (styled with `SourceCode`)
|
||||
- [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally,
|
||||
indented)
|
||||
- [X] OrderedList
|
||||
- [X] BulletList
|
||||
- [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`)
|
||||
- [X] Header (styled with `Heading#`)
|
||||
- [ ] HorizontalRule
|
||||
- [-] Table (column widths and alignments not yet implemented)
|
||||
|
||||
* Inlines
|
||||
|
||||
- [X] Str
|
||||
- [X] Emph (From italics. `underline` currently read as span. In
|
||||
future, it might optionally be emph as well)
|
||||
- [X] Strong
|
||||
- [X] Strikeout
|
||||
- [X] Superscript
|
||||
- [X] Subscript
|
||||
- [X] SmallCaps
|
||||
- [ ] Quoted
|
||||
- [ ] Cite
|
||||
- [X] Code (styled with `VerbatimChar`)
|
||||
- [X] Space
|
||||
- [X] LineBreak (these are invisible in Word: entered with Shift-Return)
|
||||
- [ ] Math
|
||||
- [X] Link (links to an arbitrary bookmark create a span with the target as
|
||||
id and "anchor" class)
|
||||
- [-] Image (Links to path in archive. Future option for
|
||||
data-encoded URI likely.)
|
||||
- [X] Note (Footnotes and Endnotes are silently combined.)
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Readers.DocX
|
||||
( readDocX
|
||||
) where
|
||||
|
||||
import Codec.Archive.Zip
|
||||
import Text.Pandoc.Definition
|
||||
import Text.Pandoc.Options
|
||||
import Text.Pandoc.Builder (text, toList)
|
||||
import Text.Pandoc.Generic (bottomUp)
|
||||
import Text.Pandoc.MIME (getMimeType)
|
||||
import Text.Pandoc.UTF8 (toString)
|
||||
import Text.Pandoc.Readers.DocX.Parse
|
||||
import Text.Pandoc.Readers.DocX.Lists
|
||||
import Data.Maybe (mapMaybe, isJust, fromJust)
|
||||
import Data.List (delete, isPrefixOf, (\\), intersect)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import Data.ByteString.Base64 (encode)
|
||||
import System.FilePath (combine)
|
||||
|
||||
readDocX :: ReaderOptions
|
||||
-> B.ByteString
|
||||
-> Pandoc
|
||||
readDocX opts bytes =
|
||||
case archiveToDocX (toArchive bytes) of
|
||||
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
|
||||
Nothing -> error $ "couldn't parse docx file"
|
||||
|
||||
runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)])
|
||||
runStyleToSpanAttr rPr = ("",
|
||||
mapMaybe id [
|
||||
if isBold rPr then (Just "strong") else Nothing,
|
||||
if isItalic rPr then (Just "emph") else Nothing,
|
||||
if isSmallCaps rPr then (Just "smallcaps") else Nothing,
|
||||
if isStrike rPr then (Just "strike") else Nothing,
|
||||
if isSuperScript rPr then (Just "superscript") else Nothing,
|
||||
if isSubScript rPr then (Just "subscript") else Nothing,
|
||||
rStyle rPr],
|
||||
case underline rPr of
|
||||
Just fmt -> [("underline", fmt)]
|
||||
_ -> []
|
||||
)
|
||||
|
||||
parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)])
|
||||
parStyleToDivAttr pPr = ("",
|
||||
pStyle pPr,
|
||||
case indent pPr of
|
||||
Just n -> [("indent", (show n))]
|
||||
Nothing -> []
|
||||
)
|
||||
|
||||
strToInlines :: String -> [Inline]
|
||||
strToInlines = toList . text
|
||||
|
||||
codeSpans :: [String]
|
||||
codeSpans = ["VerbatimChar"]
|
||||
|
||||
blockQuoteDivs :: [String]
|
||||
blockQuoteDivs = ["Quote", "BlockQuote"]
|
||||
|
||||
codeDivs :: [String]
|
||||
codeDivs = ["SourceCode"]
|
||||
|
||||
runElemToInlines :: RunElem -> [Inline]
|
||||
runElemToInlines (TextRun s) = strToInlines s
|
||||
runElemToInlines (LnBrk) = [LineBreak]
|
||||
|
||||
runElemToString :: RunElem -> String
|
||||
runElemToString (TextRun s) = s
|
||||
runElemToString (LnBrk) = ['\n']
|
||||
|
||||
runElemsToString :: [RunElem] -> String
|
||||
runElemsToString = concatMap runElemToString
|
||||
|
||||
strNormalize :: [Inline] -> [Inline]
|
||||
strNormalize [] = []
|
||||
strNormalize (Str "" : ils) = strNormalize ils
|
||||
strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l)
|
||||
strNormalize (il:ils) = il : (strNormalize ils)
|
||||
|
||||
runToInlines :: ReaderOptions -> DocX -> Run -> [Inline]
|
||||
runToInlines _ _ (Run rs runElems)
|
||||
| isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans =
|
||||
case runStyleToSpanAttr rs == ("", [], []) of
|
||||
True -> [Str (runElemsToString runElems)]
|
||||
False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]]
|
||||
| otherwise = case runStyleToSpanAttr rs == ("", [], []) of
|
||||
True -> concatMap runElemToInlines runElems
|
||||
False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)]
|
||||
runToInlines opts docx@(DocX _ notes _ _ _ ) (Footnote fnId) =
|
||||
case (getFootNote fnId notes) of
|
||||
Just bodyParts ->
|
||||
[Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
|
||||
Nothing ->
|
||||
[Note [Div ("", ["footnote"], []) []]]
|
||||
runToInlines opts docx@(DocX _ notes _ _ _) (Endnote fnId) =
|
||||
case (getEndNote fnId notes) of
|
||||
Just bodyParts ->
|
||||
[Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
|
||||
Nothing ->
|
||||
[Note [Div ("", ["endnote"], []) []]]
|
||||
|
||||
parPartToInlines :: ReaderOptions -> DocX -> ParPart -> [Inline]
|
||||
parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
|
||||
parPartToInlines _ _ (BookMark _ anchor) =
|
||||
[Span (anchor, ["anchor"], []) []]
|
||||
parPartToInlines _ (DocX _ _ _ rels _) (Drawing relid) =
|
||||
case lookupRelationship relid rels of
|
||||
Just target -> [Image [] (combine "word" target, "")]
|
||||
Nothing -> [Image [] ("", "")]
|
||||
parPartToInlines opts docx (InternalHyperLink anchor runs) =
|
||||
[Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")]
|
||||
parPartToInlines opts docx@(DocX _ _ _ rels _) (ExternalHyperLink relid runs) =
|
||||
case lookupRelationship relid rels of
|
||||
Just target ->
|
||||
[Link (concatMap (runToInlines opts docx) runs) (target, "")]
|
||||
Nothing ->
|
||||
[Link (concatMap (runToInlines opts docx) runs) ("", "")]
|
||||
|
||||
isAnchorSpan :: Inline -> Bool
|
||||
isAnchorSpan (Span (ident, classes, kvs) ils) =
|
||||
(not . null) ident &&
|
||||
classes == ["anchor"] &&
|
||||
null kvs &&
|
||||
null ils
|
||||
isAnchorSpan _ = False
|
||||
|
||||
dummyAnchors :: [String]
|
||||
dummyAnchors = ["_GoBack"]
|
||||
|
||||
makeHeaderAnchors :: Block -> Block
|
||||
makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
|
||||
case filter isAnchorSpan ils of
|
||||
[] -> h
|
||||
(x@(Span (ident, _, _) _) : xs) ->
|
||||
case ident `elem` dummyAnchors of
|
||||
True -> h
|
||||
False -> Header n (ident, classes, kvs) (ils \\ (x:xs))
|
||||
_ -> h
|
||||
makeHeaderAnchors blk = blk
|
||||
|
||||
|
||||
parPartsToInlines :: ReaderOptions -> DocX -> [ParPart] -> [Inline]
|
||||
parPartsToInlines opts docx parparts =
|
||||
--
|
||||
-- We're going to skip data-uri's for now. It should be an option,
|
||||
-- not mandatory.
|
||||
--
|
||||
--bottomUp (makeImagesSelfContained docx) $
|
||||
bottomUp spanCorrect $
|
||||
bottomUp spanTrim $
|
||||
bottomUp spanReduce $
|
||||
concatMap (parPartToInlines opts docx) parparts
|
||||
|
||||
cellToBlocks :: ReaderOptions -> DocX -> Cell -> [Block]
|
||||
cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps
|
||||
|
||||
rowToBlocksList :: ReaderOptions -> DocX -> Row -> [[Block]]
|
||||
rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
|
||||
|
||||
bodyPartToBlock :: ReaderOptions -> DocX -> BodyPart -> Block
|
||||
bodyPartToBlock opts docx (Paragraph pPr parparts) =
|
||||
Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)]
|
||||
bodyPartToBlock opts docx@(DocX _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
|
||||
let
|
||||
kvs = case lookupLevel numId lvl numbering of
|
||||
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
|
||||
, ("num-id", numId)
|
||||
, ("format", fmt)
|
||||
, ("text", txt)
|
||||
, ("start", (show start))
|
||||
]
|
||||
|
||||
Just (_, fmt, txt, Nothing) -> [ ("level", lvl)
|
||||
, ("num-id", numId)
|
||||
, ("format", fmt)
|
||||
, ("text", txt)
|
||||
]
|
||||
Nothing -> []
|
||||
in
|
||||
Div
|
||||
("", ["list-item"], kvs)
|
||||
[bodyPartToBlock opts docx (Paragraph pPr parparts)]
|
||||
bodyPartToBlock _ _ (Tbl _ _ _ []) =
|
||||
Para []
|
||||
bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
|
||||
let caption = strToInlines cap
|
||||
(hdr, rows) = case firstRowFormatting look of
|
||||
True -> (Just r, rs)
|
||||
False -> (Nothing, r:rs)
|
||||
hdrCells = case hdr of
|
||||
Just r' -> rowToBlocksList opts docx r'
|
||||
Nothing -> []
|
||||
cells = map (rowToBlocksList opts docx) rows
|
||||
|
||||
size = case null hdrCells of
|
||||
True -> length $ head cells
|
||||
False -> length $ hdrCells
|
||||
--
|
||||
-- The two following variables (horizontal column alignment and
|
||||
-- relative column widths) go to the default at the
|
||||
-- moment. Width information is in the TblGrid field of the Tbl,
|
||||
-- so should be possible. Alignment might be more difficult,
|
||||
-- since there doesn't seem to be a column entity in docx.
|
||||
alignments = take size (repeat AlignDefault)
|
||||
widths = take size (repeat 0) :: [Double]
|
||||
in
|
||||
Table caption alignments widths hdrCells cells
|
||||
|
||||
makeImagesSelfContained :: DocX -> Inline -> Inline
|
||||
makeImagesSelfContained (DocX _ _ _ _ media) i@(Image alt (uri, title)) =
|
||||
case lookup uri media of
|
||||
Just bs -> case getMimeType uri of
|
||||
Just mime -> let data_uri =
|
||||
"data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs)
|
||||
in
|
||||
Image alt (data_uri, title)
|
||||
Nothing -> i
|
||||
Nothing -> i
|
||||
makeImagesSelfContained _ inline = inline
|
||||
|
||||
bodyToBlocks :: ReaderOptions -> DocX -> Body -> [Block]
|
||||
bodyToBlocks opts docx (Body bps) =
|
||||
bottomUp removeEmptyPars $
|
||||
bottomUp strNormalize $
|
||||
bottomUp spanRemove $
|
||||
bottomUp divRemove $
|
||||
map (makeHeaderAnchors) $
|
||||
bottomUp divCorrect $
|
||||
bottomUp divReduce $
|
||||
bottomUp divCorrectPreReduce $
|
||||
bottomUp blocksToDefinitions $
|
||||
blocksToBullets $
|
||||
map (bodyPartToBlock opts docx) bps
|
||||
|
||||
docxToBlocks :: ReaderOptions -> DocX -> [Block]
|
||||
docxToBlocks opts d@(DocX (Document _ body) _ _ _ _) = bodyToBlocks opts d body
|
||||
|
||||
spanReduce :: [Inline] -> [Inline]
|
||||
spanReduce [] = []
|
||||
spanReduce ((Span (id1, classes1, kvs1) ils1) : ils)
|
||||
| (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils)
|
||||
spanReduce (s1@(Span (id1, classes1, kvs1) ils1) :
|
||||
s2@(Span (id2, classes2, kvs2) ils2) :
|
||||
ils) =
|
||||
let classes' = classes1 `intersect` classes2
|
||||
kvs' = kvs1 `intersect` kvs2
|
||||
classes1' = classes1 \\ classes'
|
||||
kvs1' = kvs1 \\ kvs'
|
||||
classes2' = classes2 \\ classes'
|
||||
kvs2' = kvs2 \\ kvs'
|
||||
in
|
||||
case null classes' && null kvs' of
|
||||
True -> s1 : (spanReduce (s2 : ils))
|
||||
False -> let attr' = ("", classes', kvs')
|
||||
attr1' = (id1, classes1', kvs1')
|
||||
attr2' = (id2, classes2', kvs2')
|
||||
in
|
||||
spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] :
|
||||
ils)
|
||||
spanReduce (il:ils) = il : (spanReduce ils)
|
||||
|
||||
ilToCode :: Inline -> String
|
||||
ilToCode (Str s) = s
|
||||
ilToCode _ = ""
|
||||
|
||||
spanRemove' :: Inline -> [Inline]
|
||||
spanRemove' s@(Span (ident, classes, _) [])
|
||||
-- "_GoBack" is automatically inserted. We don't want to keep it.
|
||||
| classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s]
|
||||
spanRemove' (Span (_, _, kvs) ils) =
|
||||
case lookup "underline" kvs of
|
||||
Just val -> [Span ("", [], [("underline", val)]) ils]
|
||||
Nothing -> ils
|
||||
spanRemove' il = [il]
|
||||
|
||||
spanRemove :: [Inline] -> [Inline]
|
||||
spanRemove = concatMap spanRemove'
|
||||
|
||||
spanTrim' :: Inline -> [Inline]
|
||||
spanTrim' il@(Span _ []) = [il]
|
||||
spanTrim' il@(Span attr (il':[]))
|
||||
| il' == Space = [Span attr [], Space]
|
||||
| otherwise = [il]
|
||||
spanTrim' (Span attr ils)
|
||||
| head ils == Space && last ils == Space =
|
||||
[Space, Span attr (init $ tail ils), Space]
|
||||
| head ils == Space = [Space, Span attr (tail ils)]
|
||||
| last ils == Space = [Span attr (init ils), Space]
|
||||
spanTrim' il = [il]
|
||||
|
||||
spanTrim :: [Inline] -> [Inline]
|
||||
spanTrim = concatMap spanTrim'
|
||||
|
||||
spanCorrect' :: Inline -> [Inline]
|
||||
spanCorrect' (Span ("", [], []) ils) = ils
|
||||
spanCorrect' (Span (ident, classes, kvs) ils)
|
||||
| "emph" `elem` classes =
|
||||
[Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils]
|
||||
| "strong" `elem` classes =
|
||||
[Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils]
|
||||
| "smallcaps" `elem` classes =
|
||||
[SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils]
|
||||
| "strike" `elem` classes =
|
||||
[Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils]
|
||||
| "superscript" `elem` classes =
|
||||
[Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils]
|
||||
| "subscript" `elem` classes =
|
||||
[Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils]
|
||||
| (not . null) (codeSpans `intersect` classes) =
|
||||
[Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)]
|
||||
| otherwise =
|
||||
[Span (ident, classes, kvs) ils]
|
||||
spanCorrect' il = [il]
|
||||
|
||||
spanCorrect :: [Inline] -> [Inline]
|
||||
spanCorrect = concatMap spanCorrect'
|
||||
|
||||
removeEmptyPars :: [Block] -> [Block]
|
||||
removeEmptyPars blks = filter (\b -> b /= (Para [])) blks
|
||||
|
||||
divReduce :: [Block] -> [Block]
|
||||
divReduce [] = []
|
||||
divReduce ((Div (id1, classes1, kvs1) blks1) : blks)
|
||||
| (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks)
|
||||
divReduce (d1@(Div (id1, classes1, kvs1) blks1) :
|
||||
d2@(Div (id2, classes2, kvs2) blks2) :
|
||||
blks) =
|
||||
let classes' = classes1 `intersect` classes2
|
||||
kvs' = kvs1 `intersect` kvs2
|
||||
classes1' = classes1 \\ classes'
|
||||
kvs1' = kvs1 \\ kvs'
|
||||
classes2' = classes2 \\ classes'
|
||||
kvs2' = kvs2 \\ kvs'
|
||||
in
|
||||
case null classes' && null kvs' of
|
||||
True -> d1 : (divReduce (d2 : blks))
|
||||
False -> let attr' = ("", classes', kvs')
|
||||
attr1' = (id1, classes1', kvs1')
|
||||
attr2' = (id2, classes2', kvs2')
|
||||
in
|
||||
divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] :
|
||||
blks)
|
||||
divReduce (blk:blks) = blk : (divReduce blks)
|
||||
|
||||
isHeaderClass :: String -> Maybe Int
|
||||
isHeaderClass s | "Heading" `isPrefixOf` s =
|
||||
case reads (drop (length "Heading") s) :: [(Int, String)] of
|
||||
[] -> Nothing
|
||||
((n, "") : []) -> Just n
|
||||
_ -> Nothing
|
||||
isHeaderClass _ = Nothing
|
||||
|
||||
findHeaderClass :: [String] -> Maybe Int
|
||||
findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of
|
||||
[] -> Nothing
|
||||
n : _ -> Just n
|
||||
|
||||
blksToInlines :: [Block] -> [Inline]
|
||||
blksToInlines (Para ils : _) = ils
|
||||
blksToInlines (Plain ils : _) = ils
|
||||
blksToInlines _ = []
|
||||
|
||||
divCorrectPreReduce' :: Block -> [Block]
|
||||
divCorrectPreReduce' (Div (ident, classes, kvs) blks)
|
||||
| isJust $ findHeaderClass classes =
|
||||
let n = fromJust $ findHeaderClass classes
|
||||
in
|
||||
[Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)]
|
||||
| otherwise = [Div (ident, classes, kvs) blks]
|
||||
divCorrectPreReduce' blk = [blk]
|
||||
|
||||
divCorrectPreReduce :: [Block] -> [Block]
|
||||
divCorrectPreReduce = concatMap divCorrectPreReduce'
|
||||
|
||||
blkToCode :: Block -> String
|
||||
blkToCode (Para []) = ""
|
||||
blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils))
|
||||
blkToCode (Para ((Span (_, classes, _) ils'): ils))
|
||||
| (not . null) (codeSpans `intersect` classes) =
|
||||
(init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils))
|
||||
blkToCode _ = ""
|
||||
|
||||
divRemove' :: Block -> [Block]
|
||||
divRemove' (Div (_, _, kvs) blks) =
|
||||
case lookup "indent" kvs of
|
||||
Just val -> [Div ("", [], [("indent", val)]) blks]
|
||||
Nothing -> blks
|
||||
divRemove' blk = [blk]
|
||||
|
||||
divRemove :: [Block] -> [Block]
|
||||
divRemove = concatMap divRemove'
|
||||
|
||||
divCorrect' :: Block -> [Block]
|
||||
divCorrect' b@(Div (ident, classes, kvs) blks)
|
||||
| (not . null) (blockQuoteDivs `intersect` classes) =
|
||||
[BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]]
|
||||
| (not . null) (codeDivs `intersect` classes) =
|
||||
[CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)]
|
||||
| otherwise =
|
||||
case lookup "indent" kvs of
|
||||
Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]
|
||||
Just _ ->
|
||||
[BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]]
|
||||
Nothing -> [b]
|
||||
divCorrect' blk = [blk]
|
||||
|
||||
divCorrect :: [Block] -> [Block]
|
||||
divCorrect = concatMap divCorrect'
|
208
src/Text/Pandoc/Readers/DocX/Lists.hs
Normal file
208
src/Text/Pandoc/Readers/DocX/Lists.hs
Normal file
|
@ -0,0 +1,208 @@
|
|||
{-
|
||||
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
-}
|
||||
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.DocX.Lists
|
||||
Copyright : Copyright (C) 2014 Jesse Rosenthal
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
Functions for converting flat DocX paragraphs into nested lists.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Readers.DocX.Lists ( blocksToBullets
|
||||
, blocksToDefinitions) where
|
||||
|
||||
import Text.Pandoc.JSON
|
||||
import Text.Pandoc.Shared (trim)
|
||||
import Control.Monad
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
|
||||
isListItem :: Block -> Bool
|
||||
isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True
|
||||
isListItem _ = False
|
||||
|
||||
getLevel :: Block -> Maybe Integer
|
||||
getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs
|
||||
getLevel _ = Nothing
|
||||
|
||||
getLevelN :: Block -> Integer
|
||||
getLevelN b = case getLevel b of
|
||||
Just n -> n
|
||||
Nothing -> -1
|
||||
|
||||
getNumId :: Block -> Maybe Integer
|
||||
getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs
|
||||
getNumId _ = Nothing
|
||||
|
||||
getNumIdN :: Block -> Integer
|
||||
getNumIdN b = case getNumId b of
|
||||
Just n -> n
|
||||
Nothing -> -1
|
||||
|
||||
getText :: Block -> Maybe String
|
||||
getText (Div (_, _, kvs) _) = lookup "text" kvs
|
||||
getText _ = Nothing
|
||||
|
||||
data ListType = Itemized | Enumerated ListAttributes
|
||||
|
||||
listStyleMap :: [(String, ListNumberStyle)]
|
||||
listStyleMap = [("upperLetter", UpperAlpha),
|
||||
("lowerLetter", LowerAlpha),
|
||||
("upperRoman", UpperRoman),
|
||||
("lowerRoman", LowerRoman),
|
||||
("decimal", Decimal)]
|
||||
|
||||
listDelimMap :: [(String, ListNumberDelim)]
|
||||
listDelimMap = [("%1)", OneParen),
|
||||
("(%1)", TwoParens),
|
||||
("%1.", Period)]
|
||||
|
||||
getListType :: Block -> Maybe ListType
|
||||
getListType b@(Div (_, _, kvs) _) | isListItem b =
|
||||
let
|
||||
start = lookup "start" kvs
|
||||
frmt = lookup "format" kvs
|
||||
txt = lookup "text" kvs
|
||||
in
|
||||
case frmt of
|
||||
Just "bullet" -> Just Itemized
|
||||
Just f ->
|
||||
case txt of
|
||||
Just t -> Just $ Enumerated (
|
||||
read (fromMaybe "1" start) :: Int,
|
||||
fromMaybe DefaultStyle (lookup f listStyleMap),
|
||||
fromMaybe DefaultDelim (lookup t listDelimMap))
|
||||
Nothing -> Nothing
|
||||
_ -> Nothing
|
||||
getListType _ = Nothing
|
||||
|
||||
listParagraphDivs :: [String]
|
||||
listParagraphDivs = ["ListParagraph"]
|
||||
|
||||
-- This is a first stab at going through and attaching meaning to list
|
||||
-- paragraphs, without an item marker, following a list item. We
|
||||
-- assume that these are paragraphs in the same item.
|
||||
|
||||
handleListParagraphs :: [Block] -> [Block]
|
||||
handleListParagraphs [] = []
|
||||
handleListParagraphs (
|
||||
(Div attr1@(_, classes1, _) blks1) :
|
||||
(Div (ident2, classes2, kvs2) blks2) :
|
||||
blks
|
||||
) | "list-item" `elem` classes1 &&
|
||||
not ("list-item" `elem` classes2) &&
|
||||
(not . null) (listParagraphDivs `intersect` classes2) =
|
||||
-- We don't want to keep this indent.
|
||||
let newDiv2 =
|
||||
(Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2)
|
||||
in
|
||||
handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks)
|
||||
handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks)
|
||||
|
||||
separateBlocks' :: Block -> [[Block]] -> [[Block]]
|
||||
separateBlocks' blk ([] : []) = [[blk]]
|
||||
separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]]
|
||||
separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]]
|
||||
-- The following is for the invisible bullet lists. This is how
|
||||
-- pandoc-generated ooxml does multiparagraph item lists.
|
||||
separateBlocks' b acc | liftM trim (getText b) == Just "" =
|
||||
(init acc) ++ [(last acc) ++ [b]]
|
||||
separateBlocks' b acc = acc ++ [[b]]
|
||||
|
||||
separateBlocks :: [Block] -> [[Block]]
|
||||
separateBlocks blks = foldr separateBlocks' [[]] (reverse blks)
|
||||
|
||||
flatToBullets' :: Integer -> [Block] -> [Block]
|
||||
flatToBullets' _ [] = []
|
||||
flatToBullets' num xs@(b : elems)
|
||||
| getLevelN b == num = b : (flatToBullets' num elems)
|
||||
| otherwise =
|
||||
let bNumId = getNumIdN b
|
||||
bLevel = getLevelN b
|
||||
(children, remaining) =
|
||||
span
|
||||
(\b' ->
|
||||
((getLevelN b') > bLevel ||
|
||||
((getLevelN b') == bLevel && (getNumIdN b') == bNumId)))
|
||||
xs
|
||||
in
|
||||
case getListType b of
|
||||
Just (Enumerated attr) ->
|
||||
(OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) :
|
||||
(flatToBullets' num remaining)
|
||||
_ ->
|
||||
(BulletList (separateBlocks $ flatToBullets' bLevel children)) :
|
||||
(flatToBullets' num remaining)
|
||||
|
||||
flatToBullets :: [Block] -> [Block]
|
||||
flatToBullets elems = flatToBullets' (-1) elems
|
||||
|
||||
blocksToBullets :: [Block] -> [Block]
|
||||
blocksToBullets blks =
|
||||
-- bottomUp removeListItemDivs $
|
||||
flatToBullets $ (handleListParagraphs blks)
|
||||
|
||||
|
||||
plainParaInlines :: Block -> [Inline]
|
||||
plainParaInlines (Plain ils) = ils
|
||||
plainParaInlines (Para ils) = ils
|
||||
plainParaInlines _ = []
|
||||
|
||||
blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block]
|
||||
blocksToDefinitions' [] acc [] = reverse acc
|
||||
blocksToDefinitions' defAcc acc [] =
|
||||
reverse $ (DefinitionList (reverse defAcc)) : acc
|
||||
blocksToDefinitions' defAcc acc
|
||||
((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks)
|
||||
| "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 =
|
||||
let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
|
||||
pair = case remainingAttr2 == ("", [], []) of
|
||||
True -> (concatMap plainParaInlines blks1, [blks2])
|
||||
False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]])
|
||||
in
|
||||
blocksToDefinitions' (pair : defAcc) acc blks
|
||||
blocksToDefinitions' defAcc acc
|
||||
((Div (ident2, classes2, kvs2) blks2) : blks)
|
||||
| (not . null) defAcc && "Definition" `elem` classes2 =
|
||||
let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
|
||||
defItems2 = case remainingAttr2 == ("", [], []) of
|
||||
True -> blks2
|
||||
False -> [Div remainingAttr2 blks2]
|
||||
((defTerm, defItems):defs) = defAcc
|
||||
defAcc' = case null defItems of
|
||||
True -> (defTerm, [defItems2]) : defs
|
||||
False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs
|
||||
in
|
||||
blocksToDefinitions' defAcc' acc blks
|
||||
blocksToDefinitions' [] acc (b:blks) =
|
||||
blocksToDefinitions' [] (b:acc) blks
|
||||
blocksToDefinitions' defAcc acc (b:blks) =
|
||||
blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks
|
||||
|
||||
|
||||
blocksToDefinitions :: [Block] -> [Block]
|
||||
blocksToDefinitions = blocksToDefinitions' [] []
|
||||
|
||||
|
||||
|
||||
|
604
src/Text/Pandoc/Readers/DocX/Parse.hs
Normal file
604
src/Text/Pandoc/Readers/DocX/Parse.hs
Normal file
|
@ -0,0 +1,604 @@
|
|||
{-
|
||||
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
-}
|
||||
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.DocX.Parse
|
||||
Copyright : Copyright (C) 2014 Jesse Rosenthal
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
Conversion of DocX archive into DocX haskell type
|
||||
-}
|
||||
|
||||
|
||||
module Text.Pandoc.Readers.DocX.Parse ( DocX(..)
|
||||
, Document(..)
|
||||
, Body(..)
|
||||
, BodyPart(..)
|
||||
, TblLook(..)
|
||||
, ParPart(..)
|
||||
, Run(..)
|
||||
, RunElem(..)
|
||||
, Notes
|
||||
, Numbering
|
||||
, Relationship
|
||||
, Media
|
||||
, RunStyle(..)
|
||||
, ParagraphStyle(..)
|
||||
, Row(..)
|
||||
, Cell(..)
|
||||
, getFootNote
|
||||
, getEndNote
|
||||
, lookupLevel
|
||||
, lookupRelationship
|
||||
, archiveToDocX
|
||||
) where
|
||||
import Codec.Archive.Zip
|
||||
import Text.XML.Light
|
||||
import Data.Maybe
|
||||
import Data.List
|
||||
import System.FilePath
|
||||
import Data.Bits ((.|.))
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import qualified Text.Pandoc.UTF8 as UTF8
|
||||
|
||||
attrToNSPair :: Attr -> Maybe (String, String)
|
||||
attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
|
||||
attrToNSPair _ = Nothing
|
||||
|
||||
|
||||
type NameSpaces = [(String, String)]
|
||||
|
||||
data DocX = DocX Document Notes Numbering [Relationship] Media
|
||||
deriving Show
|
||||
|
||||
archiveToDocX :: Archive -> Maybe DocX
|
||||
archiveToDocX archive = do
|
||||
let notes = archiveToNotes archive
|
||||
rels = archiveToRelationships archive
|
||||
media = archiveToMedia archive
|
||||
doc <- archiveToDocument archive
|
||||
numbering <- archiveToNumbering archive
|
||||
return $ DocX doc notes numbering rels media
|
||||
|
||||
data Document = Document NameSpaces Body
|
||||
deriving Show
|
||||
|
||||
archiveToDocument :: Archive -> Maybe Document
|
||||
archiveToDocument zf = do
|
||||
entry <- findEntryByPath "word/document.xml" zf
|
||||
docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
|
||||
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
|
||||
bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem
|
||||
body <- elemToBody namespaces bodyElem
|
||||
return $ Document namespaces body
|
||||
|
||||
type Media = [(FilePath, B.ByteString)]
|
||||
|
||||
filePathIsMedia :: FilePath -> Bool
|
||||
filePathIsMedia fp =
|
||||
let (dir, _) = splitFileName fp
|
||||
in
|
||||
(dir == "word/media/")
|
||||
|
||||
getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString)
|
||||
getMediaPair zf fp =
|
||||
case findEntryByPath fp zf of
|
||||
Just e -> Just (fp, fromEntry e)
|
||||
Nothing -> Nothing
|
||||
|
||||
archiveToMedia :: Archive -> Media
|
||||
archiveToMedia zf =
|
||||
mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
|
||||
|
||||
data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
|
||||
deriving Show
|
||||
|
||||
data Numb = Numb String String -- right now, only a key to an abstract num
|
||||
deriving Show
|
||||
|
||||
data AbstractNumb = AbstractNumb String [Level]
|
||||
deriving Show
|
||||
|
||||
-- (ilvl, format, string, start)
|
||||
type Level = (String, String, String, Maybe Integer)
|
||||
|
||||
lookupLevel :: String -> String -> Numbering -> Maybe Level
|
||||
lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
|
||||
absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
|
||||
lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs
|
||||
lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls
|
||||
return lvl
|
||||
|
||||
numElemToNum :: NameSpaces -> Element -> Maybe Numb
|
||||
numElemToNum ns element |
|
||||
qName (elName element) == "num" &&
|
||||
qURI (elName element) == (lookup "w" ns) = do
|
||||
numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element
|
||||
absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element
|
||||
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
return $ Numb numId absNumId
|
||||
numElemToNum _ _ = Nothing
|
||||
|
||||
absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb
|
||||
absNumElemToAbsNum ns element |
|
||||
qName (elName element) == "abstractNum" &&
|
||||
qURI (elName element) == (lookup "w" ns) = do
|
||||
absNumId <- findAttr
|
||||
(QName "abstractNumId" (lookup "w" ns) (Just "w"))
|
||||
element
|
||||
let levelElems = findChildren
|
||||
(QName "lvl" (lookup "w" ns) (Just "w"))
|
||||
element
|
||||
levels = mapMaybe id $ map (levelElemToLevel ns) levelElems
|
||||
return $ AbstractNumb absNumId levels
|
||||
absNumElemToAbsNum _ _ = Nothing
|
||||
|
||||
levelElemToLevel :: NameSpaces -> Element -> Maybe Level
|
||||
levelElemToLevel ns element |
|
||||
qName (elName element) == "lvl" &&
|
||||
qURI (elName element) == (lookup "w" ns) = do
|
||||
ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element
|
||||
fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element
|
||||
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element
|
||||
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element
|
||||
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
>>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)])))
|
||||
return (ilvl, fmt, txt, start)
|
||||
levelElemToLevel _ _ = Nothing
|
||||
|
||||
archiveToNumbering :: Archive -> Maybe Numbering
|
||||
archiveToNumbering zf =
|
||||
case findEntryByPath "word/numbering.xml" zf of
|
||||
Nothing -> Just $ Numbering [] [] []
|
||||
Just entry -> do
|
||||
numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
|
||||
let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem)
|
||||
numElems = findChildren
|
||||
(QName "num" (lookup "w" namespaces) (Just "w"))
|
||||
numberingElem
|
||||
absNumElems = findChildren
|
||||
(QName "abstractNum" (lookup "w" namespaces) (Just "w"))
|
||||
numberingElem
|
||||
nums = mapMaybe id $ map (numElemToNum namespaces) numElems
|
||||
absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems
|
||||
return $ Numbering namespaces nums absNums
|
||||
|
||||
data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])])
|
||||
deriving Show
|
||||
|
||||
noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart])
|
||||
noteElemToNote ns element
|
||||
| qName (elName element) `elem` ["endnote", "footnote"] &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
do
|
||||
noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
|
||||
let bps = map fromJust
|
||||
$ filter isJust
|
||||
$ map (elemToBodyPart ns)
|
||||
$ filterChildrenName (isParOrTbl ns) element
|
||||
return $ (noteId, bps)
|
||||
noteElemToNote _ _ = Nothing
|
||||
|
||||
getFootNote :: String -> Notes -> Maybe [BodyPart]
|
||||
getFootNote s (Notes _ fns _) = fns >>= (lookup s)
|
||||
|
||||
getEndNote :: String -> Notes -> Maybe [BodyPart]
|
||||
getEndNote s (Notes _ _ ens) = ens >>= (lookup s)
|
||||
|
||||
elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])]
|
||||
elemToNotes ns notetype element
|
||||
| qName (elName element) == (notetype ++ "s") &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
Just $ map fromJust
|
||||
$ filter isJust
|
||||
$ map (noteElemToNote ns)
|
||||
$ findChildren (QName notetype (lookup "w" ns) (Just "w")) element
|
||||
elemToNotes _ _ _ = Nothing
|
||||
|
||||
archiveToNotes :: Archive -> Notes
|
||||
archiveToNotes zf =
|
||||
let fnElem = findEntryByPath "word/footnotes.xml" zf
|
||||
>>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
|
||||
enElem = findEntryByPath "word/endnotes.xml" zf
|
||||
>>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
|
||||
fn_namespaces = case fnElem of
|
||||
Just e -> mapMaybe attrToNSPair (elAttribs e)
|
||||
Nothing -> []
|
||||
en_namespaces = case enElem of
|
||||
Just e -> mapMaybe attrToNSPair (elAttribs e)
|
||||
Nothing -> []
|
||||
ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
|
||||
fn = fnElem >>= (elemToNotes ns "footnote")
|
||||
en = enElem >>= (elemToNotes ns "endnote")
|
||||
in
|
||||
Notes ns fn en
|
||||
|
||||
|
||||
data Relationship = Relationship (RelId, Target)
|
||||
deriving Show
|
||||
|
||||
lookupRelationship :: RelId -> [Relationship] -> Maybe Target
|
||||
lookupRelationship relid rels =
|
||||
lookup relid (map (\(Relationship pair) -> pair) rels)
|
||||
|
||||
filePathIsRel :: FilePath -> Bool
|
||||
filePathIsRel fp =
|
||||
let (dir, name) = splitFileName fp
|
||||
in
|
||||
(dir == "word/_rels/") && ((takeExtension name) == ".rels")
|
||||
|
||||
relElemToRelationship :: Element -> Maybe Relationship
|
||||
relElemToRelationship element | qName (elName element) == "Relationship" =
|
||||
do
|
||||
relId <- findAttr (QName "Id" Nothing Nothing) element
|
||||
target <- findAttr (QName "Target" Nothing Nothing) element
|
||||
return $ Relationship (relId, target)
|
||||
relElemToRelationship _ = Nothing
|
||||
|
||||
|
||||
archiveToRelationships :: Archive -> [Relationship]
|
||||
archiveToRelationships archive =
|
||||
let relPaths = filter filePathIsRel (filesInArchive archive)
|
||||
entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths
|
||||
relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
|
||||
rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems
|
||||
in
|
||||
rels
|
||||
|
||||
data Body = Body [BodyPart]
|
||||
deriving Show
|
||||
|
||||
isParOrTbl :: NameSpaces -> QName -> Bool
|
||||
isParOrTbl ns q = qName q `elem` ["p", "tbl"] &&
|
||||
qURI q == (lookup "w" ns)
|
||||
|
||||
elemToBody :: NameSpaces -> Element -> Maybe Body
|
||||
elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) =
|
||||
Just $ Body
|
||||
$ map fromJust
|
||||
$ filter isJust
|
||||
$ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element
|
||||
elemToBody _ _ = Nothing
|
||||
|
||||
isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool
|
||||
isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] &&
|
||||
qURI q == (lookup "w" ns)
|
||||
|
||||
elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String)
|
||||
elemToNumInfo ns element
|
||||
| qName (elName element) == "p" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
do
|
||||
pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element
|
||||
numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr
|
||||
lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>=
|
||||
findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>=
|
||||
findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
return (numId, lvl)
|
||||
elemToNumInfo _ _ = Nothing
|
||||
|
||||
-- isBookMarkTag :: NameSpaces -> QName -> Bool
|
||||
-- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] &&
|
||||
-- qURI q == (lookup "w" ns)
|
||||
|
||||
-- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark
|
||||
-- parChildrenToBookmark ns (bms : bme : _)
|
||||
-- | qName (elName bms) == "bookmarkStart" &&
|
||||
-- qURI (elName bms) == (lookup "w" ns) &&
|
||||
-- qName (elName bme) == "bookmarkEnd" &&
|
||||
-- qURI (elName bme) == (lookup "w" ns) = do
|
||||
-- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms
|
||||
-- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms
|
||||
-- return $ (bmId, bmName)
|
||||
-- parChildrenToBookmark _ _ = Nothing
|
||||
|
||||
elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart
|
||||
elemToBodyPart ns element
|
||||
| qName (elName element) == "p" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
let parstyle = elemToParagraphStyle ns element
|
||||
parparts = mapMaybe id
|
||||
$ map (elemToParPart ns)
|
||||
$ filterChildrenName (isRunOrLinkOrBookmark ns) element
|
||||
in
|
||||
case elemToNumInfo ns element of
|
||||
Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts
|
||||
Nothing -> Just $ Paragraph parstyle parparts
|
||||
| qName (elName element) == "tbl" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
let
|
||||
caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
|
||||
>>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w"))
|
||||
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
grid = case
|
||||
findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element
|
||||
of
|
||||
Just g -> elemToTblGrid ns g
|
||||
Nothing -> []
|
||||
tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
|
||||
>>= findChild (QName "tblLook" (lookup "w" ns) (Just "w"))
|
||||
>>= elemToTblLook ns
|
||||
in
|
||||
Just $ Tbl
|
||||
(fromMaybe "" caption)
|
||||
grid
|
||||
(fromMaybe defaultTblLook tblLook)
|
||||
(mapMaybe (elemToRow ns) (elChildren element))
|
||||
| otherwise = Nothing
|
||||
|
||||
elemToTblLook :: NameSpaces -> Element -> Maybe TblLook
|
||||
elemToTblLook ns element
|
||||
| qName (elName element) == "tblLook" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element
|
||||
val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element
|
||||
firstRowFmt =
|
||||
case firstRow of
|
||||
Just "1" -> True
|
||||
Just _ -> False
|
||||
Nothing -> case val of
|
||||
Just bitMask -> testBitMask bitMask 0x020
|
||||
Nothing -> False
|
||||
in
|
||||
Just $ TblLook{firstRowFormatting = firstRowFmt}
|
||||
elemToTblLook _ _ = Nothing
|
||||
|
||||
testBitMask :: String -> Int -> Bool
|
||||
testBitMask bitMaskS n =
|
||||
case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of
|
||||
[] -> False
|
||||
((n', _) : _) -> ((n' .|. n) /= 0)
|
||||
|
||||
data ParagraphStyle = ParagraphStyle { pStyle :: [String]
|
||||
, indent :: Maybe Integer
|
||||
}
|
||||
deriving Show
|
||||
|
||||
defaultParagraphStyle :: ParagraphStyle
|
||||
defaultParagraphStyle = ParagraphStyle { pStyle = []
|
||||
, indent = Nothing
|
||||
}
|
||||
|
||||
elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
|
||||
elemToParagraphStyle ns element =
|
||||
case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of
|
||||
Just pPr ->
|
||||
ParagraphStyle
|
||||
{pStyle =
|
||||
mapMaybe id $
|
||||
map
|
||||
(findAttr (QName "val" (lookup "w" ns) (Just "w")))
|
||||
(findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr)
|
||||
, indent =
|
||||
findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>=
|
||||
findAttr (QName "left" (lookup "w" ns) (Just "w")) >>=
|
||||
stringToInteger
|
||||
}
|
||||
Nothing -> defaultParagraphStyle
|
||||
|
||||
|
||||
data BodyPart = Paragraph ParagraphStyle [ParPart]
|
||||
| ListItem ParagraphStyle String String [ParPart]
|
||||
| Tbl String TblGrid TblLook [Row]
|
||||
|
||||
deriving Show
|
||||
|
||||
type TblGrid = [Integer]
|
||||
|
||||
data TblLook = TblLook {firstRowFormatting::Bool}
|
||||
deriving Show
|
||||
|
||||
defaultTblLook :: TblLook
|
||||
defaultTblLook = TblLook{firstRowFormatting = False}
|
||||
|
||||
stringToInteger :: String -> Maybe Integer
|
||||
stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
|
||||
|
||||
elemToTblGrid :: NameSpaces -> Element -> TblGrid
|
||||
elemToTblGrid ns element
|
||||
| qName (elName element) == "tblGrid" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
let
|
||||
cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element
|
||||
in
|
||||
mapMaybe (\e ->
|
||||
findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e
|
||||
>>= stringToInteger
|
||||
)
|
||||
cols
|
||||
elemToTblGrid _ _ = []
|
||||
|
||||
data Row = Row [Cell]
|
||||
deriving Show
|
||||
|
||||
|
||||
elemToRow :: NameSpaces -> Element -> Maybe Row
|
||||
elemToRow ns element
|
||||
| qName (elName element) == "tr" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
let
|
||||
cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element
|
||||
in
|
||||
Just $ Row (mapMaybe (elemToCell ns) cells)
|
||||
elemToRow _ _ = Nothing
|
||||
|
||||
data Cell = Cell [BodyPart]
|
||||
deriving Show
|
||||
|
||||
elemToCell :: NameSpaces -> Element -> Maybe Cell
|
||||
elemToCell ns element
|
||||
| qName (elName element) == "tc" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element))
|
||||
elemToCell _ _ = Nothing
|
||||
|
||||
data ParPart = PlainRun Run
|
||||
| BookMark BookMarkId Anchor
|
||||
| InternalHyperLink Anchor [Run]
|
||||
| ExternalHyperLink RelId [Run]
|
||||
| Drawing String
|
||||
deriving Show
|
||||
|
||||
data Run = Run RunStyle [RunElem]
|
||||
| Footnote String
|
||||
| Endnote String
|
||||
deriving Show
|
||||
|
||||
data RunElem = TextRun String | LnBrk
|
||||
deriving Show
|
||||
|
||||
data RunStyle = RunStyle { isBold :: Bool
|
||||
, isItalic :: Bool
|
||||
, isSmallCaps :: Bool
|
||||
, isStrike :: Bool
|
||||
, isSuperScript :: Bool
|
||||
, isSubScript :: Bool
|
||||
, underline :: Maybe String
|
||||
, rStyle :: Maybe String }
|
||||
deriving Show
|
||||
|
||||
defaultRunStyle :: RunStyle
|
||||
defaultRunStyle = RunStyle { isBold = False
|
||||
, isItalic = False
|
||||
, isSmallCaps = False
|
||||
, isStrike = False
|
||||
, isSuperScript = False
|
||||
, isSubScript = False
|
||||
, underline = Nothing
|
||||
, rStyle = Nothing
|
||||
}
|
||||
|
||||
elemToRunStyle :: NameSpaces -> Element -> RunStyle
|
||||
elemToRunStyle ns element =
|
||||
case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of
|
||||
Just rPr ->
|
||||
RunStyle
|
||||
{
|
||||
isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr
|
||||
, isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr
|
||||
, isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr
|
||||
, isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr
|
||||
, isSuperScript =
|
||||
(Just "superscript" ==
|
||||
(findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
|
||||
findAttr (QName "val" (lookup "w" ns) (Just "w"))))
|
||||
, isSubScript =
|
||||
(Just "subscript" ==
|
||||
(findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
|
||||
findAttr (QName "val" (lookup "w" ns) (Just "w"))))
|
||||
, underline =
|
||||
findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>=
|
||||
findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
, rStyle =
|
||||
findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>=
|
||||
findAttr (QName "val" (lookup "w" ns) (Just "w"))
|
||||
}
|
||||
Nothing -> defaultRunStyle
|
||||
|
||||
elemToRun :: NameSpaces -> Element -> Maybe Run
|
||||
elemToRun ns element
|
||||
| qName (elName element) == "r" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
case
|
||||
findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>=
|
||||
findAttr (QName "id" (lookup "w" ns) (Just "w"))
|
||||
of
|
||||
Just s -> Just $ Footnote s
|
||||
Nothing ->
|
||||
case
|
||||
findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>=
|
||||
findAttr (QName "id" (lookup "w" ns) (Just "w"))
|
||||
of
|
||||
Just s -> Just $ Endnote s
|
||||
Nothing -> Just $
|
||||
Run (elemToRunStyle ns element)
|
||||
(elemToRunElems ns element)
|
||||
elemToRun _ _ = Nothing
|
||||
|
||||
elemToRunElem :: NameSpaces -> Element -> Maybe RunElem
|
||||
elemToRunElem ns element
|
||||
| qName (elName element) == "t" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
Just $ TextRun (strContent element)
|
||||
| qName (elName element) == "br" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
Just $ LnBrk
|
||||
| otherwise = Nothing
|
||||
|
||||
|
||||
elemToRunElems :: NameSpaces -> Element -> [RunElem]
|
||||
elemToRunElems ns element
|
||||
| qName (elName element) == "r" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
mapMaybe (elemToRunElem ns) (elChildren element)
|
||||
| otherwise = []
|
||||
|
||||
elemToDrawing :: NameSpaces -> Element -> Maybe ParPart
|
||||
elemToDrawing ns element
|
||||
| qName (elName element) == "drawing" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
|
||||
in
|
||||
findElement (QName "blip" (Just a_ns) (Just "a")) element
|
||||
>>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
|
||||
>>= (\s -> Just $ Drawing s)
|
||||
elemToDrawing _ _ = Nothing
|
||||
|
||||
|
||||
elemToParPart :: NameSpaces -> Element -> Maybe ParPart
|
||||
elemToParPart ns element
|
||||
| qName (elName element) == "r" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of
|
||||
Just drawingElem -> elemToDrawing ns drawingElem
|
||||
Nothing -> do
|
||||
r <- elemToRun ns element
|
||||
return $ PlainRun r
|
||||
elemToParPart ns element
|
||||
| qName (elName element) == "bookmarkStart" &&
|
||||
qURI (elName element) == (lookup "w" ns) = do
|
||||
bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
|
||||
bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element
|
||||
return $ BookMark bmId bmName
|
||||
elemToParPart ns element
|
||||
| qName (elName element) == "hyperlink" &&
|
||||
qURI (elName element) == (lookup "w" ns) =
|
||||
let runs = map fromJust $ filter isJust $ map (elemToRun ns)
|
||||
$ findChildren (QName "r" (lookup "w" ns) (Just "w")) element
|
||||
in
|
||||
case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of
|
||||
Just anchor ->
|
||||
Just $ InternalHyperLink anchor runs
|
||||
Nothing ->
|
||||
case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of
|
||||
Just relId -> Just $ ExternalHyperLink relId runs
|
||||
Nothing -> Nothing
|
||||
elemToParPart _ _ = Nothing
|
||||
|
||||
type Target = String
|
||||
type Anchor = String
|
||||
type BookMarkId = String
|
||||
type RelId = String
|
||||
|
68
tests/Tests/Readers/DocX.hs
Normal file
68
tests/Tests/Readers/DocX.hs
Normal file
|
@ -0,0 +1,68 @@
|
|||
module Tests.Readers.DocX (tests) where
|
||||
|
||||
import Text.Pandoc.Options
|
||||
import Text.Pandoc.Readers.Native
|
||||
import Text.Pandoc.Definition
|
||||
import Tests.Helpers
|
||||
import Test.Framework
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import Text.Pandoc.Readers.DocX
|
||||
|
||||
compareOutput :: FilePath -> FilePath -> IO (Pandoc, Pandoc)
|
||||
compareOutput docxFile nativeFile = do
|
||||
df <- B.readFile docxFile
|
||||
nf <- Prelude.readFile nativeFile
|
||||
return $ (readDocX def df, readNative nf)
|
||||
|
||||
testCompare' :: String -> FilePath -> FilePath -> IO Test
|
||||
testCompare' name docxFile nativeFile = do
|
||||
(dp, np) <- compareOutput docxFile nativeFile
|
||||
return $ test id name (dp, np)
|
||||
|
||||
testCompare :: String -> FilePath -> FilePath -> Test
|
||||
testCompare name docxFile nativeFile =
|
||||
buildTest $ testCompare' name docxFile nativeFile
|
||||
|
||||
|
||||
tests :: [Test]
|
||||
tests = [ testGroup "inlines"
|
||||
[ testCompare
|
||||
"font formatting"
|
||||
"docx.inline_formatting.docx"
|
||||
"docx.inline_formatting.native"
|
||||
, testCompare
|
||||
"hyperlinks"
|
||||
"docx.links.docx"
|
||||
"docx.links.native"
|
||||
, testCompare
|
||||
"inline image with reference output"
|
||||
"docx.image.docx"
|
||||
"docx.image_no_embed.native"
|
||||
, testCompare
|
||||
"handling unicode input"
|
||||
"docx.unicode.docx"
|
||||
"docx.unicode.native"]
|
||||
, testGroup "blocks"
|
||||
[ testCompare
|
||||
"headers"
|
||||
"docx.headers.docx"
|
||||
"docx.headers.native"
|
||||
, testCompare
|
||||
"lists"
|
||||
"docx.lists.docx"
|
||||
"docx.lists.native"
|
||||
, testCompare
|
||||
"footnotes and endnotes"
|
||||
"docx.notes.docx"
|
||||
"docx.notes.native"
|
||||
, testCompare
|
||||
"blockquotes (parsing indent as blockquote)"
|
||||
"docx.block_quotes.docx"
|
||||
"docx.block_quotes_parse_indent.native"
|
||||
, testCompare
|
||||
"tables"
|
||||
"docx.tables.docx"
|
||||
"docx.tables.native"
|
||||
]
|
||||
]
|
||||
|
BIN
tests/docx.block_quotes.docx
Normal file
BIN
tests/docx.block_quotes.docx
Normal file
Binary file not shown.
8
tests/docx.block_quotes_parse_indent.native
Normal file
8
tests/docx.block_quotes_parse_indent.native
Normal file
|
@ -0,0 +1,8 @@
|
|||
[Header 2 ("",[],[]) [Str "Some",Space,Str "block",Space,Str "quotes,",Space,Str "in",Space,Str "different",Space,Str "ways"]
|
||||
,Para [Str "This",Space,Str "is",Space,Str "the",Space,Str "proper",Space,Str "way,",Space,Str "with",Space,Str "a",Space,Str "style"]
|
||||
,BlockQuote
|
||||
[Para [Str "I",Space,Str "don\8217t",Space,Str "know",Space,Str "why",Space,Str "this",Space,Str "would",Space,Str "be",Space,Str "in",Space,Str "italics,",Space,Str "but",Space,Str "so",Space,Str "it",Space,Str "appears",Space,Str "to",Space,Str "be",Space,Str "on",Space,Str "my",Space,Str "screen."]]
|
||||
,Para [Str "And",Space,Str "this",Space,Str "is",Space,Str "the",Space,Str "way",Space,Str "that",Space,Str "most",Space,Str "people",Space,Str "do",Space,Str "it:"]
|
||||
,BlockQuote
|
||||
[Para [Str "I",Space,Str "just",Space,Str "indented",Space,Str "this,",Space,Str "so",Space,Str "it",Space,Str "looks",Space,Str "like",Space,Str "a",Space,Str "block",Space,Str "quote.",Space,Str "I",Space,Str "think",Space,Str "this",Space,Str "is",Space,Str "how",Space,Str "most",Space,Str "people",Space,Str "do",Space,Str "block",Space,Str "quotes",Space,Str "in",Space,Str "their",Space,Str "documents."]]
|
||||
,Para [Str "And",Space,Str "back",Space,Str "to",Space,Str "the",Space,Str "normal",Space,Str "style."]]
|
BIN
tests/docx.headers.docx
Normal file
BIN
tests/docx.headers.docx
Normal file
Binary file not shown.
5
tests/docx.headers.native
Normal file
5
tests/docx.headers.native
Normal file
|
@ -0,0 +1,5 @@
|
|||
[Header 1 ("",[],[]) [Str "A",Space,Str "Test",Space,Str "of",Space,Str "Headers"]
|
||||
,Header 2 ("",[],[]) [Str "Second",Space,Str "Level"]
|
||||
,Para [Str "Some",Space,Str "plain",Space,Str "text."]
|
||||
,Header 3 ("",[],[]) [Str "Third",Space,Str "level"]
|
||||
,Para [Str "Some",Space,Str "more",Space,Str "plain",Space,Str "text."]]
|
BIN
tests/docx.image.docx
Normal file
BIN
tests/docx.image.docx
Normal file
Binary file not shown.
2
tests/docx.image_no_embed.native
Normal file
2
tests/docx.image_no_embed.native
Normal file
|
@ -0,0 +1,2 @@
|
|||
[Header 2 ("",[],[]) [Str "An",Space,Str "image"]
|
||||
,Para [Image [] ("word/media/image1.jpeg","")]]
|
BIN
tests/docx.inline_formatting.docx
Normal file
BIN
tests/docx.inline_formatting.docx
Normal file
Binary file not shown.
5
tests/docx.inline_formatting.native
Normal file
5
tests/docx.inline_formatting.native
Normal file
|
@ -0,0 +1,5 @@
|
|||
[Para [Str "Regular",Space,Str "text",Space,Emph [Str "italics"],Space,Strong [Str "bold",Space,Emph [Str "bold",Space,Str "italics"]],Str "."]
|
||||
,Para [Str "This",Space,Str "is",Space,SmallCaps [Str "Small",Space,Str "Caps"],Str ",",Space,Str "and",Space,Str "this",Space,Str "is",Space,Strikeout [Str "strikethrough"],Str "."]
|
||||
,Para [Str "Some",Space,Str "people",Space,Str "use",Space,Span ("",[],[("underline","single")]) [Str "single",Space,Str "underlines",Space,Str "for",Space,Emph [Str "emphasis"]],Str "."]
|
||||
,Para [Str "Above",Space,Str "the",Space,Str "line",Space,Str "is",Space,Superscript [Str "superscript"],Space,Str "and",Space,Str "below",Space,Str "the",Space,Str "line",Space,Str "is",Space,Subscript [Str "subscript"],Str "."]
|
||||
,Para [Str "A",Space,Str "line",LineBreak,Str "break."]]
|
BIN
tests/docx.links.docx
Normal file
BIN
tests/docx.links.docx
Normal file
Binary file not shown.
6
tests/docx.links.native
Normal file
6
tests/docx.links.native
Normal file
|
@ -0,0 +1,6 @@
|
|||
[Header 2 ("",[],[]) [Str "An",Space,Str "internal",Space,Str "link",Space,Str "and",Space,Str "an",Space,Str "external",Space,Str "link"]
|
||||
,Para [Str "An",Space,Link [Str "external",Space,Str "link"] ("http://google.com",""),Space,Str "to",Space,Str "a",Space,Str "popular",Space,Str "website."]
|
||||
,Para [Str "An",Space,Link [Str "internal",Space,Str "link"] ("#_A_section_for",""),Space,Str "to",Space,Str "a",Space,Str "section",Space,Str "header."]
|
||||
,Para [Str "An",Space,Link [Str "internal",Space,Str "link"] ("#my_bookmark",""),Space,Str "to",Space,Str "a",Space,Str "bookmark."]
|
||||
,Header 2 ("_A_section_for",[],[]) [Str "A",Space,Str "section",Space,Str "for",Space,Str "testing",Space,Str "link",Space,Str "targets"]
|
||||
,Para [Str "A",Space,Str "bookmark",Space,Str "right",Space,Span ("my_bookmark",["anchor"],[]) [],Str "here"]]
|
BIN
tests/docx.lists.docx
Normal file
BIN
tests/docx.lists.docx
Normal file
Binary file not shown.
18
tests/docx.lists.native
Normal file
18
tests/docx.lists.native
Normal file
|
@ -0,0 +1,18 @@
|
|||
[Header 2 ("",[],[]) [Str "Some",Space,Str "nested",Space,Str "lists"]
|
||||
,OrderedList (1,Decimal,Period)
|
||||
[[Para [Str "one"]]
|
||||
,[Para [Str "two"]
|
||||
,OrderedList (1,LowerAlpha,DefaultDelim)
|
||||
[[Para [Str "a"]]
|
||||
,[Para [Str "b"]]]]]
|
||||
,BulletList
|
||||
[[Para [Str "one"]]
|
||||
,[Para [Str "two"]
|
||||
,BulletList
|
||||
[[Para [Str "three"]
|
||||
,BulletList
|
||||
[[Para [Str "four"]
|
||||
,Para [Str "Sub",Space,Str "paragraph"]]]]]]
|
||||
,[Para [Str "Same",Space,Str "list"]]]
|
||||
,BulletList
|
||||
[[Para [Str "Different",Space,Str "list",Space,Str "adjacent",Space,Str "to",Space,Str "the",Space,Str "one",Space,Str "above."]]]]
|
BIN
tests/docx.notes.docx
Normal file
BIN
tests/docx.notes.docx
Normal file
Binary file not shown.
2
tests/docx.notes.native
Normal file
2
tests/docx.notes.native
Normal file
|
@ -0,0 +1,2 @@
|
|||
[Header 2 ("",[],[]) [Str "A",Space,Str "footnote"]
|
||||
,Para [Str "Test",Space,Str "footnote.",Note [Para [Space,Str "My",Space,Str "note."]],Space,Str "Test",Space,Str "endnote.",Note [Para [Space,Str "This",Space,Str "is",Space,Str "an",Space,Str "endnote",Space,Str "at",Space,Str "the",Space,Str "end",Space,Str "of",Space,Str "the",Space,Str "document."]]]]
|
BIN
tests/docx.tables.docx
Normal file
BIN
tests/docx.tables.docx
Normal file
Binary file not shown.
24
tests/docx.tables.native
Normal file
24
tests/docx.tables.native
Normal file
|
@ -0,0 +1,24 @@
|
|||
[Header 2 ("",[],[]) [Str "A",Space,Str "table,",Space,Str "with",Space,Str "and",Space,Str "without",Space,Str "a",Space,Str "header",Space,Str "row"]
|
||||
,Table [] [AlignDefault,AlignDefault,AlignDefault,AlignDefault] [0.0,0.0,0.0,0.0]
|
||||
[[Para [Str "Name"]]
|
||||
,[Para [Str "Game"]]
|
||||
,[Para [Str "Fame"]]
|
||||
,[Para [Str "Blame"]]]
|
||||
[[[Para [Str "Lebron",Space,Str "James"]]
|
||||
,[Para [Str "Basketball"]]
|
||||
,[Para [Str "Very",Space,Str "High"]]
|
||||
,[Para [Str "Leaving",Space,Str "Cleveland"]]]
|
||||
,[[Para [Str "Ryan",Space,Str "Braun"]]
|
||||
,[Para [Str "Baseball"]]
|
||||
,[Para [Str "Moderate"]]
|
||||
,[Para [Str "Steroids"]]]
|
||||
,[[Para [Str "Russell",Space,Str "Wilson"]]
|
||||
,[Para [Str "Football"]]
|
||||
,[Para [Str "High"]]
|
||||
,[Para [Str "Tacky",Space,Str "uniform"]]]]
|
||||
,Table [] [AlignDefault,AlignDefault] [0.0,0.0]
|
||||
[]
|
||||
[[[Para [Str "Sinple"]]
|
||||
,[Para [Str "Table"]]]
|
||||
,[[Para [Str "Without"]]
|
||||
,[Para [Str "Header"]]]]]
|
BIN
tests/docx.unicode.docx
Normal file
BIN
tests/docx.unicode.docx
Normal file
Binary file not shown.
1
tests/docx.unicode.native
Normal file
1
tests/docx.unicode.native
Normal file
|
@ -0,0 +1 @@
|
|||
[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10."]]
|
|
@ -9,6 +9,7 @@ import qualified Tests.Readers.LaTeX
|
|||
import qualified Tests.Readers.Markdown
|
||||
import qualified Tests.Readers.Org
|
||||
import qualified Tests.Readers.RST
|
||||
import qualified Tests.Readers.DocX
|
||||
import qualified Tests.Writers.ConTeXt
|
||||
import qualified Tests.Writers.LaTeX
|
||||
import qualified Tests.Writers.HTML
|
||||
|
@ -38,6 +39,8 @@ tests = [ testGroup "Old" Tests.Old.tests
|
|||
, testGroup "Markdown" Tests.Readers.Markdown.tests
|
||||
, testGroup "Org" Tests.Readers.Org.tests
|
||||
, testGroup "RST" Tests.Readers.RST.tests
|
||||
, testGroup "DocX" Tests.Readers.DocX.tests
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
|
|
Loading…
Reference in a new issue