Merge branch 'docx' of https://github.com/jkr/pandoc into jkr-docx

This commit is contained in:
John MacFarlane 2014-06-16 22:16:45 -07:00
commit bec9f3c641
27 changed files with 1511 additions and 33 deletions

18
README
View file

@ -13,15 +13,15 @@ Description
Pandoc is a [Haskell] library for converting from one markup format to
another, and a command-line tool that uses this library. It can read
[markdown] and (subsets of) [Textile], [reStructuredText], [HTML],
[LaTeX], [MediaWiki markup], [Haddock markup], [OPML], [Emacs Org-mode]
and [DocBook]; and it can write plain text, [markdown],
[reStructuredText], [XHTML], [HTML 5], [LaTeX] (including [beamer] slide
shows), [ConTeXt], [RTF], [OPML], [DocBook], [OpenDocument], [ODT],
[Word docx], [GNU Texinfo], [MediaWiki markup], [EPUB] (v2 or v3),
[FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode],
[AsciiDoc], [InDesign ICML], and [Slidy], [Slideous], [DZSlides],
[reveal.js] or [S5] HTML slide shows. It can also produce [PDF] output
on systems where LaTeX is installed.
[LaTeX], [MediaWiki markup], [Haddock markup], [OPML], [Emacs
Org-mode], [DocBook], and [Word docx]; and it can write plain text,
[markdown], [reStructuredText], [XHTML], [HTML 5], [LaTeX] (including
[beamer] slide shows), [ConTeXt], [RTF], [OPML], [DocBook],
[OpenDocument], [ODT], [Word docx], [GNU Texinfo], [MediaWiki markup],
[EPUB] (v2 or v3), [FictionBook2], [Textile], [groff man] pages,
[Emacs Org-Mode], [AsciiDoc], [InDesign ICML], and [Slidy],
[Slideous], [DZSlides], [reveal.js] or [S5] HTML slide shows. It can
also produce [PDF] output on systems where LaTeX is installed.
Pandoc's enhanced version of markdown includes syntax for footnotes,
tables, flexible ordered lists, definition lists, fenced code blocks,

View file

@ -184,7 +184,25 @@ Extra-Source-Files:
tests/fb2.math.markdown,
tests/fb2.math.fb2,
tests/fb2.test-small.png,
tests/fb2.test.jpg
tests/fb2.test.jpg,
tests/docx.block_quotes.docx,
tests/docx.block_quotes_parse_indent.native,
tests/docx.headers.docx,
tests/docx.headers.native,
tests/docx.image.docx,
tests/docx.image_no_embed.native,
tests/docx.inline_formatting.docx,
tests/docx.inline_formatting.native,
tests/docx.links.docx,
tests/docx.links.native,
tests/docx.lists.docx,
tests/docx.lists.native,
tests/docx.notes.docx,
tests/docx.notes.native,
tests/docx.tables.docx,
tests/docx.tables.native,
tests/docx.unicode.docx,
tests/docx.unicode.native
Extra-Tmp-Files: man/man1/pandoc.1,
man/man5/pandoc_markdown.5
@ -275,6 +293,7 @@ Library
Text.Pandoc.Readers.Textile,
Text.Pandoc.Readers.Native,
Text.Pandoc.Readers.Haddock,
Text.Pandoc.Readers.DocX,
Text.Pandoc.Writers.Native,
Text.Pandoc.Writers.Docbook,
Text.Pandoc.Writers.OPML,
@ -305,6 +324,8 @@ Library
Text.Pandoc.Process
Other-Modules: Text.Pandoc.Readers.Haddock.Lex,
Text.Pandoc.Readers.Haddock.Parse,
Text.Pandoc.Readers.DocX.Lists,
Text.Pandoc.Readers.DocX.Parse,
Text.Pandoc.Writers.Shared,
Text.Pandoc.Asciify,
Text.Pandoc.MIME,
@ -390,6 +411,7 @@ Test-Suite test-pandoc
Tests.Readers.Markdown
Tests.Readers.Org
Tests.Readers.RST
Tests.Readers.DocX
Tests.Writers.Native
Tests.Writers.ConTeXt
Tests.Writers.HTML

View file

@ -858,6 +858,7 @@ defaultReaderName fallback (x:xs) =
".textile" -> "textile"
".native" -> "native"
".json" -> "json"
".docx" -> "docx"
_ -> defaultReaderName fallback xs
-- Returns True if extension of first source is .lhs
@ -1158,15 +1159,21 @@ main = do
Left e -> throwIO e
Right (bs,_) -> return $ UTF8.toString bs
let readFiles [] = error "Cannot read archive from stdin"
readFiles (x:_) = B.readFile x
let convertTabs = tabFilter (if preserveTabs then 0 else tabStop)
let handleIncludes' = if readerName' == "latex" || readerName' == "latex+lhs"
then handleIncludes
else return
doc <- readSources sources >>=
handleIncludes' . convertTabs . intercalate "\n" >>=
reader readerOpts
doc <- case reader of
StringReader r->
readSources sources >>=
handleIncludes' . convertTabs . intercalate "\n" >>=
r readerOpts
ByteStringReader r -> readFiles sources >>= r readerOpts
let doc0 = M.foldWithKey setMeta doc metadata

View file

@ -62,6 +62,8 @@ module Text.Pandoc
, readers
, writers
-- * Readers: converting /to/ Pandoc format
, Reader (..)
, readDocX
, readMarkdown
, readMediaWiki
, readRST
@ -125,6 +127,7 @@ import Text.Pandoc.Readers.HTML
import Text.Pandoc.Readers.Textile
import Text.Pandoc.Readers.Native
import Text.Pandoc.Readers.Haddock
import Text.Pandoc.Readers.DocX
import Text.Pandoc.Writers.Native
import Text.Pandoc.Writers.Markdown
import Text.Pandoc.Writers.RST
@ -192,24 +195,34 @@ markdown o s = do
mapM_ warn warnings
return doc
data Reader = StringReader (ReaderOptions -> String -> IO Pandoc)
| ByteStringReader (ReaderOptions -> BL.ByteString -> IO Pandoc)
mkStringReader :: (ReaderOptions -> String -> Pandoc) -> Reader
mkStringReader r = StringReader (\o s -> return $ r o s)
mkBSReader :: (ReaderOptions -> BL.ByteString -> Pandoc) -> Reader
mkBSReader r = ByteStringReader (\o s -> return $ r o s)
-- | Association list of formats and readers.
readers :: [(String, ReaderOptions -> String -> IO Pandoc)]
readers = [ ("native" , \_ s -> return $ readNative s)
,("json" , \o s -> return $ readJSON o s)
,("markdown" , markdown)
,("markdown_strict" , markdown)
,("markdown_phpextra" , markdown)
,("markdown_github" , markdown)
,("markdown_mmd", markdown)
,("rst" , \o s -> return $ readRST o s)
,("mediawiki" , \o s -> return $ readMediaWiki o s)
,("docbook" , \o s -> return $ readDocBook o s)
,("opml" , \o s -> return $ readOPML o s)
,("org" , \o s -> return $ readOrg o s)
,("textile" , \o s -> return $ readTextile o s) -- TODO : textile+lhs
,("html" , \o s -> return $ readHtml o s)
,("latex" , \o s -> return $ readLaTeX o s)
,("haddock" , \o s -> return $ readHaddock o s)
readers :: [(String, Reader)]
readers = [ ("native" , StringReader $ \_ s -> return $ readNative s)
,("json" , mkStringReader readJSON )
,("markdown" , StringReader markdown)
,("markdown_strict" , StringReader markdown)
,("markdown_phpextra" , StringReader markdown)
,("markdown_github" , StringReader markdown)
,("markdown_mmd", StringReader markdown)
,("rst" , mkStringReader readRST )
,("mediawiki" , mkStringReader readMediaWiki)
,("docbook" , mkStringReader readDocBook)
,("opml" , mkStringReader readOPML)
,("org" , mkStringReader readOrg)
,("textile" , mkStringReader readTextile) -- TODO : textile+lhs
,("html" , mkStringReader readHtml)
,("latex" , mkStringReader readLaTeX)
,("haddock" , mkStringReader readHaddock)
,("docx" , mkBSReader readDocX)
]
data Writer = PureStringWriter (WriterOptions -> Pandoc -> String)
@ -280,14 +293,17 @@ getDefaultExtensions "textile" = Set.fromList [Ext_auto_identifiers, Ext
getDefaultExtensions _ = Set.fromList [Ext_auto_identifiers]
-- | Retrieve reader based on formatSpec (format+extensions).
getReader :: String -> Either String (ReaderOptions -> String -> IO Pandoc)
getReader :: String -> Either String Reader
getReader s =
case parseFormatSpec s of
Left e -> Left $ intercalate "\n" $ [m | Message m <- errorMessages e]
Right (readerName, setExts) ->
Right (readerName, setExts) ->
case lookup readerName readers of
Nothing -> Left $ "Unknown reader: " ++ readerName
Just r -> Right $ \o ->
Just (StringReader r) -> Right $ StringReader $ \o ->
r o{ readerExtensions = setExts $
getDefaultExtensions readerName }
Just (ByteStringReader r) -> Right $ ByteStringReader $ \o ->
r o{ readerExtensions = setExts $
getDefaultExtensions readerName }

View file

@ -0,0 +1,479 @@
{-
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.DocX
Copyright : Copyright (C) 2014 Jesse Rosenthal
License : GNU GPL, version 2 or above
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
Stability : alpha
Portability : portable
Conversion of DocX type (defined in Text.Pandoc.Readers.DocX.Parse)
to 'Pandoc' document. -}
{-
Current state of implementation of DocX entities ([x] means
implemented, [-] means partially implemented):
* Blocks
- [X] Para
- [X] CodeBlock (styled with `SourceCode`)
- [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally,
indented)
- [X] OrderedList
- [X] BulletList
- [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`)
- [X] Header (styled with `Heading#`)
- [ ] HorizontalRule
- [-] Table (column widths and alignments not yet implemented)
* Inlines
- [X] Str
- [X] Emph (From italics. `underline` currently read as span. In
future, it might optionally be emph as well)
- [X] Strong
- [X] Strikeout
- [X] Superscript
- [X] Subscript
- [X] SmallCaps
- [ ] Quoted
- [ ] Cite
- [X] Code (styled with `VerbatimChar`)
- [X] Space
- [X] LineBreak (these are invisible in Word: entered with Shift-Return)
- [ ] Math
- [X] Link (links to an arbitrary bookmark create a span with the target as
id and "anchor" class)
- [-] Image (Links to path in archive. Future option for
data-encoded URI likely.)
- [X] Note (Footnotes and Endnotes are silently combined.)
-}
module Text.Pandoc.Readers.DocX
( readDocX
) where
import Codec.Archive.Zip
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Builder (text, toList)
import Text.Pandoc.Generic (bottomUp)
import Text.Pandoc.MIME (getMimeType)
import Text.Pandoc.UTF8 (toString)
import Text.Pandoc.Readers.DocX.Parse
import Text.Pandoc.Readers.DocX.Lists
import Data.Maybe (mapMaybe, isJust, fromJust)
import Data.List (delete, isPrefixOf, (\\), intersect)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import Data.ByteString.Base64 (encode)
import System.FilePath (combine)
readDocX :: ReaderOptions
-> B.ByteString
-> Pandoc
readDocX opts bytes =
case archiveToDocX (toArchive bytes) of
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
Nothing -> error $ "couldn't parse docx file"
runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)])
runStyleToSpanAttr rPr = ("",
mapMaybe id [
if isBold rPr then (Just "strong") else Nothing,
if isItalic rPr then (Just "emph") else Nothing,
if isSmallCaps rPr then (Just "smallcaps") else Nothing,
if isStrike rPr then (Just "strike") else Nothing,
if isSuperScript rPr then (Just "superscript") else Nothing,
if isSubScript rPr then (Just "subscript") else Nothing,
rStyle rPr],
case underline rPr of
Just fmt -> [("underline", fmt)]
_ -> []
)
parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)])
parStyleToDivAttr pPr = ("",
pStyle pPr,
case indent pPr of
Just n -> [("indent", (show n))]
Nothing -> []
)
strToInlines :: String -> [Inline]
strToInlines = toList . text
codeSpans :: [String]
codeSpans = ["VerbatimChar"]
blockQuoteDivs :: [String]
blockQuoteDivs = ["Quote", "BlockQuote"]
codeDivs :: [String]
codeDivs = ["SourceCode"]
runElemToInlines :: RunElem -> [Inline]
runElemToInlines (TextRun s) = strToInlines s
runElemToInlines (LnBrk) = [LineBreak]
runElemToString :: RunElem -> String
runElemToString (TextRun s) = s
runElemToString (LnBrk) = ['\n']
runElemsToString :: [RunElem] -> String
runElemsToString = concatMap runElemToString
strNormalize :: [Inline] -> [Inline]
strNormalize [] = []
strNormalize (Str "" : ils) = strNormalize ils
strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l)
strNormalize (il:ils) = il : (strNormalize ils)
runToInlines :: ReaderOptions -> DocX -> Run -> [Inline]
runToInlines _ _ (Run rs runElems)
| isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans =
case runStyleToSpanAttr rs == ("", [], []) of
True -> [Str (runElemsToString runElems)]
False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]]
| otherwise = case runStyleToSpanAttr rs == ("", [], []) of
True -> concatMap runElemToInlines runElems
False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)]
runToInlines opts docx@(DocX _ notes _ _ _ ) (Footnote fnId) =
case (getFootNote fnId notes) of
Just bodyParts ->
[Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
Nothing ->
[Note [Div ("", ["footnote"], []) []]]
runToInlines opts docx@(DocX _ notes _ _ _) (Endnote fnId) =
case (getEndNote fnId notes) of
Just bodyParts ->
[Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
Nothing ->
[Note [Div ("", ["endnote"], []) []]]
parPartToInlines :: ReaderOptions -> DocX -> ParPart -> [Inline]
parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
parPartToInlines _ _ (BookMark _ anchor) =
[Span (anchor, ["anchor"], []) []]
parPartToInlines _ (DocX _ _ _ rels _) (Drawing relid) =
case lookupRelationship relid rels of
Just target -> [Image [] (combine "word" target, "")]
Nothing -> [Image [] ("", "")]
parPartToInlines opts docx (InternalHyperLink anchor runs) =
[Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")]
parPartToInlines opts docx@(DocX _ _ _ rels _) (ExternalHyperLink relid runs) =
case lookupRelationship relid rels of
Just target ->
[Link (concatMap (runToInlines opts docx) runs) (target, "")]
Nothing ->
[Link (concatMap (runToInlines opts docx) runs) ("", "")]
isAnchorSpan :: Inline -> Bool
isAnchorSpan (Span (ident, classes, kvs) ils) =
(not . null) ident &&
classes == ["anchor"] &&
null kvs &&
null ils
isAnchorSpan _ = False
dummyAnchors :: [String]
dummyAnchors = ["_GoBack"]
makeHeaderAnchors :: Block -> Block
makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
case filter isAnchorSpan ils of
[] -> h
(x@(Span (ident, _, _) _) : xs) ->
case ident `elem` dummyAnchors of
True -> h
False -> Header n (ident, classes, kvs) (ils \\ (x:xs))
_ -> h
makeHeaderAnchors blk = blk
parPartsToInlines :: ReaderOptions -> DocX -> [ParPart] -> [Inline]
parPartsToInlines opts docx parparts =
--
-- We're going to skip data-uri's for now. It should be an option,
-- not mandatory.
--
--bottomUp (makeImagesSelfContained docx) $
bottomUp spanCorrect $
bottomUp spanTrim $
bottomUp spanReduce $
concatMap (parPartToInlines opts docx) parparts
cellToBlocks :: ReaderOptions -> DocX -> Cell -> [Block]
cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps
rowToBlocksList :: ReaderOptions -> DocX -> Row -> [[Block]]
rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
bodyPartToBlock :: ReaderOptions -> DocX -> BodyPart -> Block
bodyPartToBlock opts docx (Paragraph pPr parparts) =
Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)]
bodyPartToBlock opts docx@(DocX _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
let
kvs = case lookupLevel numId lvl numbering of
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
, ("start", (show start))
]
Just (_, fmt, txt, Nothing) -> [ ("level", lvl)
, ("num-id", numId)
, ("format", fmt)
, ("text", txt)
]
Nothing -> []
in
Div
("", ["list-item"], kvs)
[bodyPartToBlock opts docx (Paragraph pPr parparts)]
bodyPartToBlock _ _ (Tbl _ _ _ []) =
Para []
bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
let caption = strToInlines cap
(hdr, rows) = case firstRowFormatting look of
True -> (Just r, rs)
False -> (Nothing, r:rs)
hdrCells = case hdr of
Just r' -> rowToBlocksList opts docx r'
Nothing -> []
cells = map (rowToBlocksList opts docx) rows
size = case null hdrCells of
True -> length $ head cells
False -> length $ hdrCells
--
-- The two following variables (horizontal column alignment and
-- relative column widths) go to the default at the
-- moment. Width information is in the TblGrid field of the Tbl,
-- so should be possible. Alignment might be more difficult,
-- since there doesn't seem to be a column entity in docx.
alignments = take size (repeat AlignDefault)
widths = take size (repeat 0) :: [Double]
in
Table caption alignments widths hdrCells cells
makeImagesSelfContained :: DocX -> Inline -> Inline
makeImagesSelfContained (DocX _ _ _ _ media) i@(Image alt (uri, title)) =
case lookup uri media of
Just bs -> case getMimeType uri of
Just mime -> let data_uri =
"data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs)
in
Image alt (data_uri, title)
Nothing -> i
Nothing -> i
makeImagesSelfContained _ inline = inline
bodyToBlocks :: ReaderOptions -> DocX -> Body -> [Block]
bodyToBlocks opts docx (Body bps) =
bottomUp removeEmptyPars $
bottomUp strNormalize $
bottomUp spanRemove $
bottomUp divRemove $
map (makeHeaderAnchors) $
bottomUp divCorrect $
bottomUp divReduce $
bottomUp divCorrectPreReduce $
bottomUp blocksToDefinitions $
blocksToBullets $
map (bodyPartToBlock opts docx) bps
docxToBlocks :: ReaderOptions -> DocX -> [Block]
docxToBlocks opts d@(DocX (Document _ body) _ _ _ _) = bodyToBlocks opts d body
spanReduce :: [Inline] -> [Inline]
spanReduce [] = []
spanReduce ((Span (id1, classes1, kvs1) ils1) : ils)
| (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils)
spanReduce (s1@(Span (id1, classes1, kvs1) ils1) :
s2@(Span (id2, classes2, kvs2) ils2) :
ils) =
let classes' = classes1 `intersect` classes2
kvs' = kvs1 `intersect` kvs2
classes1' = classes1 \\ classes'
kvs1' = kvs1 \\ kvs'
classes2' = classes2 \\ classes'
kvs2' = kvs2 \\ kvs'
in
case null classes' && null kvs' of
True -> s1 : (spanReduce (s2 : ils))
False -> let attr' = ("", classes', kvs')
attr1' = (id1, classes1', kvs1')
attr2' = (id2, classes2', kvs2')
in
spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] :
ils)
spanReduce (il:ils) = il : (spanReduce ils)
ilToCode :: Inline -> String
ilToCode (Str s) = s
ilToCode _ = ""
spanRemove' :: Inline -> [Inline]
spanRemove' s@(Span (ident, classes, _) [])
-- "_GoBack" is automatically inserted. We don't want to keep it.
| classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s]
spanRemove' (Span (_, _, kvs) ils) =
case lookup "underline" kvs of
Just val -> [Span ("", [], [("underline", val)]) ils]
Nothing -> ils
spanRemove' il = [il]
spanRemove :: [Inline] -> [Inline]
spanRemove = concatMap spanRemove'
spanTrim' :: Inline -> [Inline]
spanTrim' il@(Span _ []) = [il]
spanTrim' il@(Span attr (il':[]))
| il' == Space = [Span attr [], Space]
| otherwise = [il]
spanTrim' (Span attr ils)
| head ils == Space && last ils == Space =
[Space, Span attr (init $ tail ils), Space]
| head ils == Space = [Space, Span attr (tail ils)]
| last ils == Space = [Span attr (init ils), Space]
spanTrim' il = [il]
spanTrim :: [Inline] -> [Inline]
spanTrim = concatMap spanTrim'
spanCorrect' :: Inline -> [Inline]
spanCorrect' (Span ("", [], []) ils) = ils
spanCorrect' (Span (ident, classes, kvs) ils)
| "emph" `elem` classes =
[Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils]
| "strong" `elem` classes =
[Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils]
| "smallcaps" `elem` classes =
[SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils]
| "strike" `elem` classes =
[Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils]
| "superscript" `elem` classes =
[Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils]
| "subscript" `elem` classes =
[Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils]
| (not . null) (codeSpans `intersect` classes) =
[Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)]
| otherwise =
[Span (ident, classes, kvs) ils]
spanCorrect' il = [il]
spanCorrect :: [Inline] -> [Inline]
spanCorrect = concatMap spanCorrect'
removeEmptyPars :: [Block] -> [Block]
removeEmptyPars blks = filter (\b -> b /= (Para [])) blks
divReduce :: [Block] -> [Block]
divReduce [] = []
divReduce ((Div (id1, classes1, kvs1) blks1) : blks)
| (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks)
divReduce (d1@(Div (id1, classes1, kvs1) blks1) :
d2@(Div (id2, classes2, kvs2) blks2) :
blks) =
let classes' = classes1 `intersect` classes2
kvs' = kvs1 `intersect` kvs2
classes1' = classes1 \\ classes'
kvs1' = kvs1 \\ kvs'
classes2' = classes2 \\ classes'
kvs2' = kvs2 \\ kvs'
in
case null classes' && null kvs' of
True -> d1 : (divReduce (d2 : blks))
False -> let attr' = ("", classes', kvs')
attr1' = (id1, classes1', kvs1')
attr2' = (id2, classes2', kvs2')
in
divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] :
blks)
divReduce (blk:blks) = blk : (divReduce blks)
isHeaderClass :: String -> Maybe Int
isHeaderClass s | "Heading" `isPrefixOf` s =
case reads (drop (length "Heading") s) :: [(Int, String)] of
[] -> Nothing
((n, "") : []) -> Just n
_ -> Nothing
isHeaderClass _ = Nothing
findHeaderClass :: [String] -> Maybe Int
findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of
[] -> Nothing
n : _ -> Just n
blksToInlines :: [Block] -> [Inline]
blksToInlines (Para ils : _) = ils
blksToInlines (Plain ils : _) = ils
blksToInlines _ = []
divCorrectPreReduce' :: Block -> [Block]
divCorrectPreReduce' (Div (ident, classes, kvs) blks)
| isJust $ findHeaderClass classes =
let n = fromJust $ findHeaderClass classes
in
[Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)]
| otherwise = [Div (ident, classes, kvs) blks]
divCorrectPreReduce' blk = [blk]
divCorrectPreReduce :: [Block] -> [Block]
divCorrectPreReduce = concatMap divCorrectPreReduce'
blkToCode :: Block -> String
blkToCode (Para []) = ""
blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils))
blkToCode (Para ((Span (_, classes, _) ils'): ils))
| (not . null) (codeSpans `intersect` classes) =
(init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils))
blkToCode _ = ""
divRemove' :: Block -> [Block]
divRemove' (Div (_, _, kvs) blks) =
case lookup "indent" kvs of
Just val -> [Div ("", [], [("indent", val)]) blks]
Nothing -> blks
divRemove' blk = [blk]
divRemove :: [Block] -> [Block]
divRemove = concatMap divRemove'
divCorrect' :: Block -> [Block]
divCorrect' b@(Div (ident, classes, kvs) blks)
| (not . null) (blockQuoteDivs `intersect` classes) =
[BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]]
| (not . null) (codeDivs `intersect` classes) =
[CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)]
| otherwise =
case lookup "indent" kvs of
Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]
Just _ ->
[BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]]
Nothing -> [b]
divCorrect' blk = [blk]
divCorrect :: [Block] -> [Block]
divCorrect = concatMap divCorrect'

View file

@ -0,0 +1,208 @@
{-
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.DocX.Lists
Copyright : Copyright (C) 2014 Jesse Rosenthal
License : GNU GPL, version 2 or above
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
Stability : alpha
Portability : portable
Functions for converting flat DocX paragraphs into nested lists.
-}
module Text.Pandoc.Readers.DocX.Lists ( blocksToBullets
, blocksToDefinitions) where
import Text.Pandoc.JSON
import Text.Pandoc.Shared (trim)
import Control.Monad
import Data.List
import Data.Maybe
isListItem :: Block -> Bool
isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True
isListItem _ = False
getLevel :: Block -> Maybe Integer
getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs
getLevel _ = Nothing
getLevelN :: Block -> Integer
getLevelN b = case getLevel b of
Just n -> n
Nothing -> -1
getNumId :: Block -> Maybe Integer
getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs
getNumId _ = Nothing
getNumIdN :: Block -> Integer
getNumIdN b = case getNumId b of
Just n -> n
Nothing -> -1
getText :: Block -> Maybe String
getText (Div (_, _, kvs) _) = lookup "text" kvs
getText _ = Nothing
data ListType = Itemized | Enumerated ListAttributes
listStyleMap :: [(String, ListNumberStyle)]
listStyleMap = [("upperLetter", UpperAlpha),
("lowerLetter", LowerAlpha),
("upperRoman", UpperRoman),
("lowerRoman", LowerRoman),
("decimal", Decimal)]
listDelimMap :: [(String, ListNumberDelim)]
listDelimMap = [("%1)", OneParen),
("(%1)", TwoParens),
("%1.", Period)]
getListType :: Block -> Maybe ListType
getListType b@(Div (_, _, kvs) _) | isListItem b =
let
start = lookup "start" kvs
frmt = lookup "format" kvs
txt = lookup "text" kvs
in
case frmt of
Just "bullet" -> Just Itemized
Just f ->
case txt of
Just t -> Just $ Enumerated (
read (fromMaybe "1" start) :: Int,
fromMaybe DefaultStyle (lookup f listStyleMap),
fromMaybe DefaultDelim (lookup t listDelimMap))
Nothing -> Nothing
_ -> Nothing
getListType _ = Nothing
listParagraphDivs :: [String]
listParagraphDivs = ["ListParagraph"]
-- This is a first stab at going through and attaching meaning to list
-- paragraphs, without an item marker, following a list item. We
-- assume that these are paragraphs in the same item.
handleListParagraphs :: [Block] -> [Block]
handleListParagraphs [] = []
handleListParagraphs (
(Div attr1@(_, classes1, _) blks1) :
(Div (ident2, classes2, kvs2) blks2) :
blks
) | "list-item" `elem` classes1 &&
not ("list-item" `elem` classes2) &&
(not . null) (listParagraphDivs `intersect` classes2) =
-- We don't want to keep this indent.
let newDiv2 =
(Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2)
in
handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks)
handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks)
separateBlocks' :: Block -> [[Block]] -> [[Block]]
separateBlocks' blk ([] : []) = [[blk]]
separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]]
separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]]
-- The following is for the invisible bullet lists. This is how
-- pandoc-generated ooxml does multiparagraph item lists.
separateBlocks' b acc | liftM trim (getText b) == Just "" =
(init acc) ++ [(last acc) ++ [b]]
separateBlocks' b acc = acc ++ [[b]]
separateBlocks :: [Block] -> [[Block]]
separateBlocks blks = foldr separateBlocks' [[]] (reverse blks)
flatToBullets' :: Integer -> [Block] -> [Block]
flatToBullets' _ [] = []
flatToBullets' num xs@(b : elems)
| getLevelN b == num = b : (flatToBullets' num elems)
| otherwise =
let bNumId = getNumIdN b
bLevel = getLevelN b
(children, remaining) =
span
(\b' ->
((getLevelN b') > bLevel ||
((getLevelN b') == bLevel && (getNumIdN b') == bNumId)))
xs
in
case getListType b of
Just (Enumerated attr) ->
(OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) :
(flatToBullets' num remaining)
_ ->
(BulletList (separateBlocks $ flatToBullets' bLevel children)) :
(flatToBullets' num remaining)
flatToBullets :: [Block] -> [Block]
flatToBullets elems = flatToBullets' (-1) elems
blocksToBullets :: [Block] -> [Block]
blocksToBullets blks =
-- bottomUp removeListItemDivs $
flatToBullets $ (handleListParagraphs blks)
plainParaInlines :: Block -> [Inline]
plainParaInlines (Plain ils) = ils
plainParaInlines (Para ils) = ils
plainParaInlines _ = []
blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block]
blocksToDefinitions' [] acc [] = reverse acc
blocksToDefinitions' defAcc acc [] =
reverse $ (DefinitionList (reverse defAcc)) : acc
blocksToDefinitions' defAcc acc
((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks)
| "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 =
let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
pair = case remainingAttr2 == ("", [], []) of
True -> (concatMap plainParaInlines blks1, [blks2])
False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]])
in
blocksToDefinitions' (pair : defAcc) acc blks
blocksToDefinitions' defAcc acc
((Div (ident2, classes2, kvs2) blks2) : blks)
| (not . null) defAcc && "Definition" `elem` classes2 =
let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
defItems2 = case remainingAttr2 == ("", [], []) of
True -> blks2
False -> [Div remainingAttr2 blks2]
((defTerm, defItems):defs) = defAcc
defAcc' = case null defItems of
True -> (defTerm, [defItems2]) : defs
False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs
in
blocksToDefinitions' defAcc' acc blks
blocksToDefinitions' [] acc (b:blks) =
blocksToDefinitions' [] (b:acc) blks
blocksToDefinitions' defAcc acc (b:blks) =
blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks
blocksToDefinitions :: [Block] -> [Block]
blocksToDefinitions = blocksToDefinitions' [] []

View file

@ -0,0 +1,604 @@
{-
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.DocX.Parse
Copyright : Copyright (C) 2014 Jesse Rosenthal
License : GNU GPL, version 2 or above
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
Stability : alpha
Portability : portable
Conversion of DocX archive into DocX haskell type
-}
module Text.Pandoc.Readers.DocX.Parse ( DocX(..)
, Document(..)
, Body(..)
, BodyPart(..)
, TblLook(..)
, ParPart(..)
, Run(..)
, RunElem(..)
, Notes
, Numbering
, Relationship
, Media
, RunStyle(..)
, ParagraphStyle(..)
, Row(..)
, Cell(..)
, getFootNote
, getEndNote
, lookupLevel
, lookupRelationship
, archiveToDocX
) where
import Codec.Archive.Zip
import Text.XML.Light
import Data.Maybe
import Data.List
import System.FilePath
import Data.Bits ((.|.))
import qualified Data.ByteString.Lazy as B
import qualified Text.Pandoc.UTF8 as UTF8
attrToNSPair :: Attr -> Maybe (String, String)
attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
attrToNSPair _ = Nothing
type NameSpaces = [(String, String)]
data DocX = DocX Document Notes Numbering [Relationship] Media
deriving Show
archiveToDocX :: Archive -> Maybe DocX
archiveToDocX archive = do
let notes = archiveToNotes archive
rels = archiveToRelationships archive
media = archiveToMedia archive
doc <- archiveToDocument archive
numbering <- archiveToNumbering archive
return $ DocX doc notes numbering rels media
data Document = Document NameSpaces Body
deriving Show
archiveToDocument :: Archive -> Maybe Document
archiveToDocument zf = do
entry <- findEntryByPath "word/document.xml" zf
docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem
body <- elemToBody namespaces bodyElem
return $ Document namespaces body
type Media = [(FilePath, B.ByteString)]
filePathIsMedia :: FilePath -> Bool
filePathIsMedia fp =
let (dir, _) = splitFileName fp
in
(dir == "word/media/")
getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString)
getMediaPair zf fp =
case findEntryByPath fp zf of
Just e -> Just (fp, fromEntry e)
Nothing -> Nothing
archiveToMedia :: Archive -> Media
archiveToMedia zf =
mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
deriving Show
data Numb = Numb String String -- right now, only a key to an abstract num
deriving Show
data AbstractNumb = AbstractNumb String [Level]
deriving Show
-- (ilvl, format, string, start)
type Level = (String, String, String, Maybe Integer)
lookupLevel :: String -> String -> Numbering -> Maybe Level
lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs
lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls
return lvl
numElemToNum :: NameSpaces -> Element -> Maybe Numb
numElemToNum ns element |
qName (elName element) == "num" &&
qURI (elName element) == (lookup "w" ns) = do
numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element
absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
return $ Numb numId absNumId
numElemToNum _ _ = Nothing
absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb
absNumElemToAbsNum ns element |
qName (elName element) == "abstractNum" &&
qURI (elName element) == (lookup "w" ns) = do
absNumId <- findAttr
(QName "abstractNumId" (lookup "w" ns) (Just "w"))
element
let levelElems = findChildren
(QName "lvl" (lookup "w" ns) (Just "w"))
element
levels = mapMaybe id $ map (levelElemToLevel ns) levelElems
return $ AbstractNumb absNumId levels
absNumElemToAbsNum _ _ = Nothing
levelElemToLevel :: NameSpaces -> Element -> Maybe Level
levelElemToLevel ns element |
qName (elName element) == "lvl" &&
qURI (elName element) == (lookup "w" ns) = do
ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element
fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
>>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)])))
return (ilvl, fmt, txt, start)
levelElemToLevel _ _ = Nothing
archiveToNumbering :: Archive -> Maybe Numbering
archiveToNumbering zf =
case findEntryByPath "word/numbering.xml" zf of
Nothing -> Just $ Numbering [] [] []
Just entry -> do
numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem)
numElems = findChildren
(QName "num" (lookup "w" namespaces) (Just "w"))
numberingElem
absNumElems = findChildren
(QName "abstractNum" (lookup "w" namespaces) (Just "w"))
numberingElem
nums = mapMaybe id $ map (numElemToNum namespaces) numElems
absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems
return $ Numbering namespaces nums absNums
data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])])
deriving Show
noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart])
noteElemToNote ns element
| qName (elName element) `elem` ["endnote", "footnote"] &&
qURI (elName element) == (lookup "w" ns) =
do
noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
let bps = map fromJust
$ filter isJust
$ map (elemToBodyPart ns)
$ filterChildrenName (isParOrTbl ns) element
return $ (noteId, bps)
noteElemToNote _ _ = Nothing
getFootNote :: String -> Notes -> Maybe [BodyPart]
getFootNote s (Notes _ fns _) = fns >>= (lookup s)
getEndNote :: String -> Notes -> Maybe [BodyPart]
getEndNote s (Notes _ _ ens) = ens >>= (lookup s)
elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])]
elemToNotes ns notetype element
| qName (elName element) == (notetype ++ "s") &&
qURI (elName element) == (lookup "w" ns) =
Just $ map fromJust
$ filter isJust
$ map (noteElemToNote ns)
$ findChildren (QName notetype (lookup "w" ns) (Just "w")) element
elemToNotes _ _ _ = Nothing
archiveToNotes :: Archive -> Notes
archiveToNotes zf =
let fnElem = findEntryByPath "word/footnotes.xml" zf
>>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
enElem = findEntryByPath "word/endnotes.xml" zf
>>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
fn_namespaces = case fnElem of
Just e -> mapMaybe attrToNSPair (elAttribs e)
Nothing -> []
en_namespaces = case enElem of
Just e -> mapMaybe attrToNSPair (elAttribs e)
Nothing -> []
ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
fn = fnElem >>= (elemToNotes ns "footnote")
en = enElem >>= (elemToNotes ns "endnote")
in
Notes ns fn en
data Relationship = Relationship (RelId, Target)
deriving Show
lookupRelationship :: RelId -> [Relationship] -> Maybe Target
lookupRelationship relid rels =
lookup relid (map (\(Relationship pair) -> pair) rels)
filePathIsRel :: FilePath -> Bool
filePathIsRel fp =
let (dir, name) = splitFileName fp
in
(dir == "word/_rels/") && ((takeExtension name) == ".rels")
relElemToRelationship :: Element -> Maybe Relationship
relElemToRelationship element | qName (elName element) == "Relationship" =
do
relId <- findAttr (QName "Id" Nothing Nothing) element
target <- findAttr (QName "Target" Nothing Nothing) element
return $ Relationship (relId, target)
relElemToRelationship _ = Nothing
archiveToRelationships :: Archive -> [Relationship]
archiveToRelationships archive =
let relPaths = filter filePathIsRel (filesInArchive archive)
entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths
relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems
in
rels
data Body = Body [BodyPart]
deriving Show
isParOrTbl :: NameSpaces -> QName -> Bool
isParOrTbl ns q = qName q `elem` ["p", "tbl"] &&
qURI q == (lookup "w" ns)
elemToBody :: NameSpaces -> Element -> Maybe Body
elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) =
Just $ Body
$ map fromJust
$ filter isJust
$ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element
elemToBody _ _ = Nothing
isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool
isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] &&
qURI q == (lookup "w" ns)
elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String)
elemToNumInfo ns element
| qName (elName element) == "p" &&
qURI (elName element) == (lookup "w" ns) =
do
pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element
numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr
lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
return (numId, lvl)
elemToNumInfo _ _ = Nothing
-- isBookMarkTag :: NameSpaces -> QName -> Bool
-- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] &&
-- qURI q == (lookup "w" ns)
-- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark
-- parChildrenToBookmark ns (bms : bme : _)
-- | qName (elName bms) == "bookmarkStart" &&
-- qURI (elName bms) == (lookup "w" ns) &&
-- qName (elName bme) == "bookmarkEnd" &&
-- qURI (elName bme) == (lookup "w" ns) = do
-- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms
-- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms
-- return $ (bmId, bmName)
-- parChildrenToBookmark _ _ = Nothing
elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart
elemToBodyPart ns element
| qName (elName element) == "p" &&
qURI (elName element) == (lookup "w" ns) =
let parstyle = elemToParagraphStyle ns element
parparts = mapMaybe id
$ map (elemToParPart ns)
$ filterChildrenName (isRunOrLinkOrBookmark ns) element
in
case elemToNumInfo ns element of
Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts
Nothing -> Just $ Paragraph parstyle parparts
| qName (elName element) == "tbl" &&
qURI (elName element) == (lookup "w" ns) =
let
caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
>>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w"))
>>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
grid = case
findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element
of
Just g -> elemToTblGrid ns g
Nothing -> []
tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
>>= findChild (QName "tblLook" (lookup "w" ns) (Just "w"))
>>= elemToTblLook ns
in
Just $ Tbl
(fromMaybe "" caption)
grid
(fromMaybe defaultTblLook tblLook)
(mapMaybe (elemToRow ns) (elChildren element))
| otherwise = Nothing
elemToTblLook :: NameSpaces -> Element -> Maybe TblLook
elemToTblLook ns element
| qName (elName element) == "tblLook" &&
qURI (elName element) == (lookup "w" ns) =
let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element
val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element
firstRowFmt =
case firstRow of
Just "1" -> True
Just _ -> False
Nothing -> case val of
Just bitMask -> testBitMask bitMask 0x020
Nothing -> False
in
Just $ TblLook{firstRowFormatting = firstRowFmt}
elemToTblLook _ _ = Nothing
testBitMask :: String -> Int -> Bool
testBitMask bitMaskS n =
case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of
[] -> False
((n', _) : _) -> ((n' .|. n) /= 0)
data ParagraphStyle = ParagraphStyle { pStyle :: [String]
, indent :: Maybe Integer
}
deriving Show
defaultParagraphStyle :: ParagraphStyle
defaultParagraphStyle = ParagraphStyle { pStyle = []
, indent = Nothing
}
elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
elemToParagraphStyle ns element =
case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of
Just pPr ->
ParagraphStyle
{pStyle =
mapMaybe id $
map
(findAttr (QName "val" (lookup "w" ns) (Just "w")))
(findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr)
, indent =
findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>=
findAttr (QName "left" (lookup "w" ns) (Just "w")) >>=
stringToInteger
}
Nothing -> defaultParagraphStyle
data BodyPart = Paragraph ParagraphStyle [ParPart]
| ListItem ParagraphStyle String String [ParPart]
| Tbl String TblGrid TblLook [Row]
deriving Show
type TblGrid = [Integer]
data TblLook = TblLook {firstRowFormatting::Bool}
deriving Show
defaultTblLook :: TblLook
defaultTblLook = TblLook{firstRowFormatting = False}
stringToInteger :: String -> Maybe Integer
stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
elemToTblGrid :: NameSpaces -> Element -> TblGrid
elemToTblGrid ns element
| qName (elName element) == "tblGrid" &&
qURI (elName element) == (lookup "w" ns) =
let
cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element
in
mapMaybe (\e ->
findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e
>>= stringToInteger
)
cols
elemToTblGrid _ _ = []
data Row = Row [Cell]
deriving Show
elemToRow :: NameSpaces -> Element -> Maybe Row
elemToRow ns element
| qName (elName element) == "tr" &&
qURI (elName element) == (lookup "w" ns) =
let
cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element
in
Just $ Row (mapMaybe (elemToCell ns) cells)
elemToRow _ _ = Nothing
data Cell = Cell [BodyPart]
deriving Show
elemToCell :: NameSpaces -> Element -> Maybe Cell
elemToCell ns element
| qName (elName element) == "tc" &&
qURI (elName element) == (lookup "w" ns) =
Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element))
elemToCell _ _ = Nothing
data ParPart = PlainRun Run
| BookMark BookMarkId Anchor
| InternalHyperLink Anchor [Run]
| ExternalHyperLink RelId [Run]
| Drawing String
deriving Show
data Run = Run RunStyle [RunElem]
| Footnote String
| Endnote String
deriving Show
data RunElem = TextRun String | LnBrk
deriving Show
data RunStyle = RunStyle { isBold :: Bool
, isItalic :: Bool
, isSmallCaps :: Bool
, isStrike :: Bool
, isSuperScript :: Bool
, isSubScript :: Bool
, underline :: Maybe String
, rStyle :: Maybe String }
deriving Show
defaultRunStyle :: RunStyle
defaultRunStyle = RunStyle { isBold = False
, isItalic = False
, isSmallCaps = False
, isStrike = False
, isSuperScript = False
, isSubScript = False
, underline = Nothing
, rStyle = Nothing
}
elemToRunStyle :: NameSpaces -> Element -> RunStyle
elemToRunStyle ns element =
case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of
Just rPr ->
RunStyle
{
isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr
, isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr
, isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr
, isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr
, isSuperScript =
(Just "superscript" ==
(findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))))
, isSubScript =
(Just "subscript" ==
(findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))))
, underline =
findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
, rStyle =
findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
}
Nothing -> defaultRunStyle
elemToRun :: NameSpaces -> Element -> Maybe Run
elemToRun ns element
| qName (elName element) == "r" &&
qURI (elName element) == (lookup "w" ns) =
case
findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>=
findAttr (QName "id" (lookup "w" ns) (Just "w"))
of
Just s -> Just $ Footnote s
Nothing ->
case
findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>=
findAttr (QName "id" (lookup "w" ns) (Just "w"))
of
Just s -> Just $ Endnote s
Nothing -> Just $
Run (elemToRunStyle ns element)
(elemToRunElems ns element)
elemToRun _ _ = Nothing
elemToRunElem :: NameSpaces -> Element -> Maybe RunElem
elemToRunElem ns element
| qName (elName element) == "t" &&
qURI (elName element) == (lookup "w" ns) =
Just $ TextRun (strContent element)
| qName (elName element) == "br" &&
qURI (elName element) == (lookup "w" ns) =
Just $ LnBrk
| otherwise = Nothing
elemToRunElems :: NameSpaces -> Element -> [RunElem]
elemToRunElems ns element
| qName (elName element) == "r" &&
qURI (elName element) == (lookup "w" ns) =
mapMaybe (elemToRunElem ns) (elChildren element)
| otherwise = []
elemToDrawing :: NameSpaces -> Element -> Maybe ParPart
elemToDrawing ns element
| qName (elName element) == "drawing" &&
qURI (elName element) == (lookup "w" ns) =
let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
in
findElement (QName "blip" (Just a_ns) (Just "a")) element
>>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
>>= (\s -> Just $ Drawing s)
elemToDrawing _ _ = Nothing
elemToParPart :: NameSpaces -> Element -> Maybe ParPart
elemToParPart ns element
| qName (elName element) == "r" &&
qURI (elName element) == (lookup "w" ns) =
case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of
Just drawingElem -> elemToDrawing ns drawingElem
Nothing -> do
r <- elemToRun ns element
return $ PlainRun r
elemToParPart ns element
| qName (elName element) == "bookmarkStart" &&
qURI (elName element) == (lookup "w" ns) = do
bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element
return $ BookMark bmId bmName
elemToParPart ns element
| qName (elName element) == "hyperlink" &&
qURI (elName element) == (lookup "w" ns) =
let runs = map fromJust $ filter isJust $ map (elemToRun ns)
$ findChildren (QName "r" (lookup "w" ns) (Just "w")) element
in
case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of
Just anchor ->
Just $ InternalHyperLink anchor runs
Nothing ->
case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of
Just relId -> Just $ ExternalHyperLink relId runs
Nothing -> Nothing
elemToParPart _ _ = Nothing
type Target = String
type Anchor = String
type BookMarkId = String
type RelId = String

View file

@ -0,0 +1,68 @@
module Tests.Readers.DocX (tests) where
import Text.Pandoc.Options
import Text.Pandoc.Readers.Native
import Text.Pandoc.Definition
import Tests.Helpers
import Test.Framework
import qualified Data.ByteString.Lazy as B
import Text.Pandoc.Readers.DocX
compareOutput :: FilePath -> FilePath -> IO (Pandoc, Pandoc)
compareOutput docxFile nativeFile = do
df <- B.readFile docxFile
nf <- Prelude.readFile nativeFile
return $ (readDocX def df, readNative nf)
testCompare' :: String -> FilePath -> FilePath -> IO Test
testCompare' name docxFile nativeFile = do
(dp, np) <- compareOutput docxFile nativeFile
return $ test id name (dp, np)
testCompare :: String -> FilePath -> FilePath -> Test
testCompare name docxFile nativeFile =
buildTest $ testCompare' name docxFile nativeFile
tests :: [Test]
tests = [ testGroup "inlines"
[ testCompare
"font formatting"
"docx.inline_formatting.docx"
"docx.inline_formatting.native"
, testCompare
"hyperlinks"
"docx.links.docx"
"docx.links.native"
, testCompare
"inline image with reference output"
"docx.image.docx"
"docx.image_no_embed.native"
, testCompare
"handling unicode input"
"docx.unicode.docx"
"docx.unicode.native"]
, testGroup "blocks"
[ testCompare
"headers"
"docx.headers.docx"
"docx.headers.native"
, testCompare
"lists"
"docx.lists.docx"
"docx.lists.native"
, testCompare
"footnotes and endnotes"
"docx.notes.docx"
"docx.notes.native"
, testCompare
"blockquotes (parsing indent as blockquote)"
"docx.block_quotes.docx"
"docx.block_quotes_parse_indent.native"
, testCompare
"tables"
"docx.tables.docx"
"docx.tables.native"
]
]

Binary file not shown.

View file

@ -0,0 +1,8 @@
[Header 2 ("",[],[]) [Str "Some",Space,Str "block",Space,Str "quotes,",Space,Str "in",Space,Str "different",Space,Str "ways"]
,Para [Str "This",Space,Str "is",Space,Str "the",Space,Str "proper",Space,Str "way,",Space,Str "with",Space,Str "a",Space,Str "style"]
,BlockQuote
[Para [Str "I",Space,Str "don\8217t",Space,Str "know",Space,Str "why",Space,Str "this",Space,Str "would",Space,Str "be",Space,Str "in",Space,Str "italics,",Space,Str "but",Space,Str "so",Space,Str "it",Space,Str "appears",Space,Str "to",Space,Str "be",Space,Str "on",Space,Str "my",Space,Str "screen."]]
,Para [Str "And",Space,Str "this",Space,Str "is",Space,Str "the",Space,Str "way",Space,Str "that",Space,Str "most",Space,Str "people",Space,Str "do",Space,Str "it:"]
,BlockQuote
[Para [Str "I",Space,Str "just",Space,Str "indented",Space,Str "this,",Space,Str "so",Space,Str "it",Space,Str "looks",Space,Str "like",Space,Str "a",Space,Str "block",Space,Str "quote.",Space,Str "I",Space,Str "think",Space,Str "this",Space,Str "is",Space,Str "how",Space,Str "most",Space,Str "people",Space,Str "do",Space,Str "block",Space,Str "quotes",Space,Str "in",Space,Str "their",Space,Str "documents."]]
,Para [Str "And",Space,Str "back",Space,Str "to",Space,Str "the",Space,Str "normal",Space,Str "style."]]

BIN
tests/docx.headers.docx Normal file

Binary file not shown.

View file

@ -0,0 +1,5 @@
[Header 1 ("",[],[]) [Str "A",Space,Str "Test",Space,Str "of",Space,Str "Headers"]
,Header 2 ("",[],[]) [Str "Second",Space,Str "Level"]
,Para [Str "Some",Space,Str "plain",Space,Str "text."]
,Header 3 ("",[],[]) [Str "Third",Space,Str "level"]
,Para [Str "Some",Space,Str "more",Space,Str "plain",Space,Str "text."]]

BIN
tests/docx.image.docx Normal file

Binary file not shown.

View file

@ -0,0 +1,2 @@
[Header 2 ("",[],[]) [Str "An",Space,Str "image"]
,Para [Image [] ("word/media/image1.jpeg","")]]

Binary file not shown.

View file

@ -0,0 +1,5 @@
[Para [Str "Regular",Space,Str "text",Space,Emph [Str "italics"],Space,Strong [Str "bold",Space,Emph [Str "bold",Space,Str "italics"]],Str "."]
,Para [Str "This",Space,Str "is",Space,SmallCaps [Str "Small",Space,Str "Caps"],Str ",",Space,Str "and",Space,Str "this",Space,Str "is",Space,Strikeout [Str "strikethrough"],Str "."]
,Para [Str "Some",Space,Str "people",Space,Str "use",Space,Span ("",[],[("underline","single")]) [Str "single",Space,Str "underlines",Space,Str "for",Space,Emph [Str "emphasis"]],Str "."]
,Para [Str "Above",Space,Str "the",Space,Str "line",Space,Str "is",Space,Superscript [Str "superscript"],Space,Str "and",Space,Str "below",Space,Str "the",Space,Str "line",Space,Str "is",Space,Subscript [Str "subscript"],Str "."]
,Para [Str "A",Space,Str "line",LineBreak,Str "break."]]

BIN
tests/docx.links.docx Normal file

Binary file not shown.

6
tests/docx.links.native Normal file
View file

@ -0,0 +1,6 @@
[Header 2 ("",[],[]) [Str "An",Space,Str "internal",Space,Str "link",Space,Str "and",Space,Str "an",Space,Str "external",Space,Str "link"]
,Para [Str "An",Space,Link [Str "external",Space,Str "link"] ("http://google.com",""),Space,Str "to",Space,Str "a",Space,Str "popular",Space,Str "website."]
,Para [Str "An",Space,Link [Str "internal",Space,Str "link"] ("#_A_section_for",""),Space,Str "to",Space,Str "a",Space,Str "section",Space,Str "header."]
,Para [Str "An",Space,Link [Str "internal",Space,Str "link"] ("#my_bookmark",""),Space,Str "to",Space,Str "a",Space,Str "bookmark."]
,Header 2 ("_A_section_for",[],[]) [Str "A",Space,Str "section",Space,Str "for",Space,Str "testing",Space,Str "link",Space,Str "targets"]
,Para [Str "A",Space,Str "bookmark",Space,Str "right",Space,Span ("my_bookmark",["anchor"],[]) [],Str "here"]]

BIN
tests/docx.lists.docx Normal file

Binary file not shown.

18
tests/docx.lists.native Normal file
View file

@ -0,0 +1,18 @@
[Header 2 ("",[],[]) [Str "Some",Space,Str "nested",Space,Str "lists"]
,OrderedList (1,Decimal,Period)
[[Para [Str "one"]]
,[Para [Str "two"]
,OrderedList (1,LowerAlpha,DefaultDelim)
[[Para [Str "a"]]
,[Para [Str "b"]]]]]
,BulletList
[[Para [Str "one"]]
,[Para [Str "two"]
,BulletList
[[Para [Str "three"]
,BulletList
[[Para [Str "four"]
,Para [Str "Sub",Space,Str "paragraph"]]]]]]
,[Para [Str "Same",Space,Str "list"]]]
,BulletList
[[Para [Str "Different",Space,Str "list",Space,Str "adjacent",Space,Str "to",Space,Str "the",Space,Str "one",Space,Str "above."]]]]

BIN
tests/docx.notes.docx Normal file

Binary file not shown.

2
tests/docx.notes.native Normal file
View file

@ -0,0 +1,2 @@
[Header 2 ("",[],[]) [Str "A",Space,Str "footnote"]
,Para [Str "Test",Space,Str "footnote.",Note [Para [Space,Str "My",Space,Str "note."]],Space,Str "Test",Space,Str "endnote.",Note [Para [Space,Str "This",Space,Str "is",Space,Str "an",Space,Str "endnote",Space,Str "at",Space,Str "the",Space,Str "end",Space,Str "of",Space,Str "the",Space,Str "document."]]]]

BIN
tests/docx.tables.docx Normal file

Binary file not shown.

24
tests/docx.tables.native Normal file
View file

@ -0,0 +1,24 @@
[Header 2 ("",[],[]) [Str "A",Space,Str "table,",Space,Str "with",Space,Str "and",Space,Str "without",Space,Str "a",Space,Str "header",Space,Str "row"]
,Table [] [AlignDefault,AlignDefault,AlignDefault,AlignDefault] [0.0,0.0,0.0,0.0]
[[Para [Str "Name"]]
,[Para [Str "Game"]]
,[Para [Str "Fame"]]
,[Para [Str "Blame"]]]
[[[Para [Str "Lebron",Space,Str "James"]]
,[Para [Str "Basketball"]]
,[Para [Str "Very",Space,Str "High"]]
,[Para [Str "Leaving",Space,Str "Cleveland"]]]
,[[Para [Str "Ryan",Space,Str "Braun"]]
,[Para [Str "Baseball"]]
,[Para [Str "Moderate"]]
,[Para [Str "Steroids"]]]
,[[Para [Str "Russell",Space,Str "Wilson"]]
,[Para [Str "Football"]]
,[Para [Str "High"]]
,[Para [Str "Tacky",Space,Str "uniform"]]]]
,Table [] [AlignDefault,AlignDefault] [0.0,0.0]
[]
[[[Para [Str "Sinple"]]
,[Para [Str "Table"]]]
,[[Para [Str "Without"]]
,[Para [Str "Header"]]]]]

BIN
tests/docx.unicode.docx Normal file

Binary file not shown.

View file

@ -0,0 +1 @@
[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10."]]

View file

@ -9,6 +9,7 @@ import qualified Tests.Readers.LaTeX
import qualified Tests.Readers.Markdown
import qualified Tests.Readers.Org
import qualified Tests.Readers.RST
import qualified Tests.Readers.DocX
import qualified Tests.Writers.ConTeXt
import qualified Tests.Writers.LaTeX
import qualified Tests.Writers.HTML
@ -38,6 +39,8 @@ tests = [ testGroup "Old" Tests.Old.tests
, testGroup "Markdown" Tests.Readers.Markdown.tests
, testGroup "Org" Tests.Readers.Org.tests
, testGroup "RST" Tests.Readers.RST.tests
, testGroup "DocX" Tests.Readers.DocX.tests
]
]