Docx reader: pass code tests.

This commit is contained in:
Jesse Rosenthal 2014-06-24 10:34:07 -04:00
parent 21295c5ab5
commit bebea5e936

View file

@ -84,7 +84,7 @@ import Text.Pandoc.Readers.Docx.Parse
import Text.Pandoc.Readers.Docx.Lists import Text.Pandoc.Readers.Docx.Lists
import Text.Pandoc.Readers.Docx.Reducible import Text.Pandoc.Readers.Docx.Reducible
import Data.Maybe (mapMaybe, isJust, fromJust) import Data.Maybe (mapMaybe, isJust, fromJust)
import Data.List (delete, isPrefixOf, (\\), intersect) import Data.List (delete, isPrefixOf, (\\))
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy as B
import Data.ByteString.Base64 (encode) import Data.ByteString.Base64 (encode)
@ -99,7 +99,7 @@ readDocx opts bytes =
Nothing -> error $ "couldn't parse docx file" Nothing -> error $ "couldn't parse docx file"
spansToKeep :: [String] spansToKeep :: [String]
spansToKeep = ["list-item", "Definition", "DefinitionTerm"] ++ codeSpans spansToKeep = ["list-item", "Definition", "DefinitionTerm"]
-- This is empty, but we put it in for future-proofing. -- This is empty, but we put it in for future-proofing.
@ -108,20 +108,28 @@ divsToKeep = []
runStyleToContainers :: RunStyle -> [Container Inline] runStyleToContainers :: RunStyle -> [Container Inline]
runStyleToContainers rPr = runStyleToContainers rPr =
let formatters = mapMaybe id let spanClassToContainers :: String -> [Container Inline]
spanClassToContainers s | s `elem` codeSpans =
[Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))]
spanClassToContainers s | s `elem` spansToKeep =
[Container $ Span ("", [s], [])]
spanClassToContainers _ = []
classContainers = case rStyle rPr of
Nothing -> []
Just s -> spanClassToContainers s
formatters = map Container $ mapMaybe id
[ if isBold rPr then (Just Strong) else Nothing [ if isBold rPr then (Just Strong) else Nothing
, if isItalic rPr then (Just Emph) else Nothing , if isItalic rPr then (Just Emph) else Nothing
, if isSmallCaps rPr then (Just SmallCaps) else Nothing , if isSmallCaps rPr then (Just SmallCaps) else Nothing
, if isStrike rPr then (Just Strikeout) else Nothing , if isStrike rPr then (Just Strikeout) else Nothing
, if isSuperScript rPr then (Just Superscript) else Nothing , if isSuperScript rPr then (Just Superscript) else Nothing
, if isSubScript rPr then (Just Subscript) else Nothing , if isSubScript rPr then (Just Subscript) else Nothing
, rStyle rPr >>=
(\s -> if s `elem` spansToKeep then Just s else Nothing) >>=
(\s -> Just $ Span ("", [s], []))
, underline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)])) , underline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)]))
] ]
in in
map Container formatters classContainers ++ formatters
divAttrToContainers :: [String] -> [(String, String)] -> [Container Block] divAttrToContainers :: [String] -> [(String, String)] -> [Container Block]
@ -132,7 +140,9 @@ divAttrToContainers (c:cs) _ | isJust (isHeaderClass c) =
[(Container $ \blks -> [(Container $ \blks ->
Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))] Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))]
divAttrToContainers (c:_) _ | c `elem` codeDivs = divAttrToContainers (c:_) _ | c `elem` codeDivs =
[Container $ \blks -> CodeBlock ("", [], []) (concatMap blkToCode blks)] -- This is a bit of a cludge. We make the codeblock from the raw
-- parparts in bodyPartToBlocks. But we need something to match against.
[Container $ \_ -> CodeBlock ("", [], []) ""]
divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs = divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs =
let kvs' = filter (\(k,_) -> k /= "indent") kvs let kvs' = filter (\(k,_) -> k /= "indent") kvs
in in
@ -183,18 +193,23 @@ runElemToString (Tab) = ['\t']
runElemsToString :: [RunElem] -> String runElemsToString :: [RunElem] -> String
runElemsToString = concatMap runElemToString runElemsToString = concatMap runElemToString
runToString :: Run -> String
runToString (Run _ runElems) = runElemsToString runElems
runToString _ = ""
parPartToString :: ParPart -> String
parPartToString (PlainRun run) = runToString run
parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
parPartToString _ = ""
inlineCodeContainer :: Container Inline -> Bool inlineCodeContainer :: Container Inline -> Bool
inlineCodeContainer (Container f) = case f [] of inlineCodeContainer (Container f) = case f [] of
Span (_, classes, _) _ -> (not . null) (classes `intersect` codeSpans) Code _ "" -> True
_ -> False _ -> False
inlineCodeContainer _ = False inlineCodeContainer _ = False
-- blockCodeContainer :: Container Block -> Bool
-- blockCodeContainer (Container f) = case f [] of
-- Div (ident, classes, kvs) _ -> (not . null) (classes `intersect` codeDivs)
-- _ -> False
-- blockCodeContainer _ = False
runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
runToInlines _ _ (Run rs runElems) runToInlines _ _ (Run rs runElems)
@ -274,7 +289,16 @@ cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps
rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
blockCodeContainer :: Container Block -> Bool
blockCodeContainer (Container f) = case f [] of
CodeBlock _ _ -> True
_ -> False
blockCodeContainer _ = False
bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block] bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block]
bodyPartToBlocks _ _ (Paragraph pPr parparts)
| any blockCodeContainer (parStyleToContainers pPr) =
[CodeBlock ("", [], []) (concatMap parPartToString parparts)]
bodyPartToBlocks opts docx (Paragraph pPr parparts) = bodyPartToBlocks opts docx (Paragraph pPr parparts) =
case parPartsToInlines opts docx parparts of case parPartsToInlines opts docx parparts of
[] -> [] ->
@ -358,6 +382,7 @@ docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
ilToCode :: Inline -> String ilToCode :: Inline -> String
ilToCode (Str s) = s ilToCode (Str s) = s
ilToCode Space = " "
ilToCode _ = "" ilToCode _ = ""
@ -369,18 +394,7 @@ isHeaderClass s | "Heading" `isPrefixOf` s =
_ -> Nothing _ -> Nothing
isHeaderClass _ = Nothing isHeaderClass _ = Nothing
blksToInlines :: [Block] -> [Inline] blksToInlines :: [Block] -> [Inline]
blksToInlines (Para ils : _) = ils blksToInlines (Para ils : _) = ils
blksToInlines (Plain ils : _) = ils blksToInlines (Plain ils : _) = ils
blksToInlines _ = [] blksToInlines _ = []
blkToCode :: Block -> String
blkToCode (Para []) = ""
blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils))
blkToCode (Para ((Span (_, classes, _) ils'): ils))
| (not . null) (codeSpans `intersect` classes) =
(init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils))
blkToCode _ = ""