[Docx Reader] Use style names, not ids, for assigning semantic meaning

Motivating issues: #5523, #5052, #5074

Style name comparisons are case-insensitive, since those are
case-insensitive in Word.

w:styleId will be used as style name if w:name is missing (this should
only happen for malformed docx and is kept as a fallback to avoid
failing altogether on malformed documents)

Block quote detection code moved from Docx.Parser to Readers.Docx

Code styles, i.e. "Source Code" and "Verbatim Char" now honor style
inheritance

Docx Reader now honours "Compact" style (used in Pandoc-generated docx).
The side-effect is that "Compact" style no longer shows up in
docx+styles output. Styles inherited from "Compact" will still
show up.

Removed obsolete list-item style from divsToKeep. That didn't
really do anything for a while now.

Add newtypes to differentiate between style names, ids, and
different style types (that is, paragraph and character styles)

Since docx style names can have spaces in them, and pandoc-markdown
classes can't, anywhere when style name is used as a class name,
spaces are replaced with ASCII dashes `-`.

Get rid of extraneous intermediate types, carrying styleId information.
Instead, styleId is saved with other style data.

Use RunStyle for inline style definitions only (lacking styleId and styleName);
for Character Styles use CharStyle type (which is basicaly RunStyle with styleId
and StyleName bolted onto it).
This commit is contained in:
Nikolay Yakimov 2019-09-15 01:40:23 +03:00 committed by John MacFarlane
parent fd14ad5261
commit c113ca6717
8 changed files with 306 additions and 183 deletions

View file

@ -3,6 +3,7 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE FlexibleContexts #-}
{- |
Module : Text.Pandoc.Readers.Docx
Copyright : Copyright (C) 2014-2019 Jesse Rosenthal
@ -65,6 +66,7 @@ import Control.Monad.State.Strict
import qualified Data.ByteString.Lazy as B
import Data.Default (Default)
import Data.List (delete, intersect)
import Data.Char (isSpace)
import qualified Data.Map as M
import Data.Maybe (isJust, fromMaybe)
import Data.Sequence (ViewL (..), viewl)
@ -133,13 +135,13 @@ evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a
evalDocxContext ctx env st = flip evalStateT st $ runReaderT ctx env
-- This is empty, but we put it in for future-proofing.
spansToKeep :: [String]
spansToKeep :: [CharStyleName]
spansToKeep = []
divsToKeep :: [String]
divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
divsToKeep :: [ParaStyleName]
divsToKeep = ["Definition", "Definition Term"]
metaStyles :: M.Map String String
metaStyles :: M.Map ParaStyleName String
metaStyles = M.fromList [ ("Title", "title")
, ("Subtitle", "subtitle")
, ("Author", "author")
@ -151,7 +153,7 @@ sepBodyParts = span (\bp -> isMetaPar bp || isEmptyPar bp)
isMetaPar :: BodyPart -> Bool
isMetaPar (Paragraph pPr _) =
not $ null $ intersect (pStyle pPr) (M.keys metaStyles)
not $ null $ intersect (getStyleNames $ pStyle pPr) (M.keys metaStyles)
isMetaPar _ = False
isEmptyPar :: BodyPart -> Bool
@ -168,7 +170,7 @@ bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String M
bodyPartsToMeta' [] = return M.empty
bodyPartsToMeta' (bp : bps)
| (Paragraph pPr parParts) <- bp
, (c : _)<- (pStyle pPr) `intersect` (M.keys metaStyles)
, (c : _)<- getStyleNames (pStyle pPr) `intersect` M.keys metaStyles
, (Just metaField) <- M.lookup c metaStyles = do
inlines <- smushInlines <$> mapM parPartToInlines parParts
remaining <- bodyPartsToMeta' bps
@ -198,11 +200,29 @@ fixAuthors (MetaBlocks blks) =
g _ = MetaInlines []
fixAuthors mv = mv
codeStyles :: [String]
codeStyles = ["VerbatimChar"]
isInheritedFromStyles :: (Eq (StyleName s), HasStyleName s, HasParentStyle s) => [StyleName s] -> s -> Bool
isInheritedFromStyles names sty
| getStyleName sty `elem` names = True
| Just psty <- getParentStyle sty = isInheritedFromStyles names psty
| otherwise = False
codeDivs :: [String]
codeDivs = ["SourceCode"]
hasStylesInheritedFrom :: [ParaStyleName] -> ParagraphStyle -> Bool
hasStylesInheritedFrom ns s = any (isInheritedFromStyles ns) $ pStyle s
removeStyleNamed :: ParaStyleName -> ParagraphStyle -> ParagraphStyle
removeStyleNamed sn ps = ps{pStyle = filter (\psd -> getStyleName psd /= sn) $ pStyle ps}
isCodeCharStyle :: CharStyle -> Bool
isCodeCharStyle = isInheritedFromStyles ["Verbatim Char"]
isCodeDiv :: ParagraphStyle -> Bool
isCodeDiv = hasStylesInheritedFrom ["Source Code"]
isBlockQuote :: ParStyle -> Bool
isBlockQuote =
isInheritedFromStyles [
"Quote", "Block Text", "Block Quote", "Block Quotation"
]
runElemToInlines :: RunElem -> Inlines
runElemToInlines (TextRun s) = text s
@ -228,57 +248,31 @@ parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
parPartToString _ = ""
blacklistedCharStyles :: [String]
blacklistedCharStyles :: [CharStyleName]
blacklistedCharStyles = ["Hyperlink"]
resolveDependentRunStyle :: PandocMonad m => RunStyle -> DocxContext m RunStyle
resolveDependentRunStyle rPr
| Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles =
| Just s <- rParentStyle rPr
, getStyleName s `elem` blacklistedCharStyles =
return rPr
| Just (_, cs) <- rStyle rPr = do
| Just s <- rParentStyle rPr = do
opts <- asks docxOptions
if isEnabled Ext_styles opts
then return rPr
else do rPr' <- resolveDependentRunStyle cs
return $
RunStyle { isBold = case isBold rPr of
Just bool -> Just bool
Nothing -> isBold rPr'
, isItalic = case isItalic rPr of
Just bool -> Just bool
Nothing -> isItalic rPr'
, isSmallCaps = case isSmallCaps rPr of
Just bool -> Just bool
Nothing -> isSmallCaps rPr'
, isStrike = case isStrike rPr of
Just bool -> Just bool
Nothing -> isStrike rPr'
, isRTL = case isRTL rPr of
Just bool -> Just bool
Nothing -> isRTL rPr'
, rVertAlign = case rVertAlign rPr of
Just valign -> Just valign
Nothing -> rVertAlign rPr'
, rUnderline = case rUnderline rPr of
Just ulstyle -> Just ulstyle
Nothing -> rUnderline rPr'
, rStyle = rStyle rPr
}
else leftBiasedMergeRunStyle rPr <$> resolveDependentRunStyle (cStyleData s)
| otherwise = return rPr
runStyleToTransform :: PandocMonad m => RunStyle -> DocxContext m (Inlines -> Inlines)
runStyleToTransform rPr
| Just (s, _) <- rStyle rPr
, s `elem` spansToKeep = do
transform <- runStyleToTransform rPr{rStyle = Nothing}
return $ spanWith ("", [s], []) . transform
| Just (s, _) <- rStyle rPr = do
opts <- asks docxOptions
let extraInfo = if isEnabled Ext_styles opts
then spanWith ("", [], [("custom-style", s)])
else id
transform <- runStyleToTransform rPr{rStyle = Nothing}
return $ extraInfo . transform
| Just sn <- getStyleName <$> rParentStyle rPr
, sn `elem` spansToKeep = do
transform <- runStyleToTransform rPr{rParentStyle = Nothing}
return $ spanWith ("", [normalizeToClassName sn], []) . transform
| Just s <- rParentStyle rPr = do
ei <- extraInfo spanWith s
transform <- runStyleToTransform rPr{rParentStyle = Nothing}
return $ ei . transform
| Just True <- isItalic rPr = do
transform <- runStyleToTransform rPr{isItalic = Nothing}
return $ emph . transform
@ -310,8 +304,7 @@ runStyleToTransform rPr
runToInlines :: PandocMonad m => Run -> DocxContext m Inlines
runToInlines (Run rs runElems)
| Just (s, _) <- rStyle rs
, s `elem` codeStyles = do
| maybe False isCodeCharStyle $ rParentStyle rs = do
rPr <- resolveDependentRunStyle rs
let codeString = code $ concatMap runElemToString runElems
return $ case rVertAlign rPr of
@ -526,39 +519,49 @@ trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils
isSp LineBreak = True
isSp _ = False
extraInfo :: (Eq (StyleName a), PandocMonad m, HasStyleName a)
=> (Attr -> i -> i) -> a -> DocxContext m (i -> i)
extraInfo f s = do
opts <- asks docxOptions
return $ if | isEnabled Ext_styles opts
-> f ("", [], [("custom-style", fromStyleName $ getStyleName s)])
| otherwise -> id
parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks)
parStyleToTransform pPr
| (c:cs) <- pStyle pPr
, c `elem` divsToKeep = do
, getStyleName c `elem` divsToKeep = do
let pPr' = pPr { pStyle = cs }
transform <- parStyleToTransform pPr'
return $ divWith ("", [c], []) . transform
return $ divWith ("", [normalizeToClassName $ getStyleName c], []) . transform
| (c:cs) <- pStyle pPr,
c `elem` listParagraphDivs = do
getStyleName c `elem` listParagraphStyles = do
let pPr' = pPr { pStyle = cs, indentation = Nothing}
transform <- parStyleToTransform pPr'
return $ divWith ("", [c], []) . transform
return $ divWith ("", [normalizeToClassName $ getStyleName c], []) . transform
| (c:cs) <- pStyle pPr = do
opts <- asks docxOptions
let pPr' = pPr { pStyle = cs}
let pPr' = pPr { pStyle = cs }
transform <- parStyleToTransform pPr'
let extraInfo = if isEnabled Ext_styles opts
then divWith ("", [], [("custom-style", c)])
else id
return $ extraInfo . (if fromMaybe False (pBlockQuote pPr) then blockQuote else id) . transform
ei <- extraInfo divWith c
return $ ei . (if isBlockQuote c then blockQuote else id) . transform
| null (pStyle pPr)
, Just left <- indentation pPr >>= leftParIndent = do
let pPr' = pPr { indentation = Nothing }
hang = fromMaybe 0 $ indentation pPr >>= hangingParIndent
transform <- parStyleToTransform pPr'
return $ if (left - hang) > 0
return $ if (left - hang) > 0
then blockQuote . transform
else transform
parStyleToTransform _ = return id
normalizeToClassName :: (FromStyleName a) => a -> String
normalizeToClassName = map go . fromStyleName
where go c | isSpace c = '-'
| otherwise = c
bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks
bodyPartToBlocks (Paragraph pPr parparts)
| not $ null $ codeDivs `intersect` (pStyle pPr) = do
| isCodeDiv pPr = do
transform <- parStyleToTransform pPr
return $
transform $
@ -568,13 +571,16 @@ bodyPartToBlocks (Paragraph pPr parparts)
ils <-local (\s-> s{docxInHeaderBlock=True})
(smushInlines <$> mapM parPartToInlines parparts)
makeHeaderAnchor $
headerWith ("", delete style (pStyle pPr), []) n ils
headerWith ("", map normalizeToClassName . delete style $ getStyleNames (pStyle pPr), []) n ils
| otherwise = do
ils <- trimSps . smushInlines <$> mapM parPartToInlines parparts
prevParaIls <- gets docxPrevPara
dropIls <- gets docxDropCap
let ils' = dropIls <> ils
if dropCap pPr
let (paraOrPlain, pPr')
| hasStylesInheritedFrom ["Compact"] pPr = (plain, removeStyleNamed "Compact" pPr)
| otherwise = (para, pPr)
if dropCap pPr'
then do modify $ \s -> s { docxDropCap = ils' }
return mempty
else do modify $ \s -> s { docxDropCap = mempty }
@ -583,41 +589,41 @@ bodyPartToBlocks (Paragraph pPr parparts)
ils'
handleInsertion = do
modify $ \s -> s {docxPrevPara = mempty}
transform <- parStyleToTransform pPr
return $ transform $ para ils''
transform <- parStyleToTransform pPr'
return $ transform $ paraOrPlain ils''
opts <- asks docxOptions
if | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) ->
return mempty
| Just (TrackedChange Insertion _) <- pChange pPr
| Just (TrackedChange Insertion _) <- pChange pPr'
, AcceptChanges <- readerTrackChanges opts ->
handleInsertion
| Just (TrackedChange Insertion _) <- pChange pPr
| Just (TrackedChange Insertion _) <- pChange pPr'
, RejectChanges <- readerTrackChanges opts -> do
modify $ \s -> s {docxPrevPara = ils''}
return mempty
| Just (TrackedChange Insertion cInfo) <- pChange pPr
| Just (TrackedChange Insertion cInfo) <- pChange pPr'
, AllChanges <- readerTrackChanges opts
, ChangeInfo _ cAuthor cDate <- cInfo -> do
let attr = ("", ["paragraph-insertion"], [("author", cAuthor), ("date", cDate)])
insertMark = spanWith attr mempty
transform <- parStyleToTransform pPr
transform <- parStyleToTransform pPr'
return $ transform $
para $ ils'' <> insertMark
| Just (TrackedChange Deletion _) <- pChange pPr
paraOrPlain $ ils'' <> insertMark
| Just (TrackedChange Deletion _) <- pChange pPr'
, AcceptChanges <- readerTrackChanges opts -> do
modify $ \s -> s {docxPrevPara = ils''}
return mempty
| Just (TrackedChange Deletion _) <- pChange pPr
| Just (TrackedChange Deletion _) <- pChange pPr'
, RejectChanges <- readerTrackChanges opts ->
handleInsertion
| Just (TrackedChange Deletion cInfo) <- pChange pPr
| Just (TrackedChange Deletion cInfo) <- pChange pPr'
, AllChanges <- readerTrackChanges opts
, ChangeInfo _ cAuthor cDate <- cInfo -> do
let attr = ("", ["paragraph-deletion"], [("author", cAuthor), ("date", cDate)])
insertMark = spanWith attr mempty
transform <- parStyleToTransform pPr
transform <- parStyleToTransform pPr'
return $ transform $
para $ ils'' <> insertMark
paraOrPlain $ ils'' <> insertMark
| otherwise -> handleInsertion
bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do
-- We check whether this current numId has previously been used,
@ -638,7 +644,7 @@ bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do
blks <- bodyPartToBlocks (Paragraph pPr parparts)
return $ divWith ("", ["list-item"], kvs) blks
bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
let pPr' = pPr {pStyle = "ListParagraph": pStyle pPr}
let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr}
in
bodyPartToBlocks $ Paragraph pPr' parparts
bodyPartToBlocks (Tbl _ _ _ []) =

View file

@ -1,4 +1,5 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.Docx.Lists
Copyright : Copyright (C) 2014-2019 Jesse Rosenthal
@ -14,13 +15,16 @@ Functions for converting flat docx paragraphs into nested lists.
module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets
, blocksToDefinitions
, listParagraphDivs
, listParagraphStyles
) where
import Prelude
import Data.List
import Data.Maybe
import Data.String (fromString)
import Text.Pandoc.Generic (bottomUp)
import Text.Pandoc.JSON
import Text.Pandoc.Readers.Docx.Parse (ParaStyleName)
import Text.Pandoc.Shared (trim, safeRead)
isListItem :: Block -> Bool
@ -79,7 +83,10 @@ getListType b@(Div (_, _, kvs) _) | isListItem b =
getListType _ = Nothing
listParagraphDivs :: [String]
listParagraphDivs = ["ListParagraph"]
listParagraphDivs = ["list-paragraph"]
listParagraphStyles :: [ParaStyleName]
listParagraphStyles = map fromString listParagraphDivs
-- This is a first stab at going through and attaching meaning to list
-- paragraphs, without an item marker, following a list item. We
@ -160,7 +167,7 @@ blocksToDefinitions' defAcc acc [] =
reverse $ DefinitionList (reverse defAcc) : acc
blocksToDefinitions' defAcc acc
(Div (_, classes1, _) blks1 : Div (ident2, classes2, kvs2) blks2 : blks)
| "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 =
| "Definition-Term" `elem` classes1 && "Definition" `elem` classes2 =
let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
pair = if remainingAttr2 == ("", [], []) then (concatMap plainParaInlines blks1, [blks2]) else (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]])
in
@ -169,12 +176,12 @@ blocksToDefinitions' ((defTerm, defItems):defs) acc
(Div (ident2, classes2, kvs2) blks2 : blks)
| "Definition" `elem` classes2 =
let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
defItems2 = case remainingAttr2 == ("", [], []) of
True -> blks2
False -> [Div remainingAttr2 blks2]
defAcc' = case null defItems of
True -> (defTerm, [defItems2]) : defs
False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs
defItems2 = if remainingAttr2 == ("", [], [])
then blks2
else [Div remainingAttr2 blks2]
defAcc' = if null defItems
then (defTerm, [defItems2]) : defs
else (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs
in
blocksToDefinitions' defAcc' acc blks
blocksToDefinitions' [] acc (b:blks) =
@ -198,7 +205,5 @@ removeListDivs' blk = [blk]
removeListDivs :: [Block] -> [Block]
removeListDivs = concatMap removeListDivs'
blocksToDefinitions :: [Block] -> [Block]
blocksToDefinitions = blocksToDefinitions' [] []

View file

@ -1,7 +1,11 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{- |
Module : Text.Pandoc.Readers.Docx.Parse
Copyright : Copyright (C) 2014-2019 Jesse Rosenthal
@ -31,6 +35,8 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, VertAlign(..)
, ParIndentation(..)
, ParagraphStyle(..)
, ParStyle
, CharStyle(cStyleData)
, Row(..)
, Cell(..)
, TrackedChange(..)
@ -38,8 +44,17 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, ChangeInfo(..)
, FieldInfo(..)
, Level(..)
, ParaStyleName
, CharStyleName
, FromStyleName(..)
, HasStyleName(..)
, HasParentStyle(..)
, archiveToDocx
, archiveToDocxWithWarnings
, getStyleNames
, pHeading
, constructBogusParStyleData
, leftBiasedMergeRunStyle
) where
import Prelude
import Codec.Archive.Zip
@ -49,10 +64,13 @@ import Control.Monad.Reader
import Control.Monad.State.Strict
import Data.Bits ((.|.))
import qualified Data.ByteString.Lazy as B
import Data.Char (chr, ord, readLitChar)
import Data.Char (chr, ord, readLitChar, toLower)
import Data.List
import Data.Function (on)
import Data.String (IsString(..))
import qualified Data.Map as M
import Data.Maybe
import Data.Coerce
import System.FilePath
import Text.Pandoc.Readers.Docx.Util
import Text.Pandoc.Readers.Docx.Fields
@ -160,13 +178,9 @@ newtype Body = Body [BodyPart]
type Media = [(FilePath, B.ByteString)]
type CharStyle = (String, RunStyle)
type CharStyleMap = M.Map CharStyleId CharStyle
type ParStyle = (String, ParStyleData)
type CharStyleMap = M.Map String RunStyle
type ParStyleMap = M.Map String ParStyleData
type ParStyleMap = M.Map ParaStyleId ParStyle
data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
deriving Show
@ -213,12 +227,9 @@ data ChangeInfo = ChangeInfo ChangeId Author ChangeDate
data TrackedChange = TrackedChange ChangeType ChangeInfo
deriving Show
data ParagraphStyle = ParagraphStyle { pStyle :: [String]
data ParagraphStyle = ParagraphStyle { pStyle :: [ParStyle]
, indentation :: Maybe ParIndentation
, dropCap :: Bool
, pHeading :: Maybe (String, Int)
, pNumInfo :: Maybe (String, String)
, pBlockQuote :: Maybe Bool
, pChange :: Maybe TrackedChange
}
deriving Show
@ -227,9 +238,6 @@ defaultParagraphStyle :: ParagraphStyle
defaultParagraphStyle = ParagraphStyle { pStyle = []
, indentation = Nothing
, dropCap = False
, pHeading = Nothing
, pNumInfo = Nothing
, pBlockQuote = Nothing
, pChange = Nothing
}
@ -254,6 +262,49 @@ newtype Row = Row [Cell]
newtype Cell = Cell [BodyPart]
deriving Show
newtype CharStyleId = CharStyleId { fromCharStyleId :: String }
deriving (Show, Eq, Ord, FromStyleId)
newtype ParaStyleId = ParaStyleId { fromParaStyleId :: String }
deriving (Show, Eq, Ord, FromStyleId)
newtype CharStyleName = CharStyleName { fromCharStyleName :: CIString }
deriving (Show, Eq, Ord, IsString, FromStyleName)
newtype ParaStyleName = ParaStyleName { fromParaStyleName :: CIString }
deriving (Show, Eq, Ord, IsString, FromStyleName)
-- Case-insensitive comparisons
newtype CIString = CIString String deriving (Show, IsString, FromStyleName)
class FromStyleName a where
fromStyleName :: a -> String
instance FromStyleName String where
fromStyleName = id
class FromStyleId a where
fromStyleId :: a -> String
instance FromStyleId String where
fromStyleId = id
instance Eq CIString where
(==) = (==) `on` map toLower . coerce
instance Ord CIString where
compare = compare `on` map toLower . coerce
leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle
leftBiasedMergeRunStyle a b = RunStyle
{ isBold = isBold a <|> isBold b
, isItalic = isItalic a <|> isItalic b
, isSmallCaps = isSmallCaps a <|> isSmallCaps b
, isStrike = isStrike a <|> isStrike b
, isRTL = isRTL a <|> isRTL b
, rVertAlign = rVertAlign a <|> rVertAlign b
, rUnderline = rUnderline a <|> rUnderline b
, rParentStyle = rParentStyle a
}
-- (width, height) in EMUs
type Extent = Maybe (Double, Double)
@ -285,21 +336,28 @@ data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen
data VertAlign = BaseLn | SupScrpt | SubScrpt
deriving Show
data RunStyle = RunStyle { isBold :: Maybe Bool
, isItalic :: Maybe Bool
, isSmallCaps :: Maybe Bool
, isStrike :: Maybe Bool
, isRTL :: Maybe Bool
, rVertAlign :: Maybe VertAlign
, rUnderline :: Maybe String
, rStyle :: Maybe CharStyle
data CharStyle = CharStyle { cStyleId :: CharStyleId
, cStyleName :: CharStyleName
, cStyleData :: RunStyle
} deriving (Show)
data RunStyle = RunStyle { isBold :: Maybe Bool
, isItalic :: Maybe Bool
, isSmallCaps :: Maybe Bool
, isStrike :: Maybe Bool
, isRTL :: Maybe Bool
, rVertAlign :: Maybe VertAlign
, rUnderline :: Maybe String
, rParentStyle :: Maybe CharStyle
}
deriving Show
data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int)
, isBlockQuote :: Maybe Bool
, numInfo :: Maybe (String, String)
, psStyle :: Maybe ParStyle}
data ParStyle = ParStyle { headingLev :: Maybe (ParaStyleName, Int)
, numInfo :: Maybe (String, String)
, psParentStyle :: Maybe ParStyle
, pStyleName :: ParaStyleName
, pStyleId :: ParaStyleId
}
deriving Show
defaultRunStyle :: RunStyle
@ -310,7 +368,7 @@ defaultRunStyle = RunStyle { isBold = Nothing
, isRTL = Nothing
, rVertAlign = Nothing
, rUnderline = Nothing
, rStyle = Nothing
, rParentStyle = Nothing
}
type Target = String
@ -390,7 +448,10 @@ elemToBody ns element | isElem ns "w" "body" element =
elemToBody _ _ = throwError WrongElem
archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap)
archiveToStyles zf =
archiveToStyles = archiveToStyles' getStyleId getStyleId
archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) =>
(a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2)
archiveToStyles' conv1 conv2 zf =
let stylesElem = findEntryByPath "word/styles.xml" zf >>=
(parseXMLDoc . UTF8.toStringLazy . fromEntry)
in
@ -399,19 +460,17 @@ archiveToStyles zf =
Just styElem ->
let namespaces = elemToNameSpaces styElem
in
( M.fromList $ buildBasedOnList namespaces styElem
(Nothing :: Maybe CharStyle),
M.fromList $ buildBasedOnList namespaces styElem
(Nothing :: Maybe ParStyle) )
( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing,
M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing)
isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool
isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool
isBasedOnStyle ns element parentStyle
| isElem ns "w" "style" element
, Just styleType <- findAttrByName ns "w" "type" element
, styleType == cStyleType parentStyle
, Just basedOnVal <- findChildByName ns "w" "basedOn" element >>=
findAttrByName ns "w" "val"
, Just ps <- parentStyle = basedOnVal == getStyleId ps
, Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps)
| isElem ns "w" "style" element
, Just styleType <- findAttrByName ns "w" "type" element
, styleType == cStyleType parentStyle
@ -419,30 +478,70 @@ isBasedOnStyle ns element parentStyle
, Nothing <- parentStyle = True
| otherwise = False
class ElemToStyle a where
class HasStyleId a => ElemToStyle a where
cStyleType :: Maybe a -> String
elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a
getStyleId :: a -> String
class FromStyleId (StyleId a) => HasStyleId a where
type StyleId a
getStyleId :: a -> StyleId a
class FromStyleName (StyleName a) => HasStyleName a where
type StyleName a
getStyleName :: a -> StyleName a
class HasParentStyle a where
getParentStyle :: a -> Maybe a
instance HasParentStyle CharStyle where
getParentStyle = rParentStyle . cStyleData
instance HasParentStyle ParStyle where
getParentStyle = psParentStyle
getStyleNames :: (Functor t, HasStyleName a) => t a -> t (StyleName a)
getStyleNames = fmap getStyleName
constructBogusParStyleData :: ParaStyleName -> ParStyle
constructBogusParStyleData stName = ParStyle
{ headingLev = Nothing
, numInfo = Nothing
, psParentStyle = Nothing
, pStyleName = stName
, pStyleId = ParaStyleId . filter (/=' ') . fromStyleName $ stName
}
instance ElemToStyle CharStyle where
cStyleType _ = "character"
elemToStyle ns element parentStyle
| isElem ns "w" "style" element
, Just "character" <- findAttrByName ns "w" "type" element
, Just styleId <- findAttrByName ns "w" "styleId" element =
Just (styleId, elemToRunStyle ns element parentStyle)
, Just "character" <- findAttrByName ns "w" "type" element =
elemToCharStyle ns element parentStyle
| otherwise = Nothing
getStyleId s = fst s
instance HasStyleId CharStyle where
type StyleId CharStyle = CharStyleId
getStyleId = cStyleId
instance HasStyleName CharStyle where
type StyleName CharStyle = CharStyleName
getStyleName = cStyleName
instance ElemToStyle ParStyle where
cStyleType _ = "paragraph"
elemToStyle ns element parentStyle
| isElem ns "w" "style" element
, Just "paragraph" <- findAttrByName ns "w" "type" element
, Just styleId <- findAttrByName ns "w" "styleId" element =
Just (styleId, elemToParStyleData ns element parentStyle)
= elemToParStyleData ns element parentStyle
| otherwise = Nothing
getStyleId s = fst s
instance HasStyleId ParStyle where
type StyleId ParStyle = ParaStyleId
getStyleId = pStyleId
instance HasStyleName ParStyle where
type StyleName ParStyle = ParaStyleName
getStyleName = pStyleName
getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a]
getStyleChildren ns element parentStyle
@ -693,6 +792,12 @@ testBitMask bitMaskS n =
stringToInteger :: String -> Maybe Integer
stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
pHeading :: ParagraphStyle -> Maybe (ParaStyleName, Int)
pHeading = getParStyleField headingLev . pStyle
pNumInfo :: ParagraphStyle -> Maybe (String, String)
pNumInfo = getParStyleField numInfo . pStyle
elemToBodyPart :: NameSpaces -> Element -> D BodyPart
elemToBodyPart ns element
| isElem ns "w" "p" element
@ -1003,20 +1108,18 @@ elemToRun ns element
return $ Run runStyle runElems
elemToRun _ _ = throwError WrongElem
getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a
getParentStyleValue :: (ParStyle -> Maybe a) -> ParStyle -> Maybe a
getParentStyleValue field style
| Just value <- field style = Just value
| Just parentStyle <- psStyle style
= getParentStyleValue field (snd parentStyle)
| Just parentStyle <- psParentStyle style
= getParentStyleValue field parentStyle
getParentStyleValue _ _ = Nothing
getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] ->
Maybe a
getParStyleField field stylemap styles
| x <- mapMaybe (\x -> M.lookup x stylemap) styles
, (y:_) <- mapMaybe (getParentStyleValue field) x
getParStyleField :: (ParStyle -> Maybe a) -> [ParStyle] -> Maybe a
getParStyleField field styles
| (y:_) <- mapMaybe (getParentStyleValue field) styles
= Just y
getParStyleField _ _ _ = Nothing
getParStyleField _ _ = Nothing
getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange
getTrackedChange ns element
@ -1038,10 +1141,10 @@ elemToParagraphStyle ns element sty
| Just pPr <- findChildByName ns "w" "pPr" element =
let style =
mapMaybe
(findAttrByName ns "w" "val")
(fmap ParaStyleId . findAttrByName ns "w" "val")
(findChildrenByName ns "w" "pStyle" pPr)
in ParagraphStyle
{pStyle = style
{pStyle = mapMaybe (`M.lookup` sty) style
, indentation =
findChildByName ns "w" "ind" pPr >>=
elemToParIndentation ns
@ -1053,9 +1156,6 @@ elemToParagraphStyle ns element sty
Just "none" -> False
Just _ -> True
Nothing -> False
, pHeading = getParStyleField headingLev sty style
, pNumInfo = getParStyleField numInfo sty style
, pBlockQuote = getParStyleField isBlockQuote sty style
, pChange = findChildByName ns "w" "rPr" pPr >>=
filterChild (\e -> isElem ns "w" "ins" e ||
isElem ns "w" "moveTo" e ||
@ -1085,16 +1185,20 @@ elemToRunStyleD :: NameSpaces -> Element -> D RunStyle
elemToRunStyleD ns element
| Just rPr <- findChildByName ns "w" "rPr" element = do
charStyles <- asks envCharStyles
let parentSty = case
let parentSty =
findChildByName ns "w" "rStyle" rPr >>=
findAttrByName ns "w" "val"
of
Just styName | Just style <- M.lookup styName charStyles ->
Just (styName, style)
_ -> Nothing
findAttrByName ns "w" "val" >>=
flip M.lookup charStyles . CharStyleId
return $ elemToRunStyle ns element parentSty
elemToRunStyleD _ _ = return defaultRunStyle
elemToCharStyle :: NameSpaces
-> Element -> Maybe CharStyle -> Maybe CharStyle
elemToCharStyle ns element parentStyle
= CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element)
<*> getElementStyleName ns element
<*> (Just $ elemToRunStyle ns element parentStyle)
elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle
elemToRunStyle ns element parentStyle
| Just rPr <- findChildByName ns "w" "rPr" element =
@ -1117,38 +1221,23 @@ elemToRunStyle ns element parentStyle
, rUnderline =
findChildByName ns "w" "u" rPr >>=
findAttrByName ns "w" "val"
, rStyle = parentStyle
, rParentStyle = parentStyle
}
elemToRunStyle _ _ _ = defaultRunStyle
getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int)
getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int)
getHeaderLevel ns element
| Just styleId <- findAttrByName ns "w" "styleId" element
, Just index <- stripPrefix "Heading" styleId
, Just n <- stringToInteger index
, n > 0 = Just (styleId, fromInteger n)
| Just styleId <- findAttrByName ns "w" "styleId" element
, Just index <- findChildByName ns "w" "name" element >>=
findAttrByName ns "w" "val" >>=
stripPrefix "heading "
, Just n <- stringToInteger index
, n > 0 = Just (styleId, fromInteger n)
| Just styleName <- getElementStyleName ns element
, Just n <- stringToInteger =<<
(stripPrefix "heading " . map toLower $
fromStyleName styleName)
, n > 0 = Just (styleName, fromInteger n)
getHeaderLevel _ _ = Nothing
blockQuoteStyleIds :: [String]
blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"]
blockQuoteStyleNames :: [String]
blockQuoteStyleNames = ["Quote", "Block Text"]
getBlockQuote :: NameSpaces -> Element -> Maybe Bool
getBlockQuote ns element
| Just styleId <- findAttrByName ns "w" "styleId" element
, styleId `elem` blockQuoteStyleIds = Just True
| Just styleName <- findChildByName ns "w" "name" element >>=
findAttrByName ns "w" "val"
, styleName `elem` blockQuoteStyleNames = Just True
getBlockQuote _ _ = Nothing
getElementStyleName :: Coercible String a => NameSpaces -> Element -> Maybe a
getElementStyleName ns el = coerce <$>
((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val")
<|> findAttrByName ns "w" "styleId" el)
getNumInfo :: NameSpaces -> Element -> Maybe (String, String)
getNumInfo ns element = do
@ -1163,15 +1252,19 @@ getNumInfo ns element = do
return (numId, lvl)
elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData
elemToParStyleData ns element parentStyle =
ParStyleData
elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle
elemToParStyleData ns element parentStyle
| Just styleId <- findAttrByName ns "w" "styleId" element
, Just styleName <- getElementStyleName ns element
= Just $ ParStyle
{
headingLev = getHeaderLevel ns element
, isBlockQuote = getBlockQuote ns element
, numInfo = getNumInfo ns element
, psStyle = parentStyle
}
, psParentStyle = parentStyle
, pStyleName = styleName
, pStyleId = ParaStyleId styleId
}
elemToParStyleData _ _ _ = Nothing
elemToRunElem :: NameSpaces -> Element -> D RunElem
elemToRunElem ns element

View file

@ -255,6 +255,10 @@ tests = [ testGroup "document"
"lists"
"docx/lists.docx"
"docx/lists.native"
, testCompare
"compact lists"
"docx/lists-compact.docx"
"docx/lists-compact.native"
, testCompare
"lists with level overrides"
"docx/lists_level_override.docx"
@ -425,6 +429,11 @@ tests = [ testGroup "document"
"custom styles (`+styles`) enabled"
"docx/custom-style-reference.docx"
"docx/custom-style-with-styles.native"
, testCompareWithOpts
def{readerExtensions=extensionsFromList [Ext_styles]}
"custom styles (`+styles`): Compact style is removed from output"
"docx/compact-style-removal.docx"
"docx/compact-style-removal.native"
]
, testGroup "metadata"
[ testCompareWithOpts def{readerStandalone=True}

Binary file not shown.

View file

@ -0,0 +1,5 @@
[OrderedList (1,Decimal,Period)
[[Plain [Str "One"]]
,[Plain [Str "Two"]]
,[Plain [Str "Three"]]
,[Plain [Str "Four"]]]]

Binary file not shown.

View file

@ -0,0 +1,5 @@
[OrderedList (1,Decimal,Period)
[[Plain [Str "One"]]
,[Plain [Str "Two"]]
,[Plain [Str "Three"]]
,[Plain [Str "Four"]]]]