Add empty_paragraphs extension.

* Deprecate `--strip-empty-paragraphs` option.  Instead we now
  use an `empty_paragraphs` extension that can be enabled on
  the reader or writer.  By default, disabled.

* Add `Ext_empty_paragraphs` constructor to `Extension`.

* Revert "Docx reader: don't strip out empty paragraphs."
  This reverts commit d6c58eb836.

* Implement `empty_paragraphs` extension in docx reader and writer,
  opendocument writer, html reader and writer.

* Add tests for `empty_paragraphs` extension.
This commit is contained in:
John MacFarlane 2017-12-04 10:31:06 -08:00
parent c58ecde937
commit ae60e0196c
15 changed files with 175 additions and 80 deletions

View file

@ -429,7 +429,8 @@ Reader options
`--strip-empty-paragraphs`
: Ignore paragraphs with non content. This option is useful
: *Deprecated. Use the `+empty_paragraphs` extension instead.*
Ignore paragraphs with no content. This option is useful
for converting word processing documents where users have
used empty paragraphs to create inter-paragraph space.
@ -3817,6 +3818,12 @@ in several respects:
we must either disallow lazy wrapping or require a blank line between
list items.
#### Extension: `empty_paragraphs` ####
Allows empty paragraphs. By default empty paragraphs are
omitted. This affects the `docx` reader and writer, the
`opendocument` and `odt` writer, and all HTML-based readers and writers.
Markdown variants
-----------------

View file

@ -947,7 +947,10 @@ options =
, Option "" ["strip-empty-paragraphs"]
(NoArg
(\opt -> return opt{ optStripEmptyParagraphs = True }))
(\opt -> do
deprecatedOption "--stripEmptyParagraphs"
"Use +empty_paragraphs extension."
return opt{ optStripEmptyParagraphs = True }))
"" -- "Strip empty paragraphs"
, Option "" ["indented-code-classes"]
@ -1472,7 +1475,7 @@ options =
, Option "m" ["latexmathml", "asciimathml"]
(OptArg
(\arg opt -> do
deprecatedOption "--latexmathml, --asciimathml, -m"
deprecatedOption "--latexmathml, --asciimathml, -m" ""
return opt { optHTMLMathMethod = LaTeXMathML arg })
"URL")
"" -- "Use LaTeXMathML script in html output"
@ -1480,7 +1483,7 @@ options =
, Option "" ["mimetex"]
(OptArg
(\arg opt -> do
deprecatedOption "--mimetex"
deprecatedOption "--mimetex" ""
let url' = case arg of
Just u -> u ++ "?"
Nothing -> "/cgi-bin/mimetex.cgi?"
@ -1491,7 +1494,7 @@ options =
, Option "" ["jsmath"]
(OptArg
(\arg opt -> do
deprecatedOption "--jsmath"
deprecatedOption "--jsmath" ""
return opt { optHTMLMathMethod = JsMath arg})
"URL")
"" -- "Use jsMath for HTML math"
@ -1499,7 +1502,7 @@ options =
, Option "" ["gladtex"]
(NoArg
(\opt -> do
deprecatedOption "--gladtex"
deprecatedOption "--gladtex" ""
return opt { optHTMLMathMethod = GladTeX }))
"" -- "Use gladtex for HTML math"
@ -1699,9 +1702,9 @@ splitField s =
baseWriterName :: String -> String
baseWriterName = takeWhile (\c -> c /= '+' && c /= '-')
deprecatedOption :: String -> IO ()
deprecatedOption o =
runIO (report $ Deprecated o "") >>=
deprecatedOption :: String -> String -> IO ()
deprecatedOption o msg =
runIO (report $ Deprecated o msg) >>=
\r -> case r of
Right () -> return ()
Left e -> E.throwIO e

View file

@ -152,6 +152,7 @@ data Extension =
| Ext_old_dashes -- ^ -- = em, - before number = en
| Ext_spaced_reference_links -- ^ Allow space between two parts of ref link
| Ext_amuse -- ^ Enable Text::Amuse extensions to Emacs Muse markup
| Ext_empty_paragraphs -- ^ Allow empty paragraphs
deriving (Show, Read, Enum, Eq, Ord, Bounded, Data, Typeable, Generic)
-- | Extensions to be used with pandoc-flavored markdown.

View file

@ -534,7 +534,10 @@ bodyPartToBlocks (Paragraph pPr parparts)
then do modify $ \s -> s { docxDropCap = ils' }
return mempty
else do modify $ \s -> s { docxDropCap = mempty }
return $ parStyleToTransform pPr $ para ils'
opts <- asks docxOptions
if isNull ils' && not (isEnabled Ext_empty_paragraphs opts)
then return mempty
else return $ parStyleToTransform pPr $ para ils'
bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do
let
kvs = case levelInfo of

View file

@ -68,9 +68,11 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import Text.Pandoc.Definition
import Text.Pandoc.Error
import Text.Pandoc.Logging
import Text.Pandoc.Options (Extension (Ext_epub_html_exts, Ext_native_divs, Ext_native_spans, Ext_raw_html),
ReaderOptions (readerExtensions, readerStripComments),
extensionEnabled)
import Text.Pandoc.Options (
Extension (Ext_epub_html_exts, Ext_empty_paragraphs, Ext_native_divs,
Ext_native_spans, Ext_raw_html),
ReaderOptions (readerExtensions, readerStripComments),
extensionEnabled)
import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (addMetaField, crFilter, escapeURI, extractSpaces,
safeRead, underlineSpan)
@ -575,7 +577,10 @@ pPlain = do
pPara :: PandocMonad m => TagParser m Blocks
pPara = do
contents <- trimInlines <$> pInTags "p" inline
return $ B.para contents
(do guardDisabled Ext_empty_paragraphs
guard (B.isNull contents)
return mempty)
<|> return (B.para contents)
pFigure :: PandocMonad m => TagParser m Blocks
pFigure = try $ do

View file

@ -922,19 +922,22 @@ blockToOpenXML' opts (Para [Image attr alt (src,'f':'i':'g':':':tit)]) = do
captionNode <- withParaProp (pCustomStyle "ImageCaption")
$ blockToOpenXML opts (Para alt)
return $ mknode "w:p" [] (paraProps ++ contents) : captionNode
blockToOpenXML' opts (Para lst) = do
isFirstPara <- gets stFirstPara
paraProps <- getParaProps $ case lst of
[Math DisplayMath _] -> True
_ -> False
bodyTextStyle <- pStyleM "Body Text"
let paraProps' = case paraProps of
[] | isFirstPara -> [mknode "w:pPr" [] [pCustomStyle "FirstParagraph"]]
[] -> [mknode "w:pPr" [] [bodyTextStyle]]
ps -> ps
modify $ \s -> s { stFirstPara = False }
contents <- inlinesToOpenXML opts lst
return [mknode "w:p" [] (paraProps' ++ contents)]
blockToOpenXML' opts (Para lst)
| null lst && not (isEnabled Ext_empty_paragraphs opts) = return []
| otherwise = do
isFirstPara <- gets stFirstPara
paraProps <- getParaProps $ case lst of
[Math DisplayMath _] -> True
_ -> False
bodyTextStyle <- pStyleM "Body Text"
let paraProps' = case paraProps of
[] | isFirstPara -> [mknode "w:pPr" []
[pCustomStyle "FirstParagraph"]]
[] -> [mknode "w:pPr" [] [bodyTextStyle]]
ps -> ps
modify $ \s -> s { stFirstPara = False }
contents <- inlinesToOpenXML opts lst
return [mknode "w:p" [] (paraProps' ++ contents)]
blockToOpenXML' opts (LineBlock lns) = blockToOpenXML opts $ linesToPara lns
blockToOpenXML' _ b@(RawBlock format str)
| format == Format "openxml" = return [ x | Elem x <- parseXML str ]

View file

@ -56,7 +56,7 @@ import qualified Data.Text.Lazy as TL
import Network.HTTP (urlEncode)
import Network.URI (URI (..), parseURIReference, unEscapeString)
import Numeric (showHex)
import Text.Blaze.Internal (customLeaf)
import Text.Blaze.Internal (customLeaf, MarkupM(Empty))
import Text.Blaze.Html hiding (contents)
import Text.Pandoc.Definition
import Text.Pandoc.Highlighting (formatHtmlBlock, formatHtmlInline, highlight,
@ -658,6 +658,7 @@ blockToHtml opts (Para [Image attr txt (s,'f':'i':'g':':':tit)]) =
figure opts attr txt (s,tit)
blockToHtml opts (Para lst)
| isEmptyRaw lst = return mempty
| null lst && not (isEnabled Ext_empty_paragraphs opts) = return mempty
| otherwise = do
contents <- inlineListToHtml opts lst
return $ H.p contents
@ -902,8 +903,7 @@ tableItemToHtml opts tag' align' item = do
let tag'' = if null alignStr
then tag'
else tag' ! attribs
return $ (
tag'' contents) >> nl opts
return $ tag'' contents >> nl opts
toListItems :: WriterOptions -> [Html] -> [Html]
toListItems opts items = map (toListItem opts) items ++ [nl opts]
@ -911,9 +911,13 @@ toListItems opts items = map (toListItem opts) items ++ [nl opts]
toListItem :: WriterOptions -> Html -> Html
toListItem opts item = nl opts >> H.li item
blockListToHtml :: PandocMonad m => WriterOptions -> [Block] -> StateT WriterState m Html
blockListToHtml :: PandocMonad m
=> WriterOptions -> [Block] -> StateT WriterState m Html
blockListToHtml opts lst =
(mconcat . intersperse (nl opts)) <$> mapM (blockToHtml opts) lst
(mconcat . intersperse (nl opts) . filter nonempty)
<$> mapM (blockToHtml opts) lst
where nonempty (Empty _) = False
nonempty _ = True
-- | Convert list of Pandoc inline elements to HTML.
inlineListToHtml :: PandocMonad m => WriterOptions -> [Inline] -> StateT WriterState m Html

View file

@ -130,7 +130,6 @@ setFirstPara :: PandocMonad m => OD m ()
setFirstPara = modify $ \s -> s { stFirstPara = True }
inParagraphTags :: PandocMonad m => Doc -> OD m Doc
inParagraphTags d | isEmpty d = return empty
inParagraphTags d = do
b <- gets stFirstPara
a <- if b
@ -323,7 +322,8 @@ blockToOpenDocument o bs
else inParagraphTags =<< inlinesToOpenDocument o b
| Para [Image attr c (s,'f':'i':'g':':':t)] <- bs
= figure attr c s t
| Para b <- bs = if null b
| Para b <- bs = if null b &&
not (isEnabled Ext_empty_paragraphs o)
then return empty
else inParagraphTags =<< inlinesToOpenDocument o b
| LineBlock b <- bs = blockToOpenDocument o $ linesToPara b

View file

@ -10,7 +10,6 @@ import Test.Tasty
import Test.Tasty.HUnit
import Tests.Helpers
import Text.Pandoc
import Text.Pandoc.Shared (stripEmptyParagraphs)
import qualified Text.Pandoc.Class as P
import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory)
import Text.Pandoc.UTF8 as UTF8
@ -38,23 +37,20 @@ instance ToString NoNormPandoc where
instance ToPandoc NoNormPandoc where
toPandoc = unNoNorm
compareOutput :: Bool
-> ReaderOptions
-> FilePath
-> FilePath
-> IO (NoNormPandoc, NoNormPandoc)
compareOutput strip opts docxFile nativeFile = do
compareOutput :: ReaderOptions
-> FilePath
-> FilePath
-> IO (NoNormPandoc, NoNormPandoc)
compareOutput opts docxFile nativeFile = do
df <- B.readFile docxFile
nf <- UTF8.toText <$> BS.readFile nativeFile
p <- runIOorExplode $ readDocx opts df
df' <- runIOorExplode $ readNative def nf
return $ (noNorm (if strip
then stripEmptyParagraphs p
else p), noNorm df')
return $ (noNorm p, noNorm df')
testCompareWithOptsIO :: ReaderOptions -> String -> FilePath -> FilePath -> IO TestTree
testCompareWithOptsIO opts name docxFile nativeFile = do
(dp, np) <- compareOutput True opts docxFile nativeFile
(dp, np) <- compareOutput opts docxFile nativeFile
return $ test id name (dp, np)
testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
@ -75,11 +71,6 @@ testForWarningsWithOpts :: ReaderOptions -> String -> FilePath -> [String] -> Te
testForWarningsWithOpts opts name docxFile expected =
unsafePerformIO $ testForWarningsWithOptsIO opts name docxFile expected
testCompareNoStrip :: String -> FilePath -> FilePath -> TestTree
testCompareNoStrip name docxFile nativeFile = unsafePerformIO $ do
(dp, np) <- compareOutput False defopts docxFile nativeFile
return $ test id name (dp, np)
-- testForWarnings :: String -> FilePath -> [String] -> TestTree
-- testForWarnings = testForWarningsWithOpts defopts
@ -266,10 +257,6 @@ tests = [ testGroup "inlines"
"dropcap paragraphs"
"docx/drop_cap.docx"
"docx/drop_cap.native"
, testCompareNoStrip
"empty paragraphs without stripping"
"docx/drop_cap.docx"
"docx/drop_cap_nostrip.native"
]
, testGroup "track changes"
[ testCompare

View file

@ -88,20 +88,17 @@
</tr>
<tr class="even">
<td><p>1</p></td>
<td>
<p><a href="Sébastien_Loeb" title="wikilink">Sébastien Loeb</a></p></td>
<td><p><a href="Sébastien_Loeb" title="wikilink">Sébastien Loeb</a></p></td>
<td><p>78</p></td>
</tr>
<tr class="odd">
<td><p>2</p></td>
<td>
<p><strong><a href="Sébastien_Ogier" title="wikilink">Sébastien Ogier</a></strong></p></td>
<td><p><strong><a href="Sébastien_Ogier" title="wikilink">Sébastien Ogier</a></strong></p></td>
<td><p>38</p></td>
</tr>
<tr class="even">
<td><p>10</p></td>
<td>
<p><a href="Hannu_Mikkola" title="wikilink">Hannu Mikkola</a></p></td>
<td><p><a href="Hannu_Mikkola" title="wikilink">Hannu Mikkola</a></p></td>
<td><p>18</p></td>
</tr>
</tbody>

View file

@ -25,8 +25,7 @@
<td style="text-align: left;">thank you</td>
</tr>
<tr class="odd">
<td style="text-align: right;">
<p><em>blah</em></p></td>
<td style="text-align: right;"><p><em>blah</em></p></td>
<td style="text-align: left;"><em>blah</em></td>
<td style="text-align: left;"><em>blah</em></td>
</tr>

View file

@ -0,0 +1,95 @@
```
% pandoc -f native -t docx | pandoc -f docx -t native
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
[Para [Str "hi"]
,Para [Str "lo"]]
```
```
% pandoc -f native -t docx+empty_paragraphs | pandoc -f docx -t native
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
[Para [Str "hi"]
,Para [Str "lo"]]
```
```
% pandoc -f native -t docx | pandoc -f docx+empty_paragraphs -t native
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
[Para [Str "hi"]
,Para [Str "lo"]]
```
```
% pandoc -f native -t docx+empty_paragraphs | pandoc -f docx+empty_paragraphs -t native
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
[Para [Str "hi"]
,Para []
,Para []
,Para [Str "lo"]]
```
```
% pandoc -f native -t html5
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
<p>hi</p>
<p>lo</p>
```
```
% pandoc -f native -t html5+empty_paragraphs
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
<p>hi</p>
<p></p>
<p></p>
<p>lo</p>
```
```
% pandoc -f html+empty_paragraphs -t native
<p>hi</p>
<p></p>
<p></p>
<p>lo</p>
^D
[Para [Str "hi"]
,Para []
,Para []
,Para [Str "lo"]]
```
```
% pandoc -f html -t native
<p>hi</p>
<p></p>
<p></p>
<p>lo</p>
^D
[Para [Str "hi"]
,Para [Str "lo"]]
```
```
% pandoc -f native -t opendocument+empty_paragraphs
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
<text:p text:style-name="Text_20_body">hi</text:p>
<text:p text:style-name="Text_20_body"></text:p>
<text:p text:style-name="Text_20_body"></text:p>
<text:p text:style-name="Text_20_body">lo</text:p>
```
```
% pandoc -f native -t opendocument
[Para [Str "hi"], Para [], Para [], Para [Str "lo"]]
^D
<text:p text:style-name="Text_20_body">hi</text:p>
<text:p text:style-name="Text_20_body">lo</text:p>
```

View file

@ -1,15 +1,15 @@
[Table [] [AlignDefault] [0.0]
[[]]
[[[Plain []]]
[[[]]
,[[Plain [Str "User\8217s",Space,Str "Guide"]]]
,[[Plain []]]
,[[Plain []]]
,[[Plain []]]
,[[]]
,[[]]
,[[]]
,[[Plain [Str "11",Space,Str "August",Space,Str "2017"]]]
,[[Plain []]]
,[[Plain []]]
,[[Plain []]]
,[[Plain []]]]
,[[]]
,[[]]
,[[]]
,[[]]]
,Para [Str "CONTENTS"]
,Para [Strong [Str "Section",Space,Str "Page"]]
,Para [Str "FIGURES",Space,Str "iv"]

View file

@ -1,4 +1,4 @@
[Para [Str "I",Space,Str "want",Space,Span ("",["comment-start"],[("id","0"),("author","Jesse Rosenthal"),("date","2016-05-09T16:13:00Z")]) [Str "I",Space,Str "left",Space,Str "a",Space,Str "comment."],Str "some",Space,Str "text",Space,Str "to",Space,Str "have",Space,Str "a",Space,Str "comment",Space,Span ("",["comment-end"],[("id","0")]) [],Str "on",Space,Str "it."]
,Para [Str "This",Space,Str "is",Space,Span ("",["comment-start"],[("id","1"),("author","Jesse Rosenthal"),("date","2016-05-09T16:13:00Z")]) [Str "A",Space,Str "comment",Space,Str "across",Space,Str "paragraphs."],Str "a",Space,Str "new",Space,Str "paragraph."]
,Para [Str "And",Space,Str "so",Span ("",["comment-end"],[("id","1")]) [],Space,Str "is",Space,Str "this."]
,Para [Str "One",Space,Span ("",["comment-start"],[("id","2"),("author","Jesse Rosenthal"),("date","2016-05-09T16:14:00Z")]) [Str "This",Space,Str "one",Space,Str "has",Space,Str "multiple",Space,Str "paragraphs.",Space,Str "\182",Space,Str "\182",Space,Str "See?"],Str "more",Span ("",["comment-end"],[("id","2")]) [],Str ".",Space,Str "And",Space,Str "this",Space,Str "is",Space,Str "one",Space,Str "with",Space,Str "a",Space,Span ("",["comment-start"],[("id","3"),("author","Jesse Rosenthal"),("date","2016-06-22T14:35:00Z")]) [Str "Do",Space,Str "something."],Span ("",["comment-start"],[("id","4"),("author","Jesse Rosenthal"),("date","2016-06-22T14:36:00Z")]) [Str "Do",Space,Str "something",Space,Str "else."],Str "comment",Space,Str "in",Space,Str "a",Space,Str "comment",Span ("",["comment-end"],[("id","3")]) [Span ("",["comment-end"],[("id","4")]) []],Str "."]]
,Para [Str "One",Space,Span ("",["comment-start"],[("id","2"),("author","Jesse Rosenthal"),("date","2016-05-09T16:14:00Z")]) [Str "This",Space,Str "one",Space,Str "has",Space,Str "multiple",Space,Str "paragraphs.",Space,Str "\182",Space,Str "See?"],Str "more",Span ("",["comment-end"],[("id","2")]) [],Str ".",Space,Str "And",Space,Str "this",Space,Str "is",Space,Str "one",Space,Str "with",Space,Str "a",Space,Span ("",["comment-start"],[("id","3"),("author","Jesse Rosenthal"),("date","2016-06-22T14:35:00Z")]) [Str "Do",Space,Str "something."],Span ("",["comment-start"],[("id","4"),("author","Jesse Rosenthal"),("date","2016-06-22T14:36:00Z")]) [Str "Do",Space,Str "something",Space,Str "else."],Str "comment",Space,Str "in",Space,Str "a",Space,Str "comment",Span ("",["comment-end"],[("id","3")]) [Span ("",["comment-end"],[("id","4")]) []],Str "."]]

View file

@ -1,9 +0,0 @@
[Para [Str "Drop",Space,Str "cap."]
,Para []
,Para [Str "Next",Space,Str "paragraph."]
,Para []
,Para [Str "Drop",Space,Str "cap",Space,Str "in",Space,Str "margin."]
,Para []
,Para []
,Para []
,Para [Str "Drop",Space,Str "cap",Space,Str "(not",Space,Str "really)."]]