From b1ee747a249e0c1b1840222ef77607218157f099 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 17 Sep 2017 12:49:15 -0700 Subject: [PATCH] Added `--strip-comments` option, `readerStripComments` in `ReaderOptions`. * Options: Added readerStripComments to ReaderOptions. * Added `--strip-comments` command-line option. * Made `htmlTag` from the HTML reader sensitive to this feature. This affects Markdown and Textile input. Closes #2552. --- MANUAL.txt | 8 ++++++++ src/Text/Pandoc/App.hs | 8 ++++++++ src/Text/Pandoc/Options.hs | 4 +++- src/Text/Pandoc/Readers/HTML.hs | 16 ++++++++++------ src/Text/Pandoc/Readers/Markdown.hs | 4 +++- test/command/2552.md | 14 ++++++++++++++ 6 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 test/command/2552.md diff --git a/MANUAL.txt b/MANUAL.txt index 8491c1998..564f7f317 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -655,6 +655,14 @@ General writer options of contents. The default is 3 (which means that level 1, 2, and 3 headers will be listed in the contents). +`--strip-comments` + +: Strip out HTML comments in the Markdown or Textile source, + rather than passing them on to Markdown, Textile or HTML + output as raw HTML. This does not apply to HTML comments + inside raw HTML blocks when the `markdown_in_html_blocks` + extension is not set. + `--no-highlight` : Disables syntax highlighting for code blocks and inlines, even when diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs index f8e23b10c..e5be7e620 100644 --- a/src/Text/Pandoc/App.hs +++ b/src/Text/Pandoc/App.hs @@ -483,6 +483,7 @@ convertWithOpts opts = do , readerTrackChanges = optTrackChanges opts , readerAbbreviations = abbrevs , readerExtensions = readerExts + , readerStripComments = optStripComments opts } let transforms = (case optBaseHeaderLevel opts of @@ -666,6 +667,7 @@ data Opt = Opt , optIncludeInHeader :: [FilePath] -- ^ Files to include in header , optResourcePath :: [FilePath] -- ^ Path to search for images etc , optEol :: LineEnding -- ^ Style of line-endings to use + , optStripComments :: Bool -- ^ Skip HTML comments } deriving (Generic, Show) instance ToJSON Opt where @@ -742,6 +744,7 @@ defaultOpts = Opt , optIncludeInHeader = [] , optResourcePath = ["."] , optEol = Native + , optStripComments = False } addMetadata :: (String, String) -> Pandoc -> Pandoc @@ -1114,6 +1117,11 @@ options = "NUMBER") "" -- "Length of line in characters" + , Option "" ["strip-comments"] + (NoArg + (\opt -> return opt { optStripComments = True })) + "" -- "Strip HTML comments" + , Option "" ["toc", "table-of-contents"] (NoArg (\opt -> return opt { optTableOfContents = True })) diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index cd353e18e..345245855 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -65,7 +65,8 @@ data ReaderOptions = ReaderOptions{ -- indented code blocks , readerAbbreviations :: Set.Set String -- ^ Strings to treat as abbreviations , readerDefaultImageExtension :: String -- ^ Default extension for images - , readerTrackChanges :: TrackChanges + , readerTrackChanges :: TrackChanges -- ^ Track changes setting for docx + , readerStripComments :: Bool -- ^ Strip HTML comments instead of parsing as raw HTML } deriving (Show, Read, Data, Typeable, Generic) instance ToJSON ReaderOptions where @@ -82,6 +83,7 @@ instance Default ReaderOptions , readerAbbreviations = defaultAbbrevs , readerDefaultImageExtension = "" , readerTrackChanges = AcceptChanges + , readerStripComments = False } defaultAbbrevs :: Set.Set String diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2093be19c..4cbc03089 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -46,9 +46,10 @@ import qualified Text.Pandoc.Builder as B import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) import Text.Pandoc.Shared ( extractSpaces, addMetaField , escapeURI, safeRead, crFilter ) -import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled, - Extension (Ext_epub_html_exts, - Ext_raw_html, Ext_native_divs, Ext_native_spans)) +import Text.Pandoc.Options ( + ReaderOptions(readerExtensions,readerStripComments), extensionEnabled, + Extension (Ext_epub_html_exts, + Ext_raw_html, Ext_native_divs, Ext_native_spans)) import Text.Pandoc.Logging import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Walk @@ -1070,7 +1071,7 @@ _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Monad m) +htmlInBalanced :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT String st m String htmlInBalanced f = try $ do @@ -1118,7 +1119,7 @@ hasTagWarning (TagWarning _:_) = True hasTagWarning _ = False -- | Matches a tag meeting a certain condition. -htmlTag :: Monad m +htmlTag :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do @@ -1153,7 +1154,10 @@ htmlTag f = try $ do count (length s + 4) anyChar skipMany (satisfy (/='>')) char '>' - return (next, "") + stripComments <- getOption readerStripComments + if stripComments + then return (next, "") + else return (next, "") | otherwise -> fail "bogus comment mode, HTML5 parse error" TagOpen tagname attr -> do guard $ all (isName . fst) attr diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index c2a73dcc5..1364f25cb 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1079,7 +1079,9 @@ htmlBlock' = try $ do first <- htmlElement skipMany spaceChar optional blanklines - return $ return $ B.rawBlock "html" first + return $ if null first + then mempty + else return $ B.rawBlock "html" first strictHtmlBlock :: PandocMonad m => MarkdownParser m String strictHtmlBlock = htmlInBalanced (not . isInlineTag) diff --git a/test/command/2552.md b/test/command/2552.md new file mode 100644 index 000000000..90a3a381c --- /dev/null +++ b/test/command/2552.md @@ -0,0 +1,14 @@ +``` +% pandoc --strip-comments +Foo + +bar + + + +bazboop +^D +

Foo

+

bar

+

bazboop

+```