Added --strip-comments option, readerStripComments in ReaderOptions.

* Options:  Added readerStripComments to ReaderOptions.
* Added `--strip-comments` command-line option.
* Made `htmlTag` from the HTML reader sensitive to this feature.

This affects Markdown and Textile input.

Closes #2552.
This commit is contained in:
John MacFarlane 2017-09-17 12:49:15 -07:00
parent ce05814372
commit b1ee747a24
6 changed files with 46 additions and 8 deletions

View file

@ -655,6 +655,14 @@ General writer options
of contents. The default is 3 (which means that level 1, 2, and 3 of contents. The default is 3 (which means that level 1, 2, and 3
headers will be listed in the contents). headers will be listed in the contents).
`--strip-comments`
: Strip out HTML comments in the Markdown or Textile source,
rather than passing them on to Markdown, Textile or HTML
output as raw HTML. This does not apply to HTML comments
inside raw HTML blocks when the `markdown_in_html_blocks`
extension is not set.
`--no-highlight` `--no-highlight`
: Disables syntax highlighting for code blocks and inlines, even when : Disables syntax highlighting for code blocks and inlines, even when

View file

@ -483,6 +483,7 @@ convertWithOpts opts = do
, readerTrackChanges = optTrackChanges opts , readerTrackChanges = optTrackChanges opts
, readerAbbreviations = abbrevs , readerAbbreviations = abbrevs
, readerExtensions = readerExts , readerExtensions = readerExts
, readerStripComments = optStripComments opts
} }
let transforms = (case optBaseHeaderLevel opts of let transforms = (case optBaseHeaderLevel opts of
@ -666,6 +667,7 @@ data Opt = Opt
, optIncludeInHeader :: [FilePath] -- ^ Files to include in header , optIncludeInHeader :: [FilePath] -- ^ Files to include in header
, optResourcePath :: [FilePath] -- ^ Path to search for images etc , optResourcePath :: [FilePath] -- ^ Path to search for images etc
, optEol :: LineEnding -- ^ Style of line-endings to use , optEol :: LineEnding -- ^ Style of line-endings to use
, optStripComments :: Bool -- ^ Skip HTML comments
} deriving (Generic, Show) } deriving (Generic, Show)
instance ToJSON Opt where instance ToJSON Opt where
@ -742,6 +744,7 @@ defaultOpts = Opt
, optIncludeInHeader = [] , optIncludeInHeader = []
, optResourcePath = ["."] , optResourcePath = ["."]
, optEol = Native , optEol = Native
, optStripComments = False
} }
addMetadata :: (String, String) -> Pandoc -> Pandoc addMetadata :: (String, String) -> Pandoc -> Pandoc
@ -1114,6 +1117,11 @@ options =
"NUMBER") "NUMBER")
"" -- "Length of line in characters" "" -- "Length of line in characters"
, Option "" ["strip-comments"]
(NoArg
(\opt -> return opt { optStripComments = True }))
"" -- "Strip HTML comments"
, Option "" ["toc", "table-of-contents"] , Option "" ["toc", "table-of-contents"]
(NoArg (NoArg
(\opt -> return opt { optTableOfContents = True })) (\opt -> return opt { optTableOfContents = True }))

View file

@ -65,7 +65,8 @@ data ReaderOptions = ReaderOptions{
-- indented code blocks -- indented code blocks
, readerAbbreviations :: Set.Set String -- ^ Strings to treat as abbreviations , readerAbbreviations :: Set.Set String -- ^ Strings to treat as abbreviations
, readerDefaultImageExtension :: String -- ^ Default extension for images , readerDefaultImageExtension :: String -- ^ Default extension for images
, readerTrackChanges :: TrackChanges , readerTrackChanges :: TrackChanges -- ^ Track changes setting for docx
, readerStripComments :: Bool -- ^ Strip HTML comments instead of parsing as raw HTML
} deriving (Show, Read, Data, Typeable, Generic) } deriving (Show, Read, Data, Typeable, Generic)
instance ToJSON ReaderOptions where instance ToJSON ReaderOptions where
@ -82,6 +83,7 @@ instance Default ReaderOptions
, readerAbbreviations = defaultAbbrevs , readerAbbreviations = defaultAbbrevs
, readerDefaultImageExtension = "" , readerDefaultImageExtension = ""
, readerTrackChanges = AcceptChanges , readerTrackChanges = AcceptChanges
, readerStripComments = False
} }
defaultAbbrevs :: Set.Set String defaultAbbrevs :: Set.Set String

View file

@ -46,9 +46,10 @@ import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
import Text.Pandoc.Shared ( extractSpaces, addMetaField import Text.Pandoc.Shared ( extractSpaces, addMetaField
, escapeURI, safeRead, crFilter ) , escapeURI, safeRead, crFilter )
import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled, import Text.Pandoc.Options (
Extension (Ext_epub_html_exts, ReaderOptions(readerExtensions,readerStripComments), extensionEnabled,
Ext_raw_html, Ext_native_divs, Ext_native_spans)) Extension (Ext_epub_html_exts,
Ext_raw_html, Ext_native_divs, Ext_native_spans))
import Text.Pandoc.Logging import Text.Pandoc.Logging
import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Walk import Text.Pandoc.Walk
@ -1070,7 +1071,7 @@ _ `closes` _ = False
--- parsers for use in markdown, textile readers --- parsers for use in markdown, textile readers
-- | Matches a stretch of HTML in balanced tags. -- | Matches a stretch of HTML in balanced tags.
htmlInBalanced :: (Monad m) htmlInBalanced :: (HasReaderOptions st, Monad m)
=> (Tag String -> Bool) => (Tag String -> Bool)
-> ParserT String st m String -> ParserT String st m String
htmlInBalanced f = try $ do htmlInBalanced f = try $ do
@ -1118,7 +1119,7 @@ hasTagWarning (TagWarning _:_) = True
hasTagWarning _ = False hasTagWarning _ = False
-- | Matches a tag meeting a certain condition. -- | Matches a tag meeting a certain condition.
htmlTag :: Monad m htmlTag :: (HasReaderOptions st, Monad m)
=> (Tag String -> Bool) => (Tag String -> Bool)
-> ParserT [Char] st m (Tag String, String) -> ParserT [Char] st m (Tag String, String)
htmlTag f = try $ do htmlTag f = try $ do
@ -1153,7 +1154,10 @@ htmlTag f = try $ do
count (length s + 4) anyChar count (length s + 4) anyChar
skipMany (satisfy (/='>')) skipMany (satisfy (/='>'))
char '>' char '>'
return (next, "<!--" <> s <> "-->") stripComments <- getOption readerStripComments
if stripComments
then return (next, "")
else return (next, "<!--" <> s <> "-->")
| otherwise -> fail "bogus comment mode, HTML5 parse error" | otherwise -> fail "bogus comment mode, HTML5 parse error"
TagOpen tagname attr -> do TagOpen tagname attr -> do
guard $ all (isName . fst) attr guard $ all (isName . fst) attr

View file

@ -1079,7 +1079,9 @@ htmlBlock' = try $ do
first <- htmlElement first <- htmlElement
skipMany spaceChar skipMany spaceChar
optional blanklines optional blanklines
return $ return $ B.rawBlock "html" first return $ if null first
then mempty
else return $ B.rawBlock "html" first
strictHtmlBlock :: PandocMonad m => MarkdownParser m String strictHtmlBlock :: PandocMonad m => MarkdownParser m String
strictHtmlBlock = htmlInBalanced (not . isInlineTag) strictHtmlBlock = htmlInBalanced (not . isInlineTag)

14
test/command/2552.md Normal file
View file

@ -0,0 +1,14 @@
```
% pandoc --strip-comments
Foo
bar
<!-- comment -->
baz<!-- bim -->boop
^D
<p>Foo</p>
<p>bar</p>
<p>bazboop</p>
```