From 19daf6cf0a336e0ffa08b2fb0e0c9932d6fef2a6 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Fri, 8 Aug 2014 21:04:25 -0700
Subject: [PATCH] Added `native_divs` and `native_spans` extensions.

This allows users to turn off the default pandoc behavior of
parsing contents of div and span tags in markdown and HTML
as native pandoc Div blocks and Span inlines.

Setting of default epub extensions has been moved from the EPUB
reader to Text.Pandoc.
---
 README                              | 14 ++++++++++++++
 src/Text/Pandoc.hs                  | 12 +++++++++++-
 src/Text/Pandoc/Options.hs          |  4 ++++
 src/Text/Pandoc/Readers/EPUB.hs     | 10 +++-------
 src/Text/Pandoc/Readers/HTML.hs     |  5 ++++-
 src/Text/Pandoc/Readers/Markdown.hs |  4 ++--
 6 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/README b/README
index 161092550..d7031d9fa 100644
--- a/README
+++ b/README
@@ -2238,6 +2238,20 @@ markdown with HTML block elements.  For example, one can surround
 a block of markdown text with `<div>` tags without preventing it
 from being interpreted as markdown.
 
+#### Extension: `native_divs` ####
+
+Use native pandoc `Div` blocks for content inside `<div>` tags.
+For the most part this should give the same output as
+`markdown_in_html_blocks`, but it makes it easier to write pandoc
+filters to manipulate groups of blocks.
+
+#### Extension: `native_spans` ####
+
+Use native pandoc `Span` blocks for content inside `<span>` tags.
+For the most part this should give the same output as `raw_html`,
+but it makes it easier to write pandoc filters to manipulate groups
+of inlines.
+
 Raw TeX
 -------
 
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index 589a6af98..c7c64f0fc 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -304,7 +304,17 @@ getDefaultExtensions "markdown_github" = githubMarkdownExtensions
 getDefaultExtensions "markdown"        = pandocExtensions
 getDefaultExtensions "plain"           = pandocExtensions
 getDefaultExtensions "org"             = Set.fromList [Ext_citations]
-getDefaultExtensions "textile"         = Set.fromList [Ext_auto_identifiers, Ext_raw_tex]
+getDefaultExtensions "textile"         = Set.fromList [Ext_auto_identifiers,
+                                                       Ext_raw_tex]
+getDefaultExtensions "html"            = Set.fromList [Ext_auto_identifiers,
+                                                       Ext_native_divs,
+                                                       Ext_native_spans]
+getDefaultExtensions "html5"           = getDefaultExtensions "html"
+getDefaultExtensions "epub"            = Set.fromList [Ext_auto_identifiers,
+                                                       Ext_raw_html,
+                                                       Ext_native_divs,
+                                                       Ext_native_spans,
+                                                       Ext_epub_html_exts]
 getDefaultExtensions _                 = Set.fromList [Ext_auto_identifiers]
 
 -- | Retrieve reader based on formatSpec (format+extensions).
diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs
index bb213bac0..84ccbbdc9 100644
--- a/src/Text/Pandoc/Options.hs
+++ b/src/Text/Pandoc/Options.hs
@@ -77,6 +77,8 @@ data Extension =
     | Ext_backtick_code_blocks    -- ^ Github style ``` code blocks
     | Ext_inline_code_attributes  -- ^ Allow attributes on inline code
     | Ext_markdown_in_html_blocks -- ^ Interpret as markdown inside HTML blocks
+    | Ext_native_divs             -- ^ Use Div blocks for contents of <div> tags
+    | Ext_native_spans            -- ^ Use Span inlines for contents of <span>
     | Ext_markdown_attribute      -- ^ Interpret text inside HTML as markdown
                                   --   iff container has attribute 'markdown'
     | Ext_escaped_line_breaks     -- ^ Treat a backslash at EOL as linebreak
@@ -131,6 +133,8 @@ pandocExtensions = Set.fromList
   , Ext_backtick_code_blocks
   , Ext_inline_code_attributes
   , Ext_markdown_in_html_blocks
+  , Ext_native_divs
+  , Ext_native_spans
   , Ext_escaped_line_breaks
   , Ext_fancy_lists
   , Ext_startnum
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
index 968b815c0..7462b3711 100644
--- a/src/Text/Pandoc/Readers/EPUB.hs
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -13,8 +13,7 @@ import Text.Pandoc.Definition hiding (Attr)
 import Text.Pandoc.Walk (walk, query)
 import Text.Pandoc.Generic(bottomUp)
 import Text.Pandoc.Readers.HTML (readHtml)
-import Text.Pandoc.Options ( ReaderOptions(..), readerExtensions, Extension(..)
-                           , readerTrace)
+import Text.Pandoc.Options ( ReaderOptions(..), readerTrace)
 import Text.Pandoc.Shared (escapeURI)
 import Text.Pandoc.MediaBag (MediaBag, insertMedia)
 import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except)
@@ -32,7 +31,6 @@ import Data.Monoid (mempty, (<>))
 import Data.List (isPrefixOf, isInfixOf)
 import Data.Maybe (mapMaybe, fromMaybe)
 import qualified Data.Map as M (Map, lookup, fromList, elems)
-import qualified Data.Set as S (insert)
 import Control.DeepSeq.Generics (deepseq, NFData)
 
 import Debug.Trace (trace)
@@ -65,9 +63,7 @@ archiveToEPUB os archive = do
   let mediaBag = fetchImages (M.elems items) root archive ast
   return $ (ast, mediaBag)
   where
-    rs = readerExtensions os
-    os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]}
-    os'' = os' {readerParseRaw = True}
+    os' = os {readerParseRaw = True}
     parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc
     parseSpineElem (normalise -> r) (normalise -> path, mime) = do
       when (readerTrace os) (traceM path)
@@ -78,7 +74,7 @@ archiveToEPUB os archive = do
     mimeToReader "application/xhtml+xml" r path = do
       fname <- findEntryByPathE (r </> path) archive
       return $ fixInternalReferences (r </> path) .
-                readHtml os'' .
+                readHtml os' .
                   UTF8.toStringLazy $
                     fromEntry fname
     mimeToReader s _ path
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 3d988cd80..1789b865f 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -45,7 +45,8 @@ import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
 import Text.Pandoc.Shared ( extractSpaces, renderTags'
                           , escapeURI, safeRead )
 import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
-                           , Extension (Ext_epub_html_exts))
+                           , Extension (Ext_epub_html_exts,
+                               Ext_native_divs, Ext_native_spans))
 import Text.Pandoc.Parsing hiding ((<|>))
 import Text.Pandoc.Walk
 import Data.Maybe ( fromMaybe, isJust)
@@ -296,6 +297,7 @@ pRawTag = do
 
 pDiv :: TagParser Blocks
 pDiv = try $ do
+  guardEnabled Ext_native_divs
   TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True)
   contents <- pInTags "div" block
   return $ B.divWith (mkAttr attr) contents
@@ -560,6 +562,7 @@ pCode = try $ do
 
 pSpan :: TagParser Inlines
 pSpan = try $ do
+  guardEnabled Ext_native_spans
   TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
   contents <- pInTags "span" inline
   return $ B.spanWith (mkAttr attr) contents
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 04b3fa684..861f81b23 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1764,7 +1764,7 @@ inBrackets parser = do
 
 spanHtml :: MarkdownParser (F Inlines)
 spanHtml = try $ do
-  guardEnabled Ext_markdown_in_html_blocks
+  guardEnabled Ext_native_spans
   (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" [])
   contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span"))
   let ident = fromMaybe "" $ lookup "id" attrs
@@ -1779,7 +1779,7 @@ spanHtml = try $ do
 
 divHtml :: MarkdownParser (F Blocks)
 divHtml = try $ do
-  guardEnabled Ext_markdown_in_html_blocks
+  guardEnabled Ext_native_divs
   (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" [])
   -- we set stateInHtmlBlock so that closing tags that can be either block or
   -- inline will not be parsed as inline tags