Implemented east_asian_line_breaks extension.

Text.Pandoc.Options: Added `Ext_east_asian_line_breaks` constructor to `Extension` (API change). This extension is like `ignore_line_breaks`, but smarter -- it only ignores line breaks between two East Asian wide characters. This makes it better suited for writing with a mix of East Asian and non-East Asian scripts. Closes #2586.
2015-12-12 17:28:52 -08:00 · 2015-12-12 17:28:52 -08:00 · 44120ea716
commit 44120ea716
parent 60d383e27e
3 changed files with 26 additions and 2 deletions
--- a/10
+++ b/10
@ -3224,7 +3224,15 @@ treated as spaces or as hard line breaks.  This option is intended for
 use with East Asian languages where spaces are not used between words,
 but text is divided into lines for readability.

-#### Extension: `emoji` ####
+#### Extension: `east_asian_line_breaks` ####
+
+Causes newlines within a paragraph to be ignored, rather than
+being treated as spaces or as hard line breaks, when they occur
+between two East Asian wide characters.  This is a better choice
+than `ignore_line_breaks` for texts that include a mix of East
+Asian wide characters and other characters.
+
+##### Extension: `emoji` ####

 Parses textual emojis like `:smile:` as Unicode emoticons.

--- a/src/Text/Pandoc/Options.hs
+++ b/src/Text/Pandoc/Options.hs
@ -106,6 +106,8 @@ data Extension =
    | Ext_subscript           -- ^ Subscript using ~this~ syntax
    | Ext_hard_line_breaks    -- ^ All newlines become hard line breaks
    | Ext_ignore_line_breaks  -- ^ Newlines in paragraphs are ignored
+    | Ext_east_asian_line_breaks  -- ^ Newlines in paragraphs are ignored between
+                              -- East Asian wide characters
    | Ext_literate_haskell    -- ^ Enable literate Haskell conventions
    | Ext_abbreviations       -- ^ PHP markdown extra abbreviation definitions
    | Ext_emoji               -- ^ Support emoji like :smile:
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@ -40,6 +40,7 @@ import Data.Char ( isSpace, isAlphaNum, toLower )
 import Data.Maybe
 import Text.Pandoc.Definition
 import Text.Pandoc.Emoji (emojis)
+import Text.Pandoc.Generic (bottomUp)
 import qualified Data.Text as T
 import Data.Text (Text)
 import qualified Data.Yaml as Yaml
@ -51,6 +52,7 @@ import qualified Data.Vector as V
 import Text.Pandoc.Builder (Inlines, Blocks, trimInlines)
 import Text.Pandoc.Options
 import Text.Pandoc.Shared
+import Text.Pandoc.Pretty (charWidth)
 import Text.Pandoc.XML (fromEntities)
 import Text.Pandoc.Parsing hiding (tableWith)
 import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
@ -356,7 +358,19 @@ parseMarkdown = do
  st <- getState
  let meta = runF (stateMeta' st) st
  let Pandoc _ bs = B.doc $ runF blocks st
-  return $ Pandoc meta bs
+  eastAsianLineBreaks <- option False $
+                    True <$ guardEnabled Ext_east_asian_line_breaks
+  return $ (if eastAsianLineBreaks
+               then bottomUp softBreakFilter
+               else id) $ Pandoc meta bs
+
+softBreakFilter :: [Inline] -> [Inline]
+softBreakFilter (x:SoftBreak:y:zs) =
+  case (stringify x, stringify y) of
+        (xs@(_:_), (c:_))
+          | charWidth (last xs) == 2 && charWidth c == 2 -> x:y:zs
+        _ -> x:SoftBreak:y:zs
+softBreakFilter xs = xs

 referenceKey :: MarkdownParser (F Blocks)
 referenceKey = try $ do