From 44120ea7165546152af88fd442c52ab0f201052e Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sat, 12 Dec 2015 17:28:52 -0800
Subject: [PATCH] Implemented `east_asian_line_breaks` extension.

Text.Pandoc.Options: Added `Ext_east_asian_line_breaks` constructor to
`Extension` (API change).

This extension is like `ignore_line_breaks`, but smarter -- it
only ignores line breaks between two East Asian wide characters.
This makes it better suited for writing with a mix of East Asian
and non-East Asian scripts.

Closes #2586.
---
 README                              | 10 +++++++++-
 src/Text/Pandoc/Options.hs          |  2 ++
 src/Text/Pandoc/Readers/Markdown.hs | 16 +++++++++++++++-
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/README b/README
index 8f00ac3a8..ea6a25871 100644
--- a/README
+++ b/README
@@ -3224,7 +3224,15 @@ treated as spaces or as hard line breaks.  This option is intended for
 use with East Asian languages where spaces are not used between words,
 but text is divided into lines for readability.
 
-#### Extension: `emoji` ####
+#### Extension: `east_asian_line_breaks` ####
+
+Causes newlines within a paragraph to be ignored, rather than
+being treated as spaces or as hard line breaks, when they occur
+between two East Asian wide characters.  This is a better choice
+than `ignore_line_breaks` for texts that include a mix of East
+Asian wide characters and other characters.
+
+##### Extension: `emoji` ####
 
 Parses textual emojis like `:smile:` as Unicode emoticons.
 
diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs
index 9f27f46f9..7dd47cd59 100644
--- a/src/Text/Pandoc/Options.hs
+++ b/src/Text/Pandoc/Options.hs
@@ -106,6 +106,8 @@ data Extension =
     | Ext_subscript           -- ^ Subscript using ~this~ syntax
     | Ext_hard_line_breaks    -- ^ All newlines become hard line breaks
     | Ext_ignore_line_breaks  -- ^ Newlines in paragraphs are ignored
+    | Ext_east_asian_line_breaks  -- ^ Newlines in paragraphs are ignored between
+                              -- East Asian wide characters
     | Ext_literate_haskell    -- ^ Enable literate Haskell conventions
     | Ext_abbreviations       -- ^ PHP markdown extra abbreviation definitions
     | Ext_emoji               -- ^ Support emoji like :smile:
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 0b7faadb7..999ab11de 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -40,6 +40,7 @@ import Data.Char ( isSpace, isAlphaNum, toLower )
 import Data.Maybe
 import Text.Pandoc.Definition
 import Text.Pandoc.Emoji (emojis)
+import Text.Pandoc.Generic (bottomUp)
 import qualified Data.Text as T
 import Data.Text (Text)
 import qualified Data.Yaml as Yaml
@@ -51,6 +52,7 @@ import qualified Data.Vector as V
 import Text.Pandoc.Builder (Inlines, Blocks, trimInlines)
 import Text.Pandoc.Options
 import Text.Pandoc.Shared
+import Text.Pandoc.Pretty (charWidth)
 import Text.Pandoc.XML (fromEntities)
 import Text.Pandoc.Parsing hiding (tableWith)
 import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
@@ -356,7 +358,19 @@ parseMarkdown = do
   st <- getState
   let meta = runF (stateMeta' st) st
   let Pandoc _ bs = B.doc $ runF blocks st
-  return $ Pandoc meta bs
+  eastAsianLineBreaks <- option False $
+                    True <$ guardEnabled Ext_east_asian_line_breaks
+  return $ (if eastAsianLineBreaks
+               then bottomUp softBreakFilter
+               else id) $ Pandoc meta bs
+
+softBreakFilter :: [Inline] -> [Inline]
+softBreakFilter (x:SoftBreak:y:zs) =
+  case (stringify x, stringify y) of
+        (xs@(_:_), (c:_))
+          | charWidth (last xs) == 2 && charWidth c == 2 -> x:y:zs
+        _ -> x:SoftBreak:y:zs
+softBreakFilter xs = xs
 
 referenceKey :: MarkdownParser (F Blocks)
 referenceKey = try $ do