From 7b111542c0ef62802a65986b41829196510e5b3e Mon Sep 17 00:00:00 2001
From: "paul.rivier" <paul.r.ml@gmail.com>
Date: Tue, 24 Apr 2012 15:56:59 +0200
Subject: [PATCH] textile reader improvements : better conformance to RedCloth
 Textile inlines

---
 src/Text/Pandoc/Parsing.hs         |   5 ++
 src/Text/Pandoc/Readers/Textile.hs | 118 ++++++++++++++++-------------
 tests/textile-reader.native        |   6 +-
 tests/textile-reader.textile       |   5 +-
 4 files changed, 78 insertions(+), 56 deletions(-)

diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 22a8d4d50..140b96cfa 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -52,6 +52,7 @@ module Text.Pandoc.Parsing ( (>>~),
                              failUnlessLHS,
                              escaped,
                              characterReference,
+                             updateLastStrPos,
                              anyOrderedListMarker,
                              orderedListMarker,
                              charRef,
@@ -786,6 +787,10 @@ charOrRef cs =
                        guard (c `elem` cs)
                        return c)
 
+updateLastStrPos :: GenParser Char ParserState ()
+updateLastStrPos = getPosition >>= \p -> 
+  updateState $ \s -> s{ stateLastStrPos = Just p }
+
 singleQuoteStart :: GenParser Char ParserState ()
 singleQuoteStart = do
   failIfInQuoteContext InSingleQuote
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 796f96e06..f9221ef9a 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 {- |
    Module      : Text.Pandoc.Readers.Textile
-   Copyright   : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
+   Copyright   : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane
    License     : GNU GPL, version 2 or above 
 
    Maintainer  : Paul Rivier <paul*rivier#demotera*com>
@@ -62,7 +62,7 @@ import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
 import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
 import Text.ParserCombinators.Parsec
 import Text.HTML.TagSoup.Match
-import Data.Char ( digitToInt, isLetter )
+import Data.Char ( digitToInt, isUpper )
 import Control.Monad ( guard, liftM )
 import Control.Applicative ((<$>), (*>), (<*))
 
@@ -74,14 +74,6 @@ readTextile state s =
   (readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")
 
 
---
--- Constants and data structure definitions
---
-
--- | Special chars border strings parsing
-specialChars :: [Char]
-specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()="
-
 -- | Generate a Pandoc ADT from a textile document
 parseTextile :: GenParser Char ParserState Pandoc
 parseTextile = do
@@ -360,14 +352,8 @@ inlineParsers = [ autoLink
                 , rawHtmlInline
                 , rawLaTeXInline'
                 , note
-                , simpleInline (string "??") (Cite [])
-                , simpleInline (string "**") Strong
-                , simpleInline (string "__") Emph
-                , simpleInline (char '*') Strong
-                , simpleInline (char '_') Emph
-                , simpleInline (char '-') Strikeout
-                , simpleInline (char '^') Superscript
-                , simpleInline (char '~') Subscript
+                , try $ (char '[' *> inlineMarkup <* char ']')
+                , inlineMarkup
                 , link
                 , image
                 , mark
@@ -375,6 +361,18 @@ inlineParsers = [ autoLink
                 , symbol
                 ]
 
+-- | Inline markups
+inlineMarkup :: GenParser Char ParserState Inline
+inlineMarkup = choice [ simpleInline (string "??") (Cite [])
+                      , simpleInline (string "**") Strong
+                      , simpleInline (string "__") Emph
+                      , simpleInline (char '*') Strong
+                      , simpleInline (char '_') Emph
+                      , simpleInline (char '-') Strikeout
+                      , simpleInline (char '^') Superscript
+                      , simpleInline (char '~') Subscript
+                      ]
+
 -- | Trademark, registered, copyright
 mark :: GenParser Char st Inline
 mark = try $ char '(' >> (try tm <|> try reg <|> copy)
@@ -400,33 +398,49 @@ copy = do
 
 note :: GenParser Char ParserState Inline
 note = try $ do
-  char '['
-  ref <- many1 digit
-  char ']'
-  state <- getState
-  let notes = stateNotes state
+  ref <- (char '[' *> many1 digit <* char ']')
+  notes <- stateNotes <$> getState
   case lookup ref notes of
     Nothing   -> fail "note not found"
     Just raw  -> liftM Note $ parseFromString parseBlocks raw
 
+-- | Special chars 
+markupChars :: [Char]
+markupChars = "\\[]*#_@~-+^|%="
+
+-- | Break strings on following chars. Space tab and newline break for
+--  inlines breaking. Open paren breaks for mark. Quote, dash and dot
+--  break for smart punctuation. Punctuation breaks for regular
+--  punctuation. Double quote breaks for named links. > and < break
+--  for inline html.
+stringBreakers :: [Char]
+stringBreakers = " \t\n('-.,:!?;\"<>"
+
+wordBoundaries :: [Char]
+wordBoundaries = markupChars ++ stringBreakers
+
+-- | Parse a hyphened sequence of words
+hyphenedWords :: GenParser Char ParserState String
+hyphenedWords = try $ do
+  hd <- noneOf wordBoundaries
+  tl <- many ( (noneOf wordBoundaries) <|> 
+               try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) )
+  let wd = hd:tl
+  option wd $ try $ 
+    (\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords)
+
 -- | Any string
 str :: GenParser Char ParserState Inline
 str = do
-  xs <- many1 (noneOf (specialChars ++ "\t\n "))
-  optional $ try $ do
-    lookAhead (char '(')
-    notFollowedBy' mark
-    getInput >>= setInput . (' ':) -- add space before acronym explanation
-  -- parse a following hyphen if followed by a letter
-  -- (this prevents unwanted interpretation as starting a strikeout section)
-  result <- option xs $ try $ do
-              char '-'
-              next <- lookAhead letter
-              guard $ isLetter (last xs) || isLetter next
-              return $ xs ++ "-"
-  pos <- getPosition
-  updateState $ \s -> s{ stateLastStrPos = Just pos }
-  return $ Str result
+  baseStr <- hyphenedWords
+  -- RedCloth compliance : if parsed word is uppercase and immediatly
+  -- followed by parens, parens content is unconditionally word acronym
+  fullStr <- option baseStr $ try $ do
+    guard $ all isUpper baseStr
+    acro <- enclosed (char '(') (char ')') anyChar
+    return $ concat [baseStr, " (", acro, ")"]
+  updateLastStrPos
+  return $ Str fullStr
 
 -- | Textile allows HTML span infos, we discard them
 htmlSpan :: GenParser Char ParserState Inline
@@ -477,34 +491,36 @@ image = try $ do
 escapedInline :: GenParser Char ParserState Inline
 escapedInline = escapedEqs <|> escapedTag
 
--- | literal text escaped between == ... ==
 escapedEqs :: GenParser Char ParserState Inline
-escapedEqs = try $ do
-  string "=="
-  contents <- manyTill anyChar (try $ string "==")
-  return $ Str contents
+escapedEqs = Str <$> (try $ surrounded (string "==") anyChar)
+
+-- -- | literal text escaped between == ... ==
+-- escapedEqs :: GenParser Char ParserState Inline
+-- escapedEqs = try $ do
+--   string "=="
+--   contents <- manyTill anyChar (try $ string "==")
+--   return $ Str contents
 
 -- | literal text escaped btw <notextile> tags
 escapedTag :: GenParser Char ParserState Inline
-escapedTag = try $ Str <$> ( string "<notextile>" *> 
-                             manyTill anyChar (try $ string "</notextile>") )
+escapedTag = try $ Str <$>
+  enclosed (string "<notextile>") (string "</notextile>") anyChar
 
--- | Any special symbol defined in specialChars
+-- | Any special symbol defined in wordBoundaries
 symbol :: GenParser Char ParserState Inline
-symbol = Str . singleton <$> oneOf specialChars
+symbol = Str . singleton <$> oneOf wordBoundaries
 
 -- | Inline code
 code :: GenParser Char ParserState Inline
 code = code1 <|> code2
 
 code1 :: GenParser Char ParserState Inline
-code1 = surrounded (char '@') anyChar >>= return . Code nullAttr
+code1 = Code nullAttr <$> surrounded (char '@') anyChar
 
 code2 :: GenParser Char ParserState Inline
 code2 = do
   htmlTag (tagOpen (=="tt") null)
-  result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
-  return $ Code nullAttr result'
+  Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
 
 -- | Html / CSS attributes
 attributes :: GenParser Char ParserState String
@@ -528,4 +544,4 @@ simpleInline border construct = surrounded border (inlineWithAttribute) >>=
 
 -- | Create a singleton list
 singleton :: a -> [a]
-singleton x = [x]
\ No newline at end of file
+singleton x = [x]
diff --git a/tests/textile-reader.native b/tests/textile-reader.native
index a40e07ae9..d9fbc4672 100644
--- a/tests/textile-reader.native
+++ b/tests/textile-reader.native
@@ -67,9 +67,9 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
  ,([Str "beer"],
    [[Plain [Str "fresh",Space,Str "and",Space,Str "bitter"]]])]
 ,Header 1 [Str "Inline",Space,Str "Markup"]
-,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
+,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "Hyphenated-words-are-ok",Str ",",Space,Str "as",Space,Str "well",Space,Str "as",Space,Str "strange_underscore_notation",Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
 ,Para [Emph [Strong [Str "This",Space,Str "is",Space,Str "strong",Space,Str "and",Space,Str "em",Str "."]],LineBreak,Str "So",Space,Str "is",Space,Strong [Emph [Str "this"]],Space,Str "word",Space,Str "and",Space,Emph [Strong [Str "that",Space,Str "one"]],Str ".",LineBreak,Strikeout [Str "This",Space,Str "is",Space,Str "strikeout",Space,Str "and",Space,Strong [Str "strong"]]]
-,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
+,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Subscript [Str "here"],Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
 ,Para [Str "Dashes",Space,Str ":",Space,Str "How",Space,Str "cool",Space,Str "\8212",Space,Str "automatic",Space,Str "dashes",Str "."]
 ,Para [Str "Elipses",Space,Str ":",Space,Str "He",Space,Str "thought",Space,Str "and",Space,Str "thought",Space,Str "\8230",Space,Str "and",Space,Str "then",Space,Str "thought",Space,Str "some",Space,Str "more",Str "."]
 ,Para [Str "Quotes",Space,Str "and",Space,Str "apostrophes",Space,Str ":",Space,Quoted DoubleQuote [Str "I",Str "\8217",Str "d",Space,Str "like",Space,Str "to",Space,Str "thank",Space,Str "you"],Space,Str "for",Space,Str "example",Str "."]
@@ -144,7 +144,7 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
 ,RawBlock "latex" "\\begin{itemize}\n  \\item one\n  \\item two\n\\end{itemize}"
 ,Para [Str "and",Space,Str "for",Space,RawInline "latex" "\\emph{inlines}",Str "."]
 ,Header 1 [Str "Acronyms",Space,Str "and",Space,Str "marks"]
-,Para [Str "PBS",Space,Str "(",Str "Public",Space,Str "Broadcasting",Space,Str "System",Str ")"]
+,Para [Str "PBS (Public Broadcasting System)"]
 ,Para [Str "Hi",Str "\8482"]
 ,Para [Str "Hi",Space,Str "\8482"]
 ,Para [Str "\174",Space,Str "Hi",Str "\174"]
diff --git a/tests/textile-reader.textile b/tests/textile-reader.textile
index cf165e1bc..c6450fdfb 100644
--- a/tests/textile-reader.textile
+++ b/tests/textile-reader.textile
@@ -115,14 +115,15 @@ h1. Inline Markup
 
 This is _emphasized_, and so __is this__.
 This is *strong*, and so **is this**.
+Hyphenated-words-are-ok, as well as strange_underscore_notation.
 A "*strong link*":http://www.foobar.com.
 
 _*This is strong and em.*_
 So is *_this_* word and __**that one**__.
 -This is strikeout and *strong*-
 
-Superscripts:  a^bc^d a^*hello*^ a^hello there^.
-Subscripts: H~2~O, H~23~O, H~many of them~O.
+Superscripts: a[^bc^]d a^*hello*^ a[^hello there^].
+Subscripts: ~here~ H[~2~]O, H[~23~]O, H[~many of them~]O.
 
 Dashes : How cool -- automatic dashes.