From c3866f3c6671300f27ae40927fe6f8a2a85cb28b Mon Sep 17 00:00:00 2001
From: "paul.rivier" <paul.r.ml@gmail.com>
Date: Fri, 3 Dec 2010 17:26:22 +0100
Subject: [PATCH] punctuation handling, and more html-specific handling

---
 src/Text/Pandoc/Readers/Markdown.hs |  3 +-
 src/Text/Pandoc/Readers/Textile.hs  | 38 +++++++++++++---
 tests/textile-reader.native         | 68 ++++++++++++++---------------
 tests/textile-reader.textile        | 28 ++++++------
 4 files changed, 81 insertions(+), 56 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 9fe5c9f06..c2a81ea0b 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -28,7 +28,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 Conversion of markdown-formatted plain text to 'Pandoc' document.
 -}
 module Text.Pandoc.Readers.Markdown ( 
-                                     readMarkdown 
+                                     readMarkdown,
+                                     smartPunctuation
                                     ) where
 
 import Data.List ( transpose, isSuffixOf, sortBy, findIndex, intercalate )
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index aec8b48a8..650060c3d 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -26,7 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    Portability : portable
 
 Conversion from Textile to 'Pandoc' document, based on the spec
-available at ahttp://redcloth.org/hobix.com/textile/
+available at http://redcloth.org/textile.
 
 Implemented and parsed:
  - Paragraphs
@@ -34,7 +34,7 @@ Implemented and parsed:
  - Lists
  - blockquote
  - Inlines : strong, emph, cite, code, deleted, superscript,
-   subscript, links
+   subscript, links, smart punctuation
 
 Implemented but discarded:
  - HTML-specific and CSS-specific attributes
@@ -42,7 +42,20 @@ Implemented but discarded:
 Left to be implemented:
  - Pandoc Meta Information (title, author, date)
  - footnotes
- - should autolink be shared through Parsing.hs ?
+ - dimension sign
+ - registered, trademark, and copyright symbols
+ - acronyms
+ - uppercase
+ - definition lists
+ - continued blocks (ex bq..)
+ - 
+
+
+
+TODO : refactor common patterns across readers :
+ - autolink
+ - smartPunctuation
+ - more ...
 
 -}
 
@@ -55,6 +68,7 @@ import Text.Pandoc.Shared
 import Text.Pandoc.Parsing
 import Text.Pandoc.Readers.HTML ( htmlTag, htmlEndTag, -- find code blocks
                                   rawHtmlBlock, rawHtmlInline )
+import Text.Pandoc.Readers.Markdown (smartPunctuation)
 import Text.ParserCombinators.Parsec
 import Data.Char ( digitToInt )
 
@@ -71,7 +85,7 @@ readTextile state s = (readWith parseTextile) state (s ++ "\n\n")
 
 -- | Special chars border strings parsing
 specialChars :: [Char]
-specialChars = "\\[]*#_@~<>!?-+^&'\";:|"
+specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%"
 
 -- | Generate a Pandoc ADT from a textile document
 parseTextile :: GenParser Char ParserState Pandoc
@@ -275,6 +289,8 @@ inlines = manyTill inline newline
 inlineParsers :: [GenParser Char ParserState Inline]
 inlineParsers = [ autoLink
                 , str
+                , htmlSpan
+                , smartPunctuation -- from markdown reader
                 , whitespace
                 , endline
                 , rawHtmlInline
@@ -296,17 +312,25 @@ inlineParsers = [ autoLink
 str :: GenParser Char ParserState Inline
 str = many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str
 
+-- | Textile allows HTML span infos, we discard them
+htmlSpan :: GenParser Char ParserState Inline
+htmlSpan = try $ do
+  char '%'
+  _ <- attributes
+  content <- manyTill anyChar (char '%')
+  return $ Str content
+
 -- | Some number of space chars
 whitespace :: GenParser Char ParserState Inline
 whitespace = many1 spaceChar >> return Space <?> "whitespace"
 
--- | In Textile, an endline character that can be treated as a space,
--- not a structural break
+-- | In Textile, an isolated endline character is a line break
 endline :: GenParser Char ParserState Inline
 endline = try $ do
   newline >> notFollowedBy blankline
-  return Space
+  return LineBreak
 
+-- | Textile standard link syntax is label:"target"
 link :: GenParser Char ParserState Inline
 link = try $ do
   name <- surrounded (char '"') inline
diff --git a/tests/textile-reader.native b/tests/textile-reader.native
index 738362b09..f7d88438a 100644
--- a/tests/textile-reader.native
+++ b/tests/textile-reader.native
@@ -1,5 +1,5 @@
 Pandoc (Meta {docTitle = [Str ""], docAuthors = [[Str ""]], docDate = [Str ""]})
-[ Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "set",Space,Str "of",Space,Str "tests",Space,Str "for",Space,Str "pandoc.",Space,Str "Most",Space,Str "of",Space,Str "them",Space,Str "are",Space,Str "adapted",Space,Str "from",Space,Str "John",Space,Str "Gruber",Str "'",Str "s",Space,Str "markdown",Space,Str "test",Space,Str "suite."]
+[ Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "set",Space,Str "of",Space,Str "tests",Space,Str "for",Space,Str "pandoc",Space,Str "Textile",Space,Str "Reader",Str ".",Space,Str "Part",Space,Str "of",Space,Str "it",Space,Str "comes",LineBreak,Str "from",Space,Str "John",Space,Str "Gruber",Str "'",Str "s",Space,Str "markdown",Space,Str "test",Space,Str "suite",Str "."]
 , Para [Strikeout [Str "-"],Str "-",Str "-"]
 , Header 1 [Str "Headers"]
 , Header 2 [Str "Level",Space,Str "2",Space,Str "with",Space,Str "an",Space,Link [Str "embeded",Space,Str "link"] ("http://www.example.com","")]
@@ -8,19 +8,19 @@ Pandoc (Meta {docTitle = [Str ""], docAuthors = [[Str ""]], docDate = [Str ""]})
 , Header 5 [Str "Level",Space,Str "5"]
 , Header 6 [Str "Level",Space,Str "6"]
 , Header 1 [Str "Paragraphs"]
-, Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Str "regular",Space,Str "paragraph."]
-, Para [Str "Line",Space,Str "break",Space,Str "are",Space,Str "not",Space,Str "paragraph",Space,Str "break",Space,Str "in",Space,Str "textile,",Space,Str "so",Space,Str "you",Space,Str "can",Space,Str "wrap",Space,Str "your",Space,Str "very",Space,Str "long",Space,Str "paragraph",Space,Str "with",Space,Str "your",Space,Str "favourite",Space,Str "text",Space,Str "editor,",Space,Str "it",Space,Str "will",Space,Str "be",Space,Str "rendered",Space,Str "as",Space,Str "a",Space,Str "single",Space,Str "one."]
-, Para [Str "Here",Str "'",Str "s",Space,Str "one",Space,Str "with",Space,Str "a",Space,Str "bullet."]
+, Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Str "regular",Space,Str "paragraph",Str "."]
+, Para [Str "Line",Space,Str "breaks",Space,Str "are",Space,Str "preserved",Space,Str "in",Space,Str "textile",Str ",",Space,Str "so",Space,Str "you",Space,Str "can",Space,Str "not",Space,Str "wrap",Space,Str "your",Space,Str "very",LineBreak,Str "long",Space,Str "paragraph",Space,Str "with",Space,Str "your",Space,Str "favourite",Space,Str "text",Space,Str "editor",Space,Str "and",Space,Str "have",Space,Str "it",Space,Str "rendered",LineBreak,Str "with",Space,Str "no",Space,Str "break",Str "."]
+, Para [Str "Here",Str "'",Str "s",Space,Str "one",Space,Str "with",Space,Str "a",Space,Str "bullet",Str "."]
 , BulletList
-  [ [ Plain [Str "criminey."] ]
+  [ [ Plain [Str "criminey",Str "."] ]
  ]
-, Para [Str "There",Space,Str "should",Space,Str "be",Space,Str "a",Space,Str "hard",Space,Str "line",Space,Str "break"]
-, Para [Str "here."]
+, Para [Str "There",Space,Str "should",Space,Str "be",Space,Str "a",Space,Str "paragraph",Space,Str "break",Space,Str "between",Space,Str "here"]
+, Para [Str "and",Space,Str "here",Str "."]
 , Header 1 [Str "Block",Space,Str "Quotes"]
 , BlockQuote
-  [ Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "famous",Space,Str "quote",Space,Str "from",Space,Str "somebody.",Space,Str "He",Space,Str "had",Space,Str "a",Space,Str "lot",Space,Str "of",Space,Str "things",Space,Str "to",Space,Str "say,",Space,Str "so",Space,Str "the",Space,Str "text",Space,Str "is",Space,Str "really",Space,Str "really",Space,Str "long",Space,Str "and",Space,Str "spans",Space,Str "on",Space,Str "multiple",Space,Str "lines."] ]
+  [ Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "famous",Space,Str "quote",Space,Str "from",Space,Str "somebody",Str ".",Space,Str "He",Space,Str "had",Space,Str "a",Space,Str "lot",Space,Str "of",Space,Str "things",Space,Str "to",LineBreak,Str "say",Str ",",Space,Str "so",Space,Str "the",Space,Str "text",Space,Str "is",Space,Str "really",Space,Str "really",Space,Str "long",Space,Str "and",Space,Str "spans",Space,Str "on",Space,Str "multiple",Space,Str "lines",Str "."] ]
 
-, Para [Str "And",Space,Str "a",Space,Str "following",Space,Str "paragraph."]
+, Para [Str "And",Space,Str "a",Space,Str "following",Space,Str "paragraph",Str "."]
 , Header 1 [Str "Code",Space,Str "Blocks"]
 , Para [Str "Code",Str ":"]
 , CodeBlock ("",[],[]) "\n    ---- (should be four hyphens)\n\n    sub status {\n        print \"working\";\n    }\n\n    this code block is indented by one tab\n"
@@ -43,24 +43,22 @@ Pandoc (Meta {docTitle = [Str ""], docAuthors = [[Str ""]], docDate = [Str ""]})
 , BulletList
   [ [ Plain [Str "ui",Space,Str "1"]
     , BulletList
-      [ [ Plain [Str "ui",Space,Str "1.1"]
+      [ [ Plain [Str "ui",Space,Str "1",Str ".",Str "1"]
         , OrderedList (1,DefaultStyle,DefaultDelim)
-          [ [ Plain [Str "oi",Space,Str "1.1.1"] ]
-          , [ Plain [Str "oi",Space,Str "1.1.2"] ] ] ], [ Plain [Str "ui",Space,Str "1.2"] ] ] ], [ Plain [Str "ui",Space,Str "2"]
+          [ [ Plain [Str "oi",Space,Str "1",Str ".",Str "1",Str ".",Str "1"] ]
+          , [ Plain [Str "oi",Space,Str "1",Str ".",Str "1",Str ".",Str "2"] ] ] ], [ Plain [Str "ui",Space,Str "1",Str ".",Str "2"] ] ] ], [ Plain [Str "ui",Space,Str "2"]
     , OrderedList (1,DefaultStyle,DefaultDelim)
-      [ [ Plain [Str "oi",Space,Str "2.1"]
+      [ [ Plain [Str "oi",Space,Str "2",Str ".",Str "1"]
         , BulletList
-          [ [ Plain [Str "ui",Space,Str "2.1.1"] ]
-          , [ Plain [Str "ui",Space,Str "2.1.2"] ] ] ] ] ] ]
+          [ [ Plain [Str "ui",Space,Str "2",Str ".",Str "1",Str ".",Str "1"] ]
+          , [ Plain [Str "ui",Space,Str "2",Str ".",Str "1",Str ".",Str "2"] ] ] ] ] ] ]
 , Header 1 [Str "Inline",Space,Str "Markup"]
-, Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str "."]
-, Para [Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str "."]
-, Para [Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
-, Para [Emph [Strong [Str "This",Space,Str "is",Space,Str "strong",Space,Str "and",Space,Str "em."]]]
-, Para [Str "So",Space,Str "is",Space,Strong [Emph [Str "this"]],Space,Str "word",Space,Str "and",Space,Emph [Strong [Str "that",Space,Str "one"]],Str "."]
-, Para [Strikeout [Str "This",Space,Str "is",Space,Str "strikeout",Space,Str "and",Space,Strong [Str "strong"]]]
-, Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str "."]
-, Para [Str "Subscripts",Str ":",Space,Str "H",Subscript [Str "2"],Str "O,",Space,Str "H",Subscript [Str "23"],Str "O,",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O."]
+, Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
+, Para [Emph [Strong [Str "This",Space,Str "is",Space,Str "strong",Space,Str "and",Space,Str "em",Str "."]],LineBreak,Str "So",Space,Str "is",Space,Strong [Emph [Str "this"]],Space,Str "word",Space,Str "and",Space,Emph [Strong [Str "that",Space,Str "one"]],Str ".",LineBreak,Strikeout [Str "This",Space,Str "is",Space,Str "strikeout",Space,Str "and",Space,Strong [Str "strong"]]]
+, Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
+, Para [Str "Dashes",Space,Str ":",Space,Str "How",Space,Str "cool",Space,Str "-",Str "-",Space,Str "automatic",Space,Str "dashes",Str "."]
+, Para [Str "Elipses",Space,Str ":",Space,Str "He",Space,Str "thought",Space,Str "and",Space,Str "thought",Space,Str ".",Str ".",Str ".",Space,Str "and",Space,Str "then",Space,Str "thought",Space,Str "some",Space,Str "more",Str "."]
+, Para [Str "Quotes",Space,Str "and",Space,Str "apostrophes",Space,Str ":",Space,Str "\"",Str "I",Str "'",Str "d",Space,Str "like",Space,Str "to",Space,Str "thank",Space,Str "you",Str "\"",Space,Str "for",Space,Str "example",Str "."]
 , Header 1 [Str "Links"]
 , Header 2 [Str "Explicit"]
 , Para [Str "Just",Space,Str "a",Space,Link [Str "url"] ("http://www.url.com","")]
@@ -84,7 +82,7 @@ Pandoc (Meta {docTitle = [Str ""], docAuthors = [[Str ""]], docDate = [Str ""]})
   [ [ Plain [Str "bella"] ]
   , [ Plain [Str "45"] ]
   , [ Plain [Str "f"] ] ] ]
-, Para [Str "and",Space,Str "some",Space,Str "text",Space,Str "following",Space,Str "..."]
+, Para [Str "and",Space,Str "some",Space,Str "text",Space,Str "following",Space,Str ".",Str ".",Str "."]
 , Header 2 [Str "With",Space,Str "headers"]
 , Table [] [AlignDefault,AlignDefault,AlignDefault] [0.0,0.0,0.0]
   [ [ Plain [Str "name"] ]
@@ -100,11 +98,11 @@ Pandoc (Meta {docTitle = [Str ""], docAuthors = [[Str ""]], docDate = [Str ""]})
   , [ Plain [Str "45"] ]
   , [ Plain [Str "f"] ] ] ]
 , Header 1 [Str "Images"]
-, Para [Str "Textile",Space,Str "inline",Space,Str "image",Space,Str "syntax,",Space,Str "like",Space,Str "here",Space,Image [Str "this is the alt text"] ("this_is_an_image.png","this is the alt text"),Space,Str "and",Space,Str "here",Space,Image [Str ""] ("this_is_an_image.png",""),Str "."]
+, Para [Str "Textile",Space,Str "inline",Space,Str "image",Space,Str "syntax",Str ",",Space,Str "like",Space,LineBreak,Str "here",Space,Image [Str "this is the alt text"] ("this_is_an_image.png","this is the alt text"),LineBreak,Str "and",Space,Str "here",Space,Image [Str ""] ("this_is_an_image.png",""),Str "."]
 , Header 1 [Str "Attributes"]
 , Header 2 [Str "HTML",Space,Str "and",Space,Str "CSS",Space,Str "attributes",Space,Str "are",Space,Str "ignored"]
-, Para [Str "as",Space,Str "well",Space,Str "as",Space,Strong [Str "inline",Space,Str "attributes"]]
-, Para [Str "and",Space,Str "paragraph",Space,Str "attributes,",Space,Str "and",Space,Str "table",Space,Str "attributes."]
+, Para [Str "as",Space,Str "well",Space,Str "as",Space,Strong [Str "inline",Space,Str "attributes"],Space,Str "of",Space,Str " all kind"]
+, Para [Str "and",Space,Str "paragraph",Space,Str "attributes",Str ",",Space,Str "and",Space,Str "table",Space,Str "attributes",Str "."]
 , Table [] [AlignDefault,AlignDefault,AlignDefault] [0.0,0.0,0.0]
   [ 
  ] [
@@ -115,16 +113,16 @@ Pandoc (Meta {docTitle = [Str ""], docAuthors = [[Str ""]], docDate = [Str ""]})
   , [ Plain [Str "24"] ]
   , [ Plain [Str "f"] ] ] ]
 , Header 1 [Str "Raw",Space,Str "HTML"]
-, Para [Str "However,",Space,HtmlInline "<strong>",Space,Str "raw",Space,Str "HTML",Space,Str "inlines",Space,HtmlInline "</strong>",Space,Str "are",Space,Str "accepted,",Space,Str "as",Space,Str "well",Space,Str "as",Space,Str ":"]
-, RawHtml "<div class=\"foobar\">"
-, Para [Str "any",Space,Strong [Str "Raw",Space,Str "HTML",Space,Str "Block"],Space,Str "with",Space,Str "bold"]
-, RawHtml "</div>"
+, Para [Str "However",Str ",",Space,Str "raw",Space,Str "HTML",Space,Str "inlines",Space,Str "are",Space,Str "accepted",Str ",",Space,Str "as",Space,Str "well",Space,Str "as",Space,Str ":"]
+, Null
+, Para [Str "any",Space,Strong [Str "Raw",Space,Str "HTML",Space,Str "Block"],Space,Str "with",Space,Str "bold",LineBreak]
+, Null
 , Para [Str "Html",Space,Str "blocks",Space,Str "can",Space,Str "be"]
-, RawHtml "<div>"
+, Null
 , Para [Str "inlined"]
-, RawHtml "</div>"
-, Para [Str "as",Space,Str "well."]
+, Null
+, Para [Str "as",Space,Str "well",Str "."]
 , BulletList
   [ [ Plain [Str "this",Space,Str "<",Str "div",Str ">",Space,Str "won",Str "'",Str "t",Space,Str "produce",Space,Str "raw",Space,Str "html",Space,Str "blocks",Space,Str "<",Str "/div",Str ">"] ]
-  , [ Plain [Str "but",Space,Str "this",Space,HtmlInline "<strong>",Space,Str "will",Space,Str "produce",Space,Str "inline",Space,Str "html",Space,HtmlInline "</strong>"] ] ]
+  , [ Plain [Str "but",Space,Str "this",Space,Str "",Space,Str "will",Space,Str "produce",Space,Str "inline",Space,Str "html",Space,Str ""] ] ]
 , Para [Str "Can",Space,Str "you",Space,Str "prove",Space,Str "that",Space,Str "2",Space,Str "<",Space,Str "3",Space,Str "?"] ]
diff --git a/tests/textile-reader.textile b/tests/textile-reader.textile
index 7384abb64..0b65e11bb 100644
--- a/tests/textile-reader.textile
+++ b/tests/textile-reader.textile
@@ -1,5 +1,5 @@
-This is a set of tests for pandoc. Most of them are adapted from John
-Gruber's markdown test suite.
+This is a set of tests for pandoc Textile Reader. Part of it comes
+from John Gruber's markdown test suite.
 
 -----
 
@@ -20,17 +20,18 @@ h1. Paragraphs
 
 Here's a regular paragraph.
 
-Line break are not paragraph break in textile, so you can wrap your
-very long paragraph with your favourite text editor, it will be
-rendered as a single one.
+Line breaks are preserved in textile, so you can not wrap your very
+long paragraph with your favourite text editor and have it rendered
+with no break.
+
 
 Here's one with a bullet.
 
 * criminey.
 
-There should be a hard line break  
+There should be a paragraph break between here
 
-here.
+and here.
 
 h1. Block Quotes
 
@@ -96,21 +97,22 @@ h2. Nested
 h1. Inline Markup
 
 This is _emphasized_, and so __is this__.
-
 This is *strong*, and so **is this**.
-
 A "*strong link*":http://www.foobar.com.
 
 _*This is strong and em.*_
-
 So is *_this_* word and __**that one**__.
-
 -This is strikeout and *strong*-
 
 Superscripts:  a^bc^d a^*hello*^ a^hello there^.
-
 Subscripts: H~2~O, H~23~O, H~many of them~O.
 
+Dashes : How cool -- automatic dashes.
+
+Elipses : He thought and thought ... and then thought some more.
+
+Quotes and apostrophes : "I'd like to thank you" for example.
+
 
 h1. Links
 
@@ -154,7 +156,7 @@ h1. Attributes
 
 h2{color:red}. HTML and CSS attributes are ignored
 
-as well as *(foo)inline attributes*
+as well as *(foo)inline attributes* of %{color:red} all kind%
 
 p{color:green}. and paragraph attributes, and table attributes.