Merge pull request #485 from paul-r-ml/master

Textile reader update
2012-04-27 15:00:02 -07:00 · 2012-04-27 15:00:02 -07:00 · 4b77def061
commit 4b77def061
parent 77125ea68c 7b111542c0
5 changed files with 162 additions and 116 deletions
--- a/22
+++ b/22
@ -130,3 +130,25 @@ This is essentially what the binary installer does.
 [blaze-html]: http://hackage.haskell.org/package/blaze-html
 [Cabal User's Guide]: http://www.haskell.org/cabal/release/latest/doc/users-guide/builders.html#setup-configure-paths

+
+Running tests
+-------------
+
+Pandoc comes with an automated test suite integrated to cabal. Data
+files are located under the 'tests' directory. If you implement a new
+feature, please update them to improve covering, and make sure by any
+necessary mean that the new reference native file is 100% correct.
+
+Also, tests require templates that leave in a separate git repository,
+tied into the main one as a git submodule. To populate 'template'
+directory, you must therefore run first :
+
+    git submodule update --init templates
+
+You are now ready to build tests :
+
+    cabal-dev install -ftests
+
+And finally run them !
+
+    cabal-dev test
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@ -52,6 +52,7 @@ module Text.Pandoc.Parsing ( (>>~),
                             failUnlessLHS,
                             escaped,
                             characterReference,
+                             updateLastStrPos,
                             anyOrderedListMarker,
                             orderedListMarker,
                             charRef,
@ -786,6 +787,10 @@ charOrRef cs =
                       guard (c `elem` cs)
                       return c)

+updateLastStrPos :: GenParser Char ParserState ()
+updateLastStrPos = getPosition >>= \p -> 
+  updateState $ \s -> s{ stateLastStrPos = Just p }
+
 singleQuoteStart :: GenParser Char ParserState ()
 singleQuoteStart = do
  failIfInQuoteContext InSingleQuote
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 {- |
   Module      : Text.Pandoc.Readers.Textile
-   Copyright   : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
+   Copyright   : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane
   License     : GNU GPL, version 2 or above 

   Maintainer  : Paul Rivier <paul*rivier#demotera*com>
@ -59,10 +59,12 @@ import Text.Pandoc.Definition
 import Text.Pandoc.Shared 
 import Text.Pandoc.Parsing
 import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
+import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
 import Text.ParserCombinators.Parsec
 import Text.HTML.TagSoup.Match
-import Data.Char ( digitToInt, isLetter )
+import Data.Char ( digitToInt, isUpper )
 import Control.Monad ( guard, liftM )
+import Control.Applicative ((<$>), (*>), (<*))

 -- | Parse a Textile text and return a Pandoc document.
 readTextile :: ParserState -- ^ Parser state, including options for parser
@ -72,14 +74,6 @@ readTextile state s =
  (readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")


--
-- Constants and data structure definitions
--
-
-- | Special chars border strings parsing
-specialChars :: [Char]
-specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()="
-
 -- | Generate a Pandoc ADT from a textile document
 parseTextile :: GenParser Char ParserState Pandoc
 parseTextile = do
@ -128,6 +122,7 @@ blockParsers = [ codeBlock
               , hrule
               , anyList
               , rawHtmlBlock
+               , rawLaTeXBlock'
               , maybeExplicitBlock "table" table
               , maybeExplicitBlock "p" para
               , nullBlock ]
@ -164,21 +159,16 @@ codeBlockPre = try $ do
 header :: GenParser Char ParserState Block
 header = try $ do
  char 'h'
-  level <- oneOf "123456" >>= return . digitToInt
-  optional attributes
-  char '.'
-  whitespace
-  name <- manyTill inline blockBreak
-  return $ Header level (normalizeSpaces name)
+  level <- digitToInt <$> oneOf "123456"
+  optional attributes >> char '.' >> whitespace
+  name <- normalizeSpaces <$> manyTill inline blockBreak
+  return $ Header level name

 -- | Blockquote of the form "bq. content"
 blockQuote :: GenParser Char ParserState Block
 blockQuote = try $ do
-  string "bq"
-  optional attributes
-  char '.'
-  whitespace
-  para >>= return . BlockQuote . (:[])
+  string "bq" >> optional attributes >> char '.' >> whitespace
+  BlockQuote . singleton <$> para

 -- Horizontal rule

@ -198,10 +188,7 @@ hrule = try $ do
 -- strict in the nesting, sublist must start at exactly "parent depth
 -- plus one"
 anyList :: GenParser Char ParserState Block
-anyList = try $ do
-  l <- anyListAtDepth 1
-  blanklines
-  return l
+anyList = try $ ( (anyListAtDepth 1) <* blanklines )

 -- | This allow one type of list to be nested into an other type,
 -- provided correct nesting
@ -212,20 +199,12 @@ anyListAtDepth depth = choice [ bulletListAtDepth depth,

 -- | Bullet List of given depth, depth being the number of leading '*'
 bulletListAtDepth :: Int -> GenParser Char ParserState Block
-bulletListAtDepth depth = try $ do
-  items <- many1 (bulletListItemAtDepth depth)
-  return (BulletList items)
+bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth)

 -- | Bullet List Item of given depth, depth being the number of
 -- leading '*'
 bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block]
-bulletListItemAtDepth depth = try $ do
-  count depth (char '*')
-  optional attributes
-  whitespace
-  p <- inlines >>= return . Plain
-  sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
-  return (p:sublist)
+bulletListItemAtDepth = genericListItemAtDepth '*'

 -- | Ordered List of given depth, depth being the number of
 -- leading '#'
@ -237,19 +216,19 @@ orderedListAtDepth depth = try $ do
 -- | Ordered List Item of given depth, depth being the number of
 -- leading '#'
 orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block]
-orderedListItemAtDepth depth = try $ do
-  count depth (char '#')
-  optional attributes
-  whitespace
-  p <- inlines >>= return . Plain
-  sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
-  return (p:sublist)
+orderedListItemAtDepth = genericListItemAtDepth '#'
+
+-- | Common implementation of list items
+genericListItemAtDepth :: Char -> Int -> GenParser Char ParserState [Block]
+genericListItemAtDepth c depth = try $ do
+  count depth (char c) >> optional attributes >> whitespace
+  p <- inlines
+  sublist <- option [] (singleton <$> anyListAtDepth (depth + 1))
+  return ((Plain p):sublist)

 -- | A definition list is a set of consecutive definition items
 definitionList :: GenParser Char ParserState Block  
-definitionList = try $ do
-  items <- many1 definitionListItem
-  return $ DefinitionList items
+definitionList = try $ DefinitionList <$> many1 definitionListItem
  
 -- | A definition list item in textile begins with '- ', followed by
 -- the term defined, then spaces and ":=". The definition follows, on
@ -277,6 +256,8 @@ blockBreak :: GenParser Char ParserState ()
 blockBreak = try (newline >> blanklines >> return ()) <|>
              (lookAhead rawHtmlBlock >> return ())

+-- raw content
+
 -- | A raw Html Block, optionally followed by blanklines
 rawHtmlBlock :: GenParser Char ParserState Block
 rawHtmlBlock = try $ do
@ -284,11 +265,16 @@ rawHtmlBlock = try $ do
  optional blanklines
  return $ RawBlock "html" b

+-- | Raw block of LaTeX content
+rawLaTeXBlock' :: GenParser Char ParserState Block
+rawLaTeXBlock' = do
+  failIfStrict
+  RawBlock "latex" <$> (rawLaTeXBlock <* spaces)
+
+
 -- | In textile, paragraphs are separated by blank lines.
 para :: GenParser Char ParserState Block
-para = try $ do
-  content <- manyTill inline blockBreak
-  return $ Para $ normalizeSpaces content
+para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak


 -- Tables
@ -302,11 +288,7 @@ tableCell = do

 -- | A table row is made of many table cells
 tableRow :: GenParser Char ParserState [TableCell]
-tableRow = try $ do
-  char '|'
-  cells <- endBy1 tableCell (char '|')
-  newline
-  return cells
+tableRow = try $ ( char '|' *> (endBy1 tableCell (char '|')) <* newline)

 -- | Many table rows
 tableRows :: GenParser Char ParserState [[TableCell]]
@ -314,13 +296,8 @@ tableRows = many1 tableRow

 -- | Table headers are made of cells separated by a tag "|_."
 tableHeaders :: GenParser Char ParserState [TableCell]
-tableHeaders = try $ do
-  let separator = (try $ string "|_.")
-  separator
-  headers <- sepBy1 tableCell separator
-  char '|'
-  newline
-  return headers
+tableHeaders = let separator = (try $ string "|_.") in
+  try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline )
  
 -- | A table with an optional header. Current implementation can
 -- handle tables with and without header, but will parse cells
@ -373,15 +350,10 @@ inlineParsers = [ autoLink
                , escapedInline
                , htmlSpan
                , rawHtmlInline
+                , rawLaTeXInline'
                , note
-                , simpleInline (string "??") (Cite [])
-                , simpleInline (string "**") Strong
-                , simpleInline (string "__") Emph
-                , simpleInline (char '*') Strong
-                , simpleInline (char '_') Emph
-                , simpleInline (char '-') Strikeout
-                , simpleInline (char '^') Superscript
-                , simpleInline (char '~') Subscript
+                , try $ (char '[' *> inlineMarkup <* char ']')
+                , inlineMarkup
                , link
                , image
                , mark
@ -389,6 +361,18 @@ inlineParsers = [ autoLink
                , symbol
                ]

+-- | Inline markups
+inlineMarkup :: GenParser Char ParserState Inline
+inlineMarkup = choice [ simpleInline (string "??") (Cite [])
+                      , simpleInline (string "**") Strong
+                      , simpleInline (string "__") Emph
+                      , simpleInline (char '*') Strong
+                      , simpleInline (char '_') Emph
+                      , simpleInline (char '-') Strikeout
+                      , simpleInline (char '^') Superscript
+                      , simpleInline (char '~') Subscript
+                      ]
+
 -- | Trademark, registered, copyright
 mark :: GenParser Char st Inline
 mark = try $ char '(' >> (try tm <|> try reg <|> copy)
@ -414,41 +398,53 @@ copy = do

 note :: GenParser Char ParserState Inline
 note = try $ do
-  char '['
-  ref <- many1 digit
-  char ']'
-  state <- getState
-  let notes = stateNotes state
+  ref <- (char '[' *> many1 digit <* char ']')
+  notes <- stateNotes <$> getState
  case lookup ref notes of
    Nothing   -> fail "note not found"
    Just raw  -> liftM Note $ parseFromString parseBlocks raw

+-- | Special chars 
+markupChars :: [Char]
+markupChars = "\\[]*#_@~-+^|%="
+
+-- | Break strings on following chars. Space tab and newline break for
+--  inlines breaking. Open paren breaks for mark. Quote, dash and dot
+--  break for smart punctuation. Punctuation breaks for regular
+--  punctuation. Double quote breaks for named links. > and < break
+--  for inline html.
+stringBreakers :: [Char]
+stringBreakers = " \t\n('-.,:!?;\"<>"
+
+wordBoundaries :: [Char]
+wordBoundaries = markupChars ++ stringBreakers
+
+-- | Parse a hyphened sequence of words
+hyphenedWords :: GenParser Char ParserState String
+hyphenedWords = try $ do
+  hd <- noneOf wordBoundaries
+  tl <- many ( (noneOf wordBoundaries) <|> 
+               try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) )
+  let wd = hd:tl
+  option wd $ try $ 
+    (\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords)
+
 -- | Any string
 str :: GenParser Char ParserState Inline
 str = do
-  xs <- many1 (noneOf (specialChars ++ "\t\n "))
-  optional $ try $ do
-    lookAhead (char '(')
-    notFollowedBy' mark
-    getInput >>= setInput . (' ':) -- add space before acronym explanation
-  -- parse a following hyphen if followed by a letter
-  -- (this prevents unwanted interpretation as starting a strikeout section)
-  result <- option xs $ try $ do
-              char '-'
-              next <- lookAhead letter
-              guard $ isLetter (last xs) || isLetter next
-              return $ xs ++ "-"
-  pos <- getPosition
-  updateState $ \s -> s{ stateLastStrPos = Just pos }
-  return $ Str result
+  baseStr <- hyphenedWords
+  -- RedCloth compliance : if parsed word is uppercase and immediatly
+  -- followed by parens, parens content is unconditionally word acronym
+  fullStr <- option baseStr $ try $ do
+    guard $ all isUpper baseStr
+    acro <- enclosed (char '(') (char ')') anyChar
+    return $ concat [baseStr, " (", acro, ")"]
+  updateLastStrPos
+  return $ Str fullStr

 -- | Textile allows HTML span infos, we discard them
 htmlSpan :: GenParser Char ParserState Inline
-htmlSpan = try $ do
-  char '%'
-  _ <- attributes
-  content <- manyTill anyChar (char '%')
-  return $ Str content
+htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') )

 -- | Some number of space chars
 whitespace :: GenParser Char ParserState Inline
@ -461,8 +457,13 @@ endline = try $ do
  return LineBreak

 rawHtmlInline :: GenParser Char ParserState Inline
-rawHtmlInline = liftM (RawInline "html" . snd)
-                $ htmlTag isInlineTag
+rawHtmlInline = RawInline "html" . snd <$> htmlTag isInlineTag
+                
+-- | Raw LaTeX Inline 
+rawLaTeXInline' :: GenParser Char ParserState Inline
+rawLaTeXInline' = try $ do
+  failIfStrict
+  rawLaTeXInline

 -- | Textile standard link syntax is "label":target
 link :: GenParser Char ParserState Inline
@ -490,38 +491,36 @@ image = try $ do
 escapedInline :: GenParser Char ParserState Inline
 escapedInline = escapedEqs <|> escapedTag

-- | literal text escaped between == ... ==
 escapedEqs :: GenParser Char ParserState Inline
-escapedEqs = try $ do
-  string "=="
-  contents <- manyTill anyChar (try $ string "==")
-  return $ Str contents
+escapedEqs = Str <$> (try $ surrounded (string "==") anyChar)
+
+-- -- | literal text escaped between == ... ==
+-- escapedEqs :: GenParser Char ParserState Inline
+-- escapedEqs = try $ do
+--   string "=="
+--   contents <- manyTill anyChar (try $ string "==")
+--   return $ Str contents

 -- | literal text escaped btw <notextile> tags
 escapedTag :: GenParser Char ParserState Inline
-escapedTag = try $ do
-  string "<notextile>"
-  contents <- manyTill anyChar (try $ string "</notextile>")
-  return $ Str contents
+escapedTag = try $ Str <$>
+  enclosed (string "<notextile>") (string "</notextile>") anyChar

-- | Any special symbol defined in specialChars
+-- | Any special symbol defined in wordBoundaries
 symbol :: GenParser Char ParserState Inline
-symbol = do
-  result <- oneOf specialChars
-  return $ Str [result]
+symbol = Str . singleton <$> oneOf wordBoundaries

 -- | Inline code
 code :: GenParser Char ParserState Inline
 code = code1 <|> code2

 code1 :: GenParser Char ParserState Inline
-code1 = surrounded (char '@') anyChar >>= return . Code nullAttr
+code1 = Code nullAttr <$> surrounded (char '@') anyChar

 code2 :: GenParser Char ParserState Inline
 code2 = do
  htmlTag (tagOpen (=="tt") null)
-  result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
-  return $ Code nullAttr result'
+  Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))

 -- | Html / CSS attributes
 attributes :: GenParser Char ParserState String
@ -542,3 +541,7 @@ simpleInline :: GenParser Char ParserState t           -- ^ surrounding parser
 simpleInline border construct = surrounded border (inlineWithAttribute) >>=
                                return . construct . normalizeSpaces
  where inlineWithAttribute = (try $ optional attributes) >> inline
+
+-- | Create a singleton list
+singleton :: a -> [a]
+singleton x = [x]
--- a/tests/textile-reader.native
+++ b/tests/textile-reader.native
@ -67,9 +67,9 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
 ,([Str "beer"],
   [[Plain [Str "fresh",Space,Str "and",Space,Str "bitter"]]])]
 ,Header 1 [Str "Inline",Space,Str "Markup"]
-,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
+,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "Hyphenated-words-are-ok",Str ",",Space,Str "as",Space,Str "well",Space,Str "as",Space,Str "strange_underscore_notation",Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
 ,Para [Emph [Strong [Str "This",Space,Str "is",Space,Str "strong",Space,Str "and",Space,Str "em",Str "."]],LineBreak,Str "So",Space,Str "is",Space,Strong [Emph [Str "this"]],Space,Str "word",Space,Str "and",Space,Emph [Strong [Str "that",Space,Str "one"]],Str ".",LineBreak,Strikeout [Str "This",Space,Str "is",Space,Str "strikeout",Space,Str "and",Space,Strong [Str "strong"]]]
-,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
+,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Subscript [Str "here"],Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
 ,Para [Str "Dashes",Space,Str ":",Space,Str "How",Space,Str "cool",Space,Str "\8212",Space,Str "automatic",Space,Str "dashes",Str "."]
 ,Para [Str "Elipses",Space,Str ":",Space,Str "He",Space,Str "thought",Space,Str "and",Space,Str "thought",Space,Str "\8230",Space,Str "and",Space,Str "then",Space,Str "thought",Space,Str "some",Space,Str "more",Str "."]
 ,Para [Str "Quotes",Space,Str "and",Space,Str "apostrophes",Space,Str ":",Space,Quoted DoubleQuote [Str "I",Str "\8217",Str "d",Space,Str "like",Space,Str "to",Space,Str "thank",Space,Str "you"],Space,Str "for",Space,Str "example",Str "."]
@ -139,8 +139,12 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
 [[Plain [Str "this",Space,Str "<",Str "div",Str ">",Space,Str "won",Str "\8217",Str "t",Space,Str "produce",Space,Str "raw",Space,Str "html",Space,Str "blocks",Space,Str "<",Str "/div",Str ">"]]
 ,[Plain [Str "but",Space,Str "this",Space,RawInline "html" "<strong>",Space,Str "will",Space,Str "produce",Space,Str "inline",Space,Str "html",Space,RawInline "html" "</strong>"]]]
 ,Para [Str "Can",Space,Str "you",Space,Str "prove",Space,Str "that",Space,Str "2",Space,Str "<",Space,Str "3",Space,Str "?"]
+,Header 1 [Str "Raw",Space,Str "LaTeX"]
+,Para [Str "This",Space,Str "Textile",Space,Str "reader",Space,Str "also",Space,Str "accepts",Space,Str "raw",Space,Str "LaTeX",Space,Str "for",Space,Str "blocks",Space,Str ":"]
+,RawBlock "latex" "\\begin{itemize}\n  \\item one\n  \\item two\n\\end{itemize}"
+,Para [Str "and",Space,Str "for",Space,RawInline "latex" "\\emph{inlines}",Str "."]
 ,Header 1 [Str "Acronyms",Space,Str "and",Space,Str "marks"]
-,Para [Str "PBS",Space,Str "(",Str "Public",Space,Str "Broadcasting",Space,Str "System",Str ")"]
+,Para [Str "PBS (Public Broadcasting System)"]
 ,Para [Str "Hi",Str "\8482"]
 ,Para [Str "Hi",Space,Str "\8482"]
 ,Para [Str "\174",Space,Str "Hi",Str "\174"]
--- a/tests/textile-reader.textile
+++ b/tests/textile-reader.textile
@ -115,14 +115,15 @@ h1. Inline Markup

 This is _emphasized_, and so __is this__.
 This is *strong*, and so **is this**.
+Hyphenated-words-are-ok, as well as strange_underscore_notation.
 A "*strong link*":http://www.foobar.com.

 _*This is strong and em.*_
 So is *_this_* word and __**that one**__.
 -This is strikeout and *strong*-

-Superscripts:  a^bc^d a^*hello*^ a^hello there^.
-Subscripts: H~2~O, H~23~O, H~many of them~O.
+Superscripts: a[^bc^]d a^*hello*^ a[^hello there^].
+Subscripts: ~here~ H[~2~]O, H[~23~]O, H[~many of them~]O.

 Dashes : How cool -- automatic dashes.

@ -198,6 +199,17 @@ Html blocks can be <div>inlined</div> as well.

 Can you prove that 2 < 3 ?

+h1. Raw LaTeX
+
+This Textile reader also accepts raw LaTeX for blocks :
+
+\begin{itemize}
+  \item one
+  \item two
+\end{itemize}
+
+and for \emph{inlines}.
+
 h1. Acronyms and marks

 PBS(Public Broadcasting System)