Merge pull request #485 from paul-r-ml/master

Textile reader update
This commit is contained in:
John MacFarlane 2012-04-27 15:00:02 -07:00
commit 4b77def061
5 changed files with 162 additions and 116 deletions

22
INSTALL
View file

@ -130,3 +130,25 @@ This is essentially what the binary installer does.
[blaze-html]: http://hackage.haskell.org/package/blaze-html
[Cabal User's Guide]: http://www.haskell.org/cabal/release/latest/doc/users-guide/builders.html#setup-configure-paths
Running tests
-------------
Pandoc comes with an automated test suite integrated to cabal. Data
files are located under the 'tests' directory. If you implement a new
feature, please update them to improve covering, and make sure by any
necessary mean that the new reference native file is 100% correct.
Also, tests require templates that leave in a separate git repository,
tied into the main one as a git submodule. To populate 'template'
directory, you must therefore run first :
git submodule update --init templates
You are now ready to build tests :
cabal-dev install -ftests
And finally run them !
cabal-dev test

View file

@ -52,6 +52,7 @@ module Text.Pandoc.Parsing ( (>>~),
failUnlessLHS,
escaped,
characterReference,
updateLastStrPos,
anyOrderedListMarker,
orderedListMarker,
charRef,
@ -786,6 +787,10 @@ charOrRef cs =
guard (c `elem` cs)
return c)
updateLastStrPos :: GenParser Char ParserState ()
updateLastStrPos = getPosition >>= \p ->
updateState $ \s -> s{ stateLastStrPos = Just p }
singleQuoteStart :: GenParser Char ParserState ()
singleQuoteStart = do
failIfInQuoteContext InSingleQuote

View file

@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Textile
Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : Paul Rivier <paul*rivier#demotera*com>
@ -59,10 +59,12 @@ import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
import Text.ParserCombinators.Parsec
import Text.HTML.TagSoup.Match
import Data.Char ( digitToInt, isLetter )
import Data.Char ( digitToInt, isUpper )
import Control.Monad ( guard, liftM )
import Control.Applicative ((<$>), (*>), (<*))
-- | Parse a Textile text and return a Pandoc document.
readTextile :: ParserState -- ^ Parser state, including options for parser
@ -72,14 +74,6 @@ readTextile state s =
(readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")
--
-- Constants and data structure definitions
--
-- | Special chars border strings parsing
specialChars :: [Char]
specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()="
-- | Generate a Pandoc ADT from a textile document
parseTextile :: GenParser Char ParserState Pandoc
parseTextile = do
@ -128,6 +122,7 @@ blockParsers = [ codeBlock
, hrule
, anyList
, rawHtmlBlock
, rawLaTeXBlock'
, maybeExplicitBlock "table" table
, maybeExplicitBlock "p" para
, nullBlock ]
@ -164,21 +159,16 @@ codeBlockPre = try $ do
header :: GenParser Char ParserState Block
header = try $ do
char 'h'
level <- oneOf "123456" >>= return . digitToInt
optional attributes
char '.'
whitespace
name <- manyTill inline blockBreak
return $ Header level (normalizeSpaces name)
level <- digitToInt <$> oneOf "123456"
optional attributes >> char '.' >> whitespace
name <- normalizeSpaces <$> manyTill inline blockBreak
return $ Header level name
-- | Blockquote of the form "bq. content"
blockQuote :: GenParser Char ParserState Block
blockQuote = try $ do
string "bq"
optional attributes
char '.'
whitespace
para >>= return . BlockQuote . (:[])
string "bq" >> optional attributes >> char '.' >> whitespace
BlockQuote . singleton <$> para
-- Horizontal rule
@ -198,10 +188,7 @@ hrule = try $ do
-- strict in the nesting, sublist must start at exactly "parent depth
-- plus one"
anyList :: GenParser Char ParserState Block
anyList = try $ do
l <- anyListAtDepth 1
blanklines
return l
anyList = try $ ( (anyListAtDepth 1) <* blanklines )
-- | This allow one type of list to be nested into an other type,
-- provided correct nesting
@ -212,20 +199,12 @@ anyListAtDepth depth = choice [ bulletListAtDepth depth,
-- | Bullet List of given depth, depth being the number of leading '*'
bulletListAtDepth :: Int -> GenParser Char ParserState Block
bulletListAtDepth depth = try $ do
items <- many1 (bulletListItemAtDepth depth)
return (BulletList items)
bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth)
-- | Bullet List Item of given depth, depth being the number of
-- leading '*'
bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block]
bulletListItemAtDepth depth = try $ do
count depth (char '*')
optional attributes
whitespace
p <- inlines >>= return . Plain
sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
return (p:sublist)
bulletListItemAtDepth = genericListItemAtDepth '*'
-- | Ordered List of given depth, depth being the number of
-- leading '#'
@ -237,19 +216,19 @@ orderedListAtDepth depth = try $ do
-- | Ordered List Item of given depth, depth being the number of
-- leading '#'
orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block]
orderedListItemAtDepth depth = try $ do
count depth (char '#')
optional attributes
whitespace
p <- inlines >>= return . Plain
sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
return (p:sublist)
orderedListItemAtDepth = genericListItemAtDepth '#'
-- | Common implementation of list items
genericListItemAtDepth :: Char -> Int -> GenParser Char ParserState [Block]
genericListItemAtDepth c depth = try $ do
count depth (char c) >> optional attributes >> whitespace
p <- inlines
sublist <- option [] (singleton <$> anyListAtDepth (depth + 1))
return ((Plain p):sublist)
-- | A definition list is a set of consecutive definition items
definitionList :: GenParser Char ParserState Block
definitionList = try $ do
items <- many1 definitionListItem
return $ DefinitionList items
definitionList = try $ DefinitionList <$> many1 definitionListItem
-- | A definition list item in textile begins with '- ', followed by
-- the term defined, then spaces and ":=". The definition follows, on
@ -277,6 +256,8 @@ blockBreak :: GenParser Char ParserState ()
blockBreak = try (newline >> blanklines >> return ()) <|>
(lookAhead rawHtmlBlock >> return ())
-- raw content
-- | A raw Html Block, optionally followed by blanklines
rawHtmlBlock :: GenParser Char ParserState Block
rawHtmlBlock = try $ do
@ -284,11 +265,16 @@ rawHtmlBlock = try $ do
optional blanklines
return $ RawBlock "html" b
-- | Raw block of LaTeX content
rawLaTeXBlock' :: GenParser Char ParserState Block
rawLaTeXBlock' = do
failIfStrict
RawBlock "latex" <$> (rawLaTeXBlock <* spaces)
-- | In textile, paragraphs are separated by blank lines.
para :: GenParser Char ParserState Block
para = try $ do
content <- manyTill inline blockBreak
return $ Para $ normalizeSpaces content
para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak
-- Tables
@ -302,11 +288,7 @@ tableCell = do
-- | A table row is made of many table cells
tableRow :: GenParser Char ParserState [TableCell]
tableRow = try $ do
char '|'
cells <- endBy1 tableCell (char '|')
newline
return cells
tableRow = try $ ( char '|' *> (endBy1 tableCell (char '|')) <* newline)
-- | Many table rows
tableRows :: GenParser Char ParserState [[TableCell]]
@ -314,13 +296,8 @@ tableRows = many1 tableRow
-- | Table headers are made of cells separated by a tag "|_."
tableHeaders :: GenParser Char ParserState [TableCell]
tableHeaders = try $ do
let separator = (try $ string "|_.")
separator
headers <- sepBy1 tableCell separator
char '|'
newline
return headers
tableHeaders = let separator = (try $ string "|_.") in
try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline )
-- | A table with an optional header. Current implementation can
-- handle tables with and without header, but will parse cells
@ -373,15 +350,10 @@ inlineParsers = [ autoLink
, escapedInline
, htmlSpan
, rawHtmlInline
, rawLaTeXInline'
, note
, simpleInline (string "??") (Cite [])
, simpleInline (string "**") Strong
, simpleInline (string "__") Emph
, simpleInline (char '*') Strong
, simpleInline (char '_') Emph
, simpleInline (char '-') Strikeout
, simpleInline (char '^') Superscript
, simpleInline (char '~') Subscript
, try $ (char '[' *> inlineMarkup <* char ']')
, inlineMarkup
, link
, image
, mark
@ -389,6 +361,18 @@ inlineParsers = [ autoLink
, symbol
]
-- | Inline markups
inlineMarkup :: GenParser Char ParserState Inline
inlineMarkup = choice [ simpleInline (string "??") (Cite [])
, simpleInline (string "**") Strong
, simpleInline (string "__") Emph
, simpleInline (char '*') Strong
, simpleInline (char '_') Emph
, simpleInline (char '-') Strikeout
, simpleInline (char '^') Superscript
, simpleInline (char '~') Subscript
]
-- | Trademark, registered, copyright
mark :: GenParser Char st Inline
mark = try $ char '(' >> (try tm <|> try reg <|> copy)
@ -414,41 +398,53 @@ copy = do
note :: GenParser Char ParserState Inline
note = try $ do
char '['
ref <- many1 digit
char ']'
state <- getState
let notes = stateNotes state
ref <- (char '[' *> many1 digit <* char ']')
notes <- stateNotes <$> getState
case lookup ref notes of
Nothing -> fail "note not found"
Just raw -> liftM Note $ parseFromString parseBlocks raw
-- | Special chars
markupChars :: [Char]
markupChars = "\\[]*#_@~-+^|%="
-- | Break strings on following chars. Space tab and newline break for
-- inlines breaking. Open paren breaks for mark. Quote, dash and dot
-- break for smart punctuation. Punctuation breaks for regular
-- punctuation. Double quote breaks for named links. > and < break
-- for inline html.
stringBreakers :: [Char]
stringBreakers = " \t\n('-.,:!?;\"<>"
wordBoundaries :: [Char]
wordBoundaries = markupChars ++ stringBreakers
-- | Parse a hyphened sequence of words
hyphenedWords :: GenParser Char ParserState String
hyphenedWords = try $ do
hd <- noneOf wordBoundaries
tl <- many ( (noneOf wordBoundaries) <|>
try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) )
let wd = hd:tl
option wd $ try $
(\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords)
-- | Any string
str :: GenParser Char ParserState Inline
str = do
xs <- many1 (noneOf (specialChars ++ "\t\n "))
optional $ try $ do
lookAhead (char '(')
notFollowedBy' mark
getInput >>= setInput . (' ':) -- add space before acronym explanation
-- parse a following hyphen if followed by a letter
-- (this prevents unwanted interpretation as starting a strikeout section)
result <- option xs $ try $ do
char '-'
next <- lookAhead letter
guard $ isLetter (last xs) || isLetter next
return $ xs ++ "-"
pos <- getPosition
updateState $ \s -> s{ stateLastStrPos = Just pos }
return $ Str result
baseStr <- hyphenedWords
-- RedCloth compliance : if parsed word is uppercase and immediatly
-- followed by parens, parens content is unconditionally word acronym
fullStr <- option baseStr $ try $ do
guard $ all isUpper baseStr
acro <- enclosed (char '(') (char ')') anyChar
return $ concat [baseStr, " (", acro, ")"]
updateLastStrPos
return $ Str fullStr
-- | Textile allows HTML span infos, we discard them
htmlSpan :: GenParser Char ParserState Inline
htmlSpan = try $ do
char '%'
_ <- attributes
content <- manyTill anyChar (char '%')
return $ Str content
htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') )
-- | Some number of space chars
whitespace :: GenParser Char ParserState Inline
@ -461,8 +457,13 @@ endline = try $ do
return LineBreak
rawHtmlInline :: GenParser Char ParserState Inline
rawHtmlInline = liftM (RawInline "html" . snd)
$ htmlTag isInlineTag
rawHtmlInline = RawInline "html" . snd <$> htmlTag isInlineTag
-- | Raw LaTeX Inline
rawLaTeXInline' :: GenParser Char ParserState Inline
rawLaTeXInline' = try $ do
failIfStrict
rawLaTeXInline
-- | Textile standard link syntax is "label":target
link :: GenParser Char ParserState Inline
@ -490,38 +491,36 @@ image = try $ do
escapedInline :: GenParser Char ParserState Inline
escapedInline = escapedEqs <|> escapedTag
-- | literal text escaped between == ... ==
escapedEqs :: GenParser Char ParserState Inline
escapedEqs = try $ do
string "=="
contents <- manyTill anyChar (try $ string "==")
return $ Str contents
escapedEqs = Str <$> (try $ surrounded (string "==") anyChar)
-- -- | literal text escaped between == ... ==
-- escapedEqs :: GenParser Char ParserState Inline
-- escapedEqs = try $ do
-- string "=="
-- contents <- manyTill anyChar (try $ string "==")
-- return $ Str contents
-- | literal text escaped btw <notextile> tags
escapedTag :: GenParser Char ParserState Inline
escapedTag = try $ do
string "<notextile>"
contents <- manyTill anyChar (try $ string "</notextile>")
return $ Str contents
escapedTag = try $ Str <$>
enclosed (string "<notextile>") (string "</notextile>") anyChar
-- | Any special symbol defined in specialChars
-- | Any special symbol defined in wordBoundaries
symbol :: GenParser Char ParserState Inline
symbol = do
result <- oneOf specialChars
return $ Str [result]
symbol = Str . singleton <$> oneOf wordBoundaries
-- | Inline code
code :: GenParser Char ParserState Inline
code = code1 <|> code2
code1 :: GenParser Char ParserState Inline
code1 = surrounded (char '@') anyChar >>= return . Code nullAttr
code1 = Code nullAttr <$> surrounded (char '@') anyChar
code2 :: GenParser Char ParserState Inline
code2 = do
htmlTag (tagOpen (=="tt") null)
result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
return $ Code nullAttr result'
Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
-- | Html / CSS attributes
attributes :: GenParser Char ParserState String
@ -542,3 +541,7 @@ simpleInline :: GenParser Char ParserState t -- ^ surrounding parser
simpleInline border construct = surrounded border (inlineWithAttribute) >>=
return . construct . normalizeSpaces
where inlineWithAttribute = (try $ optional attributes) >> inline
-- | Create a singleton list
singleton :: a -> [a]
singleton x = [x]

View file

@ -67,9 +67,9 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
,([Str "beer"],
[[Plain [Str "fresh",Space,Str "and",Space,Str "bitter"]]])]
,Header 1 [Str "Inline",Space,Str "Markup"]
,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "Hyphenated-words-are-ok",Str ",",Space,Str "as",Space,Str "well",Space,Str "as",Space,Str "strange_underscore_notation",Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
,Para [Emph [Strong [Str "This",Space,Str "is",Space,Str "strong",Space,Str "and",Space,Str "em",Str "."]],LineBreak,Str "So",Space,Str "is",Space,Strong [Emph [Str "this"]],Space,Str "word",Space,Str "and",Space,Emph [Strong [Str "that",Space,Str "one"]],Str ".",LineBreak,Strikeout [Str "This",Space,Str "is",Space,Str "strikeout",Space,Str "and",Space,Strong [Str "strong"]]]
,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Subscript [Str "here"],Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
,Para [Str "Dashes",Space,Str ":",Space,Str "How",Space,Str "cool",Space,Str "\8212",Space,Str "automatic",Space,Str "dashes",Str "."]
,Para [Str "Elipses",Space,Str ":",Space,Str "He",Space,Str "thought",Space,Str "and",Space,Str "thought",Space,Str "\8230",Space,Str "and",Space,Str "then",Space,Str "thought",Space,Str "some",Space,Str "more",Str "."]
,Para [Str "Quotes",Space,Str "and",Space,Str "apostrophes",Space,Str ":",Space,Quoted DoubleQuote [Str "I",Str "\8217",Str "d",Space,Str "like",Space,Str "to",Space,Str "thank",Space,Str "you"],Space,Str "for",Space,Str "example",Str "."]
@ -139,8 +139,12 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
[[Plain [Str "this",Space,Str "<",Str "div",Str ">",Space,Str "won",Str "\8217",Str "t",Space,Str "produce",Space,Str "raw",Space,Str "html",Space,Str "blocks",Space,Str "<",Str "/div",Str ">"]]
,[Plain [Str "but",Space,Str "this",Space,RawInline "html" "<strong>",Space,Str "will",Space,Str "produce",Space,Str "inline",Space,Str "html",Space,RawInline "html" "</strong>"]]]
,Para [Str "Can",Space,Str "you",Space,Str "prove",Space,Str "that",Space,Str "2",Space,Str "<",Space,Str "3",Space,Str "?"]
,Header 1 [Str "Raw",Space,Str "LaTeX"]
,Para [Str "This",Space,Str "Textile",Space,Str "reader",Space,Str "also",Space,Str "accepts",Space,Str "raw",Space,Str "LaTeX",Space,Str "for",Space,Str "blocks",Space,Str ":"]
,RawBlock "latex" "\\begin{itemize}\n \\item one\n \\item two\n\\end{itemize}"
,Para [Str "and",Space,Str "for",Space,RawInline "latex" "\\emph{inlines}",Str "."]
,Header 1 [Str "Acronyms",Space,Str "and",Space,Str "marks"]
,Para [Str "PBS",Space,Str "(",Str "Public",Space,Str "Broadcasting",Space,Str "System",Str ")"]
,Para [Str "PBS (Public Broadcasting System)"]
,Para [Str "Hi",Str "\8482"]
,Para [Str "Hi",Space,Str "\8482"]
,Para [Str "\174",Space,Str "Hi",Str "\174"]

View file

@ -115,14 +115,15 @@ h1. Inline Markup
This is _emphasized_, and so __is this__.
This is *strong*, and so **is this**.
Hyphenated-words-are-ok, as well as strange_underscore_notation.
A "*strong link*":http://www.foobar.com.
_*This is strong and em.*_
So is *_this_* word and __**that one**__.
-This is strikeout and *strong*-
Superscripts: a^bc^d a^*hello*^ a^hello there^.
Subscripts: H~2~O, H~23~O, H~many of them~O.
Superscripts: a[^bc^]d a^*hello*^ a[^hello there^].
Subscripts: ~here~ H[~2~]O, H[~23~]O, H[~many of them~]O.
Dashes : How cool -- automatic dashes.
@ -198,6 +199,17 @@ Html blocks can be <div>inlined</div> as well.
Can you prove that 2 < 3 ?
h1. Raw LaTeX
This Textile reader also accepts raw LaTeX for blocks :
\begin{itemize}
\item one
\item two
\end{itemize}
and for \emph{inlines}.
h1. Acronyms and marks
PBS(Public Broadcasting System)