textile reader improvements : better conformance to RedCloth Textile inlines
This commit is contained in:
parent
411d54ce98
commit
7b111542c0
4 changed files with 78 additions and 56 deletions
|
@ -52,6 +52,7 @@ module Text.Pandoc.Parsing ( (>>~),
|
|||
failUnlessLHS,
|
||||
escaped,
|
||||
characterReference,
|
||||
updateLastStrPos,
|
||||
anyOrderedListMarker,
|
||||
orderedListMarker,
|
||||
charRef,
|
||||
|
@ -786,6 +787,10 @@ charOrRef cs =
|
|||
guard (c `elem` cs)
|
||||
return c)
|
||||
|
||||
updateLastStrPos :: GenParser Char ParserState ()
|
||||
updateLastStrPos = getPosition >>= \p ->
|
||||
updateState $ \s -> s{ stateLastStrPos = Just p }
|
||||
|
||||
singleQuoteStart :: GenParser Char ParserState ()
|
||||
singleQuoteStart = do
|
||||
failIfInQuoteContext InSingleQuote
|
||||
|
|
|
@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|||
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.Textile
|
||||
Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
|
||||
Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : Paul Rivier <paul*rivier#demotera*com>
|
||||
|
@ -62,7 +62,7 @@ import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
|
|||
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
|
||||
import Text.ParserCombinators.Parsec
|
||||
import Text.HTML.TagSoup.Match
|
||||
import Data.Char ( digitToInt, isLetter )
|
||||
import Data.Char ( digitToInt, isUpper )
|
||||
import Control.Monad ( guard, liftM )
|
||||
import Control.Applicative ((<$>), (*>), (<*))
|
||||
|
||||
|
@ -74,14 +74,6 @@ readTextile state s =
|
|||
(readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")
|
||||
|
||||
|
||||
--
|
||||
-- Constants and data structure definitions
|
||||
--
|
||||
|
||||
-- | Special chars border strings parsing
|
||||
specialChars :: [Char]
|
||||
specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()="
|
||||
|
||||
-- | Generate a Pandoc ADT from a textile document
|
||||
parseTextile :: GenParser Char ParserState Pandoc
|
||||
parseTextile = do
|
||||
|
@ -360,14 +352,8 @@ inlineParsers = [ autoLink
|
|||
, rawHtmlInline
|
||||
, rawLaTeXInline'
|
||||
, note
|
||||
, simpleInline (string "??") (Cite [])
|
||||
, simpleInline (string "**") Strong
|
||||
, simpleInline (string "__") Emph
|
||||
, simpleInline (char '*') Strong
|
||||
, simpleInline (char '_') Emph
|
||||
, simpleInline (char '-') Strikeout
|
||||
, simpleInline (char '^') Superscript
|
||||
, simpleInline (char '~') Subscript
|
||||
, try $ (char '[' *> inlineMarkup <* char ']')
|
||||
, inlineMarkup
|
||||
, link
|
||||
, image
|
||||
, mark
|
||||
|
@ -375,6 +361,18 @@ inlineParsers = [ autoLink
|
|||
, symbol
|
||||
]
|
||||
|
||||
-- | Inline markups
|
||||
inlineMarkup :: GenParser Char ParserState Inline
|
||||
inlineMarkup = choice [ simpleInline (string "??") (Cite [])
|
||||
, simpleInline (string "**") Strong
|
||||
, simpleInline (string "__") Emph
|
||||
, simpleInline (char '*') Strong
|
||||
, simpleInline (char '_') Emph
|
||||
, simpleInline (char '-') Strikeout
|
||||
, simpleInline (char '^') Superscript
|
||||
, simpleInline (char '~') Subscript
|
||||
]
|
||||
|
||||
-- | Trademark, registered, copyright
|
||||
mark :: GenParser Char st Inline
|
||||
mark = try $ char '(' >> (try tm <|> try reg <|> copy)
|
||||
|
@ -400,33 +398,49 @@ copy = do
|
|||
|
||||
note :: GenParser Char ParserState Inline
|
||||
note = try $ do
|
||||
char '['
|
||||
ref <- many1 digit
|
||||
char ']'
|
||||
state <- getState
|
||||
let notes = stateNotes state
|
||||
ref <- (char '[' *> many1 digit <* char ']')
|
||||
notes <- stateNotes <$> getState
|
||||
case lookup ref notes of
|
||||
Nothing -> fail "note not found"
|
||||
Just raw -> liftM Note $ parseFromString parseBlocks raw
|
||||
|
||||
-- | Special chars
|
||||
markupChars :: [Char]
|
||||
markupChars = "\\[]*#_@~-+^|%="
|
||||
|
||||
-- | Break strings on following chars. Space tab and newline break for
|
||||
-- inlines breaking. Open paren breaks for mark. Quote, dash and dot
|
||||
-- break for smart punctuation. Punctuation breaks for regular
|
||||
-- punctuation. Double quote breaks for named links. > and < break
|
||||
-- for inline html.
|
||||
stringBreakers :: [Char]
|
||||
stringBreakers = " \t\n('-.,:!?;\"<>"
|
||||
|
||||
wordBoundaries :: [Char]
|
||||
wordBoundaries = markupChars ++ stringBreakers
|
||||
|
||||
-- | Parse a hyphened sequence of words
|
||||
hyphenedWords :: GenParser Char ParserState String
|
||||
hyphenedWords = try $ do
|
||||
hd <- noneOf wordBoundaries
|
||||
tl <- many ( (noneOf wordBoundaries) <|>
|
||||
try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) )
|
||||
let wd = hd:tl
|
||||
option wd $ try $
|
||||
(\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords)
|
||||
|
||||
-- | Any string
|
||||
str :: GenParser Char ParserState Inline
|
||||
str = do
|
||||
xs <- many1 (noneOf (specialChars ++ "\t\n "))
|
||||
optional $ try $ do
|
||||
lookAhead (char '(')
|
||||
notFollowedBy' mark
|
||||
getInput >>= setInput . (' ':) -- add space before acronym explanation
|
||||
-- parse a following hyphen if followed by a letter
|
||||
-- (this prevents unwanted interpretation as starting a strikeout section)
|
||||
result <- option xs $ try $ do
|
||||
char '-'
|
||||
next <- lookAhead letter
|
||||
guard $ isLetter (last xs) || isLetter next
|
||||
return $ xs ++ "-"
|
||||
pos <- getPosition
|
||||
updateState $ \s -> s{ stateLastStrPos = Just pos }
|
||||
return $ Str result
|
||||
baseStr <- hyphenedWords
|
||||
-- RedCloth compliance : if parsed word is uppercase and immediatly
|
||||
-- followed by parens, parens content is unconditionally word acronym
|
||||
fullStr <- option baseStr $ try $ do
|
||||
guard $ all isUpper baseStr
|
||||
acro <- enclosed (char '(') (char ')') anyChar
|
||||
return $ concat [baseStr, " (", acro, ")"]
|
||||
updateLastStrPos
|
||||
return $ Str fullStr
|
||||
|
||||
-- | Textile allows HTML span infos, we discard them
|
||||
htmlSpan :: GenParser Char ParserState Inline
|
||||
|
@ -477,34 +491,36 @@ image = try $ do
|
|||
escapedInline :: GenParser Char ParserState Inline
|
||||
escapedInline = escapedEqs <|> escapedTag
|
||||
|
||||
-- | literal text escaped between == ... ==
|
||||
escapedEqs :: GenParser Char ParserState Inline
|
||||
escapedEqs = try $ do
|
||||
string "=="
|
||||
contents <- manyTill anyChar (try $ string "==")
|
||||
return $ Str contents
|
||||
escapedEqs = Str <$> (try $ surrounded (string "==") anyChar)
|
||||
|
||||
-- -- | literal text escaped between == ... ==
|
||||
-- escapedEqs :: GenParser Char ParserState Inline
|
||||
-- escapedEqs = try $ do
|
||||
-- string "=="
|
||||
-- contents <- manyTill anyChar (try $ string "==")
|
||||
-- return $ Str contents
|
||||
|
||||
-- | literal text escaped btw <notextile> tags
|
||||
escapedTag :: GenParser Char ParserState Inline
|
||||
escapedTag = try $ Str <$> ( string "<notextile>" *>
|
||||
manyTill anyChar (try $ string "</notextile>") )
|
||||
escapedTag = try $ Str <$>
|
||||
enclosed (string "<notextile>") (string "</notextile>") anyChar
|
||||
|
||||
-- | Any special symbol defined in specialChars
|
||||
-- | Any special symbol defined in wordBoundaries
|
||||
symbol :: GenParser Char ParserState Inline
|
||||
symbol = Str . singleton <$> oneOf specialChars
|
||||
symbol = Str . singleton <$> oneOf wordBoundaries
|
||||
|
||||
-- | Inline code
|
||||
code :: GenParser Char ParserState Inline
|
||||
code = code1 <|> code2
|
||||
|
||||
code1 :: GenParser Char ParserState Inline
|
||||
code1 = surrounded (char '@') anyChar >>= return . Code nullAttr
|
||||
code1 = Code nullAttr <$> surrounded (char '@') anyChar
|
||||
|
||||
code2 :: GenParser Char ParserState Inline
|
||||
code2 = do
|
||||
htmlTag (tagOpen (=="tt") null)
|
||||
result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
|
||||
return $ Code nullAttr result'
|
||||
Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
|
||||
|
||||
-- | Html / CSS attributes
|
||||
attributes :: GenParser Char ParserState String
|
||||
|
@ -528,4 +544,4 @@ simpleInline border construct = surrounded border (inlineWithAttribute) >>=
|
|||
|
||||
-- | Create a singleton list
|
||||
singleton :: a -> [a]
|
||||
singleton x = [x]
|
||||
singleton x = [x]
|
||||
|
|
|
@ -67,9 +67,9 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
|
|||
,([Str "beer"],
|
||||
[[Plain [Str "fresh",Space,Str "and",Space,Str "bitter"]]])]
|
||||
,Header 1 [Str "Inline",Space,Str "Markup"]
|
||||
,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
|
||||
,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "Hyphenated-words-are-ok",Str ",",Space,Str "as",Space,Str "well",Space,Str "as",Space,Str "strange_underscore_notation",Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."]
|
||||
,Para [Emph [Strong [Str "This",Space,Str "is",Space,Str "strong",Space,Str "and",Space,Str "em",Str "."]],LineBreak,Str "So",Space,Str "is",Space,Strong [Emph [Str "this"]],Space,Str "word",Space,Str "and",Space,Emph [Strong [Str "that",Space,Str "one"]],Str ".",LineBreak,Strikeout [Str "This",Space,Str "is",Space,Str "strikeout",Space,Str "and",Space,Strong [Str "strong"]]]
|
||||
,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
|
||||
,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Subscript [Str "here"],Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."]
|
||||
,Para [Str "Dashes",Space,Str ":",Space,Str "How",Space,Str "cool",Space,Str "\8212",Space,Str "automatic",Space,Str "dashes",Str "."]
|
||||
,Para [Str "Elipses",Space,Str ":",Space,Str "He",Space,Str "thought",Space,Str "and",Space,Str "thought",Space,Str "\8230",Space,Str "and",Space,Str "then",Space,Str "thought",Space,Str "some",Space,Str "more",Str "."]
|
||||
,Para [Str "Quotes",Space,Str "and",Space,Str "apostrophes",Space,Str ":",Space,Quoted DoubleQuote [Str "I",Str "\8217",Str "d",Space,Str "like",Space,Str "to",Space,Str "thank",Space,Str "you"],Space,Str "for",Space,Str "example",Str "."]
|
||||
|
@ -144,7 +144,7 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
|
|||
,RawBlock "latex" "\\begin{itemize}\n \\item one\n \\item two\n\\end{itemize}"
|
||||
,Para [Str "and",Space,Str "for",Space,RawInline "latex" "\\emph{inlines}",Str "."]
|
||||
,Header 1 [Str "Acronyms",Space,Str "and",Space,Str "marks"]
|
||||
,Para [Str "PBS",Space,Str "(",Str "Public",Space,Str "Broadcasting",Space,Str "System",Str ")"]
|
||||
,Para [Str "PBS (Public Broadcasting System)"]
|
||||
,Para [Str "Hi",Str "\8482"]
|
||||
,Para [Str "Hi",Space,Str "\8482"]
|
||||
,Para [Str "\174",Space,Str "Hi",Str "\174"]
|
||||
|
|
|
@ -115,14 +115,15 @@ h1. Inline Markup
|
|||
|
||||
This is _emphasized_, and so __is this__.
|
||||
This is *strong*, and so **is this**.
|
||||
Hyphenated-words-are-ok, as well as strange_underscore_notation.
|
||||
A "*strong link*":http://www.foobar.com.
|
||||
|
||||
_*This is strong and em.*_
|
||||
So is *_this_* word and __**that one**__.
|
||||
-This is strikeout and *strong*-
|
||||
|
||||
Superscripts: a^bc^d a^*hello*^ a^hello there^.
|
||||
Subscripts: H~2~O, H~23~O, H~many of them~O.
|
||||
Superscripts: a[^bc^]d a^*hello*^ a[^hello there^].
|
||||
Subscripts: ~here~ H[~2~]O, H[~23~]O, H[~many of them~]O.
|
||||
|
||||
Dashes : How cool -- automatic dashes.
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue