Basic Textile Reader

This commit is contained in:
paul.rivier 2010-09-16 11:26:12 +02:00 committed by John MacFarlane
parent 3ffd724617
commit d532c72c5b
7 changed files with 262 additions and 1 deletions

View file

@ -62,6 +62,7 @@ module Text.Pandoc
, readRST
, readLaTeX
, readHtml
, readTextile
-- * Parser state used in readers
, ParserState (..)
, defaultParserState
@ -104,6 +105,7 @@ import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.RST
import Text.Pandoc.Readers.LaTeX
import Text.Pandoc.Readers.HTML
import Text.Pandoc.Readers.Textile
import Text.Pandoc.Writers.Native
import Text.Pandoc.Writers.Markdown
import Text.Pandoc.Writers.RST

View file

@ -108,6 +108,7 @@ data Inline
| Emph [Inline] -- ^ Emphasized text (list of inlines)
| Strong [Inline] -- ^ Strongly emphasized text (list of inlines)
| Strikeout [Inline] -- ^ Strikeout text (list of inlines)
| Inserted [Inline] -- ^ Underlined text (list of inlines)
| Superscript [Inline] -- ^ Superscripted text (list of inlines)
| Subscript [Inline] -- ^ Subscripted text (list of inlines)
| SmallCaps [Inline] -- ^ Small caps text (list of inlines)

View file

@ -31,6 +31,7 @@ module Text.Pandoc.Readers.HTML (
readHtml,
rawHtmlInline,
rawHtmlBlock,
htmlTag,
anyHtmlBlockTag,
anyHtmlInlineTag,
anyHtmlTag,

View file

@ -0,0 +1,253 @@
{-
Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@'
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Textile
Copyright : Copyright (C) 2010 Paul Rivier
License : GNU GPL, version 2 or above
Maintainer : Paul Rivier <paul*rivier#demotera*com>
Stability : alpha
Portability : portable
Conversion from Textile to 'Pandoc' document.
Implemented :
- Paragraphs
- Code blocks
- Lists
- blockquote
- Inlines : strong, emph, cite, code, deleted, inserted, superscript, subscript
Not implemented :
- HTML-specific and CSS-specific inlines
-}
module Text.Pandoc.Readers.Textile (
readTextile
) where
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Readers.HTML ( htmlTag, htmlEndTag )
import Text.ParserCombinators.Parsec
import Data.Char ( digitToInt )
-- | Parse a Textile text and return a Pandoc document.
readTextile :: ParserState -- ^ Parser state, including options for parser
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Pandoc
readTextile state s = (readWith parseTextile) state (s ++ "\n\n")
--
-- Constants and data structure definitions
--
-- | Special chars border strings parsing
specialChars :: [Char]
specialChars = "\\[]*#_@~<>!?-+^&'\";:"
-- | Generate a Pandoc ADT from a textile document
parseTextile :: GenParser Char ParserState Pandoc
parseTextile = do
many blankline
blocks <- parseBlocks
return $ Pandoc (Meta [Str ""] [[Str ""]] [Str ""]) blocks -- FIXME
-- | Parse document blocks
parseBlocks :: GenParser Char ParserState [Block]
parseBlocks = manyTill block eof
-- | Block parsers list tried in definition order
blockParsers :: [GenParser Char ParserState Block]
blockParsers = [ codeBlock
, header
, blockQuote
, anyList
, para
, nullBlock ]
-- | Any block in the order of definition of blockParsers
block :: GenParser Char ParserState Block
block = choice blockParsers <?> "block"
-- | Code Blocks in Textile are between <pre> and </pre>
codeBlock :: GenParser Char ParserState Block
codeBlock = try $ do
htmlTag "pre"
content <- manyTill anyChar (try $ htmlEndTag "pre" >> blockBreak)
return $ CodeBlock ("",[],[]) content
-- | Header of the form "hN. content" with N in 1..6
header :: GenParser Char ParserState Block
header = try $ do
char 'h'
level <- oneOf "123456" >>= return . digitToInt
char '.'
whitespace
name <- manyTill inline blockBreak
return $ Header level (normalizeSpaces name)
-- | Blockquote of the form "bq. content"
blockQuote :: GenParser Char ParserState Block
blockQuote = try $ do
string "bq."
whitespace
para >>= return . BlockQuote . (:[])
-- Lists handling
-- | Can be a bullet list or an ordered list. This implementation is
-- strict in the nesting, sublist must start at exactly "parent depth
-- plus one"
anyList :: GenParser Char ParserState Block
anyList = try $ do
l <- anyListAtDepth 1
blanklines
return l
-- | This allow one type of list to be nested into an other type,
-- provided correct nesting
anyListAtDepth :: Int -> GenParser Char ParserState Block
anyListAtDepth depth = choice [ bulletListAtDepth depth,
orderedListAtDepth depth ]
-- | Bullet List of given depth, depth being the number of leading '*'
bulletListAtDepth :: Int -> GenParser Char ParserState Block
bulletListAtDepth depth = try $ do
items <- many1 (bulletListItemAtDepth depth)
return (BulletList items)
-- | Bullet List Item of given depth, depth being the number of
-- leading '*'
bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block]
bulletListItemAtDepth depth = try $ do
count depth (char '*')
whitespace
p <- inlines >>= return . Plain
sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
return (p:sublist)
-- | Ordered List of given depth, depth being the number of
-- leading '#'
orderedListAtDepth :: Int -> GenParser Char ParserState Block
orderedListAtDepth depth = try $ do
items <- many1 (orderedListItemAtDepth depth)
return (OrderedList (1, DefaultStyle, DefaultDelim) items)
-- | Ordered List Item of given depth, depth being the number of
-- leading '#'
orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block]
orderedListItemAtDepth depth = try $ do
count depth (char '#')
whitespace
p <- inlines >>= return . Plain
sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
return (p:sublist)
-- | This terminates a block such as a paragraph.
blockBreak :: GenParser Char ParserState ()
blockBreak = try $ newline >> blanklines >> return ()
-- | In textile, paragraphs are separated by blank lines.
para :: GenParser Char ParserState Block
para = try $ do
content <- manyTill inline blockBreak
return $ Para $ normalizeSpaces content
-- | Any inline element
inline :: GenParser Char ParserState Inline
inline = choice inlineParsers <?> "inline"
-- | List of consecutive inlines before a newline
inlines :: GenParser Char ParserState [Inline]
inlines = manyTill inline newline
-- | Inline parsers tried in order
inlineParsers :: [GenParser Char ParserState Inline]
inlineParsers = [ str
, whitespace
, endline
, code
, simpleInline (string "??") (Cite [])
, simpleInline (char '*') Strong
, simpleInline (char '_') Emph
, simpleInline (string "**") Strong
, simpleInline (string "__") Emph
, simpleInline (char '-') Strikeout
, simpleInline (char '+') Inserted
, simpleInline (char '^') Superscript
, simpleInline (char '~') Subscript
-- , link
-- , image
-- , math
-- , autoLink
, symbol
]
-- | Any string
str :: GenParser Char ParserState Inline
str = many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str
-- | Some number of space chars
whitespace :: GenParser Char ParserState Inline
whitespace = many1 spaceChar >> return Space <?> "whitespace"
-- | In Textile, an endline character that can be treated as a space,
-- not a structural break
endline :: GenParser Char ParserState Inline
endline = try $ do
newline >> notFollowedBy blankline
return Space
-- | Any special symbol defined in specialChars
symbol :: GenParser Char ParserState Inline
symbol = do
result <- oneOf specialChars
return $ Str [result]
-- | Inline code
code :: GenParser Char ParserState Inline
code = surrounded (char '@') anyChar >>=
return . Code
-- | Parses material surrounded by a parser.
surrounded :: GenParser Char st t -- ^ surrounding parser
-> GenParser Char st a -- ^ content parser (to be used repeatedly)
-> GenParser Char st [a]
surrounded border = enclosed border border
-- | Inlines are most of the time of the same form
simpleInline :: GenParser Char ParserState t -- ^ surrounding parser
-> ([Inline] -> Inline) -- ^ Inline constructor
-> GenParser Char ParserState Inline -- ^ content parser (to be used repeatedly)
simpleInline border construct = surrounded border inline >>=
return . construct . normalizeSpaces
-- TODO
--
-- - Pandoc Meta Information
-- - footnotes
-- - hyperlink "label":target
-- - tables
-- - doc
-- - tests
-- - Inserted inline handling in writers

View file

@ -377,6 +377,7 @@ inlineListToIdentifier =
Str s -> s
Emph lst -> concatMap extractText lst
Strikeout lst -> concatMap extractText lst
Inserted lst -> concatMap extractText lst
Superscript lst -> concatMap extractText lst
SmallCaps lst -> concatMap extractText lst
Subscript lst -> concatMap extractText lst

View file

@ -454,6 +454,8 @@ inlineToHtml opts inline =
(Code str) -> return $ thecode << str
(Strikeout lst) -> inlineListToHtml opts lst >>=
return . (thespan ! [thestyle "text-decoration: line-through;"])
(Inserted lst) -> inlineListToHtml opts lst >>=
return . (thespan ! [thestyle "text-decoration: underline;"])
(SmallCaps lst) -> inlineListToHtml opts lst >>=
return . (thespan ! [thestyle "font-variant: small-caps;"])
(Superscript lst) -> inlineListToHtml opts lst >>= return . sup

View file

@ -90,6 +90,7 @@ readers = [("native" , readPandoc)
,("markdown" , readMarkdown)
,("markdown+lhs" , readMarkdown)
,("rst" , readRST)
,("textile" , readTextile) -- TODO : textile+lhs
,("rst+lhs" , readRST)
,("html" , readHtml)
,("latex" , readLaTeX)