From d532c72c5b3249ae83733d9fe064764d123603c0 Mon Sep 17 00:00:00 2001 From: "paul.rivier" Date: Thu, 16 Sep 2010 11:26:12 +0200 Subject: [PATCH] Basic Textile Reader --- src/Text/Pandoc.hs | 2 + src/Text/Pandoc/Definition.hs | 1 + src/Text/Pandoc/Readers/HTML.hs | 3 +- src/Text/Pandoc/Readers/Textile.hs | 253 +++++++++++++++++++++++++++++ src/Text/Pandoc/Shared.hs | 1 + src/Text/Pandoc/Writers/HTML.hs | 2 + src/pandoc.hs | 1 + 7 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 src/Text/Pandoc/Readers/Textile.hs diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 6cb8130a4..d11f084a5 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -62,6 +62,7 @@ module Text.Pandoc , readRST , readLaTeX , readHtml + , readTextile -- * Parser state used in readers , ParserState (..) , defaultParserState @@ -104,6 +105,7 @@ import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.RST import Text.Pandoc.Readers.LaTeX import Text.Pandoc.Readers.HTML +import Text.Pandoc.Readers.Textile import Text.Pandoc.Writers.Native import Text.Pandoc.Writers.Markdown import Text.Pandoc.Writers.RST diff --git a/src/Text/Pandoc/Definition.hs b/src/Text/Pandoc/Definition.hs index fffca3b2e..0972d5f4c 100644 --- a/src/Text/Pandoc/Definition.hs +++ b/src/Text/Pandoc/Definition.hs @@ -108,6 +108,7 @@ data Inline | Emph [Inline] -- ^ Emphasized text (list of inlines) | Strong [Inline] -- ^ Strongly emphasized text (list of inlines) | Strikeout [Inline] -- ^ Strikeout text (list of inlines) + | Inserted [Inline] -- ^ Underlined text (list of inlines) | Superscript [Inline] -- ^ Superscripted text (list of inlines) | Subscript [Inline] -- ^ Subscripted text (list of inlines) | SmallCaps [Inline] -- ^ Small caps text (list of inlines) diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 5ccbc4fb1..fdc727170 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -30,7 +30,8 @@ Conversion of HTML to 'Pandoc' document. module Text.Pandoc.Readers.HTML ( readHtml, rawHtmlInline, - rawHtmlBlock, + rawHtmlBlock, + htmlTag, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs new file mode 100644 index 000000000..dba92ecd8 --- /dev/null +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -0,0 +1,253 @@ +{- +Copyright (C) 2010 Paul Rivier | tr '*#' '.@' + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Textile + Copyright : Copyright (C) 2010 Paul Rivier + License : GNU GPL, version 2 or above + + Maintainer : Paul Rivier + Stability : alpha + Portability : portable + +Conversion from Textile to 'Pandoc' document. + +Implemented : + - Paragraphs + - Code blocks + - Lists + - blockquote + - Inlines : strong, emph, cite, code, deleted, inserted, superscript, subscript + + +Not implemented : + - HTML-specific and CSS-specific inlines +-} + + +module Text.Pandoc.Readers.Textile ( + readTextile + ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Readers.HTML ( htmlTag, htmlEndTag ) +import Text.ParserCombinators.Parsec +import Data.Char ( digitToInt ) + +-- | Parse a Textile text and return a Pandoc document. +readTextile :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readTextile state s = (readWith parseTextile) state (s ++ "\n\n") + + +-- +-- Constants and data structure definitions +-- + +-- | Special chars border strings parsing +specialChars :: [Char] +specialChars = "\\[]*#_@~<>!?-+^&'\";:" + +-- | Generate a Pandoc ADT from a textile document +parseTextile :: GenParser Char ParserState Pandoc +parseTextile = do + many blankline + blocks <- parseBlocks + return $ Pandoc (Meta [Str ""] [[Str ""]] [Str ""]) blocks -- FIXME + +-- | Parse document blocks +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = manyTill block eof + +-- | Block parsers list tried in definition order +blockParsers :: [GenParser Char ParserState Block] +blockParsers = [ codeBlock + , header + , blockQuote + , anyList + , para + , nullBlock ] + +-- | Any block in the order of definition of blockParsers +block :: GenParser Char ParserState Block +block = choice blockParsers "block" + +-- | Code Blocks in Textile are between
 and 
+codeBlock :: GenParser Char ParserState Block +codeBlock = try $ do + htmlTag "pre" + content <- manyTill anyChar (try $ htmlEndTag "pre" >> blockBreak) + return $ CodeBlock ("",[],[]) content + +-- | Header of the form "hN. content" with N in 1..6 +header :: GenParser Char ParserState Block +header = try $ do + char 'h' + level <- oneOf "123456" >>= return . digitToInt + char '.' + whitespace + name <- manyTill inline blockBreak + return $ Header level (normalizeSpaces name) + +-- | Blockquote of the form "bq. content" +blockQuote :: GenParser Char ParserState Block +blockQuote = try $ do + string "bq." + whitespace + para >>= return . BlockQuote . (:[]) + +-- Lists handling + +-- | Can be a bullet list or an ordered list. This implementation is +-- strict in the nesting, sublist must start at exactly "parent depth +-- plus one" +anyList :: GenParser Char ParserState Block +anyList = try $ do + l <- anyListAtDepth 1 + blanklines + return l + +-- | This allow one type of list to be nested into an other type, +-- provided correct nesting +anyListAtDepth :: Int -> GenParser Char ParserState Block +anyListAtDepth depth = choice [ bulletListAtDepth depth, + orderedListAtDepth depth ] + +-- | Bullet List of given depth, depth being the number of leading '*' +bulletListAtDepth :: Int -> GenParser Char ParserState Block +bulletListAtDepth depth = try $ do + items <- many1 (bulletListItemAtDepth depth) + return (BulletList items) + +-- | Bullet List Item of given depth, depth being the number of +-- leading '*' +bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block] +bulletListItemAtDepth depth = try $ do + count depth (char '*') + whitespace + p <- inlines >>= return . Plain + sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) + return (p:sublist) + +-- | Ordered List of given depth, depth being the number of +-- leadingĀ '#' +orderedListAtDepth :: Int -> GenParser Char ParserState Block +orderedListAtDepth depth = try $ do + items <- many1 (orderedListItemAtDepth depth) + return (OrderedList (1, DefaultStyle, DefaultDelim) items) + +-- | Ordered List Item of given depth, depth being the number of +-- leadingĀ '#' +orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block] +orderedListItemAtDepth depth = try $ do + count depth (char '#') + whitespace + p <- inlines >>= return . Plain + sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) + return (p:sublist) + +-- | This terminates a block such as a paragraph. +blockBreak :: GenParser Char ParserState () +blockBreak = try $ newline >> blanklines >> return () + +-- | In textile, paragraphs are separated by blank lines. +para :: GenParser Char ParserState Block +para = try $ do + content <- manyTill inline blockBreak + return $ Para $ normalizeSpaces content + +-- | Any inline element +inline :: GenParser Char ParserState Inline +inline = choice inlineParsers "inline" + +-- | List of consecutive inlines before a newline +inlines :: GenParser Char ParserState [Inline] +inlines = manyTill inline newline + +-- | Inline parsers tried in order +inlineParsers :: [GenParser Char ParserState Inline] +inlineParsers = [ str + , whitespace + , endline + , code + , simpleInline (string "??") (Cite []) + , simpleInline (char '*') Strong + , simpleInline (char '_') Emph + , simpleInline (string "**") Strong + , simpleInline (string "__") Emph + , simpleInline (char '-') Strikeout + , simpleInline (char '+') Inserted + , simpleInline (char '^') Superscript + , simpleInline (char '~') Subscript + -- , link + -- , image + -- , math + -- , autoLink + , symbol + ] + +-- | Any string +str :: GenParser Char ParserState Inline +str = many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str + +-- | Some number of space chars +whitespace :: GenParser Char ParserState Inline +whitespace = many1 spaceChar >> return Space "whitespace" + +-- | In Textile, an endline character that can be treated as a space, +-- not a structural break +endline :: GenParser Char ParserState Inline +endline = try $ do + newline >> notFollowedBy blankline + return Space + +-- | Any special symbol defined in specialChars +symbol :: GenParser Char ParserState Inline +symbol = do + result <- oneOf specialChars + return $ Str [result] + +-- | Inline code +code :: GenParser Char ParserState Inline +code = surrounded (char '@') anyChar >>= + return . Code + +-- | Parses material surrounded by a parser. +surrounded :: GenParser Char st t -- ^ surrounding parser + -> GenParser Char st a -- ^ content parser (to be used repeatedly) + -> GenParser Char st [a] +surrounded border = enclosed border border + +-- | Inlines are most of the time of the same form +simpleInline :: GenParser Char ParserState t -- ^ surrounding parser + -> ([Inline] -> Inline) -- ^ Inline constructor + -> GenParser Char ParserState Inline -- ^ content parser (to be used repeatedly) +simpleInline border construct = surrounded border inline >>= + return . construct . normalizeSpaces + + +-- TODO +-- +-- - Pandoc Meta Information +-- - footnotes +-- - hyperlink "label":target +-- - tables +-- - doc +-- - tests +-- - Inserted inline handling in writers \ No newline at end of file diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 0fdaf42f3..006c11c63 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -377,6 +377,7 @@ inlineListToIdentifier = Str s -> s Emph lst -> concatMap extractText lst Strikeout lst -> concatMap extractText lst + Inserted lst -> concatMap extractText lst Superscript lst -> concatMap extractText lst SmallCaps lst -> concatMap extractText lst Subscript lst -> concatMap extractText lst diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index b8da4bec0..55403af42 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -454,6 +454,8 @@ inlineToHtml opts inline = (Code str) -> return $ thecode << str (Strikeout lst) -> inlineListToHtml opts lst >>= return . (thespan ! [thestyle "text-decoration: line-through;"]) + (Inserted lst) -> inlineListToHtml opts lst >>= + return . (thespan ! [thestyle "text-decoration: underline;"]) (SmallCaps lst) -> inlineListToHtml opts lst >>= return . (thespan ! [thestyle "font-variant: small-caps;"]) (Superscript lst) -> inlineListToHtml opts lst >>= return . sup diff --git a/src/pandoc.hs b/src/pandoc.hs index 4f5a1c32a..647649a3c 100644 --- a/src/pandoc.hs +++ b/src/pandoc.hs @@ -90,6 +90,7 @@ readers = [("native" , readPandoc) ,("markdown" , readMarkdown) ,("markdown+lhs" , readMarkdown) ,("rst" , readRST) + ,("textile" , readTextile) -- TODO : textile+lhs ,("rst+lhs" , readRST) ,("html" , readHtml) ,("latex" , readLaTeX)