Creole reader (#3994)

This is feature complete but not very thoroughly tested yet.
This commit is contained in:
Sascha Wilde 2017-10-27 01:19:28 +02:00 committed by John MacFarlane
parent 76886678a6
commit 66fd3247ea
5 changed files with 596 additions and 0 deletions

View file

@ -385,6 +385,7 @@ library
Text.Pandoc.Readers.LaTeX.Types,
Text.Pandoc.Readers.Markdown,
Text.Pandoc.Readers.CommonMark,
Text.Pandoc.Readers.Creole,
Text.Pandoc.Readers.MediaWiki,
Text.Pandoc.Readers.Vimwiki,
Text.Pandoc.Readers.RST,
@ -583,6 +584,7 @@ test-suite test-pandoc
Tests.Readers.Txt2Tags
Tests.Readers.EPUB
Tests.Readers.Muse
Tests.Readers.Creole
Tests.Writers.Native
Tests.Writers.ConTeXt
Tests.Writers.Docbook

View file

@ -45,6 +45,7 @@ module Text.Pandoc.Readers
, readOdt
, readMarkdown
, readCommonMark
, readCreole
, readMediaWiki
, readVimwiki
, readRST
@ -76,6 +77,7 @@ import Text.Pandoc.Error
import Text.Pandoc.Extensions
import Text.Pandoc.Options
import Text.Pandoc.Readers.CommonMark
import Text.Pandoc.Readers.Creole
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
import Text.Pandoc.Readers.EPUB
@ -117,6 +119,7 @@ readers = [ ("native" , TextReader readNative)
,("markdown_github" , TextReader readMarkdown)
,("markdown_mmd", TextReader readMarkdown)
,("commonmark" , TextReader readCommonMark)
,("creole" , TextReader readCreole)
,("gfm" , TextReader readCommonMark)
,("rst" , TextReader readRST)
,("mediawiki" , TextReader readMediaWiki)

View file

@ -0,0 +1,316 @@
{-
Copyright (C) 2017 Sascha Wilde <wilde@sha-bang.de>
partly based on all the other readers, especialy the work by
John MacFarlane <jgm@berkeley.edu> and
Alexander Sulfrian <alexander.sulfrian@fu-berlin.de>
all bugs are solely created by me.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Creole
Copyright : Copyright (C) 2017 Sascha Wilde
License : GNU GPL, version 2 or above
Maintainer : Sascha Wilde <wilde@sha-bang.de>
Stability : WIP
Portability : portable
Conversion of creole text to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Creole ( readCreole
) where
import Control.Monad.Except (liftM2, throwError, guard)
import qualified Data.Foldable as F
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad(..))
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Parsing hiding (enclosed)
import Text.Pandoc.Shared (crFilter)
import Data.Maybe (fromMaybe)
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
-- | Read creole from an input string and return a Pandoc document.
readCreole :: PandocMonad m
=> ReaderOptions
-> Text
-> m Pandoc
readCreole opts s = do
res <- readWithM parseCreole def{ stateOptions = opts }
(T.unpack (crFilter s) ++ "\n\n")
case res of
Left e -> throwError e
Right d -> return d
type CRLParser = ParserT [Char] ParserState
--
-- Utility funcitons
--
(<+>) :: (Monad m, Monoid a) => m a -> m a -> m a
(<+>) = liftM2 (<>)
-- we have to redefine `enclosed' from Text.Pandoc.Parsing, because it
-- assumes, that there can't be a space after the start parser, but
-- with creole this is possible.
enclosed :: (Show end, PandocMonad m) => CRLParser m start -- ^ start parser
-> CRLParser m end -- ^ end parser
-> CRLParser m a -- ^ content parser (to be used repeatedly)
-> CRLParser m [a]
enclosed start end parser = try $ start >> many1Till parser end
--
-- main parser
--
specialChars :: [Char]
specialChars = "*/~{}\\|[]()<>\"'"
parseCreole :: PandocMonad m => CRLParser m Pandoc
parseCreole = do
bs <- mconcat <$> many block
spaces
eof
return $ B.doc bs
--
-- block parsers
--
block :: PandocMonad m => CRLParser m B.Blocks
block = do
res <- mempty <$ skipMany1 blankline
<|> nowiki
<|> header
<|> horizontalRule
<|> anyList 1
<|> table
<|> para
skipMany blankline
return res
nowiki :: PandocMonad m => CRLParser m B.Blocks
nowiki = try $ nowikiStart >> manyTill content nowikiEnd >>= return . B.codeBlock . mconcat
where
content = brackets <|> line
brackets = try $ option "" ((:[]) <$> newline)
<+> (char ' ' >> (many (char ' ') <+> string "}}}") <* eol)
line = option "" ((:[]) <$> newline) <+> manyTill anyChar eol
eol = lookAhead $ try $ nowikiEnd <|> newline
nowikiStart = optional newline >> string "{{{" >> skipMany spaceChar >> newline
nowikiEnd = try $ linebreak >> string "}}}" >> skipMany spaceChar >> newline
header :: PandocMonad m => CRLParser m B.Blocks
header = try $ do
skipSpaces
level <- many1 (char '=') >>= return . length
guard $ level <= 6
skipSpaces
content <- B.str <$> manyTill (noneOf "\n") headerEnd
return $ B.header level content
where
headerEnd = try $ skipSpaces >> many (char '=') >> skipSpaces >> newline
unorderedList :: PandocMonad m => Int -> CRLParser m B.Blocks
unorderedList = list '*' B.bulletList
orderedList :: PandocMonad m => Int -> CRLParser m B.Blocks
orderedList = list '#' B.orderedList
anyList :: PandocMonad m => Int -> CRLParser m B.Blocks
anyList n = unorderedList n <|> orderedList n
anyListItem :: PandocMonad m => Int -> CRLParser m B.Blocks
anyListItem n = listItem '*' n <|> listItem '#' n
list :: PandocMonad m => Char -> ([B.Blocks] -> B.Blocks) -> Int -> CRLParser m B.Blocks
list c f n = many1 (itemPlusSublist <|> listItem c n)
>>= return . f
where itemPlusSublist = try $ listItem c n <+> anyList (n+1)
listItem :: PandocMonad m => Char -> Int -> CRLParser m B.Blocks
listItem c n = (listStart >> many1Till inline itemEnd)
>>= return . B.plain . B.trimInlines .mconcat
where
listStart = try $ optional newline >> skipSpaces >> count n (char c)
>> (lookAhead $ noneOf [c]) >> skipSpaces
itemEnd = endOfParaElement <|> nextItem n
<|> if n < 3 then nextItem (n+1)
else nextItem (n+1) <|> nextItem (n-1)
nextItem x = lookAhead $ try $ blankline >> anyListItem x >> return mempty
table :: PandocMonad m => CRLParser m B.Blocks
table = try $ do
headers <- optionMaybe headerRow
rows <- many1 row
return $ B.simpleTable (fromMaybe [mempty] headers) rows
where
headerRow = try $ skipSpaces >> many1Till headerCell rowEnd
headerCell = B.plain . B.trimInlines . mconcat
<$> (string "|=" >> many1Till inline cellEnd)
row = try $ skipSpaces >> many1Till cell rowEnd
cell = B.plain . B.trimInlines . mconcat
<$> (char '|' >> many1Till inline cellEnd)
rowEnd = try $ optional (char '|') >> skipSpaces >> newline
cellEnd = lookAhead $ try $ char '|' <|> rowEnd
para :: PandocMonad m => CRLParser m B.Blocks
para = many1Till inline endOfParaElement >>= return . result . mconcat
where
result content = if F.all (==Space) content
then mempty
else B.para $ B.trimInlines content
endOfParaElement :: PandocMonad m => CRLParser m ()
endOfParaElement = lookAhead $ endOfInput <|> endOfPara
<|> startOfList <|> startOfTable
<|> startOfHeader <|> hr <|> startOfNowiki
where
endOfInput = try $ skipMany blankline >> skipSpaces >> eof
endOfPara = try $ blankline >> skipMany1 blankline
startOf :: PandocMonad m => CRLParser m a -> CRLParser m ()
startOf p = try $ blankline >> p >> return mempty
startOfList = startOf $ anyList 1
startOfTable = startOf $ table
startOfHeader = startOf header
startOfNowiki = startOf nowiki
hr = startOf horizontalRule
horizontalRule :: PandocMonad m => CRLParser m B.Blocks
horizontalRule = try $ skipSpaces >> string "----" >> skipSpaces >> newline
>> return B.horizontalRule
--
-- inline parsers
--
inline :: PandocMonad m => CRLParser m B.Inlines
inline = choice [ whitespace
, escapedLink
, escapedChar
, link
, inlineNowiki
, placeholder
, image
, forcedLinebreak
, bold
, finalBold
, italics
, finalItalics
, str
, symbol
] <?> "inline"
escapedChar :: PandocMonad m => CRLParser m B.Inlines
escapedChar = (try $ char '~' >> noneOf "\t\n ") >>= return . B.str . (:[])
escapedLink :: PandocMonad m => CRLParser m B.Inlines
escapedLink = try $ do
char '~'
(orig, _) <- uri
return $ B.str orig
image :: PandocMonad m => CRLParser m B.Inlines
image = try $ do
(orig, src) <- wikiImg
return $ B.image src "" (B.str $ orig)
where
linkSrc = many $ noneOf "|}\n\r\t"
linkDsc = char '|' >> many (noneOf "}\n\r\t")
wikiImg = try $ do
string "{{"
src <- linkSrc
dsc <- option "" linkDsc
string "}}"
return (dsc, src)
link :: PandocMonad m => CRLParser m B.Inlines
link = try $ do
(orig, src) <- uriLink <|> wikiLink
return $ B.link src "" orig
where
linkSrc = many $ noneOf "|]\n\r\t"
linkDsc :: PandocMonad m => String -> CRLParser m B.Inlines
linkDsc otxt = B.str
<$> (try $ option otxt
(char '|' >> many (noneOf "]\n\r\t")))
linkImg = try $ char '|' >> image
wikiLink = try $ do
string "[["
src <- linkSrc
dsc <- linkImg <|> linkDsc src
string "]]"
return (dsc, src)
uriLink = try $ do
(orig, src) <- uri
return (B.str orig, src)
inlineNowiki :: PandocMonad m => CRLParser m B.Inlines
inlineNowiki = B.code <$> (start >> manyTill (noneOf "\n\r") end)
where
start = try $ string "{{{"
end = try $ string "}}}" >> (lookAhead $ noneOf "}")
placeholder :: PandocMonad m => CRLParser m B.Inlines
-- The semantics of the placeholder is basicallly implementation
-- dependent, so there is no way to DTRT for all cases.
-- So for now we just drop them.
placeholder = B.text <$> (try $ string "<<<" >> manyTill anyChar (string ">>>")
>> return "")
whitespace :: PandocMonad m => CRLParser m B.Inlines
whitespace = (lb <|> regsp) >>= return
where lb = try $ skipMany spaceChar >> linebreak >> return B.space
regsp = try $ skipMany1 spaceChar >> return B.space
linebreak :: PandocMonad m => CRLParser m B.Inlines
linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline)
where lastNewline = eof >> return mempty
innerNewline = return B.space
symbol :: PandocMonad m => CRLParser m B.Inlines
symbol = oneOf specialChars >>= return . B.str . (:[])
str :: PandocMonad m => CRLParser m B.Inlines
str = let strChar = noneOf ("\t\n " ++ specialChars) in
many1 strChar >>= return . B.str
bold :: PandocMonad m => CRLParser m B.Inlines
bold = B.strong . mconcat <$>
enclosed (string "**") (try $ string "**") inline
italics :: PandocMonad m => CRLParser m B.Inlines
italics = B.emph . mconcat <$>
enclosed (string "//") (try $ string "//") inline
finalBold :: PandocMonad m => CRLParser m B.Inlines
finalBold = B.strong . mconcat <$>
try (string "**" >> many1Till inline endOfParaElement)
finalItalics :: PandocMonad m => CRLParser m B.Inlines
finalItalics = B.emph . mconcat <$>
try (string "//" >> many1Till inline endOfParaElement)
forcedLinebreak :: PandocMonad m => CRLParser m B.Inlines
forcedLinebreak = try $ string "\\\\" >> return B.linebreak

View file

@ -0,0 +1,273 @@
{-# LANGUAGE OverloadedStrings #-}
module Tests.Readers.Creole (tests) where
import Data.Text (Text)
import qualified Data.Text as T
import Test.Tasty
import Tests.Helpers
import Text.Pandoc
import Text.Pandoc.Arbitrary ()
import Text.Pandoc.Builder
creole :: Text -> Pandoc
creole = purely $ readCreole def{ readerStandalone = True }
infix 4 =:
(=:) :: ToString c
=> String -> (Text, c) -> TestTree
(=:) = test creole
tests :: [TestTree]
tests = [
testGroup "Basic Text Formatting" [
"bold, single line, fully delimited" =:
"only **bold** is bold"
=?> para ("only " <> strong "bold" <> " is bold")
, "italics, single line, fully delimited" =:
"only //this// is in italics"
=?> para ("only " <> emph "this" <> " is in italics")
, "bold in italics, fully delimited" =:
"//**this**// is in bold italics"
=?> para (emph (strong "this") <> " is in bold italics")
, "italics in bold, fully delimited" =:
"**//this//** is in bold italics"
=?> para (strong (emph "this") <> " is in bold italics")
, "escape bold marker" =:
"~**not bold" =?> para "**not bold"
, "escape italics marker" =:
"~//not in italics" =?> para "//not in italics"
, "inline nowiki, simple" =:
"this is {{{**not** ~interpreted}}} at all"
=?> para ("this is " <> code "**not** ~interpreted" <> " at all")
, "inline nowiki, curly braces inside" =:
"this is {{{{{{//including// some `}' chars}}}}}}"
=?> para ("this is " <> code "{{{//including// some `}' chars}}}")
, "placeholder" =:
"foo <<<place holder>>> bar"
=?> para "foo bar"
, "placeholder escaped" =:
"foo ~<<<no place holder>>> bar"
=?> para "foo <<<no place holder>>> bar"
]
, testGroup "Headers" [
"header level 1, no space, no trailing =" =:
"= Top-Level Header"
=?> header 1 (str "Top-Level Header")
, "header level 1, leading space, trailing =" =:
" = Top-Level Header = "
=?> header 1 (str "Top-Level Header")
, "header level 2, no space, no trailing =" =:
"== Second Level"
=?> header 2 (str "Second Level")
, "header level 2, leading space, no trailing =" =:
" == Second Level"
=?> header 2 (str "Second Level")
, "header level 3, no space, no trailing =" =:
"=== Third"
=?> header 3 (str "Third")
, "header level 3, no space, > 3 trailing =" =:
"=== Third ======="
=?> header 3 (str "Third")
, "header level 4, no space, no trailing =" =:
"==== Fourth Level Heading"
=?> header 4 (str "Fourth Level Heading")
, "header level 4, no space, < 4 trailing =" =:
"==== Fourth Level Heading =="
=?> header 4 (str "Fourth Level Heading")
, "header level 5, no space, no trailing =" =:
"===== Fifth"
=?> header 5 (str "Fifth")
, "header level 6, no space, no trailing =" =:
"====== Sixth"
=?> header 6 (str "Sixth")
]
, testGroup "Paragraphs" [
"paragraphs: multiple, one line" =:
"first line\n\nanother line\n"
=?> para "first line" <> para "another line"
]
, testGroup "Lists" [
"unordered list, two entries, one separating space" =:
"* foo\n* bar"
=?> bulletList [ plain "foo", plain "bar" ]
, "unordered list, three entries, one separating space" =:
"* foo\n* bar\n* baz"
=?> bulletList [ plain "foo", plain "bar", plain "baz" ]
, "para followed by, unordered list, two entries, one separating space" =:
"blubber\n* foo\n* bar"
=?> para "blubber" <> bulletList [ plain "foo", plain "bar" ]
, "nested unordered list, one separating space" =:
"* foo\n** bar\n** baz\n* blubb"
=?> bulletList [ plain "foo"
<> bulletList [ plain "bar", plain "baz" ]
, plain "blubb" ]
, "nested many unordered lists, one separating space" =:
("* foo\n** bar\n*** third\n*** third two\n** baz\n*** third again\n"
<> "**** fourth\n***** fith\n* blubb")
=?> bulletList [ plain "foo"
<> bulletList [ plain "bar"
<> bulletList [ plain "third"
, plain "third two"]
, plain "baz"
<> bulletList [ plain "third again"
<> bulletList [
plain "fourth"
<> bulletList [
plain "fith"
]
]
]
]
, plain "blubb" ]
, "nested unordered list, mixed separating space" =:
"*foo\n ** bar\n **baz\n * blubb"
=?> bulletList [ plain "foo"
<> bulletList [ plain "bar", plain "baz" ]
, plain "blubb" ]
, "ordered list, two entries, one separating space" =:
"# foo\n# bar"
=?> orderedList [ plain "foo", plain "bar" ]
, "ordered list, three entries, one separating space" =:
"# foo\n# bar\n# baz"
=?> orderedList [ plain "foo", plain "bar", plain "baz" ]
, "para followed by, ordered list, two entries, one separating space" =:
"blubber\n# foo\n# bar"
=?> para "blubber" <> orderedList [ plain "foo", plain "bar" ]
, "nested ordered list, one separating space" =:
"# foo\n## bar\n## baz\n# blubb"
=?> orderedList [ plain "foo"
<> orderedList [ plain "bar", plain "baz" ]
, plain "blubb" ]
, "nested many ordered lists, one separating space" =:
("# foo\n## bar\n### third\n### third two\n## baz\n### third again\n"
<> "#### fourth\n##### fith\n# blubb")
=?> orderedList [ plain "foo"
<> orderedList [ plain "bar"
<> orderedList [ plain "third"
, plain "third two"]
, plain "baz"
<> orderedList [ plain "third again"
<> orderedList [
plain "fourth"
<> orderedList [
plain "fith"
]
]
]
]
, plain "blubb" ]
, "nested ordered list, mixed separating space" =:
"#foo\n ## bar\n ##baz\n # blubb"
=?> orderedList [ plain "foo"
<> orderedList [ plain "bar", plain "baz" ]
, plain "blubb" ]
, "mixed nested ordered and unordered lists, one separating space" =:
("# foo\n** bar\n### third\n### third two\n** baz\n### third again\n"
<> "#### fourth\n***** fith\n# blubb")
=?> orderedList [ plain "foo"
<> bulletList [ plain "bar"
<> orderedList [ plain "third"
, plain "third two"]
, plain "baz"
<> orderedList [ plain "third again"
<> orderedList [
plain "fourth"
<> bulletList [
plain "fith"
]
]
]
]
, plain "blubb" ]
]
, testGroup "NoWiki" [
"quoted block, simple" =:
"{{{\nfoo bar\n //baz//\n}}}"
=?> codeBlock "foo bar\n //baz//"
, "quoted block, curly bracket exception" =:
"{{{\nfoo bar\n }}}\nbaz\n }}}\n}}}"
=?> codeBlock "foo bar\n }}}\nbaz\n}}}"
, "forced line breaks" =:
"{{{no break!\\\\here}}} but a break\\\\here!"
=?> para (code "no break!\\\\here" <> " but a break"
<> linebreak <> "here!")
]
, testGroup "Images and Links" [
"image simple" =:
"{{foo.png}}" =?> para (image "foo.png" "" (str ""))
, "image with alt text" =:
"Image of a bar: {{/path/to/bar.png|A Bar}} look at it!"
=?> para ("Image of a bar: "
<> image "/path/to/bar.png" "" (str "A Bar") <> " look at it!")
, "auto link" =:
"foo http://foo.example.com/bar/baz.html bar"
=?> para ("foo "
<> link "http://foo.example.com/bar/baz.html" ""
(str "http://foo.example.com/bar/baz.html")
<> " bar")
, "escaped auto link" =:
"foo ~http://foo.example.com/bar/baz.html bar"
=?> para ("foo http://foo.example.com/bar/baz.html bar")
, "wiki link simple" =:
"foo [[http://foo.example.com/foo.png]] bar"
=?> para ("foo "
<> link "http://foo.example.com/foo.png" ""
(str "http://foo.example.com/foo.png")
<> " bar")
, "wiki link with name" =:
"foo [[http://foo.example.com/foo.png|my link]] bar"
=?> para ("foo "
<> link "http://foo.example.com/foo.png" ""
(str "my link")
<> " bar")
, "image link" =:
"[[http://foo.example.com/|{{foo.png}}]]"
=?> para (link "http://foo.example.com/" "" (image "foo.png" "" (str "")))
]
, testGroup "Table" [
"Table with Header" =:
T.unlines [ "|= Foo |= Bar |= Baz |"
, "| One | Two | Three |"
, "| 1 | 2 | 3 |"
, "| A | B | C |"
]
=?> simpleTable
[plain "Foo", plain "Bar" , plain "Baz"]
[[plain "One", plain "Two" , plain "Three"]
,[plain "1", plain "2" , plain "3"]
,[plain "A", plain "B" , plain "C"]]
, "Table without Header" =:
T.unlines [ "| One | Two | Three |"
, "| 1 | 2 | 3 |"
, "| A | B | C |"
]
=?> simpleTable [mempty]
[[plain "One", plain "Two" , plain "Three"]
,[plain "1", plain "2" , plain "3"]
,[plain "A", plain "B" , plain "C"]]
, "Table without Header, no markers at line ends" =:
T.unlines [ "| One | Two | Three"
, "| 1 | 2 | 3"
, "| A | B | C "
]
=?> simpleTable [mempty]
[[plain "One", plain "Two" , plain "Three"]
,[plain "1", plain "2" , plain "3"]
,[plain "A", plain "B" , plain "C"]]
, "Table with Header, with formatting" =:
T.unlines [ "|= **Foo** |= **Bar** |= **Baz** |"
, "|//one// element |//second// elt|Three |"
, "| {{{1}}} | {{{{}}}} | [[link]] |"
]
=?> simpleTable
[plain $ strong "Foo", plain $ strong "Bar" , plain $ strong "Baz"]
[[plain (emph "one" <> " element"), plain (emph "second" <> " elt")
,plain "Three"]
,[plain $ code "1", plain $ code "{}"
,plain $ link "link" "" (str "link")]]
]
]

View file

@ -17,6 +17,7 @@ import qualified Tests.Readers.Org
import qualified Tests.Readers.RST
import qualified Tests.Readers.Txt2Tags
import qualified Tests.Readers.Muse
import qualified Tests.Readers.Creole
import qualified Tests.Shared
import qualified Tests.Writers.AsciiDoc
import qualified Tests.Writers.ConTeXt
@ -63,6 +64,7 @@ tests = testGroup "pandoc tests" [ Tests.Command.tests
, testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests
, testGroup "EPUB" Tests.Readers.EPUB.tests
, testGroup "Muse" Tests.Readers.Muse.tests
, testGroup "Creole" Tests.Readers.Creole.tests
]
, testGroup "Lua filters" Tests.Lua.tests
]