Added basic mediawiki reader.

Text.Pandoc.Readers.MediaWiki module,
tests/mediawiki-reader.{txt,native}.
This commit is contained in:
John MacFarlane 2012-09-10 10:02:12 -07:00
parent 167012daf7
commit 3d361b2101
8 changed files with 435 additions and 7 deletions

View file

@ -16,10 +16,11 @@ Synopsis: Conversion between markup formats
Description: Pandoc is a Haskell library for converting from one markup Description: Pandoc is a Haskell library for converting from one markup
format to another, and a command-line tool that uses format to another, and a command-line tool that uses
this library. It can read markdown and (subsets of) HTML, this library. It can read markdown and (subsets of) HTML,
reStructuredText, LaTeX, DocBook, and Textile, and it can write reStructuredText, LaTeX, DocBook, MediaWiki markup,
markdown, reStructuredText, HTML, LaTeX, ConTeXt, Docbook, and Textile, and it can write markdown, reStructuredText,
OpenDocument, ODT, Word docx, RTF, MediaWiki, Textile, HTML, LaTeX, ConTeXt, Docbook, OpenDocument, ODT,
groff man pages, plain text, Emacs Org-Mode, AsciiDoc, EPUB, Word docx, RTF, MediaWiki, Textile, groff man pages,
plain text, Emacs Org-Mode, AsciiDoc, EPUB,
FictionBook2, and S5, Slidy and Slideous HTML slide shows. FictionBook2, and S5, Slidy and Slideous HTML slide shows.
. .
Pandoc extends standard markdown syntax with footnotes, Pandoc extends standard markdown syntax with footnotes,
@ -120,6 +121,8 @@ Extra-Source-Files:
tests/markdown-citations.mhra.txt, tests/markdown-citations.mhra.txt,
tests/markdown-citations.ieee.txt, tests/markdown-citations.ieee.txt,
tests/textile-reader.textile, tests/textile-reader.textile,
tests/mediawiki-reader.wiki,
tests/mediawiki-reader.native,
tests/rst-reader.native, tests/rst-reader.native,
tests/rst-reader.rst, tests/rst-reader.rst,
tests/s5.basic.html, tests/s5.basic.html,
@ -262,6 +265,7 @@ Library
Text.Pandoc.Readers.HTML, Text.Pandoc.Readers.HTML,
Text.Pandoc.Readers.LaTeX, Text.Pandoc.Readers.LaTeX,
Text.Pandoc.Readers.Markdown, Text.Pandoc.Readers.Markdown,
Text.Pandoc.Readers.MediaWiki,
Text.Pandoc.Readers.RST, Text.Pandoc.Readers.RST,
Text.Pandoc.Readers.DocBook, Text.Pandoc.Readers.DocBook,
Text.Pandoc.Readers.TeXMath, Text.Pandoc.Readers.TeXMath,

View file

@ -66,6 +66,7 @@ module Text.Pandoc
, writers , writers
-- * Readers: converting /to/ Pandoc format -- * Readers: converting /to/ Pandoc format
, readMarkdown , readMarkdown
, readMediaWiki
, readRST , readRST
, readLaTeX , readLaTeX
, readHtml , readHtml
@ -110,6 +111,7 @@ module Text.Pandoc
import Text.Pandoc.Definition import Text.Pandoc.Definition
import Text.Pandoc.Generic import Text.Pandoc.Generic
import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.MediaWiki
import Text.Pandoc.Readers.RST import Text.Pandoc.Readers.RST
import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.LaTeX import Text.Pandoc.Readers.LaTeX
@ -179,6 +181,7 @@ readers = [("native" , \_ -> readNative)
,("markdown_strict" , readMarkdown) ,("markdown_strict" , readMarkdown)
,("markdown" , readMarkdown) ,("markdown" , readMarkdown)
,("rst" , readRST) ,("rst" , readRST)
,("mediawiki" , readMediaWiki)
,("docbook" , readDocBook) ,("docbook" , readDocBook)
,("textile" , readTextile) -- TODO : textile+lhs ,("textile" , readTextile) -- TODO : textile+lhs
,("html" , readHtml) ,("html" , readHtml)

View file

@ -1,6 +1,4 @@
{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
{-# LANGUAGE FlexibleInstances, TypeSynonymInstances,
GeneralizedNewtypeDeriving #-}
{- {-
Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>

View file

@ -0,0 +1,311 @@
{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
{-
Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.MediaWiki
Copyright : Copyright (C) 2012 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Conversion of mediawiki text to 'Pandoc' document.
-}
{-
TODO:
_ fix pre parser -- it should use html tagsoup parsers,
then just strip out the text from text tags.
_ correctly handle skipped level in list, e.g. # to ###
_ tests for lists
_ support HTML lists
_ support list style attributes and start values in ol lists, also
value attribute on li
_ support <p> tags in lists (and out?)
_ support :, ::, etc. for indent (treat as list continuation paras?)
_ support preformatted text (lines starting with space)
_ support preformatted text blocks
_ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi <syntaxhighlight lang="php"> (alternativel, <source...>)
if 'line' attribute present, number lines
if 'start' present, set starting line number
_ support internal links http://www.mediawiki.org/wiki/Help:Links
_ support external links
_ support automatic linkification of URLs
_ support images http://www.mediawiki.org/wiki/Help:Images
_ ignore gallery tag?
_ support tables http://www.mediawiki.org/wiki/Help:Tables
_ support <math> tag for latex math
_ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in
postprocessing
_ category links
_ tests for raw html inline
_ tests for sup, sub, del
_ tests for pre, haskell
_ tests for code, tt, hask
_ test for blockquote
-}
module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
import Text.Pandoc.Options
import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced,
isInlineTag, isBlockTag, isTextTag, isCommentTag )
import Text.Pandoc.XML ( fromEntities )
import Text.Pandoc.Parsing
import Text.Pandoc.Shared ( stripTrailingNewlines )
import Data.Monoid (mconcat, mempty)
import Control.Applicative ((<$>), (<*), (*>), (<$))
import Control.Monad
import Data.List (intersperse)
import Text.HTML.TagSoup
-- | Read mediawiki from an input string and return a Pandoc document.
readMediaWiki :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Pandoc
readMediaWiki opts s =
(readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n")
type MWParser = Parser [Char] ParserState
--
-- auxiliary functions
--
specialChars :: [Char]
specialChars = "'[]<=&*"
spaceChars :: [Char]
spaceChars = " \n\t"
sym :: String -> MWParser ()
sym s = () <$ try (string s)
htmlComment :: MWParser ()
htmlComment = () <$ htmlTag isCommentTag
inlinesInTags :: String -> MWParser Inlines
inlinesInTags tag = trimInlines . mconcat <$> try
(htmlTag (~== TagOpen tag []) *>
manyTill inline (htmlTag (~== TagClose tag)))
blocksInTags :: String -> MWParser Blocks
blocksInTags tag = mconcat <$> try
(htmlTag (~== TagOpen tag []) *>
manyTill block (htmlTag (~== TagClose tag)))
charsInTags :: String -> MWParser [Char]
charsInTags tag = fromEntities <$> try
(htmlTag (~== TagOpen tag []) *>
manyTill anyChar (htmlTag (~== TagClose tag)))
--
-- main parser
--
parseMediaWiki :: MWParser Pandoc
parseMediaWiki = do
bs <- mconcat <$> many block
spaces
eof
return $ B.doc bs
--
-- block parsers
--
block :: MWParser Blocks
block = header
<|> hrule
<|> bulletList
<|> orderedList
<|> definitionList
<|> blockquote
<|> codeblock
<|> haskell
<|> mempty <$ skipMany1 blankline
<|> mempty <$ try (spaces *> htmlComment)
<|> para
para :: MWParser Blocks
para = B.para . trimInlines . mconcat <$> many1 inline
hrule :: MWParser Blocks
hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
blockquote :: MWParser Blocks
blockquote = B.blockQuote <$> blocksInTags "blockquote"
codeblock :: MWParser Blocks
codeblock = B.codeBlock . trimCode <$> charsInTags "pre"
trimCode :: String -> String
trimCode ('\n':xs) = stripTrailingNewlines xs
trimCode xs = stripTrailingNewlines xs
haskell :: MWParser Blocks
haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
charsInTags "haskell"
header :: MWParser Blocks
header = try $ do
col <- sourceColumn <$> getPosition
guard $ col == 1 -- header must be at beginning of line
eqs <- many1 (char '=')
let lev = length eqs
guard $ lev <= 6
contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
return $ B.header lev contents
bulletList :: MWParser Blocks
bulletList = B.bulletList <$> many1 (listItem '*')
orderedList :: MWParser Blocks
orderedList = B.orderedList <$> many1 (listItem '#')
definitionList :: MWParser Blocks
definitionList = B.definitionList <$> many1 defListItem
defListItem :: MWParser (Inlines, [Blocks])
defListItem = try $ do
terms <- mconcat . intersperse B.linebreak <$> many1 defListTerm
defs <- many1 $ listItem ':'
return (terms, defs)
defListTerm :: MWParser Inlines
defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>=
parseFromString (trimInlines . mconcat <$> many inline)
listStart :: Char -> MWParser ()
listStart c = char c *> notFollowedBy listStartChar
listStartChar :: MWParser Char
listStartChar = oneOf "*#;:"
anyListStart :: MWParser ()
anyListStart = listStart '*' <|> listStart '#' <|> listStart ';'
listItem :: Char -> MWParser Blocks
listItem c = try $ do
listStart c
first <- manyTill anyChar newline
rest <- many (try $ char c *> lookAhead listStartChar *>
manyTill anyChar newline)
parseFromString (mconcat <$> many1 block) $ unlines $ first : rest
--
-- inline parsers
--
inline :: MWParser Inlines
inline = whitespace
<|> url
<|> str
<|> strong
<|> emph
<|> nowiki
<|> linebreak
<|> externalLink
<|> strikeout
<|> subscript
<|> superscript
<|> code
<|> hask
<|> B.singleton <$> charRef
<|> inlineHtml
<|> special
str :: MWParser Inlines
str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
special :: MWParser Inlines
special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *>
oneOf specialChars)
inlineHtml :: MWParser Inlines
inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag
whitespace :: MWParser Inlines
whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment)
endline :: MWParser ()
endline = () <$ try (newline <*
notFollowedBy blankline <*
notFollowedBy' hrule <*
notFollowedBy anyListStart)
linebreak :: MWParser Inlines
linebreak = B.linebreak <$
(htmlTag (~== TagOpen "br" []) *>
optional (htmlTag (~== TagClose "br")) *>
optional blankline)
externalLink :: MWParser Inlines
externalLink = try $ do
char '['
(orig, src) <- uri
skipMany1 spaceChar
lab <- manyTill inline (char ']')
let lab' = if null lab
then [B.str "1"] -- TODO generate sequentially from state
else lab
return $ B.link src "" $ trimInlines $ mconcat lab'
url :: MWParser Inlines
url = do
(_, src) <- uri
return $ B.link src "" (B.str orig)
nowiki :: MWParser Inlines
nowiki = B.text <$> charsInTags "nowiki"
strikeout :: MWParser Inlines
strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del")
superscript :: MWParser Inlines
superscript = B.superscript <$> inlinesInTags "sup"
subscript :: MWParser Inlines
subscript = B.subscript <$> inlinesInTags "sub"
code :: MWParser Inlines
code = B.code <$> (charsInTags "code" <|> charsInTags "tt")
hask :: MWParser Inlines
hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
-- | Parses a list of inlines between start and end delimiters.
inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
inlinesBetween start end =
(trimInlines . mconcat) <$> try (start >> many1Till inner end)
where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
innerSpace = try $ whitespace >>~ notFollowedBy' end
emph :: MWParser Inlines
emph = B.emph <$> nested (inlinesBetween start end)
where start = sym "''" >> lookAhead nonspaceChar
end = try $ notFollowedBy' (() <$ strong) >> sym "''"
strong :: MWParser Inlines
strong = B.strong <$> nested (inlinesBetween start end)
where start = sym "'''" >> lookAhead nonspaceChar
end = try $ sym "'''"

View file

@ -721,6 +721,7 @@ defaultReaderName fallback (x:xs) =
".rst" -> "rst" ".rst" -> "rst"
".lhs" -> "markdown+lhs" ".lhs" -> "markdown+lhs"
".db" -> "docbook" ".db" -> "docbook"
".wiki" -> "mediawiki"
".textile" -> "textile" ".textile" -> "textile"
".native" -> "native" ".native" -> "native"
".json" -> "json" ".json" -> "json"

View file

@ -119,9 +119,14 @@ tests = [ testGroup "markdown"
, fb2WriterTest "math" [] "fb2.math.markdown" "fb2.math.fb2" , fb2WriterTest "math" [] "fb2.math.markdown" "fb2.math.fb2"
, fb2WriterTest "testsuite" [] "testsuite.native" "writer.fb2" , fb2WriterTest "testsuite" [] "testsuite.native" "writer.fb2"
] ]
, testGroup "mediawiki"
[ testGroup "writer" $ writerTests "mediawiki"
, test "reader" ["-r", "mediawiki", "-w", "native", "-s"]
"mediawiki-reader.wiki" "mediawiki-reader.native"
]
, testGroup "other writers" $ map (\f -> testGroup f $ writerTests f) , testGroup "other writers" $ map (\f -> testGroup f $ writerTests f)
[ "opendocument" , "context" , "texinfo" [ "opendocument" , "context" , "texinfo"
, "man" , "plain" , "mediawiki", "rtf", "org", "asciidoc" , "man" , "plain" , "rtf", "org", "asciidoc"
] ]
] ]

View file

@ -0,0 +1,35 @@
Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
[Header 1 [Str "header"]
,Header 2 [Str "header",Space,Str "level",Space,Str "two"]
,Header 3 [Str "header",Space,Str "level",Space,Str "3"]
,Header 4 [Str "header",Space,Emph [Str "level"],Space,Str "four"]
,Header 5 [Str "header",Space,Str "level",Space,Str "5"]
,Header 6 [Str "header",Space,Str "level",Space,Str "6"]
,Para [Str "=======",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "========"]
,Para [Str "==",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "=="]
,Header 2 [Str "emph",Space,Str "and",Space,Str "strong"]
,Para [Emph [Str "emph"],Space,Strong [Str "strong"]]
,Para [Strong [Emph [Str "strong",Space,Str "and",Space,Str "emph"]]]
,Para [Strong [Emph [Str "emph",Space,Str "inside"],Space,Str "strong"]]
,Para [Strong [Str "strong",Space,Str "with",Space,Emph [Str "emph"]]]
,Para [Emph [Strong [Str "strong",Space,Str "inside"],Space,Str "emph"]]
,Header 2 [Str "horizontal",Space,Str "rule"]
,Para [Str "top"]
,HorizontalRule
,Para [Str "bottom"]
,HorizontalRule
,Header 2 [Str "nowiki"]
,Para [Str "''not",Space,Str "emph''"]
,Header 2 [Str "strikeout"]
,Para [Strikeout [Str "This",Space,Str "is",Space,Emph [Str "struck",Space,Str "out"]]]
,Header 2 [Str "entities"]
,Para [Str "hi",Space,Str "&",Space,Str "low"]
,Para [Str "hi",Space,Str "&",Space,Str "low"]
,Para [Str "G\246del"]
,Para [Str "\777\2730"]
,Header 2 [Str "comments"]
,Para [Str "inline",Space,Str "comment"]
,Para [Str "between",Space,Str "blocks"]
,Header 2 [Str "linebreaks"]
,Para [Str "hi",LineBreak,Str "there"]
,Para [Str "hi",LineBreak,Str "there"]]

View file

@ -0,0 +1,71 @@
= header =
== header level two ==
===header level 3===
====header ''level'' four====
===== header level 5 =====
====== header level 6 ======
======= not a header ========
== not a header ==
== emph and strong ==
''emph'' '''strong'''
'''''strong and emph'''''
'''''emph inside'' strong'''
'''strong with ''emph'''''
'''''strong inside''' emph''
== horizontal rule ==
top
----
bottom
----
== nowiki ==
<nowiki>''not emph''</nowiki>
== strikeout ==
<strike> This is ''struck out''</strike>
== entities ==
hi & low
hi &amp; low
G&ouml;del
&#777;&#xAAA;
== comments ==
inline<!-- secret --> comment
<!-- secret -->
between blocks
<!-- secret -->
== linebreaks ==
hi<br/>there
hi<br>
there