From 3d361b2101c097ecde343625b15da8c197d733eb Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 10 Sep 2012 10:02:12 -0700 Subject: [PATCH] Added basic mediawiki reader. Text.Pandoc.Readers.MediaWiki module, tests/mediawiki-reader.{txt,native}. --- pandoc.cabal | 12 +- src/Text/Pandoc.hs | 3 + src/Text/Pandoc/Readers/Markdown.hs | 2 - src/Text/Pandoc/Readers/MediaWiki.hs | 311 +++++++++++++++++++++++++++ src/pandoc.hs | 1 + tests/Tests/Old.hs | 7 +- tests/mediawiki-reader.native | 35 +++ tests/mediawiki-reader.wiki | 71 ++++++ 8 files changed, 435 insertions(+), 7 deletions(-) create mode 100644 src/Text/Pandoc/Readers/MediaWiki.hs create mode 100644 tests/mediawiki-reader.native create mode 100644 tests/mediawiki-reader.wiki diff --git a/pandoc.cabal b/pandoc.cabal index e999f1b80..0b234c52f 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -16,10 +16,11 @@ Synopsis: Conversion between markup formats Description: Pandoc is a Haskell library for converting from one markup format to another, and a command-line tool that uses this library. It can read markdown and (subsets of) HTML, - reStructuredText, LaTeX, DocBook, and Textile, and it can write - markdown, reStructuredText, HTML, LaTeX, ConTeXt, Docbook, - OpenDocument, ODT, Word docx, RTF, MediaWiki, Textile, - groff man pages, plain text, Emacs Org-Mode, AsciiDoc, EPUB, + reStructuredText, LaTeX, DocBook, MediaWiki markup, + and Textile, and it can write markdown, reStructuredText, + HTML, LaTeX, ConTeXt, Docbook, OpenDocument, ODT, + Word docx, RTF, MediaWiki, Textile, groff man pages, + plain text, Emacs Org-Mode, AsciiDoc, EPUB, FictionBook2, and S5, Slidy and Slideous HTML slide shows. . Pandoc extends standard markdown syntax with footnotes, @@ -120,6 +121,8 @@ Extra-Source-Files: tests/markdown-citations.mhra.txt, tests/markdown-citations.ieee.txt, tests/textile-reader.textile, + tests/mediawiki-reader.wiki, + tests/mediawiki-reader.native, tests/rst-reader.native, tests/rst-reader.rst, tests/s5.basic.html, @@ -262,6 +265,7 @@ Library Text.Pandoc.Readers.HTML, Text.Pandoc.Readers.LaTeX, Text.Pandoc.Readers.Markdown, + Text.Pandoc.Readers.MediaWiki, Text.Pandoc.Readers.RST, Text.Pandoc.Readers.DocBook, Text.Pandoc.Readers.TeXMath, diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 33706816e..1e6b1d010 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -66,6 +66,7 @@ module Text.Pandoc , writers -- * Readers: converting /to/ Pandoc format , readMarkdown + , readMediaWiki , readRST , readLaTeX , readHtml @@ -110,6 +111,7 @@ module Text.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Generic import Text.Pandoc.Readers.Markdown +import Text.Pandoc.Readers.MediaWiki import Text.Pandoc.Readers.RST import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.LaTeX @@ -179,6 +181,7 @@ readers = [("native" , \_ -> readNative) ,("markdown_strict" , readMarkdown) ,("markdown" , readMarkdown) ,("rst" , readRST) + ,("mediawiki" , readMediaWiki) ,("docbook" , readDocBook) ,("textile" , readTextile) -- TODO : textile+lhs ,("html" , readHtml) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index d3d4e72ff..1c2cc12f1 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1,6 +1,4 @@ {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 -{-# LANGUAGE FlexibleInstances, TypeSynonymInstances, - GeneralizedNewtypeDeriving #-} {- Copyright (C) 2006-2010 John MacFarlane diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs new file mode 100644 index 000000000..f3adbe72e --- /dev/null +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -0,0 +1,311 @@ +{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 +{- +Copyright (C) 2012 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.MediaWiki + Copyright : Copyright (C) 2012 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of mediawiki text to 'Pandoc' document. +-} +{- +TODO: +_ fix pre parser -- it should use html tagsoup parsers, + then just strip out the text from text tags. +_ correctly handle skipped level in list, e.g. # to ### +_ tests for lists +_ support HTML lists +_ support list style attributes and start values in ol lists, also + value attribute on li +_ support

tags in lists (and out?) +_ support :, ::, etc. for indent (treat as list continuation paras?) +_ support preformatted text (lines starting with space) +_ support preformatted text blocks +_ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi (alternativel, ) + if 'line' attribute present, number lines + if 'start' present, set starting line number +_ support internal links http://www.mediawiki.org/wiki/Help:Links +_ support external links +_ support automatic linkification of URLs +_ support images http://www.mediawiki.org/wiki/Help:Images +_ ignore gallery tag? +_ support tables http://www.mediawiki.org/wiki/Help:Tables +_ support tag for latex math +_ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in + postprocessing +_ category links +_ tests for raw html inline +_ tests for sup, sub, del +_ tests for pre, haskell +_ tests for code, tt, hask +_ test for blockquote +-} +module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where + +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) +import Text.Pandoc.Options +import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, + isInlineTag, isBlockTag, isTextTag, isCommentTag ) +import Text.Pandoc.XML ( fromEntities ) +import Text.Pandoc.Parsing +import Text.Pandoc.Shared ( stripTrailingNewlines ) +import Data.Monoid (mconcat, mempty) +import Control.Applicative ((<$>), (<*), (*>), (<$)) +import Control.Monad +import Data.List (intersperse) +import Text.HTML.TagSoup + +-- | Read mediawiki from an input string and return a Pandoc document. +readMediaWiki :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readMediaWiki opts s = + (readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n") + +type MWParser = Parser [Char] ParserState + +-- +-- auxiliary functions +-- + +specialChars :: [Char] +specialChars = "'[]<=&*" + +spaceChars :: [Char] +spaceChars = " \n\t" + +sym :: String -> MWParser () +sym s = () <$ try (string s) + +htmlComment :: MWParser () +htmlComment = () <$ htmlTag isCommentTag + +inlinesInTags :: String -> MWParser Inlines +inlinesInTags tag = trimInlines . mconcat <$> try + (htmlTag (~== TagOpen tag []) *> + manyTill inline (htmlTag (~== TagClose tag))) + +blocksInTags :: String -> MWParser Blocks +blocksInTags tag = mconcat <$> try + (htmlTag (~== TagOpen tag []) *> + manyTill block (htmlTag (~== TagClose tag))) + +charsInTags :: String -> MWParser [Char] +charsInTags tag = fromEntities <$> try + (htmlTag (~== TagOpen tag []) *> + manyTill anyChar (htmlTag (~== TagClose tag))) + +-- +-- main parser +-- + +parseMediaWiki :: MWParser Pandoc +parseMediaWiki = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + +-- +-- block parsers +-- + +block :: MWParser Blocks +block = header + <|> hrule + <|> bulletList + <|> orderedList + <|> definitionList + <|> blockquote + <|> codeblock + <|> haskell + <|> mempty <$ skipMany1 blankline + <|> mempty <$ try (spaces *> htmlComment) + <|> para + +para :: MWParser Blocks +para = B.para . trimInlines . mconcat <$> many1 inline + +hrule :: MWParser Blocks +hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline) + +blockquote :: MWParser Blocks +blockquote = B.blockQuote <$> blocksInTags "blockquote" + +codeblock :: MWParser Blocks +codeblock = B.codeBlock . trimCode <$> charsInTags "pre" + +trimCode :: String -> String +trimCode ('\n':xs) = stripTrailingNewlines xs +trimCode xs = stripTrailingNewlines xs + +haskell :: MWParser Blocks +haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$> + charsInTags "haskell" + +header :: MWParser Blocks +header = try $ do + col <- sourceColumn <$> getPosition + guard $ col == 1 -- header must be at beginning of line + eqs <- many1 (char '=') + let lev = length eqs + guard $ lev <= 6 + contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=') + return $ B.header lev contents + +bulletList :: MWParser Blocks +bulletList = B.bulletList <$> many1 (listItem '*') + +orderedList :: MWParser Blocks +orderedList = B.orderedList <$> many1 (listItem '#') + +definitionList :: MWParser Blocks +definitionList = B.definitionList <$> many1 defListItem + +defListItem :: MWParser (Inlines, [Blocks]) +defListItem = try $ do + terms <- mconcat . intersperse B.linebreak <$> many1 defListTerm + defs <- many1 $ listItem ':' + return (terms, defs) + +defListTerm :: MWParser Inlines +defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>= + parseFromString (trimInlines . mconcat <$> many inline) + +listStart :: Char -> MWParser () +listStart c = char c *> notFollowedBy listStartChar + +listStartChar :: MWParser Char +listStartChar = oneOf "*#;:" + +anyListStart :: MWParser () +anyListStart = listStart '*' <|> listStart '#' <|> listStart ';' + +listItem :: Char -> MWParser Blocks +listItem c = try $ do + listStart c + first <- manyTill anyChar newline + rest <- many (try $ char c *> lookAhead listStartChar *> + manyTill anyChar newline) + parseFromString (mconcat <$> many1 block) $ unlines $ first : rest + +-- +-- inline parsers +-- + +inline :: MWParser Inlines +inline = whitespace + <|> url + <|> str + <|> strong + <|> emph + <|> nowiki + <|> linebreak + <|> externalLink + <|> strikeout + <|> subscript + <|> superscript + <|> code + <|> hask + <|> B.singleton <$> charRef + <|> inlineHtml + <|> special + +str :: MWParser Inlines +str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) + +special :: MWParser Inlines +special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *> + oneOf specialChars) + +inlineHtml :: MWParser Inlines +inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag + +whitespace :: MWParser Inlines +whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment) + +endline :: MWParser () +endline = () <$ try (newline <* + notFollowedBy blankline <* + notFollowedBy' hrule <* + notFollowedBy anyListStart) + +linebreak :: MWParser Inlines +linebreak = B.linebreak <$ + (htmlTag (~== TagOpen "br" []) *> + optional (htmlTag (~== TagClose "br")) *> + optional blankline) + +externalLink :: MWParser Inlines +externalLink = try $ do + char '[' + (orig, src) <- uri + skipMany1 spaceChar + lab <- manyTill inline (char ']') + let lab' = if null lab + then [B.str "1"] -- TODO generate sequentially from state + else lab + return $ B.link src "" $ trimInlines $ mconcat lab' + +url :: MWParser Inlines +url = do + (_, src) <- uri + return $ B.link src "" (B.str orig) + +nowiki :: MWParser Inlines +nowiki = B.text <$> charsInTags "nowiki" + +strikeout :: MWParser Inlines +strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del") + +superscript :: MWParser Inlines +superscript = B.superscript <$> inlinesInTags "sup" + +subscript :: MWParser Inlines +subscript = B.subscript <$> inlinesInTags "sub" + +code :: MWParser Inlines +code = B.code <$> (charsInTags "code" <|> charsInTags "tt") + +hask :: MWParser Inlines +hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" + +-- | Parses a list of inlines between start and end delimiters. +inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines +inlinesBetween start end = + (trimInlines . mconcat) <$> try (start >> many1Till inner end) + where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) + innerSpace = try $ whitespace >>~ notFollowedBy' end + +emph :: MWParser Inlines +emph = B.emph <$> nested (inlinesBetween start end) + where start = sym "''" >> lookAhead nonspaceChar + end = try $ notFollowedBy' (() <$ strong) >> sym "''" + +strong :: MWParser Inlines +strong = B.strong <$> nested (inlinesBetween start end) + where start = sym "'''" >> lookAhead nonspaceChar + end = try $ sym "'''" + diff --git a/src/pandoc.hs b/src/pandoc.hs index af7004352..cb561e817 100644 --- a/src/pandoc.hs +++ b/src/pandoc.hs @@ -721,6 +721,7 @@ defaultReaderName fallback (x:xs) = ".rst" -> "rst" ".lhs" -> "markdown+lhs" ".db" -> "docbook" + ".wiki" -> "mediawiki" ".textile" -> "textile" ".native" -> "native" ".json" -> "json" diff --git a/tests/Tests/Old.hs b/tests/Tests/Old.hs index 8899fef6f..5360126c2 100644 --- a/tests/Tests/Old.hs +++ b/tests/Tests/Old.hs @@ -119,9 +119,14 @@ tests = [ testGroup "markdown" , fb2WriterTest "math" [] "fb2.math.markdown" "fb2.math.fb2" , fb2WriterTest "testsuite" [] "testsuite.native" "writer.fb2" ] + , testGroup "mediawiki" + [ testGroup "writer" $ writerTests "mediawiki" + , test "reader" ["-r", "mediawiki", "-w", "native", "-s"] + "mediawiki-reader.wiki" "mediawiki-reader.native" + ] , testGroup "other writers" $ map (\f -> testGroup f $ writerTests f) [ "opendocument" , "context" , "texinfo" - , "man" , "plain" , "mediawiki", "rtf", "org", "asciidoc" + , "man" , "plain" , "rtf", "org", "asciidoc" ] ] diff --git a/tests/mediawiki-reader.native b/tests/mediawiki-reader.native new file mode 100644 index 000000000..8b3eedf42 --- /dev/null +++ b/tests/mediawiki-reader.native @@ -0,0 +1,35 @@ +Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []}) +[Header 1 [Str "header"] +,Header 2 [Str "header",Space,Str "level",Space,Str "two"] +,Header 3 [Str "header",Space,Str "level",Space,Str "3"] +,Header 4 [Str "header",Space,Emph [Str "level"],Space,Str "four"] +,Header 5 [Str "header",Space,Str "level",Space,Str "5"] +,Header 6 [Str "header",Space,Str "level",Space,Str "6"] +,Para [Str "=======",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "========"] +,Para [Str "==",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "=="] +,Header 2 [Str "emph",Space,Str "and",Space,Str "strong"] +,Para [Emph [Str "emph"],Space,Strong [Str "strong"]] +,Para [Strong [Emph [Str "strong",Space,Str "and",Space,Str "emph"]]] +,Para [Strong [Emph [Str "emph",Space,Str "inside"],Space,Str "strong"]] +,Para [Strong [Str "strong",Space,Str "with",Space,Emph [Str "emph"]]] +,Para [Emph [Strong [Str "strong",Space,Str "inside"],Space,Str "emph"]] +,Header 2 [Str "horizontal",Space,Str "rule"] +,Para [Str "top"] +,HorizontalRule +,Para [Str "bottom"] +,HorizontalRule +,Header 2 [Str "nowiki"] +,Para [Str "''not",Space,Str "emph''"] +,Header 2 [Str "strikeout"] +,Para [Strikeout [Str "This",Space,Str "is",Space,Emph [Str "struck",Space,Str "out"]]] +,Header 2 [Str "entities"] +,Para [Str "hi",Space,Str "&",Space,Str "low"] +,Para [Str "hi",Space,Str "&",Space,Str "low"] +,Para [Str "G\246del"] +,Para [Str "\777\2730"] +,Header 2 [Str "comments"] +,Para [Str "inline",Space,Str "comment"] +,Para [Str "between",Space,Str "blocks"] +,Header 2 [Str "linebreaks"] +,Para [Str "hi",LineBreak,Str "there"] +,Para [Str "hi",LineBreak,Str "there"]] diff --git a/tests/mediawiki-reader.wiki b/tests/mediawiki-reader.wiki new file mode 100644 index 000000000..6e51f1544 --- /dev/null +++ b/tests/mediawiki-reader.wiki @@ -0,0 +1,71 @@ += header = + +== header level two == + +===header level 3=== + +====header ''level'' four==== + +===== header level 5 ===== + +====== header level 6 ====== + +======= not a header ======== + + == not a header == + +== emph and strong == + +''emph'' '''strong''' + +'''''strong and emph''''' + +'''''emph inside'' strong''' + +'''strong with ''emph''''' + +'''''strong inside''' emph'' + +== horizontal rule == + +top +---- +bottom + +---- + +== nowiki == + +''not emph'' + +== strikeout == + + This is ''struck out'' + +== entities == + +hi & low + +hi & low + +Gödel + +̉પ + +== comments == + +inline comment + + + +between blocks + + + +== linebreaks == + +hi
there + +hi
+there +