Added basic mediawiki reader.

Text.Pandoc.Readers.MediaWiki module, tests/mediawiki-reader.{txt,native}.
2012-09-10 10:02:12 -07:00 · 2012-09-10 10:02:12 -07:00 · 3d361b2101
commit 3d361b2101
parent 167012daf7
8 changed files with 435 additions and 7 deletions
--- a/pandoc.cabal
+++ b/pandoc.cabal
@ -16,10 +16,11 @@ Synopsis:        Conversion between markup formats
 Description:     Pandoc is a Haskell library for converting from one markup
                 format to another, and a command-line tool that uses
                 this library. It can read markdown and (subsets of) HTML,
-                 reStructuredText, LaTeX, DocBook, and Textile, and it can write
+                 reStructuredText, LaTeX, DocBook, MediaWiki markup,
-                 markdown, reStructuredText, HTML, LaTeX, ConTeXt, Docbook,
+                 and Textile, and it can write markdown, reStructuredText,
-                 OpenDocument, ODT, Word docx, RTF, MediaWiki, Textile,
+                 HTML, LaTeX, ConTeXt, Docbook, OpenDocument, ODT,
-                 groff man pages, plain text, Emacs Org-Mode, AsciiDoc, EPUB,
+                 Word docx, RTF, MediaWiki, Textile, groff man pages,
                 plain text, Emacs Org-Mode, AsciiDoc, EPUB,
                 FictionBook2, and S5, Slidy and Slideous HTML slide shows.
                 .
                 Pandoc extends standard markdown syntax with footnotes,
@ -120,6 +121,8 @@ Extra-Source-Files:
                 tests/markdown-citations.mhra.txt,
                 tests/markdown-citations.ieee.txt,
                 tests/textile-reader.textile,
                 tests/mediawiki-reader.wiki,
                 tests/mediawiki-reader.native,
                 tests/rst-reader.native,
                 tests/rst-reader.rst,
                 tests/s5.basic.html,
@ -262,6 +265,7 @@ Library
                   Text.Pandoc.Readers.HTML,
                   Text.Pandoc.Readers.LaTeX,
                   Text.Pandoc.Readers.Markdown,
                   Text.Pandoc.Readers.MediaWiki,
                   Text.Pandoc.Readers.RST,
                   Text.Pandoc.Readers.DocBook,
                   Text.Pandoc.Readers.TeXMath,
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@ -66,6 +66,7 @@ module Text.Pandoc
               , writers
               -- * Readers: converting /to/ Pandoc format
               , readMarkdown
               , readMediaWiki
               , readRST
               , readLaTeX
               , readHtml
@ -110,6 +111,7 @@ module Text.Pandoc
 import Text.Pandoc.Definition
 import Text.Pandoc.Generic
 import Text.Pandoc.Readers.Markdown
 import Text.Pandoc.Readers.MediaWiki
 import Text.Pandoc.Readers.RST
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.LaTeX
@ -179,6 +181,7 @@ readers = [("native"       , \_ -> readNative)
          ,("markdown_strict" , readMarkdown)
          ,("markdown"     , readMarkdown)
          ,("rst"          , readRST)
          ,("mediawiki"    , readMediaWiki)
          ,("docbook"      , readDocBook)
          ,("textile"      , readTextile) -- TODO : textile+lhs
          ,("html"         , readHtml)
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@ -1,6 +1,4 @@
 {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
 {-# LANGUAGE FlexibleInstances, TypeSynonymInstances,
    GeneralizedNewtypeDeriving #-}
 {-
 Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@ -0,0 +1,311 @@
 {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
 {-
 Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 -}
 {- |
   Module      : Text.Pandoc.Readers.MediaWiki
   Copyright   : Copyright (C) 2012 John MacFarlane
   License     : GNU GPL, version 2 or above
   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable
 Conversion of mediawiki text to 'Pandoc' document.
 -}
 {-
 TODO:
 _ fix pre parser -- it should use html tagsoup parsers,
  then just strip out the text from text tags.
 _ correctly handle skipped level in list, e.g. # to ###
 _ tests for lists
 _ support HTML lists
 _ support list style attributes and start values in ol lists, also
  value attribute on li
 _ support <p> tags in lists (and out?)
 _ support :, ::, etc. for indent (treat as list continuation paras?)
 _ support preformatted text (lines starting with space)
 _ support preformatted text blocks
 _ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi <syntaxhighlight lang="php"> (alternativel, <source...>)
  if 'line' attribute present, number lines
  if 'start' present, set starting line number
 _ support internal links http://www.mediawiki.org/wiki/Help:Links
 _ support external links
 _ support automatic linkification of URLs
 _ support images http://www.mediawiki.org/wiki/Help:Images
 _ ignore gallery tag?
 _ support tables http://www.mediawiki.org/wiki/Help:Tables
 _ support <math> tag for latex math
 _ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in
  postprocessing
 _ category links
 _ tests for raw html inline
 _ tests for sup, sub, del
 _ tests for pre, haskell
 _ tests for code, tt, hask
 _ test for blockquote
 -}
 module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
 import Text.Pandoc.Definition
 import qualified Text.Pandoc.Builder as B
 import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
 import Text.Pandoc.Options
 import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced,
           isInlineTag, isBlockTag, isTextTag, isCommentTag )
 import Text.Pandoc.XML ( fromEntities )
 import Text.Pandoc.Parsing
 import Text.Pandoc.Shared ( stripTrailingNewlines )
 import Data.Monoid (mconcat, mempty)
 import Control.Applicative ((<$>), (<*), (*>), (<$))
 import Control.Monad
 import Data.List (intersperse)
 import Text.HTML.TagSoup
 -- | Read mediawiki from an input string and return a Pandoc document.
 readMediaWiki :: ReaderOptions -- ^ Reader options
               -> String        -- ^ String to parse (assuming @'\n'@ line endings)
               -> Pandoc
 readMediaWiki opts s =
  (readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n")
 type MWParser = Parser [Char] ParserState
 --
 -- auxiliary functions
 --
 specialChars :: [Char]
 specialChars = "'[]<=&*"
 spaceChars :: [Char]
 spaceChars = " \n\t"
 sym :: String -> MWParser ()
 sym s = () <$ try (string s)
 htmlComment :: MWParser ()
 htmlComment = () <$ htmlTag isCommentTag
 inlinesInTags :: String -> MWParser Inlines
 inlinesInTags tag = trimInlines . mconcat <$> try
  (htmlTag (~== TagOpen tag []) *>
   manyTill inline (htmlTag (~== TagClose tag)))
 blocksInTags :: String -> MWParser Blocks
 blocksInTags tag = mconcat <$> try
  (htmlTag (~== TagOpen tag []) *>
   manyTill block (htmlTag (~== TagClose tag)))
 charsInTags :: String -> MWParser [Char]
 charsInTags tag = fromEntities <$> try
  (htmlTag (~== TagOpen tag []) *>
   manyTill anyChar (htmlTag (~== TagClose tag)))
 --
 -- main parser
 --
 parseMediaWiki :: MWParser Pandoc
 parseMediaWiki = do
  bs <- mconcat <$> many block
  spaces
  eof
  return $ B.doc bs
 --
 -- block parsers
 --
 block :: MWParser Blocks
 block = header
     <|> hrule
     <|> bulletList
     <|> orderedList
     <|> definitionList
     <|> blockquote
     <|> codeblock
     <|> haskell
     <|> mempty <$ skipMany1 blankline
     <|> mempty <$ try (spaces *> htmlComment)
     <|> para
 para :: MWParser Blocks
 para = B.para . trimInlines . mconcat <$> many1 inline
 hrule :: MWParser Blocks
 hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
 blockquote :: MWParser Blocks
 blockquote = B.blockQuote <$> blocksInTags "blockquote"
 codeblock :: MWParser Blocks
 codeblock = B.codeBlock . trimCode <$> charsInTags "pre"
 trimCode :: String -> String
 trimCode ('\n':xs) = stripTrailingNewlines xs
 trimCode xs        = stripTrailingNewlines xs
 haskell :: MWParser Blocks
 haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
             charsInTags "haskell"
 header :: MWParser Blocks
 header = try $ do
  col <- sourceColumn <$> getPosition
  guard $ col == 1  -- header must be at beginning of line
  eqs <- many1 (char '=')
  let lev = length eqs
  guard $ lev <= 6
  contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
  return $ B.header lev contents
 bulletList :: MWParser Blocks
 bulletList = B.bulletList <$> many1 (listItem '*')
 orderedList :: MWParser Blocks
 orderedList = B.orderedList <$> many1 (listItem '#')
 definitionList :: MWParser Blocks
 definitionList = B.definitionList <$> many1 defListItem
 defListItem :: MWParser (Inlines, [Blocks])
 defListItem = try $ do
  terms <- mconcat . intersperse B.linebreak <$> many1 defListTerm
  defs  <- many1 $ listItem ':'
  return (terms, defs)
 defListTerm  :: MWParser Inlines
 defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>=
  parseFromString (trimInlines . mconcat <$> many inline)
 listStart :: Char -> MWParser ()
 listStart c = char c *> notFollowedBy listStartChar
 listStartChar :: MWParser Char
 listStartChar = oneOf "*#;:"
 anyListStart :: MWParser ()
 anyListStart = listStart '*' <|> listStart '#' <|> listStart ';'
 listItem :: Char -> MWParser Blocks
 listItem c = try $ do
  listStart c
  first <- manyTill anyChar newline
  rest <- many (try $ char c *> lookAhead listStartChar *>
                   manyTill anyChar newline)
  parseFromString (mconcat <$> many1 block) $ unlines $ first : rest
 --
 -- inline parsers
 --
 inline :: MWParser Inlines
 inline =  whitespace
      <|> url
      <|> str
      <|> strong
      <|> emph
      <|> nowiki
      <|> linebreak
      <|> externalLink
      <|> strikeout
      <|> subscript
      <|> superscript
      <|> code
      <|> hask
      <|> B.singleton <$> charRef
      <|> inlineHtml
      <|> special
 str :: MWParser Inlines
 str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
 special :: MWParser Inlines
 special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *>
                             oneOf specialChars)
 inlineHtml :: MWParser Inlines
 inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag
 whitespace :: MWParser Inlines
 whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment)
 endline :: MWParser ()
 endline = () <$ try (newline <*
                     notFollowedBy blankline <*
                     notFollowedBy' hrule <*
                     notFollowedBy anyListStart)
 linebreak :: MWParser Inlines
 linebreak = B.linebreak <$
  (htmlTag (~== TagOpen "br" []) *>
   optional (htmlTag (~== TagClose "br")) *>
   optional blankline)
 externalLink :: MWParser Inlines
 externalLink = try $ do
  char '['
  (orig, src) <- uri
  skipMany1 spaceChar
  lab <- manyTill inline (char ']')
  let lab' = if null lab
                then [B.str "1"] -- TODO generate sequentially from state
                else lab
  return $ B.link src "" $ trimInlines $ mconcat lab'
 url :: MWParser Inlines
 url = do
  (_, src) <- uri
  return $ B.link src "" (B.str orig)
 nowiki :: MWParser Inlines
 nowiki = B.text <$> charsInTags "nowiki"
 strikeout :: MWParser Inlines
 strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del")
 superscript :: MWParser Inlines
 superscript = B.superscript <$> inlinesInTags "sup"
 subscript :: MWParser Inlines
 subscript = B.subscript <$> inlinesInTags "sub"
 code :: MWParser Inlines
 code = B.code <$> (charsInTags "code" <|> charsInTags "tt")
 hask :: MWParser Inlines
 hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
 -- | Parses a list of inlines between start and end delimiters.
 inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
 inlinesBetween start end =
  (trimInlines . mconcat) <$> try (start >> many1Till inner end)
    where inner      = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
          innerSpace = try $ whitespace >>~ notFollowedBy' end
 emph :: MWParser Inlines
 emph = B.emph <$> nested (inlinesBetween start end)
    where start = sym "''" >> lookAhead nonspaceChar
          end   = try $ notFollowedBy' (() <$ strong) >> sym "''"
 strong :: MWParser Inlines
 strong = B.strong <$> nested (inlinesBetween start end)
    where start = sym "'''" >> lookAhead nonspaceChar
          end   = try $ sym "'''"
--- a/src/pandoc.hs
+++ b/src/pandoc.hs
@ -721,6 +721,7 @@ defaultReaderName fallback (x:xs) =
    ".rst"      -> "rst"
    ".lhs"      -> "markdown+lhs"
    ".db"       -> "docbook"
    ".wiki"     -> "mediawiki"
    ".textile"  -> "textile"
    ".native"   -> "native"
    ".json"     -> "json"
--- a/tests/Tests/Old.hs
+++ b/tests/Tests/Old.hs
@ -119,9 +119,14 @@ tests = [ testGroup "markdown"
          , fb2WriterTest "math" [] "fb2.math.markdown" "fb2.math.fb2"
          , fb2WriterTest "testsuite" [] "testsuite.native" "writer.fb2"
          ]
        , testGroup "mediawiki"
          [ testGroup "writer" $ writerTests "mediawiki"
          , test "reader" ["-r", "mediawiki", "-w", "native", "-s"]
            "mediawiki-reader.wiki" "mediawiki-reader.native"
          ]
        , testGroup "other writers" $ map (\f -> testGroup f $ writerTests f)
          [ "opendocument" , "context" , "texinfo"
-          , "man" , "plain" , "mediawiki", "rtf", "org", "asciidoc"
+          , "man" , "plain" , "rtf", "org", "asciidoc"
          ]
        ]
--- a/tests/mediawiki-reader.native
+++ b/tests/mediawiki-reader.native
@ -0,0 +1,35 @@
 Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
 [Header 1 [Str "header"]
 ,Header 2 [Str "header",Space,Str "level",Space,Str "two"]
 ,Header 3 [Str "header",Space,Str "level",Space,Str "3"]
 ,Header 4 [Str "header",Space,Emph [Str "level"],Space,Str "four"]
 ,Header 5 [Str "header",Space,Str "level",Space,Str "5"]
 ,Header 6 [Str "header",Space,Str "level",Space,Str "6"]
 ,Para [Str "=======",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "========"]
 ,Para [Str "==",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "=="]
 ,Header 2 [Str "emph",Space,Str "and",Space,Str "strong"]
 ,Para [Emph [Str "emph"],Space,Strong [Str "strong"]]
 ,Para [Strong [Emph [Str "strong",Space,Str "and",Space,Str "emph"]]]
 ,Para [Strong [Emph [Str "emph",Space,Str "inside"],Space,Str "strong"]]
 ,Para [Strong [Str "strong",Space,Str "with",Space,Emph [Str "emph"]]]
 ,Para [Emph [Strong [Str "strong",Space,Str "inside"],Space,Str "emph"]]
 ,Header 2 [Str "horizontal",Space,Str "rule"]
 ,Para [Str "top"]
 ,HorizontalRule
 ,Para [Str "bottom"]
 ,HorizontalRule
 ,Header 2 [Str "nowiki"]
 ,Para [Str "''not",Space,Str "emph''"]
 ,Header 2 [Str "strikeout"]
 ,Para [Strikeout [Str "This",Space,Str "is",Space,Emph [Str "struck",Space,Str "out"]]]
 ,Header 2 [Str "entities"]
 ,Para [Str "hi",Space,Str "&",Space,Str "low"]
 ,Para [Str "hi",Space,Str "&",Space,Str "low"]
 ,Para [Str "G\246del"]
 ,Para [Str "\777\2730"]
 ,Header 2 [Str "comments"]
 ,Para [Str "inline",Space,Str "comment"]
 ,Para [Str "between",Space,Str "blocks"]
 ,Header 2 [Str "linebreaks"]
 ,Para [Str "hi",LineBreak,Str "there"]
 ,Para [Str "hi",LineBreak,Str "there"]]
--- a/tests/mediawiki-reader.wiki
+++ b/tests/mediawiki-reader.wiki
@ -0,0 +1,71 @@
 = header =
 == header level two ==
 ===header level 3===
 ====header ''level'' four====
 ===== header level 5 =====
 ====== header level 6 ======
 ======= not a header ========
 == not a header ==
 == emph and strong ==
 ''emph'' '''strong'''
 '''''strong and emph'''''
 '''''emph inside'' strong'''
 '''strong with ''emph'''''
 '''''strong inside''' emph''
 == horizontal rule ==
 top
 ----
 bottom
 ----
 == nowiki ==
 <nowiki>''not emph''</nowiki>
 == strikeout ==
 <strike> This is ''struck out''</strike>
 == entities ==
 hi & low
 hi &amp; low
 G&ouml;del
 &#777;&#xAAA;
 == comments ==
 inline<!-- secret --> comment
 <!-- secret -->
 between blocks
 <!-- secret -->
 == linebreaks ==
 hi<br/>there
 hi<br>
 there