Added CommonMark reader using cmark (libcmark bindings).

- Added commonmark as an input format.
- Added `Text.Pandoc.Readers.CommonMark.readCommonMark`.
- For now, we use the markdown writer to generate benchmark
  text for the CommonMark reader.  We can change this when we
  get a writer.
This commit is contained in:
John MacFarlane 2015-03-16 22:20:42 -07:00 committed by John MacFarlane
parent 5721a5d34b
commit e0d234e54d
5 changed files with 150 additions and 18 deletions

25
README
View file

@ -12,17 +12,18 @@ Description
Pandoc is a [Haskell] library for converting from one markup format to Pandoc is a [Haskell] library for converting from one markup format to
another, and a command-line tool that uses this library. It can read another, and a command-line tool that uses this library. It can read
[markdown] and (subsets of) [Textile], [reStructuredText], [HTML], [Markdown], [CommonMark], and (subsets of) [Textile],
[LaTeX], [MediaWiki markup], [TWiki markup], [Haddock markup], [OPML], [reStructuredText], [HTML], [LaTeX], [MediaWiki markup], [TWiki
[Emacs Org-mode], [DocBook], [txt2tags], [EPUB] and [Word docx]; and markup], [Haddock markup], [OPML], [Emacs Org-mode], [DocBook],
it can write plain text, [markdown], [reStructuredText], [XHTML], [txt2tags], [EPUB] and [Word docx]; and it can write plain text,
[HTML 5], [LaTeX] (including [beamer] slide shows), [ConTeXt], [RTF], [Markdown], [reStructuredText], [XHTML], [HTML 5], [LaTeX] (including
[OPML], [DocBook], [OpenDocument], [ODT], [Word docx], [GNU Texinfo], [beamer] slide shows), [ConTeXt], [RTF], [OPML], [DocBook],
[MediaWiki markup], [DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3), [OpenDocument], [ODT], [Word docx], [GNU Texinfo], [MediaWiki markup],
[FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode], [AsciiDoc], [DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3),
[InDesign ICML], and [Slidy], [Slideous], [DZSlides], [reveal.js] or [FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode],
[S5] HTML slide shows. It can also produce [PDF] output on systems where [AsciiDoc], [InDesign ICML], and [Slidy], [Slideous], [DZSlides],
LaTeX is installed. [reveal.js] or [S5] HTML slide shows. It can also produce [PDF] output
on systems where LaTeX is installed.
Pandoc's enhanced version of markdown includes syntax for footnotes, Pandoc's enhanced version of markdown includes syntax for footnotes,
tables, flexible ordered lists, definition lists, fenced code blocks, tables, flexible ordered lists, definition lists, fenced code blocks,
@ -159,6 +160,7 @@ General options
extended markdown), `markdown_strict` (original unextended markdown), extended markdown), `markdown_strict` (original unextended markdown),
`markdown_phpextra` (PHP Markdown Extra extended markdown), `markdown_phpextra` (PHP Markdown Extra extended markdown),
`markdown_github` (github extended markdown), `markdown_github` (github extended markdown),
`commonmark` (CommonMark markdown),
`textile` (Textile), `rst` (reStructuredText), `html` (HTML), `textile` (Textile), `rst` (reStructuredText), `html` (HTML),
`docbook` (DocBook), `t2t` (txt2tags), `docx` (docx), `epub` (EPUB), `docbook` (DocBook), `t2t` (txt2tags), `docx` (docx), `epub` (EPUB),
`opml` (OPML), `org` (Emacs Org-mode), `mediawiki` (MediaWiki markup), `opml` (OPML), `org` (Emacs Org-mode), `mediawiki` (MediaWiki markup),
@ -3238,3 +3240,4 @@ Rosenthal.
[EPUB]: http://idpf.org/epub [EPUB]: http://idpf.org/epub
[EPUBspine]: http://www.idpf.org/epub/301/spec/epub-publications.html#sec-spine-elem [EPUBspine]: http://www.idpf.org/epub/301/spec/epub-publications.html#sec-spine-elem
[KaTeX]: https://github.com/Khan/KaTeX [KaTeX]: https://github.com/Khan/KaTeX
[CommonMark]: http://commonmark.org

View file

@ -26,12 +26,18 @@ import Debug.Trace (trace)
readerBench :: Pandoc readerBench :: Pandoc
-> (String, ReaderOptions -> String -> IO Pandoc) -> (String, ReaderOptions -> String -> IO Pandoc)
-> Maybe Benchmark -> Maybe Benchmark
readerBench doc (name, reader) = case lookup name writers of readerBench doc (name, reader) =
Just (PureStringWriter writer) -> case lookup name writers of
let inp = writer def{ writerWrapText = True} doc Just (PureStringWriter writer) ->
in return $ bench (name ++ " reader") $ nfIO $ let inp = writer def{ writerWrapText = True} doc
(reader def{ readerSmart = True }) inp in return $ bench (name ++ " reader") $ nfIO $
_ -> trace ("\nCould not find writer for " ++ name ++ "\n") Nothing (reader def{ readerSmart = True }) inp
_ | name == "commonmark" ->
let inp = writeMarkdown def{ writerWrapText = True} doc
in return $ bench (name ++ " reader") $ nfIO $
(reader def{ readerSmart = True }) inp
| otherwise -> trace ("\nCould not find writer for " ++ name ++
"\n") Nothing
writerBench :: Pandoc writerBench :: Pandoc
-> (String, WriterOptions -> Pandoc -> String) -> (String, WriterOptions -> Pandoc -> String)

View file

@ -254,7 +254,8 @@ Library
old-time, old-time,
deepseq-generics >= 0.1 && < 0.2, deepseq-generics >= 0.1 && < 0.2,
JuicyPixels >= 3.1.6.1 && < 3.3, JuicyPixels >= 3.1.6.1 && < 3.3,
filemanip >= 0.3 && < 0.4 filemanip >= 0.3 && < 0.4,
cmark >= 0.3 && < 0.4
if flag(old-locale) if flag(old-locale)
Build-Depends: old-locale >= 1 && < 1.1, Build-Depends: old-locale >= 1 && < 1.1,
time >= 1.2 && < 1.5 time >= 1.2 && < 1.5
@ -292,6 +293,7 @@ Library
Text.Pandoc.Readers.HTML, Text.Pandoc.Readers.HTML,
Text.Pandoc.Readers.LaTeX, Text.Pandoc.Readers.LaTeX,
Text.Pandoc.Readers.Markdown, Text.Pandoc.Readers.Markdown,
Text.Pandoc.Readers.CommonMark,
Text.Pandoc.Readers.MediaWiki, Text.Pandoc.Readers.MediaWiki,
Text.Pandoc.Readers.RST, Text.Pandoc.Readers.RST,
Text.Pandoc.Readers.Org, Text.Pandoc.Readers.Org,

View file

@ -66,6 +66,7 @@ module Text.Pandoc
, mkStringReader , mkStringReader
, readDocx , readDocx
, readMarkdown , readMarkdown
, readCommonMark
, readMediaWiki , readMediaWiki
, readRST , readRST
, readOrg , readOrg
@ -124,6 +125,7 @@ import Text.Pandoc.Definition
import Text.Pandoc.Generic import Text.Pandoc.Generic
import Text.Pandoc.JSON import Text.Pandoc.JSON
import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.CommonMark
import Text.Pandoc.Readers.MediaWiki import Text.Pandoc.Readers.MediaWiki
import Text.Pandoc.Readers.RST import Text.Pandoc.Readers.RST
import Text.Pandoc.Readers.Org import Text.Pandoc.Readers.Org
@ -225,6 +227,7 @@ readers = [ ("native" , StringReader $ \_ s -> return $ readNative s)
,("markdown_phpextra" , mkStringReaderWithWarnings readMarkdownWithWarnings) ,("markdown_phpextra" , mkStringReaderWithWarnings readMarkdownWithWarnings)
,("markdown_github" , mkStringReaderWithWarnings readMarkdownWithWarnings) ,("markdown_github" , mkStringReaderWithWarnings readMarkdownWithWarnings)
,("markdown_mmd", mkStringReaderWithWarnings readMarkdownWithWarnings) ,("markdown_mmd", mkStringReaderWithWarnings readMarkdownWithWarnings)
,("commonmark" , mkStringReader readCommonMark)
,("rst" , mkStringReaderWithWarnings readRSTWithWarnings ) ,("rst" , mkStringReaderWithWarnings readRSTWithWarnings )
,("mediawiki" , mkStringReader readMediaWiki) ,("mediawiki" , mkStringReader readMediaWiki)
,("docbook" , mkStringReader readDocBook) ,("docbook" , mkStringReader readDocBook)

View file

@ -0,0 +1,118 @@
{-
Copyright (C) 2015 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.CommonMark
Copyright : Copyright (C) 2015 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Conversion of CommonMark-formatted plain text to 'Pandoc' document.
CommonMark is a strongly specified variant of Markdown: http://commonmark.org.
-}
module Text.Pandoc.Readers.CommonMark (readCommonMark)
where
import CMark
import Data.Text (unpack, pack)
import Data.List (groupBy)
import Text.Pandoc.Definition
import Text.Pandoc.Options
-- | Parse a CommonMark formatted string into a 'Pandoc' structure.
readCommonMark :: ReaderOptions -> String -> Pandoc
readCommonMark opts = nodeToPandoc . commonmarkToNode opts' . pack
where opts' = if readerSmart opts
then [optNormalize, optSmart]
else [optNormalize]
nodeToPandoc :: Node -> Pandoc
nodeToPandoc (Node _ DOCUMENT nodes) =
Pandoc nullMeta $ foldr addBlock [] nodes
nodeToPandoc n = -- shouldn't happen
Pandoc nullMeta $ foldr addBlock [] [n]
addBlocks :: [Node] -> [Block]
addBlocks = foldr addBlock []
addBlock :: Node -> [Block] -> [Block]
addBlock (Node _ PARAGRAPH nodes) =
(Para (addInlines nodes) :)
addBlock (Node _ HRULE _) =
(HorizontalRule :)
addBlock (Node _ BLOCK_QUOTE nodes) =
(BlockQuote (addBlocks nodes) :)
addBlock (Node _ (HTML t) _) =
(RawBlock (Format "html") (unpack t) :)
addBlock (Node _ (CODE_BLOCK info t) _) =
(CodeBlock ("", take 1 (words (unpack info)), []) (unpack t) :)
addBlock (Node _ (HEADER lev) nodes) =
(Header lev ("",[],[]) (addInlines nodes) :)
addBlock (Node _ (LIST listAttrs) nodes) =
(constructor (map (setTightness . addBlocks . children) nodes) :)
where constructor = case listType listAttrs of
BULLET_LIST -> BulletList
ORDERED_LIST -> OrderedList
(start, DefaultStyle, delim)
start = listStart listAttrs
setTightness = if listTight listAttrs
then map paraToPlain
else id
paraToPlain (Para xs) = Plain (xs)
paraToPlain x = x
delim = case listDelim listAttrs of
PERIOD_DELIM -> Period
PAREN_DELIM -> OneParen
addBlock (Node _ ITEM nodes) = id -- handled in LIST
addBlock _ = id
children :: Node -> [Node]
children (Node _ _ ns) = ns
addInlines :: [Node] -> [Inline]
addInlines = foldr addInline []
addInline :: Node -> [Inline] -> [Inline]
addInline (Node _ (TEXT t) _) = (map toinl clumps ++)
where raw = unpack t
clumps = groupBy samekind raw
samekind ' ' ' ' = True
samekind ' ' _ = False
samekind _ ' ' = False
samekind _ _ = True
toinl (' ':_) = Space
toinl xs = Str xs
addInline (Node _ LINEBREAK _) = (LineBreak :)
addInline (Node _ SOFTBREAK _) = (Space :)
addInline (Node _ (INLINE_HTML t) _) =
(RawInline (Format "html") (unpack t) :)
addInline (Node _ (CODE t) _) =
(Code ("",[],[]) (unpack t) :)
addInline (Node _ EMPH nodes) =
(Emph (addInlines nodes) :)
addInline (Node _ STRONG nodes) =
(Strong (addInlines nodes) :)
addInline (Node _ (LINK url title) nodes) =
(Link (addInlines nodes) (unpack url, unpack title) :)
addInline (Node _ (IMAGE url title) nodes) =
(Image (addInlines nodes) (unpack url, unpack title) :)
addInline _ = id