Add reader for Haddock markup based on Haddock's own lexer/parser.

This commit is contained in:
David Lazar 2013-03-28 14:53:10 -07:00
parent ee0fc19bc5
commit 18459b95ba
5 changed files with 395 additions and 1 deletions

View file

@ -223,6 +223,7 @@ Library
Build-Depends: base >= 4.2 && <5,
syb >= 0.1 && < 0.5,
containers >= 0.1 && < 0.6,
array >= 0.3 && < 0.5,
parsec >= 3.1 && < 3.2,
mtl >= 1.1 && < 2.2,
network >= 2 && < 2.5,
@ -287,6 +288,7 @@ Library
Text.Pandoc.Readers.TeXMath,
Text.Pandoc.Readers.Textile,
Text.Pandoc.Readers.Native,
Text.Pandoc.Readers.Haddock,
Text.Pandoc.Writers.Native,
Text.Pandoc.Writers.Docbook,
Text.Pandoc.Writers.OPML,
@ -313,7 +315,9 @@ Library
Text.Pandoc.XML,
Text.Pandoc.Biblio,
Text.Pandoc.SelfContained
Other-Modules: Text.Pandoc.MIME,
Other-Modules: Text.Pandoc.Readers.Haddock.Lex,
Text.Pandoc.MIME,
Text.Pandoc.Readers.Haddock.Parse,
Text.Pandoc.Parsing,
Text.Pandoc.UUID,
Text.Pandoc.ImageSize,

View file

@ -73,6 +73,7 @@ module Text.Pandoc
, readTextile
, readDocBook
, readOPML
, readHaddock
, readNative
-- * Writers: converting /from/ Pandoc format
, Writer (..)
@ -120,6 +121,7 @@ import Text.Pandoc.Readers.LaTeX
import Text.Pandoc.Readers.HTML
import Text.Pandoc.Readers.Textile
import Text.Pandoc.Readers.Native
import Text.Pandoc.Readers.Haddock
import Text.Pandoc.Writers.Native
import Text.Pandoc.Writers.Markdown
import Text.Pandoc.Writers.RST
@ -200,6 +202,7 @@ readers = [("native" , \_ s -> return $ readNative s)
,("textile" , \o s -> return $ readTextile o s) -- TODO : textile+lhs
,("html" , \o s -> return $ readHtml o s)
,("latex" , \o s -> return $ readLaTeX o s)
,("haddock" , \o s -> return $ readHaddock o s)
]
data Writer = PureStringWriter (WriterOptions -> Pandoc -> String)

View file

@ -0,0 +1,39 @@
{- |
Module : Text.Pandoc.Readers.Haddock
Copyright : Copyright (C) 2013 David Lazar
License : GNU GPL, version 2 or above
Maintainer : David Lazar <lazar6@illinois.edu>
Stability : alpha
Conversion of Haddock markup to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Haddock
( readHaddock
) where
import Text.Pandoc.Builder
import Text.Pandoc.Options
import Text.Pandoc.Readers.Haddock.Lex
import Text.Pandoc.Readers.Haddock.Parse
-- | Parse Haddock markup and return a 'Pandoc' document.
readHaddock :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse
-> Pandoc
readHaddock _ s = Pandoc (Meta [] [] []) blocks
where
blocks = case parseParas (tokenise s (0,0)) of
Nothing -> []
Just x -> mergeLists (toList x)
-- similar to 'docAppend' in Haddock.Doc
mergeLists :: [Block] -> [Block]
mergeLists (BulletList xs : BulletList ys : blocks)
= mergeLists (BulletList (xs ++ ys) : blocks)
mergeLists (OrderedList _ xs : OrderedList a ys : blocks)
= mergeLists (OrderedList a (xs ++ ys) : blocks)
mergeLists (DefinitionList xs : DefinitionList ys : blocks)
= mergeLists (DefinitionList (xs ++ ys) : blocks)
mergeLists (x : blocks) = x : mergeLists blocks
mergeLists [] = []

View file

@ -0,0 +1,169 @@
--
-- Haddock - A Haskell Documentation Tool
--
-- (c) Simon Marlow 2002
--
-- This file was modified and integrated into GHC by David Waern 2006.
-- Then moved back into Haddock by Isaac Dupree in 2009 :-)
-- Then copied into Pandoc by David Lazar in 2013 :-D
{
{-# LANGUAGE BangPatterns #-} -- Generated by Alex
{-# OPTIONS -Wwarn -w #-}
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
-- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
-- for details
module Text.Pandoc.Readers.Haddock.Lex (
Token(..),
LToken,
tokenise
) where
import Data.Char
import Numeric (readHex)
}
%wrapper "posn"
$ws = $white # \n
$digit = [0-9]
$hexdigit = [0-9a-fA-F]
$special = [\"\@]
$alphanum = [A-Za-z0-9]
$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]
:-
-- beginning of a paragraph
<0,para> {
$ws* \n ;
$ws* \> { begin birdtrack }
$ws* prop \> .* \n { strtoken TokProperty `andBegin` property}
$ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
$ws* [\*\-] { token TokBullet `andBegin` string }
$ws* \[ { token TokDefStart `andBegin` def }
$ws* \( $digit+ \) { token TokNumber `andBegin` string }
$ws* $digit+ \. { token TokNumber `andBegin` string }
$ws* { begin string }
}
-- beginning of a line
<line> {
$ws* \> { begin birdtrack }
$ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
$ws* \n { token TokPara `andBegin` para }
-- Here, we really want to be able to say
-- $ws* (\n | <eof>) { token TokPara `andBegin` para}
-- because otherwise a trailing line of whitespace will result in
-- a spurious TokString at the end of a docstring. We don't have <eof>,
-- though (NOW I realise what it was for :-). To get around this, we always
-- append \n to the end of a docstring.
() { begin string }
}
<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line }
<property> () { token TokPara `andBegin` para }
<example> {
$ws* \n { token TokPara `andBegin` para }
$ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
() { begin exampleresult }
}
<exampleexpr> .* \n { strtokenNL TokExampleExpression `andBegin` example }
<exampleresult> .* \n { strtokenNL TokExampleResult `andBegin` example }
<string,def> {
$special { strtoken $ \s -> TokSpecial (head s) }
\<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
\< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) }
\# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) }
\/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) }
[\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) }
\\ . { strtoken (TokString . tail) }
"&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
"&#" [xX] $hexdigit+ \;
{ strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
-- allow special characters through if they don't fit one of the previous
-- patterns.
[\/\'\`\<\#\&\\] { strtoken TokString }
[^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
[^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString }
}
<def> {
\] { token TokDefEnd `andBegin` string }
}
-- ']' doesn't have any special meaning outside of the [...] at the beginning
-- of a definition paragraph.
<string> {
\] { strtoken TokString }
}
{
-- | A located token
type LToken = (Token, AlexPosn)
data Token
= TokPara
| TokNumber
| TokBullet
| TokDefStart
| TokDefEnd
| TokSpecial Char
| TokIdent String
| TokString String
| TokURL String
| TokPic String
| TokEmphasis String
| TokAName String
| TokBirdTrack String
| TokProperty String
| TokExamplePrompt String
| TokExampleExpression String
| TokExampleResult String
-- deriving Show
tokenPos :: LToken -> (Int, Int)
tokenPos t = let AlexPn _ line col = snd t in (line, col)
type StartCode = Int
type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken]
tokenise :: String -> (Int, Int) -> [LToken]
tokenise str (line, col) = go (posn,'\n',[],eofHack str) para
where posn = AlexPn 0 line col
go inp@(pos,_,_,str) sc =
case alexScan inp sc of
AlexEOF -> []
AlexError _ -> []
AlexSkip inp' len -> go inp' sc
AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc)
-- NB. we add a final \n to the string, (see comment in the beginning of line
-- production above).
eofHack str = str++"\n"
andBegin :: Action -> StartCode -> Action
andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont
token :: Token -> Action
token t = \pos _ sc cont -> (t, pos) : cont sc
strtoken, strtokenNL :: (String -> Token) -> Action
strtoken t = \pos str sc cont -> (t str, pos) : cont sc
strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc
-- ^ We only want LF line endings in our internal doc string format, so we
-- filter out all CRs.
begin :: StartCode -> Action
begin sc = \_ _ _ cont -> cont sc
}

View file

@ -0,0 +1,179 @@
-- This code was copied from the 'haddock' package, modified, and integrated
-- into Pandoc by David Lazar.
{
{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6
{-# OPTIONS -Wwarn -w #-}
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
-- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
-- for details
module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where
import Text.Pandoc.Readers.Haddock.Lex
import Text.Pandoc.Builder
import Data.Generics (everywhere, mkT)
import Data.Char (isSpace)
import Data.Maybe (fromMaybe)
import Data.List (stripPrefix)
import Data.Monoid (mempty)
}
%expect 0
%tokentype { LToken }
%token
'/' { (TokSpecial '/',_) }
'@' { (TokSpecial '@',_) }
'[' { (TokDefStart,_) }
']' { (TokDefEnd,_) }
DQUO { (TokSpecial '\"',_) }
URL { (TokURL $$,_) }
PIC { (TokPic $$,_) }
ANAME { (TokAName $$,_) }
'/../' { (TokEmphasis $$,_) }
'-' { (TokBullet,_) }
'(n)' { (TokNumber,_) }
'>..' { (TokBirdTrack $$,_) }
PROP { (TokProperty $$,_) }
PROMPT { (TokExamplePrompt $$,_) }
RESULT { (TokExampleResult $$,_) }
EXP { (TokExampleExpression $$,_) }
IDENT { (TokIdent $$,_) }
PARA { (TokPara,_) }
STRING { (TokString $$,_) }
%monad { Maybe }
%name parseParas doc
%name parseString seq
%%
doc :: { Blocks }
: apara PARA doc { $1 <> $3 }
| PARA doc { $2 }
| apara { $1 }
| {- empty -} { mempty }
apara :: { Blocks }
: ulpara { bulletList [$1] }
| olpara { orderedList [$1] }
| defpara { definitionList [$1] }
| para { $1 }
ulpara :: { Blocks }
: '-' para { $2 }
olpara :: { Blocks }
: '(n)' para { $2 }
defpara :: { (Inlines, [Blocks]) }
: '[' seq ']' seq { ($2, [plain $4]) }
para :: { Blocks }
: seq { para $1 }
| codepara { codeBlock $1 }
| property { $1 }
| examples { $1 }
codepara :: { String }
: '>..' codepara { $1 ++ $2 }
| '>..' { $1 }
property :: { Blocks }
: PROP { makeProperty $1 }
examples :: { Blocks }
: example examples { $1 <> $2 }
| example { $1 }
example :: { Blocks }
: PROMPT EXP result { makeExample $1 $2 (lines $3) }
| PROMPT EXP { makeExample $1 $2 [] }
result :: { String }
: RESULT result { $1 ++ $2 }
| RESULT { $1 }
seq :: { Inlines }
: elem seq { $1 <> $2 }
| elem { $1 }
elem :: { Inlines }
: elem1 { $1 }
| '@' seq1 '@' { monospace $2 }
seq1 :: { Inlines }
: PARA seq1 { linebreak <> $2 }
| elem1 seq1 { $1 <> $2 }
| elem1 { $1 }
elem1 :: { Inlines }
: STRING { str $1 }
| '/../' { emph (str $1) }
| URL { makeHyperlink $1 }
| PIC { image $1 $1 mempty }
| ANAME { mempty } -- TODO
| IDENT { code $1 }
| DQUO strings DQUO { code $2 }
strings :: { String }
: STRING { $1 }
| STRING strings { $1 ++ $2 }
{
happyError :: [LToken] -> Maybe a
happyError toks = Nothing
monospace :: Inlines -> Inlines
monospace = everywhere (mkT go)
where
go (Str s) = Code nullAttr s
go Space = Code nullAttr " "
go x = x
-- | Create a `Hyperlink` from given string.
--
-- A hyperlink consists of a URL and an optional label. The label is separated
-- from the url by one or more whitespace characters.
makeHyperlink :: String -> Inlines
makeHyperlink input = case break isSpace $ strip input of
(url, "") -> link url url (str url)
(url, lb) -> link url url (str label)
where label = dropWhile isSpace lb
makeProperty :: String -> Blocks
makeProperty s = case strip s of
'p':'r':'o':'p':'>':xs ->
codeBlockWith ([], ["property"], []) (dropWhile isSpace xs)
xs ->
error $ "makeProperty: invalid input " ++ show xs
-- | Create an 'Example', stripping superfluous characters as appropriate
makeExample :: String -> String -> [String] -> Blocks
makeExample prompt expression result =
para $ codeWith ([], ["expr"], []) (strip expression ++ "\n")
<> codeWith ([], ["result"], []) (unlines result')
where
-- 1. drop trailing whitespace from the prompt, remember the prefix
(prefix, _) = span isSpace prompt
-- 2. drop, if possible, the exact same sequence of whitespace
-- characters from each result line
--
-- 3. interpret lines that only contain the string "<BLANKLINE>" as an
-- empty line
result' = map (substituteBlankLine . tryStripPrefix prefix) result
where
tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
substituteBlankLine "<BLANKLINE>" = ""
substituteBlankLine line = line
-- | Remove all leading and trailing whitespace
strip :: String -> String
strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse
}