Add reader for Haddock markup based on Haddock's own lexer/parser.

2013-03-28 14:53:10 -07:00 · 2013-03-28 14:53:10 -07:00 · 18459b95ba
commit 18459b95ba
parent ee0fc19bc5
5 changed files with 395 additions and 1 deletions
--- a/pandoc.cabal
+++ b/pandoc.cabal
@ -223,6 +223,7 @@ Library
  Build-Depends: base >= 4.2 && <5,
                 syb >= 0.1 && < 0.5,
                 containers >= 0.1 && < 0.6,
+                 array >= 0.3 && < 0.5,
                 parsec >= 3.1 && < 3.2,
                 mtl >= 1.1 && < 2.2,
                 network >= 2 && < 2.5,
@ -287,6 +288,7 @@ Library
                   Text.Pandoc.Readers.TeXMath,
                   Text.Pandoc.Readers.Textile,
                   Text.Pandoc.Readers.Native,
+                   Text.Pandoc.Readers.Haddock,
                   Text.Pandoc.Writers.Native,
                   Text.Pandoc.Writers.Docbook,
                   Text.Pandoc.Writers.OPML,
@ -313,7 +315,9 @@ Library
                   Text.Pandoc.XML,
                   Text.Pandoc.Biblio,
                   Text.Pandoc.SelfContained
-  Other-Modules:   Text.Pandoc.MIME,
+  Other-Modules:   Text.Pandoc.Readers.Haddock.Lex,
+                   Text.Pandoc.MIME,
+                   Text.Pandoc.Readers.Haddock.Parse,
                   Text.Pandoc.Parsing,
                   Text.Pandoc.UUID,
                   Text.Pandoc.ImageSize,
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@ -73,6 +73,7 @@ module Text.Pandoc
               , readTextile
               , readDocBook
               , readOPML
+               , readHaddock
               , readNative
               -- * Writers: converting /from/ Pandoc format
               , Writer (..)
@ -120,6 +121,7 @@ import Text.Pandoc.Readers.LaTeX
 import Text.Pandoc.Readers.HTML
 import Text.Pandoc.Readers.Textile
 import Text.Pandoc.Readers.Native
+import Text.Pandoc.Readers.Haddock
 import Text.Pandoc.Writers.Native
 import Text.Pandoc.Writers.Markdown
 import Text.Pandoc.Writers.RST
@ -200,6 +202,7 @@ readers = [("native"       , \_ s -> return $ readNative s)
          ,("textile"      , \o s -> return $ readTextile o s) -- TODO : textile+lhs
          ,("html"         , \o s -> return $ readHtml o s)
          ,("latex"        , \o s -> return $ readLaTeX o s)
+          ,("haddock"      , \o s -> return $ readHaddock o s)
          ]

 data Writer = PureStringWriter   (WriterOptions -> Pandoc -> String)
--- a/src/Text/Pandoc/Readers/Haddock.hs
+++ b/src/Text/Pandoc/Readers/Haddock.hs
@ -0,0 +1,39 @@
+{- |
+   Module      : Text.Pandoc.Readers.Haddock
+   Copyright   : Copyright (C) 2013 David Lazar
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : David Lazar <lazar6@illinois.edu>
+   Stability   : alpha
+
+Conversion of Haddock markup to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Haddock
+    ( readHaddock
+    ) where
+
+import Text.Pandoc.Builder
+import Text.Pandoc.Options
+import Text.Pandoc.Readers.Haddock.Lex
+import Text.Pandoc.Readers.Haddock.Parse
+
+-- | Parse Haddock markup and return a 'Pandoc' document.
+readHaddock :: ReaderOptions -- ^ Reader options
+            -> String        -- ^ String to parse
+            -> Pandoc
+readHaddock _ s = Pandoc (Meta [] [] []) blocks
+  where
+    blocks = case parseParas (tokenise s (0,0)) of
+        Nothing -> []
+        Just x -> mergeLists (toList x)
+
+-- similar to 'docAppend' in Haddock.Doc
+mergeLists :: [Block] -> [Block]
+mergeLists (BulletList xs : BulletList ys : blocks)
+    = mergeLists (BulletList (xs ++ ys) : blocks)
+mergeLists (OrderedList _ xs : OrderedList a ys : blocks) 
+    = mergeLists (OrderedList a (xs ++ ys) : blocks)
+mergeLists (DefinitionList xs : DefinitionList ys : blocks)
+    = mergeLists (DefinitionList (xs ++ ys) : blocks)
+mergeLists (x : blocks) = x : mergeLists blocks
+mergeLists [] = []
--- a/src/Text/Pandoc/Readers/Haddock/Lex.x
+++ b/src/Text/Pandoc/Readers/Haddock/Lex.x
@ -0,0 +1,169 @@
+--
+-- Haddock - A Haskell Documentation Tool
+--
+-- (c) Simon Marlow 2002
+--
+-- This file was modified and integrated into GHC by David Waern 2006.
+-- Then moved back into Haddock by Isaac Dupree in 2009 :-)
+-- Then copied into Pandoc by David Lazar in 2013 :-D
+
+{
+{-# LANGUAGE BangPatterns #-}   -- Generated by Alex
+{-# OPTIONS -Wwarn -w #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and fix
+-- any warnings in the module. See
+--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
+-- for details
+
+module Text.Pandoc.Readers.Haddock.Lex (
+    Token(..),
+    LToken,
+    tokenise
+ ) where
+
+import Data.Char
+import Numeric (readHex)
+}
+
+%wrapper "posn"
+
+$ws       = $white # \n
+$digit    = [0-9]
+$hexdigit = [0-9a-fA-F]
+$special  = [\"\@]
+$alphanum = [A-Za-z0-9]
+$ident    = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]
+
+:-
+
+-- beginning of a paragraph
+<0,para> {
+  $ws* \n               ;
+  $ws* \>               { begin birdtrack }
+  $ws* prop \> .* \n    { strtoken TokProperty `andBegin` property}
+  $ws* \>\>\>           { strtoken TokExamplePrompt `andBegin` exampleexpr }
+  $ws* [\*\-]           { token TokBullet `andBegin` string }
+  $ws* \[               { token TokDefStart `andBegin` def }
+  $ws* \( $digit+ \)    { token TokNumber `andBegin` string }
+  $ws* $digit+ \.       { token TokNumber `andBegin` string }
+  $ws*                  { begin string }
+}
+
+-- beginning of a line
+<line> {
+  $ws* \>               { begin birdtrack }
+  $ws* \>\>\>           { strtoken TokExamplePrompt `andBegin` exampleexpr }
+  $ws* \n               { token TokPara `andBegin` para }
+
+  -- Here, we really want to be able to say
+  -- $ws* (\n | <eof>)  { token TokPara `andBegin` para}
+  -- because otherwise a trailing line of whitespace will result in
+  -- a spurious TokString at the end of a docstring.  We don't have <eof>,
+  -- though (NOW I realise what it was for :-).  To get around this, we always
+  -- append \n to the end of a docstring.
+  ()                    { begin string }
+}
+
+<birdtrack> .*  \n?     { strtokenNL TokBirdTrack `andBegin` line }
+
+<property> ()           { token TokPara `andBegin` para }
+
+<example> {
+  $ws*  \n              { token TokPara `andBegin` para }
+  $ws* \>\>\>           { strtoken TokExamplePrompt `andBegin` exampleexpr }
+  ()                    { begin exampleresult }
+}
+
+<exampleexpr> .* \n     { strtokenNL TokExampleExpression `andBegin` example }
+
+<exampleresult> .* \n   { strtokenNL TokExampleResult `andBegin` example }
+
+<string,def> {
+  $special              { strtoken $ \s -> TokSpecial (head s) }
+  \<\< [^\>]* \>\>      { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
+  \< [^\>]* \>          { strtoken $ \s -> TokURL (init (tail s)) }
+  \# [^\#]* \#          { strtoken $ \s -> TokAName (init (tail s)) }
+  \/ [^\/]* \/          { strtoken $ \s -> TokEmphasis (init (tail s)) }
+  [\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) }
+  \\ .                  { strtoken (TokString . tail) }
+  "&#" $digit+ \;       { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
+  "&#" [xX] $hexdigit+ \;
+        { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
+  -- allow special characters through if they don't fit one of the previous
+  -- patterns.
+  [\/\'\`\<\#\&\\]      { strtoken TokString }
+  [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
+  [^ $special \/ \< \# \n \'\` \& \\ \]]+    { strtoken TokString }
+}
+
+<def> {
+  \]                    { token TokDefEnd `andBegin` string }
+}
+
+-- ']' doesn't have any special meaning outside of the [...] at the beginning
+-- of a definition paragraph.
+<string> {
+  \]                    { strtoken TokString }
+}
+
+{
+-- | A located token
+type LToken = (Token, AlexPosn)
+
+data Token
+  = TokPara
+  | TokNumber
+  | TokBullet
+  | TokDefStart
+  | TokDefEnd
+  | TokSpecial Char
+  | TokIdent String
+  | TokString String
+  | TokURL String
+  | TokPic String
+  | TokEmphasis String
+  | TokAName String
+  | TokBirdTrack String
+  | TokProperty String
+  | TokExamplePrompt String
+  | TokExampleExpression String
+  | TokExampleResult String
+--  deriving Show
+
+tokenPos :: LToken -> (Int, Int)
+tokenPos t = let AlexPn _ line col = snd t in (line, col)
+
+type StartCode = Int
+type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken]
+
+tokenise :: String -> (Int, Int) -> [LToken]
+tokenise str (line, col) = go (posn,'\n',[],eofHack str) para
+  where posn = AlexPn 0 line col
+        go inp@(pos,_,_,str) sc =
+          case alexScan inp sc of
+                AlexEOF -> []
+                AlexError _ -> []
+                AlexSkip  inp' len     -> go inp' sc
+                AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc)
+
+-- NB. we add a final \n to the string, (see comment in the beginning of line
+-- production above).
+eofHack str = str++"\n"
+
+andBegin :: Action -> StartCode -> Action
+andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont
+
+token :: Token -> Action
+token t = \pos _ sc cont -> (t, pos) : cont sc
+
+strtoken, strtokenNL :: (String -> Token) -> Action
+strtoken t = \pos str sc cont -> (t str, pos) : cont sc
+strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc
+-- ^ We only want LF line endings in our internal doc string format, so we
+-- filter out all CRs.
+
+begin :: StartCode -> Action
+begin sc = \_ _ _ cont -> cont sc
+
+}
--- a/src/Text/Pandoc/Readers/Haddock/Parse.y
+++ b/src/Text/Pandoc/Readers/Haddock/Parse.y
@ -0,0 +1,179 @@
+-- This code was copied from the 'haddock' package, modified, and integrated
+-- into Pandoc by David Lazar.
+{
+{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6
+{-# OPTIONS -Wwarn -w #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and fix
+-- any warnings in the module. See
+--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
+-- for details
+
+module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where
+
+import Text.Pandoc.Readers.Haddock.Lex
+import Text.Pandoc.Builder
+import Data.Generics (everywhere, mkT)
+import Data.Char  (isSpace)
+import Data.Maybe (fromMaybe)
+import Data.List  (stripPrefix)
+import Data.Monoid (mempty)
+}
+
+%expect 0
+
+%tokentype  { LToken }
+
+%token
+    '/'     { (TokSpecial '/',_) }
+    '@'     { (TokSpecial '@',_) }
+    '['     { (TokDefStart,_) }
+    ']'     { (TokDefEnd,_) }
+    DQUO    { (TokSpecial '\"',_) }
+    URL     { (TokURL $$,_) }
+    PIC     { (TokPic $$,_) }
+    ANAME   { (TokAName $$,_) }
+    '/../'  { (TokEmphasis $$,_) }
+    '-'     { (TokBullet,_) }
+    '(n)'   { (TokNumber,_) }
+    '>..'   { (TokBirdTrack $$,_) }
+    PROP    { (TokProperty $$,_) }
+    PROMPT  { (TokExamplePrompt $$,_) }
+    RESULT  { (TokExampleResult $$,_) }
+    EXP     { (TokExampleExpression $$,_) }
+    IDENT   { (TokIdent $$,_) }
+    PARA    { (TokPara,_) }
+    STRING  { (TokString $$,_) }
+
+%monad { Maybe }
+
+%name parseParas doc
+%name parseString seq
+
+%%
+
+doc :: { Blocks }
+    : apara PARA doc    { $1 <> $3 }
+    | PARA doc          { $2 }
+    | apara             { $1 }
+    | {- empty -}       { mempty }
+
+apara :: { Blocks }
+    : ulpara            { bulletList [$1] }
+    | olpara            { orderedList [$1] }
+    | defpara           { definitionList [$1] }
+    | para              { $1 }
+
+ulpara :: { Blocks }
+    : '-' para          { $2 }
+
+olpara  :: { Blocks }
+    : '(n)' para        { $2 }
+
+defpara :: { (Inlines, [Blocks]) }
+    : '[' seq ']' seq   { ($2, [plain $4]) }
+
+para :: { Blocks }
+    : seq               { para $1 }
+    | codepara          { codeBlock $1 }
+    | property          { $1 }
+    | examples          { $1 }
+
+codepara :: { String }
+    : '>..' codepara    { $1 ++ $2 }
+    | '>..'             { $1 }
+
+property :: { Blocks }
+    : PROP              { makeProperty $1 }
+
+examples :: { Blocks }
+    : example examples  { $1 <> $2 }
+    | example           { $1 }
+
+example :: { Blocks }
+    : PROMPT EXP result { makeExample $1 $2 (lines $3) }
+    | PROMPT EXP        { makeExample $1 $2 [] }
+
+result :: { String }
+    : RESULT result     { $1 ++ $2 }
+    | RESULT            { $1 }
+
+seq :: { Inlines }
+    : elem seq          { $1 <> $2 }
+    | elem              { $1 }
+
+elem :: { Inlines }
+    : elem1             { $1 }
+    | '@' seq1 '@'      { monospace $2 }
+
+seq1 :: { Inlines }
+    : PARA seq1         { linebreak <> $2 }
+    | elem1 seq1        { $1 <> $2 }
+    | elem1             { $1 }
+
+elem1 :: { Inlines }
+    : STRING            { str $1 }
+    | '/../'            { emph (str $1) }
+    | URL               { makeHyperlink $1 }
+    | PIC               { image $1 $1 mempty }
+    | ANAME             { mempty } -- TODO
+    | IDENT             { code $1 }
+    | DQUO strings DQUO { code $2 }
+
+strings :: { String }
+    : STRING            { $1 }
+    | STRING strings    { $1 ++ $2 }
+
+{
+happyError :: [LToken] -> Maybe a
+happyError toks = Nothing
+
+monospace :: Inlines -> Inlines 
+monospace = everywhere (mkT go)
+  where
+    go (Str s) = Code nullAttr s
+    go Space = Code nullAttr " "
+    go x = x
+
+-- | Create a `Hyperlink` from given string.
+--
+-- A hyperlink consists of a URL and an optional label.  The label is separated
+-- from the url by one or more whitespace characters.
+makeHyperlink :: String -> Inlines
+makeHyperlink input = case break isSpace $ strip input of
+  (url, "") -> link url url (str url)
+  (url, lb) -> link url url (str label)
+    where label = dropWhile isSpace lb
+
+makeProperty :: String -> Blocks
+makeProperty s = case strip s of
+  'p':'r':'o':'p':'>':xs ->
+    codeBlockWith ([], ["property"], []) (dropWhile isSpace xs)
+  xs ->
+    error $ "makeProperty: invalid input " ++ show xs
+
+-- | Create an 'Example', stripping superfluous characters as appropriate
+makeExample :: String -> String -> [String] -> Blocks
+makeExample prompt expression result =
+    para $ codeWith ([], ["expr"], []) (strip expression ++ "\n")
+        <> codeWith ([], ["result"], []) (unlines result')
+  where
+    -- 1. drop trailing whitespace from the prompt, remember the prefix
+    (prefix, _) = span isSpace prompt
+
+    -- 2. drop, if possible, the exact same sequence of whitespace
+    -- characters from each result line
+    --
+    -- 3. interpret lines that only contain the string "<BLANKLINE>" as an
+    -- empty line
+    result' = map (substituteBlankLine . tryStripPrefix prefix) result
+      where
+        tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
+
+        substituteBlankLine "<BLANKLINE>" = ""
+        substituteBlankLine line          = line
+
+-- | Remove all leading and trailing whitespace
+strip :: String -> String
+strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse
+}