Merge pull request #1256 from tarleb/org-reader-improvements

Org reader improvements
This commit is contained in:
John MacFarlane 2014-04-19 20:35:41 -07:00
commit 6a2361c457
5 changed files with 641 additions and 252 deletions

View file

@ -55,6 +55,13 @@ Copyright (C) 2010 Paul Rivier
Released under the GPL.
----------------------------------------------------------------------
src/Text/Pandoc/Readers/Org.hs
tests/Tests/Readers/Org.hs
Copyright (C) 2014 Albert Krewinkel
Released under the GPL.
----------------------------------------------------------------------
src/Text/Pandoc/Biblio.hs
Copyright (C) 2008-2010 Andrea Rossato

View file

@ -861,22 +861,6 @@ definitionList = do
items <- fmap sequence $ many1 definitionListItem
return $ B.definitionList <$> fmap compactify'DL items
compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])]
compactify'DL items =
let defs = concatMap snd items
defBlocks = reverse $ concatMap B.toList defs
isPara (Para _) = True
isPara _ = False
in case defBlocks of
(Para x:_) -> if not $ any isPara (drop 1 defBlocks)
then let (t,ds) = last items
lastDef = B.toList $ last ds
ds' = init ds ++
[B.fromList $ init lastDef ++ [Plain x]]
in init items ++ [(t, ds')]
else items
_ -> items
--
-- paragraph block
--
@ -1892,4 +1876,3 @@ doubleQuoted = try $ do
(withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return
(fmap B.doubleQuoted . trimInlinesF $ contents))
<|> (return $ return (B.str "\8220") <> contents)

View file

@ -1,4 +1,5 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-
Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de>
@ -24,26 +25,32 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Maintainer : Albert Krewinkel <tarleb@moltkeplatz.de>
Conversion of Org-Mode to 'Pandoc' document.
Conversion of org-mode formatted plain text to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Org ( readOrg ) where
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>), HasMeta(..))
import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), (<>)
, trimInlines )
import Text.Pandoc.Definition
import Text.Pandoc.Options
import qualified Text.Pandoc.Parsing as P
import Text.Pandoc.Parsing hiding (newline, orderedListMarker, updateLastStrPos)
import Text.Pandoc.Shared (compactify')
import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
, newline, orderedListMarker
, parseFromString
, updateLastStrPos )
import Text.Pandoc.Shared (compactify', compactify'DL)
import Control.Applicative (pure, (<$>), (<$), (<*>), (<*), (*>), (<**>))
import Control.Arrow ((***))
import Control.Monad (guard, when)
import Control.Applicative ( Applicative, pure
, (<$>), (<$), (<*>), (<*), (*>), (<**>) )
import Control.Monad (foldM, guard, liftM, liftM2, when)
import Control.Monad.Reader (Reader, runReader, ask, asks)
import Data.Char (toLower)
import Data.Default
import Data.List (foldl', isPrefixOf, isSuffixOf)
import Data.Maybe (listToMaybe, fromMaybe)
import Data.Monoid (mconcat, mempty, mappend)
import Data.List (intersperse, isPrefixOf, isSuffixOf)
import qualified Data.Map as M
import Data.Maybe (listToMaybe, fromMaybe, isJust)
import Data.Monoid (Monoid, mconcat, mempty, mappend)
-- | Parse org-mode string and return a Pandoc document.
readOrg :: ReaderOptions -- ^ Reader options
@ -53,27 +60,35 @@ readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
type OrgParser = Parser [Char] OrgParserState
parseOrg:: OrgParser Pandoc
parseOrg :: OrgParser Pandoc
parseOrg = do
blocks' <- B.toList <$> parseBlocks
blocks' <- parseBlocks
st <- getState
let meta = orgStateMeta st
return $ Pandoc meta $ filter (/= Null) blocks'
let meta = runF (orgStateMeta' st) st
return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st)
--
-- Parser State for Org
--
type OrgNoteRecord = (String, F Blocks)
type OrgNoteTable = [OrgNoteRecord]
type OrgBlockAttributes = M.Map String String
-- | Org-mode parser state
data OrgParserState = OrgParserState
{ orgStateOptions :: ReaderOptions
, orgStateBlockAttributes :: OrgBlockAttributes
, orgStateEmphasisCharStack :: [Char]
, orgStateEmphasisNewlines :: Maybe Int
, orgStateLastForbiddenCharPos :: Maybe SourcePos
, orgStateLastPreCharPos :: Maybe SourcePos
, orgStateLastStrPos :: Maybe SourcePos
, orgStateMeta :: Meta
} deriving (Show)
, orgStateMeta' :: F Meta
, orgStateNotes' :: OrgNoteTable
}
instance HasReaderOptions OrgParserState where
extractReaderOptions = orgStateOptions
@ -90,14 +105,30 @@ instance Default OrgParserState where
defaultOrgParserState :: OrgParserState
defaultOrgParserState = OrgParserState
{ orgStateOptions = def
, orgStateBlockAttributes = M.empty
, orgStateEmphasisCharStack = []
, orgStateEmphasisNewlines = Nothing
, orgStateLastForbiddenCharPos = Nothing
, orgStateLastPreCharPos = Nothing
, orgStateLastStrPos = Nothing
, orgStateMeta = nullMeta
, orgStateMeta' = return nullMeta
, orgStateNotes' = []
}
addBlockAttribute :: String -> String -> OrgParser ()
addBlockAttribute key val = updateState $ \s ->
let attrs = orgStateBlockAttributes s
in s{ orgStateBlockAttributes = M.insert key val attrs }
lookupBlockAttribute :: String -> OrgParser (Maybe String)
lookupBlockAttribute key =
M.lookup key . orgStateBlockAttributes <$> getState
resetBlockAttributes :: OrgParser ()
resetBlockAttributes = updateState $ \s ->
s{ orgStateBlockAttributes = orgStateBlockAttributes def }
updateLastStrPos :: OrgParser ()
updateLastStrPos = getPosition >>= \p ->
updateState $ \s -> s{ orgStateLastStrPos = Just p }
@ -111,19 +142,19 @@ updateLastPreCharPos = getPosition >>= \p ->
updateState $ \s -> s{ orgStateLastPreCharPos = Just p}
pushToInlineCharStack :: Char -> OrgParser ()
pushToInlineCharStack c = updateState $ \st ->
st { orgStateEmphasisCharStack = c:orgStateEmphasisCharStack st }
pushToInlineCharStack c = updateState $ \s ->
s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s }
popInlineCharStack :: OrgParser ()
popInlineCharStack = updateState $ \st ->
st { orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ st }
popInlineCharStack = updateState $ \s ->
s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s }
surroundingEmphasisChar :: OrgParser [Char]
surroundingEmphasisChar = take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
startEmphasisNewlinesCounting :: Int -> OrgParser ()
startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
s { orgStateEmphasisNewlines = Just maxNewlines }
s{ orgStateEmphasisNewlines = Just maxNewlines }
decEmphasisNewlinesCount :: OrgParser ()
decEmphasisNewlinesCount = updateState $ \s ->
@ -138,6 +169,48 @@ resetEmphasisNewlines :: OrgParser ()
resetEmphasisNewlines = updateState $ \s ->
s{ orgStateEmphasisNewlines = Nothing }
addToNotesTable :: OrgNoteRecord -> OrgParser ()
addToNotesTable note = do
oldnotes <- orgStateNotes' <$> getState
updateState $ \s -> s{ orgStateNotes' = note:oldnotes }
-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts
-- of the state saved and restored.
parseFromString :: OrgParser a -> String -> OrgParser a
parseFromString parser str' = do
oldLastPreCharPos <- orgStateLastPreCharPos <$> getState
updateState $ \s -> s{ orgStateLastPreCharPos = Nothing }
result <- P.parseFromString parser str'
updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos }
return result
--
-- Adaptions and specializations of parsing utilities
--
newtype F a = F { unF :: Reader OrgParserState a
} deriving (Monad, Applicative, Functor)
runF :: F a -> OrgParserState -> a
runF = runReader . unF
askF :: F OrgParserState
askF = F ask
asksF :: (OrgParserState -> a) -> F a
asksF f = F $ asks f
instance Monoid a => Monoid (F a) where
mempty = return mempty
mappend = liftM2 mappend
mconcat = fmap mconcat . sequence
trimInlinesF :: F Inlines -> F Inlines
trimInlinesF = liftM trimInlines
-- | Like @Text.Parsec.Char.newline@, but causes additional state changes.
newline :: OrgParser Char
newline =
P.newline
@ -148,37 +221,83 @@ newline =
-- parsing blocks
--
parseBlocks :: OrgParser Blocks
parseBlocks :: OrgParser (F Blocks)
parseBlocks = mconcat <$> manyTill block eof
block :: OrgParser Blocks
block :: OrgParser (F Blocks)
block = choice [ mempty <$ blanklines
, orgBlock
, optionalAttributes $ choice
[ orgBlock
, figure
, table
]
, example
, drawer
, figure
, specialLine
, header
, hline
, return <$> hline
, list
, table
, latexFragment
, noteBlock
, paraOrPlain
] <?> "block"
optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks)
optionalAttributes parser = try $
resetBlockAttributes *> parseBlockAttributes *> parser
parseBlockAttributes :: OrgParser ()
parseBlockAttributes = do
attrs <- many attribute
() <$ mapM (uncurry parseAndAddAttribute) attrs
where
attribute :: OrgParser (String, String)
attribute = try $ do
key <- metaLineStart *> many1Till (noneOf "\n\r") (char ':')
val <- skipSpaces *> anyLine
return (map toLower key, val)
parseAndAddAttribute :: String -> String -> OrgParser ()
parseAndAddAttribute key value = do
let key' = map toLower key
() <$ addBlockAttribute key' value
lookupInlinesAttr :: String -> OrgParser (Maybe (F Inlines))
lookupInlinesAttr attr = try $ do
val <- lookupBlockAttribute attr
maybe (return Nothing)
(fmap Just . parseFromString parseInlines)
val
--
-- Org Blocks (#+BEGIN_... / #+END_...)
--
orgBlock :: OrgParser Blocks
orgBlock :: OrgParser (F Blocks)
orgBlock = try $ do
(indent, blockType, args) <- blockHeader
blockStr <- rawBlockContent indent blockType
content <- rawBlockContent indent blockType
contentBlocks <- parseFromString parseBlocks (content ++ "\n")
let classArgs = [ translateLang . fromMaybe [] $ listToMaybe args ]
case blockType of
"comment" -> return mempty
"src" -> return $ B.codeBlockWith ("", classArgs, []) blockStr
_ -> B.divWith ("", [blockType], [])
<$> parseFromString parseBlocks blockStr
"html" -> returnF $ B.rawBlock "html" content
"latex" -> returnF $ B.rawBlock "latex" content
"ascii" -> returnF $ B.rawBlock "ascii" content
"example" -> returnF $ exampleCode content
"quote" -> return $ B.blockQuote <$> contentBlocks
"verse" -> parseVerse content
"src" -> codeBlockWithAttr classArgs content
_ -> return $ B.divWith ("", [blockType], []) <$> contentBlocks
where
returnF :: a -> OrgParser (F a)
returnF = return . return
parseVerse :: String -> OrgParser (F Blocks)
parseVerse cs =
fmap B.para . mconcat . intersperse (pure B.linebreak)
<$> mapM (parseFromString parseInlines) (lines cs)
blockHeader :: OrgParser (Int, String, [String])
blockHeader = (,,) <$> blockIndent
@ -188,6 +307,18 @@ blockHeader = (,,) <$> blockIndent
blockType = map toLower <$> (stringAnyCase "#+begin_" *> many letter)
blockArgs = manyTill (many nonspaceChar <* skipSpaces) newline
codeBlockWithAttr :: [String] -> String -> OrgParser (F Blocks)
codeBlockWithAttr classArgs content = do
identifier <- fromMaybe "" <$> lookupBlockAttribute "name"
caption <- lookupInlinesAttr "caption"
let codeBlck = B.codeBlockWith (identifier, classArgs, []) content
return $ maybe (pure codeBlck) (labelDiv codeBlck) caption
where
labelDiv blk value =
B.divWith nullAttr <$> (mappend <$> labelledBlock value
<*> pure blk)
labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], []))
rawBlockContent :: Int -> String -> OrgParser String
rawBlockContent indent blockType =
unlines . map commaEscaped <$> manyTill indentedLine blockEnder
@ -222,15 +353,18 @@ commaEscaped (',':cs@('*':_)) = cs
commaEscaped (',':cs@('#':'+':_)) = cs
commaEscaped cs = cs
example :: OrgParser Blocks
example = try $
B.codeBlockWith ("", ["example"], []) . unlines <$> many1 exampleLine
example :: OrgParser (F Blocks)
example = try $ do
return . return . exampleCode =<< unlines <$> many1 exampleLine
exampleCode :: String -> Blocks
exampleCode = B.codeBlockWith ("", ["example"], [])
exampleLine :: OrgParser String
exampleLine = try $ string ": " *> anyLine
-- Drawers for properties or a logbook
drawer :: OrgParser Blocks
drawer :: OrgParser (F Blocks)
drawer = try $ do
drawerStart
manyTill drawerLine (try drawerEnd)
@ -256,41 +390,31 @@ drawerEnd = try $
--
-- Figures (Image on a line by itself, preceded by name and/or caption)
figure :: OrgParser Blocks
figure :: OrgParser (F Blocks)
figure = try $ do
(tit, cap) <- (maybe mempty withFigPrefix *** fromMaybe mempty)
<$> nameAndOrCaption
(cap, nam) <- nameAndCaption
src <- skipSpaces *> selfTarget <* skipSpaces <* newline
guard (isImageFilename src)
return . B.para $ B.image src tit cap
where withFigPrefix cs = if "fig:" `isPrefixOf` cs
then cs
else "fig:" ++ cs
nameAndOrCaption :: OrgParser (Maybe String, Maybe Inlines)
nameAndOrCaption = try $ nameFirst <|> captionFirst
return $ do
cap' <- cap
return $ B.para $ B.image src nam cap'
where
nameFirst = try $ do
n <- name
c <- optionMaybe caption
return (Just n, c)
captionFirst = try $ do
c <- caption
n <- optionMaybe name
return (n, Just c)
caption :: OrgParser Inlines
caption = try $ annotation "CAPTION" *> inlinesTillNewline
name :: OrgParser String
name = try $ annotation "NAME" *> skipSpaces *> manyTill anyChar newline
annotation :: String -> OrgParser String
annotation ann = try $ metaLineStart *> stringAnyCase ann <* char ':'
nameAndCaption =
do
maybeCap <- lookupInlinesAttr "caption"
maybeNam <- lookupBlockAttribute "name"
guard $ isJust maybeCap || isJust maybeNam
return ( fromMaybe mempty maybeCap
, maybe mempty withFigPrefix maybeNam )
withFigPrefix cs =
if "fig:" `isPrefixOf` cs
then cs
else "fig:" ++ cs
--
-- Comments, Options and Metadata
specialLine :: OrgParser Blocks
specialLine = try $ metaLine <|> commentLine
specialLine :: OrgParser (F Blocks)
specialLine = fmap return . try $ metaLine <|> commentLine
metaLine :: OrgParser Blocks
metaLine = try $ metaLineStart *> declarationLine
@ -308,29 +432,41 @@ commentLineStart = try $ mappend <$> many spaceChar <*> string "# "
declarationLine :: OrgParser Blocks
declarationLine = try $ do
meta' <- B.setMeta <$> metaKey <*> metaValue <*> pure nullMeta
updateState $ \st -> st { orgStateMeta = orgStateMeta st <> meta' }
key <- metaKey
inlinesF <- metaInlines
updateState $ \st ->
let meta' = B.setMeta <$> pure key <*> inlinesF <*> pure nullMeta
in st { orgStateMeta' = orgStateMeta' st <> meta' }
return mempty
metaValue :: OrgParser MetaValue
metaValue = MetaInlines . B.toList <$> inlinesTillNewline
metaInlines :: OrgParser (F MetaValue)
metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline
metaKey :: OrgParser String
metaKey = map toLower <$> many1 (noneOf ": \n\r")
<* char ':'
<* skipSpaces
--
-- Headers
--
-- | Headers
header :: OrgParser Blocks
header = try $
B.header <$> headerStart
<*> inlinesTillNewline
header :: OrgParser (F Blocks)
header = try $ do
level <- headerStart
title <- inlinesTillNewline
return $ B.header level <$> title
headerStart :: OrgParser Int
headerStart = try $
(length <$> many1 (char '*')) <* many1 (char ' ')
-- Horizontal Line (five dashes or more)
-- Don't use (or need) the reader wrapper here, we want hline to be
-- @show@able. Otherwise we can't use it with @notFollowedBy'@.
-- | Horizontal Line (five -- dashes or more)
hline :: OrgParser Blocks
hline = try $ do
skipSpaces
@ -344,27 +480,30 @@ hline = try $ do
-- Tables
--
data OrgTableRow = OrgContentRow [Blocks]
data OrgTableRow = OrgContentRow (F [Blocks])
| OrgAlignRow [Alignment]
| OrgHlineRow
deriving (Eq, Show)
data OrgTable = OrgTable
{ orgTableColumns :: Int
, orgTableAlignments :: [Alignment]
, orgTableHeader :: [Blocks]
, orgTableRows :: [[Blocks]]
} deriving (Eq, Show)
}
table :: OrgParser Blocks
table :: OrgParser (F Blocks)
table = try $ do
lookAhead tableStart
orgToPandocTable . normalizeTable . rowsToTable <$> tableRows
do
rows <- tableRows
cptn <- fromMaybe (pure "") <$> lookupInlinesAttr "caption"
return $ (<$> cptn) . orgToPandocTable . normalizeTable =<< rowsToTable rows
orgToPandocTable :: OrgTable
-> Inlines
-> Blocks
orgToPandocTable (OrgTable _ aligns heads lns) =
B.table "" (zip aligns $ repeat 0) heads lns
orgToPandocTable (OrgTable _ aligns heads lns) caption =
B.table caption (zip aligns $ repeat 0) heads lns
tableStart :: OrgParser Char
tableStart = try $ skipSpaces *> char '|'
@ -374,11 +513,11 @@ tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow)
tableContentRow :: OrgParser OrgTableRow
tableContentRow = try $
OrgContentRow <$> (tableStart *> manyTill tableContentCell newline)
OrgContentRow . sequence <$> (tableStart *> manyTill tableContentCell newline)
tableContentCell :: OrgParser Blocks
tableContentCell :: OrgParser (F Blocks)
tableContentCell = try $
B.plain . trimInlines . mconcat <$> many1Till inline endOfCell
fmap B.plain . trimInlinesF . mconcat <$> many1Till inline endOfCell
endOfCell :: OrgParser Char
endOfCell = try $ char '|' <|> lookAhead newline
@ -410,8 +549,8 @@ tableHline = try $
OrgHlineRow <$ (tableStart *> char '-' *> anyLine)
rowsToTable :: [OrgTableRow]
-> OrgTable
rowsToTable = foldl' (flip rowToContent) zeroTable
-> F OrgTable
rowsToTable = foldM (flip rowToContent) zeroTable
where zeroTable = OrgTable 0 mempty mempty mempty
normalizeTable :: OrgTable
@ -430,57 +569,113 @@ normalizeTable (OrgTable cols aligns heads lns) =
-- line as a header. All other horizontal lines are discarded.
rowToContent :: OrgTableRow
-> OrgTable
-> OrgTable
rowToContent OrgHlineRow = maybeBodyToHeader
rowToContent (OrgContentRow rs) = setLongestRow rs . appendToBody rs
rowToContent (OrgAlignRow as) = setLongestRow as . setAligns as
-> F OrgTable
rowToContent OrgHlineRow t = maybeBodyToHeader t
rowToContent (OrgAlignRow as) t = setLongestRow as =<< setAligns as t
rowToContent (OrgContentRow rf) t = do
rs <- rf
setLongestRow rs =<< appendToBody rs t
setLongestRow :: [a]
-> OrgTable
-> OrgTable
setLongestRow rs t = t{ orgTableColumns = max (length rs) (orgTableColumns t) }
-> F OrgTable
setLongestRow rs t =
return t{ orgTableColumns = max (length rs) (orgTableColumns t) }
maybeBodyToHeader :: OrgTable
-> OrgTable
-> F OrgTable
maybeBodyToHeader t = case t of
OrgTable{ orgTableHeader = [], orgTableRows = b:[] } ->
t{ orgTableHeader = b , orgTableRows = [] }
_ -> t
return t{ orgTableHeader = b , orgTableRows = [] }
_ -> return t
appendToBody :: [Blocks]
-> OrgTable
-> OrgTable
appendToBody r t = t{ orgTableRows = orgTableRows t ++ [r] }
-> F OrgTable
appendToBody r t = return t{ orgTableRows = orgTableRows t ++ [r] }
setAligns :: [Alignment]
-> OrgTable
-> OrgTable
setAligns aligns t = t{ orgTableAlignments = aligns }
-> F OrgTable
setAligns aligns t = return $ t{ orgTableAlignments = aligns }
--
-- LaTeX fragments
--
latexFragment :: OrgParser (F Blocks)
latexFragment = try $ do
envName <- latexEnvStart
content <- mconcat <$> manyTill anyLineNewline (latexEnd envName)
return . return $ B.rawBlock "latex" (content `inLatexEnv` envName)
where
c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n"
, c
, "\\end{", e, "}\n"
]
latexEnvStart :: OrgParser String
latexEnvStart = try $ do
skipSpaces *> string "\\begin{"
*> latexEnvName
<* string "}"
<* blankline
latexEnd :: String -> OrgParser ()
latexEnd envName = try $
() <$ skipSpaces
<* string ("\\end{" ++ envName ++ "}")
<* blankline
-- | Parses a LaTeX environment name.
latexEnvName :: OrgParser String
latexEnvName = try $ do
mappend <$> many1 alphaNum
<*> option "" (string "*")
--
-- Footnote defintions
--
noteBlock :: OrgParser (F Blocks)
noteBlock = try $ do
ref <- noteMarker <* skipSpaces
content <- mconcat <$> blocksTillHeaderOrNote
addToNotesTable (ref, content)
return mempty
where
blocksTillHeaderOrNote =
many1Till block (eof <|> () <$ lookAhead noteMarker
<|> () <$ lookAhead headerStart)
-- Paragraphs or Plain text
paraOrPlain :: OrgParser Blocks
paraOrPlain :: OrgParser (F Blocks)
paraOrPlain = try $
parseInlines <**> option B.plain (try $ newline *> pure B.para)
parseInlines <**> (fmap <$> option B.plain (try $ newline *> pure B.para))
inlinesTillNewline :: OrgParser Inlines
inlinesTillNewline = trimInlines . mconcat <$> manyTill inline newline
inlinesTillNewline :: OrgParser (F Inlines)
inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline
--
-- list blocks
--
list :: OrgParser Blocks
list :: OrgParser (F Blocks)
list = choice [ definitionList, bulletList, orderedList ] <?> "list"
definitionList :: OrgParser Blocks
definitionList = B.definitionList <$> many1 (definitionListItem bulletListStart)
definitionList :: OrgParser (F Blocks)
definitionList = fmap B.definitionList . fmap compactify'DL . sequence
<$> many1 (definitionListItem bulletListStart)
bulletList :: OrgParser Blocks
bulletList = B.bulletList . compactify' <$> many1 (listItem bulletListStart)
bulletList :: OrgParser (F Blocks)
bulletList = fmap B.bulletList . fmap compactify' . sequence
<$> many1 (listItem bulletListStart)
orderedList :: OrgParser Blocks
orderedList = B.orderedList . compactify' <$> many1 (listItem orderedListStart)
orderedList :: OrgParser (F Blocks)
-- orderedList = B.orderedList . compactify' <$> many1 (listItem orderedListStart)
orderedList = fmap B.orderedList . fmap compactify' . sequence
<$> many1 (listItem orderedListStart)
genericListStart :: OrgParser String
-> OrgParser Int
@ -499,33 +694,36 @@ orderedListStart = genericListStart orderedListMarker
where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)")
definitionListItem :: OrgParser Int
-> OrgParser (Inlines, [Blocks])
-> OrgParser (F (Inlines, [Blocks]))
definitionListItem parseMarkerGetLength = try $ do
markerLength <- parseMarkerGetLength
term <- manyTill (noneOf "\n\r") (try $ string "::")
first <- anyLineNewline
blank <- option "" ("\n" <$ blankline)
cont <- concat <$> many (listContinuation markerLength)
term' <- parseFromString inline term
contents' <- parseFromString parseBlocks $ first ++ cont
return (term', [contents'])
contents' <- parseFromString parseBlocks $ first ++ blank ++ cont
return $ (,) <$> term' <*> fmap (:[]) contents'
-- parse raw text for one list item, excluding start marker and continuations
listItem :: OrgParser Int
-> OrgParser Blocks
-> OrgParser (F Blocks)
listItem start = try $ do
markerLength <- try start
firstLine <- anyLineNewline
blank <- option "" ("\n" <$ blankline)
rest <- concat <$> many (listContinuation markerLength)
parseFromString parseBlocks $ firstLine ++ rest
parseFromString parseBlocks $ firstLine ++ blank ++ rest
-- continuation of a list item - indented and separated by blankline or endline.
-- Note: nested lists are parsed as continuations.
listContinuation :: Int
-> OrgParser String
listContinuation markerLength = try $
mappend <$> many blankline
<*> (concat <$> many1 listLine)
notFollowedBy' blankline
*> (mappend <$> (concat <$> many1 listLine)
<*> many blankline)
where listLine = try $ indentWith markerLength *> anyLineNewline
anyLineNewline :: OrgParser String
@ -536,11 +734,12 @@ anyLineNewline = (++ "\n") <$> anyLine
-- inline
--
inline :: OrgParser Inlines
inline :: OrgParser (F Inlines)
inline =
choice [ whitespace
, linebreak
, link
, footnote
, linkOrImage
, str
, endline
, emph
@ -557,67 +756,104 @@ inline =
] <* (guard =<< newlinesCountWithinLimits)
<?> "inline"
parseInlines :: OrgParser Inlines
parseInlines = trimInlines . mconcat <$> many1 inline
parseInlines :: OrgParser (F Inlines)
parseInlines = trimInlinesF . mconcat <$> many1 inline
-- treat these as potentially non-text when parsing inline:
specialChars :: [Char]
specialChars = "\"$'()*+-./:<=>[\\]^_{|}~"
whitespace :: OrgParser Inlines
whitespace = B.space <$ skipMany1 spaceChar
<* updateLastPreCharPos
<* updateLastForbiddenCharPos
whitespace :: OrgParser (F Inlines)
whitespace = pure B.space <$ skipMany1 spaceChar
<* updateLastPreCharPos
<* updateLastForbiddenCharPos
<?> "whitespace"
linebreak :: OrgParser Inlines
linebreak = try $ B.linebreak <$ string "\\\\" <* skipSpaces <* newline
linebreak :: OrgParser (F Inlines)
linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline
str :: OrgParser Inlines
str = B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
<* updateLastStrPos
str :: OrgParser (F Inlines)
str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
<* updateLastStrPos
-- an endline character that can be treated as a space, not a structural break
endline :: OrgParser Inlines
-- | An endline character that can be treated as a space, not a structural
-- break. This should reflect the values of the Emacs variable
-- @org-element-pagaraph-separate@.
endline :: OrgParser (F Inlines)
endline = try $ do
newline
notFollowedBy blankline
notFollowedBy' exampleLine
notFollowedBy' hline
notFollowedBy' noteMarker
notFollowedBy' tableStart
notFollowedBy' drawerStart
notFollowedBy' headerStart
notFollowedBy' metaLineStart
notFollowedBy' latexEnvStart
notFollowedBy' commentLineStart
notFollowedBy' bulletListStart
notFollowedBy' orderedListStart
decEmphasisNewlinesCount
guard =<< newlinesCountWithinLimits
updateLastPreCharPos
return B.space
return . return $ B.space
link :: OrgParser Inlines
link = explicitOrImageLink <|> selflinkOrImage <?> "link"
footnote :: OrgParser (F Inlines)
footnote = try $ inlineNote <|> referencedNote
explicitOrImageLink :: OrgParser Inlines
inlineNote :: OrgParser (F Inlines)
inlineNote = try $ do
string "[fn:"
ref <- many alphaNum
char ':'
note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']')
when (not $ null ref) $
addToNotesTable ("fn:" ++ ref, note)
return $ B.note <$> note
referencedNote :: OrgParser (F Inlines)
referencedNote = try $ do
ref <- noteMarker
return $ do
notes <- asksF orgStateNotes'
case lookup ref notes of
Nothing -> return $ B.str $ "[" ++ ref ++ "]"
Just contents -> do
st <- askF
let contents' = runF contents st{ orgStateNotes' = [] }
return $ B.note contents'
noteMarker :: OrgParser String
noteMarker = try $ do
char '['
choice [ many1Till digit (char ']')
, (++) <$> string "fn:"
<*> many1Till (noneOf "\n\r\t ") (char ']')
]
linkOrImage :: OrgParser (F Inlines)
linkOrImage = explicitOrImageLink <|> selflinkOrImage <?> "link or image"
explicitOrImageLink :: OrgParser (F Inlines)
explicitOrImageLink = try $ do
char '['
src <- linkTarget
title <- enclosedRaw (char '[') (char ']')
title' <- parseFromString (mconcat <$> many inline) title
char ']'
return . B.link src ""
$ if isImageFilename src && isImageFilename title
then B.image title "" ""
else title'
return $ B.link src "" <$>
if isImageFilename src && isImageFilename title
then return $ B.image title mempty mempty
else title'
selflinkOrImage :: OrgParser Inlines
selflinkOrImage :: OrgParser (F Inlines)
selflinkOrImage = try $ do
src <- char '[' *> linkTarget <* char ']'
return $ if isImageFilename src
then B.image src "" ""
else B.link src "" (B.str src)
return . return $ if isImageFilename src
then B.image src "" ""
else B.link src "" (B.str src)
selfTarget :: OrgParser String
selfTarget = try $ char '[' *> linkTarget <* char ']'
@ -628,57 +864,56 @@ linkTarget = enclosed (char '[') (char ']') (noneOf "\n\r]")
isImageFilename :: String -> Bool
isImageFilename filename =
any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
any (\x -> (x++":") `isPrefixOf` filename) protocols ||
':' `notElem` filename
(any (\x -> (x++":") `isPrefixOf` filename) protocols ||
':' `notElem` filename)
where
imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
protocols = [ "file", "http", "https" ]
emph :: OrgParser Inlines
emph = B.emph <$> emphasisBetween '/'
emph :: OrgParser (F Inlines)
emph = fmap B.emph <$> emphasisBetween '/'
strong :: OrgParser Inlines
strong = B.strong <$> emphasisBetween '*'
strong :: OrgParser (F Inlines)
strong = fmap B.strong <$> emphasisBetween '*'
strikeout :: OrgParser Inlines
strikeout = B.strikeout <$> emphasisBetween '+'
strikeout :: OrgParser (F Inlines)
strikeout = fmap B.strikeout <$> emphasisBetween '+'
-- There is no underline, so we use strong instead.
underline :: OrgParser Inlines
underline = B.strong <$> emphasisBetween '_'
underline :: OrgParser (F Inlines)
underline = fmap B.strong <$> emphasisBetween '_'
code :: OrgParser Inlines
code = B.code <$> verbatimBetween '='
code :: OrgParser (F Inlines)
code = return . B.code <$> verbatimBetween '='
verbatim :: OrgParser Inlines
verbatim = B.rawInline "" <$> verbatimBetween '~'
verbatim :: OrgParser (F Inlines)
verbatim = return . B.rawInline "" <$> verbatimBetween '~'
math :: OrgParser Inlines
math = B.math <$> choice [ math1CharBetween '$'
, mathStringBetween '$'
, rawMathBetween "\\(" "\\)"
]
subscript :: OrgParser (F Inlines)
subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr)
displayMath :: OrgParser Inlines
displayMath = B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
, rawMathBetween "$$" "$$"
]
superscript :: OrgParser (F Inlines)
superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr)
subscript :: OrgParser Inlines
subscript = B.subscript <$> try (char '_' *> subOrSuperExpr)
math :: OrgParser (F Inlines)
math = return . B.math <$> choice [ math1CharBetween '$'
, mathStringBetween '$'
, rawMathBetween "\\(" "\\)"
]
superscript :: OrgParser Inlines
superscript = B.superscript <$> try (char '^' *> subOrSuperExpr)
symbol :: OrgParser Inlines
symbol = B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
displayMath :: OrgParser (F Inlines)
displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
, rawMathBetween "$$" "$$"
]
symbol :: OrgParser (F Inlines)
symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
where updatePositions c
| c `elem` emphasisPreChars = c <$ updateLastPreCharPos
| c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos
| otherwise = return c
emphasisBetween :: Char
-> OrgParser Inlines
-> OrgParser (F Inlines)
emphasisBetween c = try $ do
startEmphasisNewlinesCounting emphasisAllowedNewlines
res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
@ -711,7 +946,7 @@ math1CharBetween c = try $ do
char c
res <- noneOf $ c:mathForbiddenBorderChars
char c
eof <|> lookAhead (oneOf mathPostChars) *> return ()
eof <|> () <$ lookAhead (oneOf mathPostChars)
return [res]
rawMathBetween :: String
@ -734,12 +969,12 @@ emphasisEnd :: Char -> OrgParser Char
emphasisEnd c = try $ do
guard =<< notAfterForbiddenBorderChar
char c
eof <|> lookAhead (surroundingEmphasisChar >>= \x ->
oneOf (x ++ emphasisPostChars))
*> return ()
eof <|> () <$ lookAhead acceptablePostChars
updateLastStrPos
popInlineCharStack
return c
where acceptablePostChars =
surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars)
mathStart :: Char -> OrgParser Char
mathStart c = try $
@ -749,15 +984,15 @@ mathEnd :: Char -> OrgParser Char
mathEnd c = try $ do
res <- noneOf (c:mathForbiddenBorderChars)
char c
eof <|> lookAhead (oneOf mathPostChars *> pure ())
eof <|> () <$ lookAhead (oneOf mathPostChars)
return res
enclosedInlines :: OrgParser a
-> OrgParser b
-> OrgParser Inlines
-> OrgParser (F Inlines)
enclosedInlines start end = try $
trimInlines . mconcat <$> enclosed start end inline
trimInlinesF . mconcat <$> enclosed start end inline
enclosedRaw :: OrgParser a
-> OrgParser b
@ -843,25 +1078,13 @@ notAfterForbiddenBorderChar = do
return $ lastFBCPos /= Just pos
-- | Read a sub- or superscript expression
subOrSuperExpr :: OrgParser Inlines
subOrSuperExpr = try $ do
choice [ balancedSexp '{' '}'
, balancedSexp '(' ')' >>= return . enclosing ('(', ')')
subOrSuperExpr :: OrgParser (F Inlines)
subOrSuperExpr = try $
choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r")
, enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r")
, simpleSubOrSuperString
] >>= parseFromString (mconcat <$> many inline)
-- | Read a balanced sexp
balancedSexp :: Char
-> Char
-> OrgParser String
balancedSexp l r = try $ do
char l
res <- concat <$> many ( many1 (noneOf ([l, r] ++ "\n\r"))
<|> try (string [l, r])
<|> enclosing (l, r) <$> balancedSexp l r
)
char r
return res
where enclosing (left, right) s = left : s ++ [right]
simpleSubOrSuperString :: OrgParser String
simpleSubOrSuperString = try $
@ -869,8 +1092,3 @@ simpleSubOrSuperString = try $
, mappend <$> option [] ((:[]) <$> oneOf "+-")
<*> many1 alphaNum
]
enclosing :: (a, a)
-> [a]
-> [a]
enclosing (left, right) s = left : s ++ [right]

View file

@ -56,6 +56,7 @@ module Text.Pandoc.Shared (
stringify,
compactify,
compactify',
compactify'DL,
Element (..),
hierarchicalize,
uniqueIdent,
@ -82,7 +83,7 @@ module Text.Pandoc.Shared (
import Text.Pandoc.Definition
import Text.Pandoc.Walk
import Text.Pandoc.Generic
import Text.Pandoc.Builder (Blocks, ToMetaValue(..))
import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..))
import qualified Text.Pandoc.Builder as B
import qualified Text.Pandoc.UTF8 as UTF8
import System.Environment (getProgName)
@ -435,6 +436,21 @@ compactify' items =
_ -> items
_ -> items
-- | Like @compactify'@, but akts on items of definition lists.
compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])]
compactify'DL items =
let defs = concatMap snd items
defBlocks = reverse $ concatMap B.toList defs
in case defBlocks of
(Para x:_) -> if not $ any isPara (drop 1 defBlocks)
then let (t,ds) = last items
lastDef = B.toList $ last ds
ds' = init ds ++
[B.fromList $ init lastDef ++ [Plain x]]
in init items ++ [(t, ds')]
else items
_ -> items
isPara :: Block -> Bool
isPara (Para _) = True
isPara _ = False
@ -698,5 +714,3 @@ safeRead s = case reads s of
(d,x):_
| all isSpace x -> return d
_ -> fail $ "Could not read `" ++ s ++ "'"

View file

@ -8,7 +8,7 @@ import Tests.Arbitrary()
import Text.Pandoc.Builder
import Text.Pandoc
import Data.List (intersperse)
import Data.Monoid (mempty, mconcat)
import Data.Monoid (mempty, mappend, mconcat)
org :: String -> Pandoc
org = readOrg def
@ -98,6 +98,10 @@ tests =
"line \\\\ \nbreak" =?>
para ("line" <> linebreak <> "break")
, "Inline note" =:
"[fn::Schreib mir eine E-Mail]" =?>
para (note $ para "Schreib mir eine E-Mail")
, "Markup-chars not occuring on word break are symbols" =:
unlines [ "this+that+ +so+on"
, "seven*eight* nine*"
@ -359,29 +363,6 @@ tests =
, "#+END_COMMENT"] =?>
(mempty::Blocks)
, "Source Block in Text" =:
unlines [ "Low German greeting"
, " #+BEGIN_SRC haskell"
, " main = putStrLn greeting"
, " where greeting = \"moin\""
, " #+END_SRC" ] =?>
let attr' = ("", ["haskell"], [])
code' = "main = putStrLn greeting\n" ++
" where greeting = \"moin\"\n"
in mconcat [ para $ spcSep [ "Low", "German", "greeting" ]
, codeBlockWith attr' code'
]
, "Source Block" =:
unlines [ " #+BEGIN_SRC haskell"
, " main = putStrLn greeting"
, " where greeting = \"moin\""
, " #+END_SRC" ] =?>
let attr' = ("", ["haskell"], [])
code' = "main = putStrLn greeting\n" ++
" where greeting = \"moin\"\n"
in codeBlockWith attr' code'
, "Figure" =:
unlines [ "#+caption: A very courageous man."
, "#+name: goodguy"
@ -402,6 +383,48 @@ tests =
] =?>
para (image "the-red-queen.jpg" "fig:redqueen"
"Used as a metapher in evolutionary biology.")
, "Footnote" =:
unlines [ "A footnote[1]"
, ""
, "[1] First paragraph"
, ""
, "second paragraph"
] =?>
para (mconcat
[ "A", space, "footnote"
, note $ mconcat [ para ("First" <> space <> "paragraph")
, para ("second" <> space <> "paragraph")
]
])
, "Two footnotes" =:
unlines [ "Footnotes[fn:1][fn:2]"
, ""
, "[fn:1] First note."
, ""
, "[fn:2] Second note."
] =?>
para (mconcat
[ "Footnotes"
, note $ para ("First" <> space <> "note.")
, note $ para ("Second" <> space <> "note.")
])
, "Footnote followed by header" =:
unlines [ "Another note[fn:yay]"
, ""
, "[fn:yay] This is great!"
, ""
, "** Headline"
] =?>
mconcat
[ para (mconcat
[ "Another", space, "note"
, note $ para ("This" <> space <> "is" <> space <> "great!")
])
, header 2 "Headline"
]
]
, testGroup "Lists" $
@ -537,13 +560,36 @@ tests =
, ("TTL", [ plain $ "transistor-transistor" <> space <>
"logic" ])
, ("PSK", [ mconcat
[ para $ "phase-shift" <> space <> "keying"
, plain $ spcSep [ "a", "digital"
, "modulation", "scheme" ]
[ para $ "phase-shift" <> space <> "keying"
, para $ spcSep [ "a", "digital"
, "modulation", "scheme" ]
]
]
)
])
]
, "Compact definition list" =:
unlines [ "- ATP :: adenosine 5' triphosphate"
, "- DNA :: deoxyribonucleic acid"
, "- PCR :: polymerase chain reaction"
, ""
] =?>
definitionList
[ ("ATP", [ plain $ spcSep [ "adenosine", "5'", "triphosphate" ] ])
, ("DNA", [ plain $ spcSep [ "deoxyribonucleic", "acid" ] ])
, ("PCR", [ plain $ spcSep [ "polymerase", "chain", "reaction" ] ])
]
, "Loose bullet list" =:
unlines [ "- apple"
, ""
, "- orange"
, ""
, "- peach"
] =?>
bulletList [ para "apple"
, para "orange"
, para "peach"
]
]
, testGroup "Tables"
@ -656,5 +702,126 @@ tests =
[ [ plain "1" , plain "One" , plain "foo" ]
, [ plain "2" , plain mempty , plain mempty ]
]
, "Table with caption" =:
unlines [ "#+CAPTION: Hitchhiker's Multiplication Table"
, "| x | 6 |"
, "| 9 | 42 |"
] =?>
table "Hitchhiker's Multiplication Table"
[(AlignDefault, 0), (AlignDefault, 0)]
[]
[ [ plain "x", plain "6" ]
, [ plain "9", plain "42" ]
]
]
, testGroup "Blocks and fragments"
[ "Source block" =:
unlines [ " #+BEGIN_SRC haskell"
, " main = putStrLn greeting"
, " where greeting = \"moin\""
, " #+END_SRC" ] =?>
let attr' = ("", ["haskell"], [])
code' = "main = putStrLn greeting\n" ++
" where greeting = \"moin\"\n"
in codeBlockWith attr' code'
, "Source block between paragraphs" =:
unlines [ "Low German greeting"
, " #+BEGIN_SRC haskell"
, " main = putStrLn greeting"
, " where greeting = \"Moin!\""
, " #+END_SRC" ] =?>
let attr' = ("", ["haskell"], [])
code' = "main = putStrLn greeting\n" ++
" where greeting = \"Moin!\"\n"
in mconcat [ para $ spcSep [ "Low", "German", "greeting" ]
, codeBlockWith attr' code'
]
, "Example block" =:
unlines [ "#+begin_example"
, "A chosen representation of"
, "a rule."
, "#+eND_exAMPle"
] =?>
codeBlockWith ("", ["example"], [])
"A chosen representation of\na rule.\n"
, "HTML block" =:
unlines [ "#+BEGIN_HTML"
, "<aside>HTML5 is pretty nice.</aside>"
, "#+END_HTML"
] =?>
rawBlock "html" "<aside>HTML5 is pretty nice.</aside>\n"
, "Quote block" =:
unlines [ "#+BEGIN_QUOTE"
, "/Niemand/ hat die Absicht, eine Mauer zu errichten!"
, "#+END_QUOTE"
] =?>
blockQuote (para (spcSep [ emph "Niemand", "hat", "die", "Absicht,"
, "eine", "Mauer", "zu", "errichten!"
]))
, "Verse block" =:
unlines [ "The first lines of Goethe's /Faust/:"
, "#+begin_verse"
, "Habe nun, ach! Philosophie,"
, "Juristerei und Medizin,"
, "Und leider auch Theologie!"
, "Durchaus studiert, mit heißem Bemühn."
, "#+end_verse"
] =?>
mconcat
[ para $ spcSep [ "The", "first", "lines", "of"
, "Goethe's", emph "Faust" <> ":"]
, para $ mconcat
[ spcSep [ "Habe", "nun,", "ach!", "Philosophie," ]
, linebreak
, spcSep [ "Juristerei", "und", "Medizin," ]
, linebreak
, spcSep [ "Und", "leider", "auch", "Theologie!" ]
, linebreak
, spcSep [ "Durchaus", "studiert,", "mit", "heißem", "Bemühn." ]
]
]
, "LaTeX fragment" =:
unlines [ "\\begin{equation}"
, "X_i = \\begin{cases}"
, " G_{\\alpha(i)} & \\text{if }\\alpha(i-1) = \\alpha(i)\\\\"
, " C_{\\alpha(i)} & \\text{otherwise}"
, " \\end{cases}"
, "\\end{equation}"
] =?>
rawBlock "latex"
(unlines [ "\\begin{equation}"
, "X_i = \\begin{cases}"
, " G_{\\alpha(i)} & \\text{if }\\alpha(i-1) =" ++
" \\alpha(i)\\\\"
, " C_{\\alpha(i)} & \\text{otherwise}"
, " \\end{cases}"
, "\\end{equation}"
])
, "Code block with caption" =:
unlines [ "#+CAPTION: Functor laws in Haskell"
, "#+NAME: functor-laws"
, "#+BEGIN_SRC haskell"
, "fmap id = id"
, "fmap (p . q) = (fmap p) . (fmap q)"
, "#+END_SRC"
] =?>
divWith
nullAttr
(mappend
(plain $ spanWith ("", ["label"], [])
(spcSep [ "Functor", "laws", "in", "Haskell" ]))
(codeBlockWith ("functor-laws", ["haskell"], [])
(unlines [ "fmap id = id"
, "fmap (p . q) = (fmap p) . (fmap q)"
])))
]
]