Org reader: extract inline parser to module

Inline parsing code is moved to a separate module.  Parsers for block
starts are extracted as well, as those are used in the `endline` parser.

This is part of the Org-mode reader cleanup effort.
This commit is contained in:
Albert Krewinkel 2016-05-22 16:52:06 +02:00
parent a340c7249f
commit 39e8b4276e
5 changed files with 889 additions and 756 deletions

View file

@ -393,6 +393,8 @@ Library
Text.Pandoc.Readers.Odt.Generic.XMLConverter,
Text.Pandoc.Readers.Odt.Arrows.State,
Text.Pandoc.Readers.Odt.Arrows.Utils,
Text.Pandoc.Readers.Org.BlockStarts,
Text.Pandoc.Readers.Org.Inlines,
Text.Pandoc.Readers.Org.ParserState,
Text.Pandoc.Readers.Org.Parsing,
Text.Pandoc.Writers.Shared,

View file

@ -1,5 +1,4 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-
Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
@ -29,6 +28,8 @@ Conversion of org-mode formatted plain text to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Org ( readOrg ) where
import Text.Pandoc.Readers.Org.BlockStarts
import Text.Pandoc.Readers.Org.Inlines
import Text.Pandoc.Readers.Org.ParserState
import Text.Pandoc.Readers.Org.Parsing
@ -38,19 +39,16 @@ import Text.Pandoc.Definition
import Text.Pandoc.Compat.Monoid ((<>))
import Text.Pandoc.Error
import Text.Pandoc.Options
import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
import Text.Pandoc.Shared (compactify', compactify'DL)
import Text.TeXMath (readTeX, writePandoc, DisplayType(..))
import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
import Text.Pandoc.Shared ( compactify', compactify'DL )
import Control.Arrow (first)
import Control.Monad (foldM, guard, mplus, mzero, when)
import Control.Arrow ( first )
import Control.Monad ( foldM, guard, mzero )
import Control.Monad.Reader ( runReader )
import Data.Char (isAlphaNum, isSpace, toLower, toUpper)
import Data.List ( foldl', intersperse, isPrefixOf, isSuffixOf )
import Data.Char ( toLower, toUpper)
import Data.List ( foldl', intersperse, isPrefixOf )
import qualified Data.Map as M
import Data.Maybe ( fromMaybe, isNothing )
import Network.HTTP (urlEncode)
import Network.HTTP ( urlEncode )
-- | Parse org-mode string and return a Pandoc document.
@ -59,54 +57,6 @@ readOrg :: ReaderOptions -- ^ Reader options
-> Either PandocError Pandoc
readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
--
-- Functions acting on the parser state
--
recordAnchorId :: String -> OrgParser ()
recordAnchorId i = updateState $ \s ->
s{ orgStateAnchorIds = i : (orgStateAnchorIds s) }
pushToInlineCharStack :: Char -> OrgParser ()
pushToInlineCharStack c = updateState $ \s ->
s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s }
popInlineCharStack :: OrgParser ()
popInlineCharStack = updateState $ \s ->
s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s }
surroundingEmphasisChar :: OrgParser [Char]
surroundingEmphasisChar =
take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
startEmphasisNewlinesCounting :: Int -> OrgParser ()
startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
s{ orgStateEmphasisNewlines = Just maxNewlines }
decEmphasisNewlinesCount :: OrgParser ()
decEmphasisNewlinesCount = updateState $ \s ->
s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
newlinesCountWithinLimits :: OrgParser Bool
newlinesCountWithinLimits = do
st <- getState
return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True
resetEmphasisNewlines :: OrgParser ()
resetEmphasisNewlines = updateState $ \s ->
s{ orgStateEmphasisNewlines = Nothing }
addLinkFormat :: String
-> (String -> String)
-> OrgParser ()
addLinkFormat key formatter = updateState $ \s ->
let fs = orgStateLinkFormatters s
in s{ orgStateLinkFormatters = M.insert key formatter fs }
addToNotesTable :: OrgNoteRecord -> OrgParser ()
addToNotesTable note = do
oldnotes <- orgStateNotes' <$> getState
updateState $ \s -> s{ orgStateNotes' = note:oldnotes }
--
-- Export Settings
--
@ -259,7 +209,7 @@ block = choice [ mempty <$ blanklines
, genericDrawer
, specialLine
, header
, return <$> hline
, horizontalRule
, list
, latexFragment
, noteBlock
@ -457,9 +407,6 @@ indentWith num = do
type SwitchOption = (Char, Maybe String)
orgArgWord :: OrgParser String
orgArgWord = many1 orgArgWordChar
-- | Parse code block arguments
-- TODO: We currently don't handle switches.
codeHeaderArgs :: OrgParser ([String], [(String, String)])
@ -474,7 +421,10 @@ codeHeaderArgs = try $ do
, map toRundocAttrib (("language", language) : parameters)
)
else ([ pandocLang ], parameters)
where hasRundocParameters = not . null
where
hasRundocParameters = not . null
toRundocAttrib = first ("rundoc-" ++)
switch :: OrgParser SwitchOption
switch = try $ simpleSwitch <|> lineNumbersSwitch
@ -508,17 +458,6 @@ blockOption = try $ do
paramValue <- option "yes" orgParamValue
return (argKey, paramValue)
inlineBlockOption :: OrgParser (String, String)
inlineBlockOption = try $ do
argKey <- orgArgKey
paramValue <- option "yes" orgInlineParamValue
return (argKey, paramValue)
orgArgKey :: OrgParser String
orgArgKey = try $
skipSpaces *> char ':'
*> many1 orgArgWordChar
orgParamValue :: OrgParser String
orgParamValue = try $
skipSpaces
@ -526,19 +465,6 @@ orgParamValue = try $
*> many1 (noneOf "\t\n\r ")
<* skipSpaces
orgInlineParamValue :: OrgParser String
orgInlineParamValue = try $
skipSpaces
*> notFollowedBy (char ':')
*> many1 (noneOf "\t\n\r ]")
<* skipSpaces
orgArgWordChar :: OrgParser Char
orgArgWordChar = alphaNum <|> oneOf "-_"
toRundocAttrib :: (String, String) -> (String, String)
toRundocAttrib = first ("rundoc-" ++)
commaEscaped :: String -> String
commaEscaped (',':cs@('*':_)) = cs
commaEscaped (',':cs@('#':'+':_)) = cs
@ -552,7 +478,10 @@ exampleCode :: String -> Blocks
exampleCode = B.codeBlockWith ("", ["example"], [])
exampleLine :: OrgParser String
exampleLine = try $ skipSpaces *> string ": " *> anyLine
exampleLine = try $ exampleLineStart *> anyLine
horizontalRule :: OrgParser (F Blocks)
horizontalRule = return B.horizontalRule <$ try hline
--
@ -582,11 +511,6 @@ genericDrawer = try $ do
drawerDiv :: String -> F Blocks -> F Blocks
drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty)
drawerStart :: OrgParser String
drawerStart = try $
skipSpaces *> drawerName <* skipSpaces <* newline
where drawerName = char ':' *> manyTill nonspaceChar (char ':')
drawerLine :: OrgParser String
drawerLine = anyLine
@ -639,31 +563,38 @@ figure = try $ do
let attr = (mempty, mempty, figKeyVals)
return $ (B.para . B.imageWith attr src (withFigPrefix figName) <$> figCaption)
where
withFigPrefix :: String -> String
withFigPrefix cs =
if "fig:" `isPrefixOf` cs
then cs
else "fig:" ++ cs
selfTarget :: OrgParser String
selfTarget = try $ char '[' *> linkTarget <* char ']'
--
-- Comments, Options and Metadata
--
addLinkFormat :: String
-> (String -> String)
-> OrgParser ()
addLinkFormat key formatter = updateState $ \s ->
let fs = orgStateLinkFormatters s
in s{ orgStateLinkFormatters = M.insert key formatter fs }
specialLine :: OrgParser (F Blocks)
specialLine = fmap return . try $ metaLine <|> commentLine
-- The order, in which blocks are tried, makes sure that we're not looking at
-- the beginning of a block, so we don't need to check for it
metaLine :: OrgParser Blocks
metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine)
-- The order, in which blocks are tried, makes sure that we're not looking at
-- the beginning of a block, so we don't need to check for it
metaLineStart :: OrgParser ()
metaLineStart = try $ skipSpaces <* string "#+"
commentLine :: OrgParser Blocks
commentLine = commentLineStart *> anyLine *> pure mempty
commentLineStart :: OrgParser ()
commentLineStart = try $ skipSpaces <* string "# "
declarationLine :: OrgParser ()
declarationLine = try $ do
key <- metaKey
@ -741,23 +672,6 @@ header = try $ do
*> many1 tag
<* skipSpaces
headerStart :: OrgParser Int
headerStart = try $
(length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos
-- Don't use (or need) the reader wrapper here, we want hline to be
-- @show@able. Otherwise we can't use it with @notFollowedBy'@.
-- | Horizontal Line (five -- dashes or more)
hline :: OrgParser Blocks
hline = try $ do
skipSpaces
string "-----"
many (char '-')
skipSpaces
newline
return B.horizontalRule
--
-- Tables
@ -793,9 +707,6 @@ orgToPandocTable :: OrgTable
orgToPandocTable (OrgTable aligns heads lns) caption =
B.table caption (zip aligns $ repeat 0) heads lns
tableStart :: OrgParser Char
tableStart = try $ skipSpaces *> char '|'
tableRows :: OrgParser [OrgTableRow]
tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow)
@ -899,25 +810,12 @@ latexFragment = try $ do
, "\\end{", e, "}\n"
]
latexEnvStart :: OrgParser String
latexEnvStart = try $ do
skipSpaces *> string "\\begin{"
*> latexEnvName
<* string "}"
<* blankline
latexEnd :: String -> OrgParser ()
latexEnd envName = try $
() <$ skipSpaces
<* string ("\\end{" ++ envName ++ "}")
<* blankline
-- | Parses a LaTeX environment name.
latexEnvName :: OrgParser String
latexEnvName = try $ do
mappend <$> many1 alphaNum
<*> option "" (string "*")
--
-- Footnote defintions
@ -942,7 +840,7 @@ paraOrPlain = try $ do
-- is directly followed by a list item, in which case the block is read as
-- plain text.
try (guard nl
*> notFollowedBy (inList *> (orderedListStart <|> bulletListStart))
*> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart))
*> return (B.para <$> ils))
<|> (return (B.plain <$> ils))
@ -971,38 +869,21 @@ orderedList :: OrgParser (F Blocks)
orderedList = fmap B.orderedList . fmap compactify' . sequence
<$> many1 (listItem orderedListStart)
genericListStart :: OrgParser String
-> OrgParser Int
genericListStart listMarker = try $
(+) <$> (length <$> many spaceChar)
<*> (length <$> listMarker <* many1 spaceChar)
-- parses bullet list marker. maybe we know the indent level
bulletListStart :: OrgParser Int
bulletListStart = bulletListStart' Nothing
bulletListStart' :: Maybe Int -> OrgParser Int
-- returns length of bulletList prefix, inclusive of marker
bulletListStart' Nothing = do ind <- length <$> many spaceChar
when (ind == 0) $ notFollowedBy (char '*')
oneOf bullets
many1 spaceChar
oneOf (bullets $ ind == 0)
skipSpaces1
return (ind + 1)
-- Unindented lists are legal, but they can't use '*' bullets
-- We return n to maintain compatibility with the generic listItem
bulletListStart' (Just n) = do count (n-1) spaceChar
when (n == 1) $ notFollowedBy (char '*')
oneOf bullets
oneOf (bullets $ n == 1)
many1 spaceChar
return n
bullets :: String
bullets = "*+-"
orderedListStart :: OrgParser Int
orderedListStart = genericListStart orderedListMarker
-- Ordered list markers allowed in org-mode
where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)")
-- Unindented lists are legal, but they can't use '*' bullets.
-- We return n to maintain compatibility with the generic listItem.
bullets :: Bool -> String
bullets unindented = if unindented then "+-" else "*+-"
definitionListItem :: OrgParser Int
-> OrgParser (F (Inlines, [Blocks]))
@ -1040,602 +921,6 @@ listContinuation markerLength = try $
<*> many blankline)
where listLine = try $ indentWith markerLength *> anyLineNewline
-- | Parse any line, include the final newline in the output.
anyLineNewline :: OrgParser String
anyLineNewline = (++ "\n") <$> anyLine
--
-- inline
--
inline :: OrgParser (F Inlines)
inline =
choice [ whitespace
, linebreak
, cite
, footnote
, linkOrImage
, anchor
, inlineCodeBlock
, str
, endline
, emph
, strong
, strikeout
, underline
, code
, math
, displayMath
, verbatim
, subscript
, superscript
, inlineLaTeX
, smart
, symbol
] <* (guard =<< newlinesCountWithinLimits)
<?> "inline"
parseInlines :: OrgParser (F Inlines)
parseInlines = trimInlinesF . mconcat <$> many1 inline
-- treat these as potentially non-text when parsing inline:
specialChars :: [Char]
specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~"
whitespace :: OrgParser (F Inlines)
whitespace = pure B.space <$ skipMany1 spaceChar
<* updateLastPreCharPos
<* updateLastForbiddenCharPos
<?> "whitespace"
linebreak :: OrgParser (F Inlines)
linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline
str :: OrgParser (F Inlines)
str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
<* updateLastStrPos
-- | An endline character that can be treated as a space, not a structural
-- break. This should reflect the values of the Emacs variable
-- @org-element-pagaraph-separate@.
endline :: OrgParser (F Inlines)
endline = try $ do
newline
notFollowedBy blankline
notFollowedBy' exampleLine
notFollowedBy' hline
notFollowedBy' noteMarker
notFollowedBy' tableStart
notFollowedBy' drawerStart
notFollowedBy' headerStart
notFollowedBy' metaLineStart
notFollowedBy' latexEnvStart
notFollowedBy' commentLineStart
notFollowedBy' bulletListStart
notFollowedBy' orderedListStart
decEmphasisNewlinesCount
guard =<< newlinesCountWithinLimits
updateLastPreCharPos
return . return $ B.softbreak
cite :: OrgParser (F Inlines)
cite = try $ do
guardEnabled Ext_citations
(cs, raw) <- withRaw normalCite
return $ (flip B.cite (B.text raw)) <$> cs
normalCite :: OrgParser (F [Citation])
normalCite = try $ char '['
*> skipSpaces
*> citeList
<* skipSpaces
<* char ']'
citeList :: OrgParser (F [Citation])
citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces)
citation :: OrgParser (F Citation)
citation = try $ do
pref <- prefix
(suppress_author, key) <- citeKey
suff <- suffix
return $ do
x <- pref
y <- suff
return $ Citation{ citationId = key
, citationPrefix = B.toList x
, citationSuffix = B.toList y
, citationMode = if suppress_author
then SuppressAuthor
else NormalCitation
, citationNoteNum = 0
, citationHash = 0
}
where
prefix = trimInlinesF . mconcat <$>
manyTill inline (char ']' <|> (']' <$ lookAhead citeKey))
suffix = try $ do
hasSpace <- option False (notFollowedBy nonspaceChar >> return True)
skipSpaces
rest <- trimInlinesF . mconcat <$>
many (notFollowedBy (oneOf ";]") *> inline)
return $ if hasSpace
then (B.space <>) <$> rest
else rest
footnote :: OrgParser (F Inlines)
footnote = try $ inlineNote <|> referencedNote
inlineNote :: OrgParser (F Inlines)
inlineNote = try $ do
string "[fn:"
ref <- many alphaNum
char ':'
note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']')
when (not $ null ref) $
addToNotesTable ("fn:" ++ ref, note)
return $ B.note <$> note
referencedNote :: OrgParser (F Inlines)
referencedNote = try $ do
ref <- noteMarker
return $ do
notes <- asksF orgStateNotes'
case lookup ref notes of
Nothing -> return $ B.str $ "[" ++ ref ++ "]"
Just contents -> do
st <- askF
let contents' = runF contents st{ orgStateNotes' = [] }
return $ B.note contents'
noteMarker :: OrgParser String
noteMarker = try $ do
char '['
choice [ many1Till digit (char ']')
, (++) <$> string "fn:"
<*> many1Till (noneOf "\n\r\t ") (char ']')
]
linkOrImage :: OrgParser (F Inlines)
linkOrImage = explicitOrImageLink
<|> selflinkOrImage
<|> angleLink
<|> plainLink
<?> "link or image"
explicitOrImageLink :: OrgParser (F Inlines)
explicitOrImageLink = try $ do
char '['
srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
title <- enclosedRaw (char '[') (char ']')
title' <- parseFromString (mconcat <$> many inline) title
char ']'
return $ do
src <- srcF
if isImageFilename title
then pure $ B.link src "" $ B.image title mempty mempty
else linkToInlinesF src =<< title'
selflinkOrImage :: OrgParser (F Inlines)
selflinkOrImage = try $ do
src <- char '[' *> linkTarget <* char ']'
return $ linkToInlinesF src (B.str src)
plainLink :: OrgParser (F Inlines)
plainLink = try $ do
(orig, src) <- uri
returnF $ B.link src "" (B.str orig)
angleLink :: OrgParser (F Inlines)
angleLink = try $ do
char '<'
link <- plainLink
char '>'
return link
selfTarget :: OrgParser String
selfTarget = try $ char '[' *> linkTarget <* char ']'
linkTarget :: OrgParser String
linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]")
possiblyEmptyLinkTarget :: OrgParser String
possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
applyCustomLinkFormat :: String -> OrgParser (F String)
applyCustomLinkFormat link = do
let (linkType, rest) = break (== ':') link
return $ do
formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
return $ maybe link ($ drop 1 rest) formatter
-- | Take a link and return a function which produces new inlines when given
-- description inlines.
linkToInlinesF :: String -> Inlines -> F Inlines
linkToInlinesF linkStr =
case linkStr of
"" -> pure . B.link mempty "" -- wiki link (empty by convention)
('#':_) -> pure . B.link linkStr "" -- document-local fraction
_ -> case cleanLinkString linkStr of
(Just cleanedLink) -> if isImageFilename cleanedLink
then const . pure $ B.image cleanedLink "" ""
else pure . B.link cleanedLink ""
Nothing -> internalLink linkStr -- other internal link
-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if
-- the string does not appear to be a link.
cleanLinkString :: String -> Maybe String
cleanLinkString s =
case s of
'/':_ -> Just $ "file://" ++ s -- absolute path
'.':'/':_ -> Just s -- relative path
'.':'.':'/':_ -> Just s -- relative path
-- Relative path or URL (file schema)
'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s'
_ | isUrl s -> Just s -- URL
_ -> Nothing
where
isUrl :: String -> Bool
isUrl cs =
let (scheme, path) = break (== ':') cs
in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme
&& not (null path)
isImageFilename :: String -> Bool
isImageFilename filename =
any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
(any (\x -> (x++":") `isPrefixOf` filename) protocols ||
':' `notElem` filename)
where
imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
protocols = [ "file", "http", "https" ]
internalLink :: String -> Inlines -> F Inlines
internalLink link title = do
anchorB <- (link `elem`) <$> asksF orgStateAnchorIds
if anchorB
then return $ B.link ('#':link) "" title
else return $ B.emph title
-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
-- @anchor-id@ set as id. Legal anchors in org-mode are defined through
-- @org-target-regexp@, which is fairly liberal. Since no link is created if
-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as
-- an anchor.
anchor :: OrgParser (F Inlines)
anchor = try $ do
anchorId <- parseAnchor
recordAnchorId anchorId
returnF $ B.spanWith (solidify anchorId, [], []) mempty
where
parseAnchor = string "<<"
*> many1 (noneOf "\t\n\r<>\"' ")
<* string ">>"
<* skipSpaces
-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors
-- the org function @org-export-solidify-link-text@.
solidify :: String -> String
solidify = map replaceSpecialChar
where replaceSpecialChar c
| isAlphaNum c = c
| c `elem` ("_.-:" :: String) = c
| otherwise = '-'
-- | Parses an inline code block and marks it as an babel block.
inlineCodeBlock :: OrgParser (F Inlines)
inlineCodeBlock = try $ do
string "src_"
lang <- many1 orgArgWordChar
opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption
inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r")
let attrClasses = [translateLang lang, rundocBlockClass]
let attrKeyVal = map toRundocAttrib (("language", lang) : opts)
returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode
enclosedByPair :: Char -- ^ opening char
-> Char -- ^ closing char
-> OrgParser a -- ^ parser
-> OrgParser [a]
enclosedByPair s e p = char s *> many1Till p (char e)
emph :: OrgParser (F Inlines)
emph = fmap B.emph <$> emphasisBetween '/'
strong :: OrgParser (F Inlines)
strong = fmap B.strong <$> emphasisBetween '*'
strikeout :: OrgParser (F Inlines)
strikeout = fmap B.strikeout <$> emphasisBetween '+'
-- There is no underline, so we use strong instead.
underline :: OrgParser (F Inlines)
underline = fmap B.strong <$> emphasisBetween '_'
verbatim :: OrgParser (F Inlines)
verbatim = return . B.code <$> verbatimBetween '='
code :: OrgParser (F Inlines)
code = return . B.code <$> verbatimBetween '~'
subscript :: OrgParser (F Inlines)
subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr)
superscript :: OrgParser (F Inlines)
superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr)
math :: OrgParser (F Inlines)
math = return . B.math <$> choice [ math1CharBetween '$'
, mathStringBetween '$'
, rawMathBetween "\\(" "\\)"
]
displayMath :: OrgParser (F Inlines)
displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
, rawMathBetween "$$" "$$"
]
updatePositions :: Char
-> OrgParser (Char)
updatePositions c = do
when (c `elem` emphasisPreChars) updateLastPreCharPos
when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
return c
symbol :: OrgParser (F Inlines)
symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
emphasisBetween :: Char
-> OrgParser (F Inlines)
emphasisBetween c = try $ do
startEmphasisNewlinesCounting emphasisAllowedNewlines
res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState
when isTopLevelEmphasis
resetEmphasisNewlines
return res
verbatimBetween :: Char
-> OrgParser String
verbatimBetween c = try $
emphasisStart c *>
many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c)
-- | Parses a raw string delimited by @c@ using Org's math rules
mathStringBetween :: Char
-> OrgParser String
mathStringBetween c = try $ do
mathStart c
body <- many1TillNOrLessNewlines mathAllowedNewlines
(noneOf (c:"\n\r"))
(lookAhead $ mathEnd c)
final <- mathEnd c
return $ body ++ [final]
-- | Parse a single character between @c@ using math rules
math1CharBetween :: Char
-> OrgParser String
math1CharBetween c = try $ do
char c
res <- noneOf $ c:mathForbiddenBorderChars
char c
eof <|> () <$ lookAhead (oneOf mathPostChars)
return [res]
rawMathBetween :: String
-> String
-> OrgParser String
rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e)
-- | Parses the start (opening character) of emphasis
emphasisStart :: Char -> OrgParser Char
emphasisStart c = try $ do
guard =<< afterEmphasisPreChar
guard =<< notAfterString
char c
lookAhead (noneOf emphasisForbiddenBorderChars)
pushToInlineCharStack c
return c
-- | Parses the closing character of emphasis
emphasisEnd :: Char -> OrgParser Char
emphasisEnd c = try $ do
guard =<< notAfterForbiddenBorderChar
char c
eof <|> () <$ lookAhead acceptablePostChars
updateLastStrPos
popInlineCharStack
return c
where acceptablePostChars =
surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars)
mathStart :: Char -> OrgParser Char
mathStart c = try $
char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars))
mathEnd :: Char -> OrgParser Char
mathEnd c = try $ do
res <- noneOf (c:mathForbiddenBorderChars)
char c
eof <|> () <$ lookAhead (oneOf mathPostChars)
return res
enclosedInlines :: OrgParser a
-> OrgParser b
-> OrgParser (F Inlines)
enclosedInlines start end = try $
trimInlinesF . mconcat <$> enclosed start end inline
enclosedRaw :: OrgParser a
-> OrgParser b
-> OrgParser String
enclosedRaw start end = try $
start *> (onSingleLine <|> spanningTwoLines)
where onSingleLine = try $ many1Till (noneOf "\n\r") end
spanningTwoLines = try $
anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine
-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume
-- newlines.
many1TillNOrLessNewlines :: Int
-> OrgParser Char
-> OrgParser a
-> OrgParser String
many1TillNOrLessNewlines n p end = try $
nMoreLines (Just n) mempty >>= oneOrMore
where
nMoreLines Nothing cs = return cs
nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine
nMoreLines k cs = try $ (final k cs <|> rest k cs)
>>= uncurry nMoreLines
final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine
rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline)
finalLine = try $ manyTill p end
minus1 k = k - 1
oneOrMore cs = guard (not $ null cs) *> return cs
-- Org allows customization of the way it reads emphasis. We use the defaults
-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components`
-- for details).
-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too)
emphasisPreChars :: [Char]
emphasisPreChars = "\t \"'({"
-- | Chars allowed at after emphasis
emphasisPostChars :: [Char]
emphasisPostChars = "\t\n !\"'),-.:;?\\}"
-- | Chars not allowed at the (inner) border of emphasis
emphasisForbiddenBorderChars :: [Char]
emphasisForbiddenBorderChars = "\t\n\r \"',"
-- | The maximum number of newlines within
emphasisAllowedNewlines :: Int
emphasisAllowedNewlines = 1
-- LaTeX-style math: see `org-latex-regexps` for details
-- | Chars allowed after an inline ($...$) math statement
mathPostChars :: [Char]
mathPostChars = "\t\n \"'),-.:;?"
-- | Chars not allowed at the (inner) border of math
mathForbiddenBorderChars :: [Char]
mathForbiddenBorderChars = "\t\n\r ,;.$"
-- | Maximum number of newlines in an inline math statement
mathAllowedNewlines :: Int
mathAllowedNewlines = 2
-- | Whether we are right behind a char allowed before emphasis
afterEmphasisPreChar :: OrgParser Bool
afterEmphasisPreChar = do
pos <- getPosition
lastPrePos <- orgStateLastPreCharPos <$> getState
return . fromMaybe True $ (== pos) <$> lastPrePos
-- | Whether the parser is right after a forbidden border char
notAfterForbiddenBorderChar :: OrgParser Bool
notAfterForbiddenBorderChar = do
pos <- getPosition
lastFBCPos <- orgStateLastForbiddenCharPos <$> getState
return $ lastFBCPos /= Just pos
-- | Read a sub- or superscript expression
subOrSuperExpr :: OrgParser (F Inlines)
subOrSuperExpr = try $
choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r")
, enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r")
, simpleSubOrSuperString
] >>= parseFromString (mconcat <$> many inline)
where enclosing (left, right) s = left : s ++ [right]
simpleSubOrSuperString :: OrgParser String
simpleSubOrSuperString = try $ do
state <- getState
guard . exportSubSuperscripts . orgStateExportSettings $ state
choice [ string "*"
, mappend <$> option [] ((:[]) <$> oneOf "+-")
<*> many1 alphaNum
]
inlineLaTeX :: OrgParser (F Inlines)
inlineLaTeX = try $ do
cmd <- inlineLaTeXCommand
maybe mzero returnF $
parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd
where
parseAsMath :: String -> Maybe Inlines
parseAsMath cs = B.fromList <$> texMathToPandoc cs
parseAsInlineLaTeX :: String -> Maybe Inlines
parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
parseAsMathMLSym :: String -> Maybe Inlines
parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
-- drop initial backslash and any trailing "{}"
where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1
state :: ParserState
state = def{ stateOptions = def{ readerParseRaw = True }}
texMathToPandoc :: String -> Maybe [Inline]
texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline
maybeRight :: Either a b -> Maybe b
maybeRight = either (const Nothing) Just
inlineLaTeXCommand :: OrgParser String
inlineLaTeXCommand = try $ do
rest <- getInput
case runParser rawLaTeXInline def "source" rest of
Right (RawInline _ cs) -> do
-- drop any trailing whitespace, those are not be part of the command as
-- far as org mode is concerned.
let cmdNoSpc = dropWhileEnd isSpace cs
let len = length cmdNoSpc
count len anyChar
return cmdNoSpc
_ -> mzero
-- Taken from Data.OldList.
dropWhileEnd :: (a -> Bool) -> [a] -> [a]
dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) []
smart :: OrgParser (F Inlines)
smart = do
getOption readerSmart >>= guard
doubleQuoted <|> singleQuoted <|>
choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses])
where
orgDash = dash <* updatePositions '-'
orgEllipses = ellipses <* updatePositions '.'
orgApostrophe =
(char '\'' <|> char '\8217') <* updateLastPreCharPos
<* updateLastForbiddenCharPos
*> return (B.str "\x2019")
singleQuoted :: OrgParser (F Inlines)
singleQuoted = try $ do
singleQuoteStart
updatePositions '\''
withQuoteContext InSingleQuote $
fmap B.singleQuoted . trimInlinesF . mconcat <$>
many1Till inline (singleQuoteEnd <* updatePositions '\'')
-- doubleQuoted will handle regular double-quoted sections, as well
-- as dialogues with an open double-quote without a close double-quote
-- in the same paragraph.
doubleQuoted :: OrgParser (F Inlines)
doubleQuoted = try $ do
doubleQuoteStart
updatePositions '"'
contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline)
(withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return
(fmap B.doubleQuoted . trimInlinesF $ contents))
<|> (return $ return (B.str "\8220") <> contents)

View file

@ -0,0 +1,112 @@
{-
Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Org.Options
Copyright : Copyright (C) 2014-2016 Albert Krewinkel
License : GNU GPL, version 2 or above
Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
Parsers for Org-mode inline elements.
-}
module Text.Pandoc.Readers.Org.BlockStarts
( exampleLineStart
, hline
, noteMarker
, tableStart
, drawerStart
, headerStart
, metaLineStart
, latexEnvStart
, commentLineStart
, bulletListStart
, orderedListStart
) where
import Text.Pandoc.Readers.Org.Parsing
-- | Horizontal Line (five -- dashes or more)
hline :: OrgParser ()
hline = try $ do
skipSpaces
string "-----"
many (char '-')
skipSpaces
newline
return ()
-- | Read the start of a header line, return the header level
headerStart :: OrgParser Int
headerStart = try $
(length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos
tableStart :: OrgParser Char
tableStart = try $ skipSpaces *> char '|'
latexEnvStart :: OrgParser String
latexEnvStart = try $ do
skipSpaces *> string "\\begin{"
*> latexEnvName
<* string "}"
<* blankline
where
latexEnvName :: OrgParser String
latexEnvName = try $ mappend <$> many1 alphaNum <*> option "" (string "*")
-- | Parses bullet list marker.
bulletListStart :: OrgParser ()
bulletListStart = try $
choice
[ () <$ skipSpaces <* oneOf "+-" <* skipSpaces1
, () <$ skipSpaces1 <* char '*' <* skipSpaces1
]
genericListStart :: OrgParser String
-> OrgParser Int
genericListStart listMarker = try $
(+) <$> (length <$> many spaceChar)
<*> (length <$> listMarker <* many1 spaceChar)
orderedListStart :: OrgParser Int
orderedListStart = genericListStart orderedListMarker
-- Ordered list markers allowed in org-mode
where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)")
drawerStart :: OrgParser String
drawerStart = try $
skipSpaces *> drawerName <* skipSpaces <* newline
where drawerName = char ':' *> manyTill nonspaceChar (char ':')
metaLineStart :: OrgParser ()
metaLineStart = try $ skipSpaces <* string "#+"
commentLineStart :: OrgParser ()
commentLineStart = try $ skipSpaces <* string "# "
exampleLineStart :: OrgParser ()
exampleLineStart = () <$ try (skipSpaces *> string ": ")
noteMarker :: OrgParser String
noteMarker = try $ do
char '['
choice [ many1Till digit (char ']')
, (++) <$> string "fn:"
<*> many1Till (noneOf "\n\r\t ") (char ']')
]

View file

@ -0,0 +1,715 @@
{-# LANGUAGE OverloadedStrings #-}
{-
Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Org.Options
Copyright : Copyright (C) 2014-2016 Albert Krewinkel
License : GNU GPL, version 2 or above
Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
Parsers for Org-mode inline elements.
-}
module Text.Pandoc.Readers.Org.Inlines
( inline
, addToNotesTable
, parseInlines
, isImageFilename
, linkTarget
) where
import Text.Pandoc.Readers.Org.BlockStarts
import Text.Pandoc.Readers.Org.ParserState
import Text.Pandoc.Readers.Org.Parsing
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder ( Inlines )
import Text.Pandoc.Definition
import Text.Pandoc.Compat.Monoid ( (<>) )
import Text.Pandoc.Options
import Text.Pandoc.Readers.LaTeX ( inlineCommand, rawLaTeXInline )
import Text.TeXMath ( readTeX, writePandoc, DisplayType(..) )
import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
import Control.Arrow ( first )
import Control.Monad ( guard, mplus, mzero, when )
import Data.Char ( isAlphaNum, isSpace )
import Data.List ( isPrefixOf, isSuffixOf )
import Data.Maybe ( fromMaybe )
import qualified Data.Map as M
-- | Prefix used for Rundoc classes and arguments.
rundocPrefix :: String
rundocPrefix = "rundoc-"
-- | The class-name used to mark rundoc blocks.
rundocBlockClass :: String
rundocBlockClass = rundocPrefix ++ "block"
toRundocAttrib :: (String, String) -> (String, String)
toRundocAttrib = first ("rundoc-" ++)
translateLang :: String -> String
translateLang "C" = "c"
translateLang "C++" = "cpp"
translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported
translateLang "js" = "javascript"
translateLang "lisp" = "commonlisp"
translateLang "R" = "r"
translateLang "sh" = "bash"
translateLang "sqlite" = "sql"
translateLang cs = cs
--
-- Functions acting on the parser state
--
recordAnchorId :: String -> OrgParser ()
recordAnchorId i = updateState $ \s ->
s{ orgStateAnchorIds = i : (orgStateAnchorIds s) }
pushToInlineCharStack :: Char -> OrgParser ()
pushToInlineCharStack c = updateState $ \s ->
s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s }
popInlineCharStack :: OrgParser ()
popInlineCharStack = updateState $ \s ->
s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s }
surroundingEmphasisChar :: OrgParser [Char]
surroundingEmphasisChar =
take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
startEmphasisNewlinesCounting :: Int -> OrgParser ()
startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
s{ orgStateEmphasisNewlines = Just maxNewlines }
decEmphasisNewlinesCount :: OrgParser ()
decEmphasisNewlinesCount = updateState $ \s ->
s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
newlinesCountWithinLimits :: OrgParser Bool
newlinesCountWithinLimits = do
st <- getState
return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True
resetEmphasisNewlines :: OrgParser ()
resetEmphasisNewlines = updateState $ \s ->
s{ orgStateEmphasisNewlines = Nothing }
addToNotesTable :: OrgNoteRecord -> OrgParser ()
addToNotesTable note = do
oldnotes <- orgStateNotes' <$> getState
updateState $ \s -> s{ orgStateNotes' = note:oldnotes }
-- | Parse a single Org-mode inline element
inline :: OrgParser (F Inlines)
inline =
choice [ whitespace
, linebreak
, cite
, footnote
, linkOrImage
, anchor
, inlineCodeBlock
, str
, endline
, emph
, strong
, strikeout
, underline
, code
, math
, displayMath
, verbatim
, subscript
, superscript
, inlineLaTeX
, smart
, symbol
] <* (guard =<< newlinesCountWithinLimits)
<?> "inline"
parseInlines :: OrgParser (F Inlines)
parseInlines = trimInlinesF . mconcat <$> many1 inline
-- treat these as potentially non-text when parsing inline:
specialChars :: [Char]
specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~"
whitespace :: OrgParser (F Inlines)
whitespace = pure B.space <$ skipMany1 spaceChar
<* updateLastPreCharPos
<* updateLastForbiddenCharPos
<?> "whitespace"
linebreak :: OrgParser (F Inlines)
linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline
str :: OrgParser (F Inlines)
str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
<* updateLastStrPos
-- | An endline character that can be treated as a space, not a structural
-- break. This should reflect the values of the Emacs variable
-- @org-element-pagaraph-separate@.
endline :: OrgParser (F Inlines)
endline = try $ do
newline
notFollowedBy blankline
notFollowedBy' exampleLineStart
notFollowedBy' hline
notFollowedBy' noteMarker
notFollowedBy' tableStart
notFollowedBy' drawerStart
notFollowedBy' headerStart
notFollowedBy' metaLineStart
notFollowedBy' latexEnvStart
notFollowedBy' commentLineStart
notFollowedBy' bulletListStart
notFollowedBy' orderedListStart
decEmphasisNewlinesCount
guard =<< newlinesCountWithinLimits
updateLastPreCharPos
return . return $ B.softbreak
cite :: OrgParser (F Inlines)
cite = try $ do
guardEnabled Ext_citations
(cs, raw) <- withRaw normalCite
return $ (flip B.cite (B.text raw)) <$> cs
normalCite :: OrgParser (F [Citation])
normalCite = try $ char '['
*> skipSpaces
*> citeList
<* skipSpaces
<* char ']'
citeList :: OrgParser (F [Citation])
citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces)
citation :: OrgParser (F Citation)
citation = try $ do
pref <- prefix
(suppress_author, key) <- citeKey
suff <- suffix
return $ do
x <- pref
y <- suff
return $ Citation{ citationId = key
, citationPrefix = B.toList x
, citationSuffix = B.toList y
, citationMode = if suppress_author
then SuppressAuthor
else NormalCitation
, citationNoteNum = 0
, citationHash = 0
}
where
prefix = trimInlinesF . mconcat <$>
manyTill inline (char ']' <|> (']' <$ lookAhead citeKey))
suffix = try $ do
hasSpace <- option False (notFollowedBy nonspaceChar >> return True)
skipSpaces
rest <- trimInlinesF . mconcat <$>
many (notFollowedBy (oneOf ";]") *> inline)
return $ if hasSpace
then (B.space <>) <$> rest
else rest
footnote :: OrgParser (F Inlines)
footnote = try $ inlineNote <|> referencedNote
inlineNote :: OrgParser (F Inlines)
inlineNote = try $ do
string "[fn:"
ref <- many alphaNum
char ':'
note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']')
when (not $ null ref) $
addToNotesTable ("fn:" ++ ref, note)
return $ B.note <$> note
referencedNote :: OrgParser (F Inlines)
referencedNote = try $ do
ref <- noteMarker
return $ do
notes <- asksF orgStateNotes'
case lookup ref notes of
Nothing -> return $ B.str $ "[" ++ ref ++ "]"
Just contents -> do
st <- askF
let contents' = runF contents st{ orgStateNotes' = [] }
return $ B.note contents'
linkOrImage :: OrgParser (F Inlines)
linkOrImage = explicitOrImageLink
<|> selflinkOrImage
<|> angleLink
<|> plainLink
<?> "link or image"
explicitOrImageLink :: OrgParser (F Inlines)
explicitOrImageLink = try $ do
char '['
srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
title <- enclosedRaw (char '[') (char ']')
title' <- parseFromString (mconcat <$> many inline) title
char ']'
return $ do
src <- srcF
if isImageFilename title
then pure $ B.link src "" $ B.image title mempty mempty
else linkToInlinesF src =<< title'
selflinkOrImage :: OrgParser (F Inlines)
selflinkOrImage = try $ do
src <- char '[' *> linkTarget <* char ']'
return $ linkToInlinesF src (B.str src)
plainLink :: OrgParser (F Inlines)
plainLink = try $ do
(orig, src) <- uri
returnF $ B.link src "" (B.str orig)
angleLink :: OrgParser (F Inlines)
angleLink = try $ do
char '<'
link <- plainLink
char '>'
return link
linkTarget :: OrgParser String
linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]")
possiblyEmptyLinkTarget :: OrgParser String
possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
applyCustomLinkFormat :: String -> OrgParser (F String)
applyCustomLinkFormat link = do
let (linkType, rest) = break (== ':') link
return $ do
formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
return $ maybe link ($ drop 1 rest) formatter
-- | Take a link and return a function which produces new inlines when given
-- description inlines.
linkToInlinesF :: String -> Inlines -> F Inlines
linkToInlinesF linkStr =
case linkStr of
"" -> pure . B.link mempty "" -- wiki link (empty by convention)
('#':_) -> pure . B.link linkStr "" -- document-local fraction
_ -> case cleanLinkString linkStr of
(Just cleanedLink) -> if isImageFilename cleanedLink
then const . pure $ B.image cleanedLink "" ""
else pure . B.link cleanedLink ""
Nothing -> internalLink linkStr -- other internal link
-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if
-- the string does not appear to be a link.
cleanLinkString :: String -> Maybe String
cleanLinkString s =
case s of
'/':_ -> Just $ "file://" ++ s -- absolute path
'.':'/':_ -> Just s -- relative path
'.':'.':'/':_ -> Just s -- relative path
-- Relative path or URL (file schema)
'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s'
_ | isUrl s -> Just s -- URL
_ -> Nothing
where
isUrl :: String -> Bool
isUrl cs =
let (scheme, path) = break (== ':') cs
in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme
&& not (null path)
isImageFilename :: String -> Bool
isImageFilename filename =
any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
(any (\x -> (x++":") `isPrefixOf` filename) protocols ||
':' `notElem` filename)
where
imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
protocols = [ "file", "http", "https" ]
internalLink :: String -> Inlines -> F Inlines
internalLink link title = do
anchorB <- (link `elem`) <$> asksF orgStateAnchorIds
if anchorB
then return $ B.link ('#':link) "" title
else return $ B.emph title
-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
-- @anchor-id@ set as id. Legal anchors in org-mode are defined through
-- @org-target-regexp@, which is fairly liberal. Since no link is created if
-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as
-- an anchor.
anchor :: OrgParser (F Inlines)
anchor = try $ do
anchorId <- parseAnchor
recordAnchorId anchorId
returnF $ B.spanWith (solidify anchorId, [], []) mempty
where
parseAnchor = string "<<"
*> many1 (noneOf "\t\n\r<>\"' ")
<* string ">>"
<* skipSpaces
-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors
-- the org function @org-export-solidify-link-text@.
solidify :: String -> String
solidify = map replaceSpecialChar
where replaceSpecialChar c
| isAlphaNum c = c
| c `elem` ("_.-:" :: String) = c
| otherwise = '-'
-- | Parses an inline code block and marks it as an babel block.
inlineCodeBlock :: OrgParser (F Inlines)
inlineCodeBlock = try $ do
string "src_"
lang <- many1 orgArgWordChar
opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption
inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r")
let attrClasses = [translateLang lang, rundocBlockClass]
let attrKeyVal = map toRundocAttrib (("language", lang) : opts)
returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode
where
inlineBlockOption :: OrgParser (String, String)
inlineBlockOption = try $ do
argKey <- orgArgKey
paramValue <- option "yes" orgInlineParamValue
return (argKey, paramValue)
orgInlineParamValue :: OrgParser String
orgInlineParamValue = try $
skipSpaces
*> notFollowedBy (char ':')
*> many1 (noneOf "\t\n\r ]")
<* skipSpaces
enclosedByPair :: Char -- ^ opening char
-> Char -- ^ closing char
-> OrgParser a -- ^ parser
-> OrgParser [a]
enclosedByPair s e p = char s *> many1Till p (char e)
emph :: OrgParser (F Inlines)
emph = fmap B.emph <$> emphasisBetween '/'
strong :: OrgParser (F Inlines)
strong = fmap B.strong <$> emphasisBetween '*'
strikeout :: OrgParser (F Inlines)
strikeout = fmap B.strikeout <$> emphasisBetween '+'
-- There is no underline, so we use strong instead.
underline :: OrgParser (F Inlines)
underline = fmap B.strong <$> emphasisBetween '_'
verbatim :: OrgParser (F Inlines)
verbatim = return . B.code <$> verbatimBetween '='
code :: OrgParser (F Inlines)
code = return . B.code <$> verbatimBetween '~'
subscript :: OrgParser (F Inlines)
subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr)
superscript :: OrgParser (F Inlines)
superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr)
math :: OrgParser (F Inlines)
math = return . B.math <$> choice [ math1CharBetween '$'
, mathStringBetween '$'
, rawMathBetween "\\(" "\\)"
]
displayMath :: OrgParser (F Inlines)
displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
, rawMathBetween "$$" "$$"
]
updatePositions :: Char
-> OrgParser (Char)
updatePositions c = do
when (c `elem` emphasisPreChars) updateLastPreCharPos
when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
return c
symbol :: OrgParser (F Inlines)
symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
emphasisBetween :: Char
-> OrgParser (F Inlines)
emphasisBetween c = try $ do
startEmphasisNewlinesCounting emphasisAllowedNewlines
res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState
when isTopLevelEmphasis
resetEmphasisNewlines
return res
verbatimBetween :: Char
-> OrgParser String
verbatimBetween c = try $
emphasisStart c *>
many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c)
-- | Parses a raw string delimited by @c@ using Org's math rules
mathStringBetween :: Char
-> OrgParser String
mathStringBetween c = try $ do
mathStart c
body <- many1TillNOrLessNewlines mathAllowedNewlines
(noneOf (c:"\n\r"))
(lookAhead $ mathEnd c)
final <- mathEnd c
return $ body ++ [final]
-- | Parse a single character between @c@ using math rules
math1CharBetween :: Char
-> OrgParser String
math1CharBetween c = try $ do
char c
res <- noneOf $ c:mathForbiddenBorderChars
char c
eof <|> () <$ lookAhead (oneOf mathPostChars)
return [res]
rawMathBetween :: String
-> String
-> OrgParser String
rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e)
-- | Parses the start (opening character) of emphasis
emphasisStart :: Char -> OrgParser Char
emphasisStart c = try $ do
guard =<< afterEmphasisPreChar
guard =<< notAfterString
char c
lookAhead (noneOf emphasisForbiddenBorderChars)
pushToInlineCharStack c
return c
-- | Parses the closing character of emphasis
emphasisEnd :: Char -> OrgParser Char
emphasisEnd c = try $ do
guard =<< notAfterForbiddenBorderChar
char c
eof <|> () <$ lookAhead acceptablePostChars
updateLastStrPos
popInlineCharStack
return c
where acceptablePostChars =
surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars)
mathStart :: Char -> OrgParser Char
mathStart c = try $
char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars))
mathEnd :: Char -> OrgParser Char
mathEnd c = try $ do
res <- noneOf (c:mathForbiddenBorderChars)
char c
eof <|> () <$ lookAhead (oneOf mathPostChars)
return res
enclosedInlines :: OrgParser a
-> OrgParser b
-> OrgParser (F Inlines)
enclosedInlines start end = try $
trimInlinesF . mconcat <$> enclosed start end inline
enclosedRaw :: OrgParser a
-> OrgParser b
-> OrgParser String
enclosedRaw start end = try $
start *> (onSingleLine <|> spanningTwoLines)
where onSingleLine = try $ many1Till (noneOf "\n\r") end
spanningTwoLines = try $
anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine
-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume
-- newlines.
many1TillNOrLessNewlines :: Int
-> OrgParser Char
-> OrgParser a
-> OrgParser String
many1TillNOrLessNewlines n p end = try $
nMoreLines (Just n) mempty >>= oneOrMore
where
nMoreLines Nothing cs = return cs
nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine
nMoreLines k cs = try $ (final k cs <|> rest k cs)
>>= uncurry nMoreLines
final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine
rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline)
finalLine = try $ manyTill p end
minus1 k = k - 1
oneOrMore cs = guard (not $ null cs) *> return cs
-- Org allows customization of the way it reads emphasis. We use the defaults
-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components`
-- for details).
-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too)
emphasisPreChars :: [Char]
emphasisPreChars = "\t \"'({"
-- | Chars allowed at after emphasis
emphasisPostChars :: [Char]
emphasisPostChars = "\t\n !\"'),-.:;?\\}"
-- | Chars not allowed at the (inner) border of emphasis
emphasisForbiddenBorderChars :: [Char]
emphasisForbiddenBorderChars = "\t\n\r \"',"
-- | The maximum number of newlines within
emphasisAllowedNewlines :: Int
emphasisAllowedNewlines = 1
-- LaTeX-style math: see `org-latex-regexps` for details
-- | Chars allowed after an inline ($...$) math statement
mathPostChars :: [Char]
mathPostChars = "\t\n \"'),-.:;?"
-- | Chars not allowed at the (inner) border of math
mathForbiddenBorderChars :: [Char]
mathForbiddenBorderChars = "\t\n\r ,;.$"
-- | Maximum number of newlines in an inline math statement
mathAllowedNewlines :: Int
mathAllowedNewlines = 2
-- | Whether we are right behind a char allowed before emphasis
afterEmphasisPreChar :: OrgParser Bool
afterEmphasisPreChar = do
pos <- getPosition
lastPrePos <- orgStateLastPreCharPos <$> getState
return . fromMaybe True $ (== pos) <$> lastPrePos
-- | Whether the parser is right after a forbidden border char
notAfterForbiddenBorderChar :: OrgParser Bool
notAfterForbiddenBorderChar = do
pos <- getPosition
lastFBCPos <- orgStateLastForbiddenCharPos <$> getState
return $ lastFBCPos /= Just pos
-- | Read a sub- or superscript expression
subOrSuperExpr :: OrgParser (F Inlines)
subOrSuperExpr = try $
choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r")
, enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r")
, simpleSubOrSuperString
] >>= parseFromString (mconcat <$> many inline)
where enclosing (left, right) s = left : s ++ [right]
simpleSubOrSuperString :: OrgParser String
simpleSubOrSuperString = try $ do
state <- getState
guard . exportSubSuperscripts . orgStateExportSettings $ state
choice [ string "*"
, mappend <$> option [] ((:[]) <$> oneOf "+-")
<*> many1 alphaNum
]
inlineLaTeX :: OrgParser (F Inlines)
inlineLaTeX = try $ do
cmd <- inlineLaTeXCommand
maybe mzero returnF $
parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd
where
parseAsMath :: String -> Maybe Inlines
parseAsMath cs = B.fromList <$> texMathToPandoc cs
parseAsInlineLaTeX :: String -> Maybe Inlines
parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
parseAsMathMLSym :: String -> Maybe Inlines
parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
-- drop initial backslash and any trailing "{}"
where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1
state :: ParserState
state = def{ stateOptions = def{ readerParseRaw = True }}
texMathToPandoc :: String -> Maybe [Inline]
texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline
maybeRight :: Either a b -> Maybe b
maybeRight = either (const Nothing) Just
inlineLaTeXCommand :: OrgParser String
inlineLaTeXCommand = try $ do
rest <- getInput
case runParser rawLaTeXInline def "source" rest of
Right (RawInline _ cs) -> do
-- drop any trailing whitespace, those are not be part of the command as
-- far as org mode is concerned.
let cmdNoSpc = dropWhileEnd isSpace cs
let len = length cmdNoSpc
count len anyChar
return cmdNoSpc
_ -> mzero
-- Taken from Data.OldList.
dropWhileEnd :: (a -> Bool) -> [a] -> [a]
dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) []
smart :: OrgParser (F Inlines)
smart = do
getOption readerSmart >>= guard
doubleQuoted <|> singleQuoted <|>
choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses])
where
orgDash = dash <* updatePositions '-'
orgEllipses = ellipses <* updatePositions '.'
orgApostrophe =
(char '\'' <|> char '\8217') <* updateLastPreCharPos
<* updateLastForbiddenCharPos
*> return (B.str "\x2019")
singleQuoted :: OrgParser (F Inlines)
singleQuoted = try $ do
singleQuoteStart
updatePositions '\''
withQuoteContext InSingleQuote $
fmap B.singleQuoted . trimInlinesF . mconcat <$>
many1Till inline (singleQuoteEnd <* updatePositions '\'')
-- doubleQuoted will handle regular double-quoted sections, as well
-- as dialogues with an open double-quote without a close double-quote
-- in the same paragraph.
doubleQuoted :: OrgParser (F Inlines)
doubleQuoted = try $ do
doubleQuoteStart
updatePositions '"'
contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline)
(withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return
(fmap B.doubleQuoted . trimInlinesF $ contents))
<|> (return $ return (B.str "\8220") <> contents)

View file

@ -34,10 +34,14 @@ module Text.Pandoc.Readers.Org.Parsing
, blanklines
, newline
, parseFromString
, skipSpaces1
, inList
, withContext
, updateLastForbiddenCharPos
, updateLastPreCharPos
, orgArgKey
, orgArgWord
, orgArgWordChar
-- * Re-exports from Text.Pandoc.Parser
, ParserContext (..)
, many1Till
@ -133,6 +137,10 @@ parseFromString parser str' = do
updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos }
return result
-- | Skip one or more tab or space characters.
skipSpaces1 :: OrgParser ()
skipSpaces1 = skipMany1 spaceChar
-- | Like @Text.Parsec.Char.newline@, but causes additional state changes.
newline :: OrgParser Char
newline =
@ -180,3 +188,14 @@ updateLastForbiddenCharPos = getPosition >>= \p ->
updateLastPreCharPos :: OrgParser ()
updateLastPreCharPos = getPosition >>= \p ->
updateState $ \s -> s{ orgStateLastPreCharPos = Just p}
orgArgKey :: OrgParser String
orgArgKey = try $
skipSpaces *> char ':'
*> many1 orgArgWordChar
orgArgWord :: OrgParser String
orgArgWord = many1 orgArgWordChar
orgArgWordChar :: OrgParser Char
orgArgWordChar = alphaNum <|> oneOf "-_"