Merge pull request #1226 from tarleb/org-emphasis-reader

Org reader: Precise rules for the recognition of markup
This commit is contained in:
John MacFarlane 2014-04-09 09:34:44 -07:00
commit 54e33a132b
2 changed files with 284 additions and 123 deletions

View file

@ -32,11 +32,12 @@ import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>), HasMeta(..)) import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>), HasMeta(..))
import Text.Pandoc.Definition import Text.Pandoc.Definition
import Text.Pandoc.Options import Text.Pandoc.Options
import Text.Pandoc.Parsing hiding (orderedListMarker, updateLastStrPos) import qualified Text.Pandoc.Parsing as P
import Text.Pandoc.Parsing hiding (newline, orderedListMarker, updateLastStrPos)
import Text.Pandoc.Shared (compactify') import Text.Pandoc.Shared (compactify')
import Control.Applicative (pure, (<$>), (<$), (<*>), (<*), (*>), (<**>)) import Control.Applicative (pure, (<$>), (<$), (<*>), (<*), (*>), (<**>))
import Control.Monad (guard, mzero) import Control.Monad (guard, when)
import Data.Char (toLower) import Data.Char (toLower)
import Data.Default import Data.Default
import Data.List (foldl', isPrefixOf, isSuffixOf) import Data.List (foldl', isPrefixOf, isSuffixOf)
@ -47,49 +48,100 @@ import Data.Monoid (mconcat, mempty, mappend)
readOrg :: ReaderOptions -- ^ Reader options readOrg :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings) -> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Pandoc -> Pandoc
readOrg opts s = (readWith parseOrg) def{ orgOptions = opts } (s ++ "\n\n") readOrg opts s = (readWith parseOrg) def{ orgStateOptions = opts } (s ++ "\n\n")
type OrgParser = Parser [Char] OrgParserState type OrgParser = Parser [Char] OrgParserState
parseOrg:: OrgParser Pandoc
parseOrg = do
blocks' <- B.toList <$> parseBlocks
st <- getState
let meta = orgStateMeta st
return $ Pandoc meta $ filter (/= Null) blocks'
--
-- Parser State for Org
--
-- | Org-mode parser state -- | Org-mode parser state
data OrgParserState = OrgParserState data OrgParserState = OrgParserState
{ orgOptions :: ReaderOptions { orgStateOptions :: ReaderOptions
, orgInlineCharStack :: [Char] , orgStateEmphasisCharStack :: [Char]
, orgLastStrPos :: Maybe SourcePos , orgStateEmphasisNewlines :: Maybe Int
, orgMeta :: Meta , orgStateLastForbiddenCharPos :: Maybe SourcePos
, orgStateLastPreCharPos :: Maybe SourcePos
, orgStateLastStrPos :: Maybe SourcePos
, orgStateMeta :: Meta
} deriving (Show) } deriving (Show)
instance HasReaderOptions OrgParserState where instance HasReaderOptions OrgParserState where
extractReaderOptions = orgOptions extractReaderOptions = orgStateOptions
instance HasMeta OrgParserState where instance HasMeta OrgParserState where
setMeta field val st = setMeta field val st =
st{ orgMeta = setMeta field val $ orgMeta st } st{ orgStateMeta = setMeta field val $ orgStateMeta st }
deleteMeta field st = deleteMeta field st =
st{ orgMeta = deleteMeta field $ orgMeta st } st{ orgStateMeta = deleteMeta field $ orgStateMeta st }
instance Default OrgParserState where instance Default OrgParserState where
def = defaultOrgParserState def = defaultOrgParserState
defaultOrgParserState :: OrgParserState defaultOrgParserState :: OrgParserState
defaultOrgParserState = OrgParserState defaultOrgParserState = OrgParserState
{ orgOptions = def { orgStateOptions = def
, orgInlineCharStack = [] , orgStateEmphasisCharStack = []
, orgLastStrPos = Nothing , orgStateEmphasisNewlines = Nothing
, orgMeta = nullMeta , orgStateLastForbiddenCharPos = Nothing
, orgStateLastPreCharPos = Nothing
, orgStateLastStrPos = Nothing
, orgStateMeta = nullMeta
} }
updateLastStrPos :: OrgParser () updateLastStrPos :: OrgParser ()
updateLastStrPos = getPosition >>= \p -> updateLastStrPos = getPosition >>= \p ->
updateState $ \s -> s{ orgLastStrPos = Just p } updateState $ \s -> s{ orgStateLastStrPos = Just p }
updateLastForbiddenCharPos :: OrgParser ()
updateLastForbiddenCharPos = getPosition >>= \p ->
updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p}
parseOrg:: OrgParser Pandoc updateLastPreCharPos :: OrgParser ()
parseOrg = do updateLastPreCharPos = getPosition >>= \p ->
blocks' <- B.toList <$> parseBlocks updateState $ \s -> s{ orgStateLastPreCharPos = Just p}
pushToInlineCharStack :: Char -> OrgParser ()
pushToInlineCharStack c = updateState $ \st ->
st { orgStateEmphasisCharStack = c:(orgStateEmphasisCharStack st) }
popInlineCharStack :: OrgParser ()
popInlineCharStack = updateState $ \st ->
st { orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ st }
surroundingEmphasisChar :: OrgParser [Char]
surroundingEmphasisChar = take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
startEmphasisNewlinesCounting :: Int -> OrgParser ()
startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
s { orgStateEmphasisNewlines = Just maxNewlines }
decEmphasisNewlinesCount :: OrgParser ()
decEmphasisNewlinesCount = updateState $ \s ->
s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
newlinesCountWithinLimits :: OrgParser Bool
newlinesCountWithinLimits = do
st <- getState st <- getState
let meta = orgMeta st return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True
return $ Pandoc meta $ filter (/= Null) blocks'
resetEmphasisNewlines :: OrgParser ()
resetEmphasisNewlines = updateState $ \s ->
s{ orgStateEmphasisNewlines = Nothing }
newline :: OrgParser Char
newline =
P.newline
<* updateLastPreCharPos
<* updateLastForbiddenCharPos
-- --
-- parsing blocks -- parsing blocks
@ -218,7 +270,7 @@ commentLineStart = try $ mappend <$> many spaceChar <*> string "# "
declarationLine :: OrgParser Blocks declarationLine :: OrgParser Blocks
declarationLine = try $ do declarationLine = try $ do
meta' <- B.setMeta <$> metaKey <*> metaValue <*> pure nullMeta meta' <- B.setMeta <$> metaKey <*> metaValue <*> pure nullMeta
updateState $ \st -> st { orgMeta = orgMeta st <> meta' } updateState $ \st -> st { orgStateMeta = orgStateMeta st <> meta' }
return mempty return mempty
metaValue :: OrgParser MetaValue metaValue :: OrgParser MetaValue
@ -449,22 +501,24 @@ anyLineNewline = (++ "\n") <$> anyLine
-- --
inline :: OrgParser Inlines inline :: OrgParser Inlines
inline = choice inlineParsers <?> "inline" inline =
where inlineParsers = [ whitespace choice [ whitespace
, link , link
, str , str
, endline , endline
, emph , emph
, strong , strong
, strikeout , strikeout
, underline , underline
, code , code
, math , math
, verbatim , verbatim
, subscript , subscript
, superscript , superscript
, symbol , symbol
] ] <* (guard =<< newlinesCountWithinLimits)
<?> "inline"
-- treat these as potentially non-text when parsing inline: -- treat these as potentially non-text when parsing inline:
specialChars :: [Char] specialChars :: [Char]
@ -472,7 +526,10 @@ specialChars = "\"$'()*+-./:<=>[\\]^_{|}~"
whitespace :: OrgParser Inlines whitespace :: OrgParser Inlines
whitespace = B.space <$ skipMany1 spaceChar <?> "whitespace" whitespace = B.space <$ skipMany1 spaceChar
<* updateLastPreCharPos
<* updateLastForbiddenCharPos
<?> "whitespace"
str :: OrgParser Inlines str :: OrgParser Inlines
str = B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") str = B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
@ -492,6 +549,9 @@ endline = try $ do
notFollowedBy' commentLineStart notFollowedBy' commentLineStart
notFollowedBy' bulletListStart notFollowedBy' bulletListStart
notFollowedBy' orderedListStart notFollowedBy' orderedListStart
decEmphasisNewlinesCount
guard =<< newlinesCountWithinLimits
updateLastPreCharPos
return B.space return B.space
link :: OrgParser Inlines link :: OrgParser Inlines
@ -500,42 +560,54 @@ link = explicitOrImageLink <|> selflinkOrImage <?> "link"
explicitOrImageLink :: OrgParser Inlines explicitOrImageLink :: OrgParser Inlines
explicitOrImageLink = try $ do explicitOrImageLink = try $ do
char '[' char '['
src <- enclosedRaw (char '[') (char ']') src <- linkTarget
title <- enclosedRaw (char '[') (char ']') title <- enclosedRaw (char '[') (char ']')
title' <- parseFromString (mconcat <$> many inline) title title' <- parseFromString (mconcat <$> many inline) title
char ']' char ']'
return $ if (isImage src) && (isImage title) return $ if (isImageFilename src) && (isImageFilename title)
then B.link src "" (B.image title "" "") then B.link src "" (B.image title "" "")
else B.link src "" title' else B.link src "" title'
selflinkOrImage :: OrgParser Inlines selflinkOrImage :: OrgParser Inlines
selflinkOrImage = try $ do selflinkOrImage = try $ do
src <- enclosedRaw (string "[[") (string "]]") src <- (char '[') *> linkTarget <* char ']'
return $ if isImage src return $ if isImageFilename src
then B.image src "" "" then B.image src "" ""
else B.link src "" (B.str src) else B.link src "" (B.str src)
linkTarget :: OrgParser String
linkTarget = enclosed (char '[') (char ']') (noneOf "\n\r]")
isImageFilename :: String -> Bool
isImageFilename filename =
any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
any (\x -> (x++":") `isPrefixOf` filename) protocols ||
':' `notElem` filename
where
imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
protocols = [ "file", "http", "https" ]
emph :: OrgParser Inlines emph :: OrgParser Inlines
emph = B.emph <$> inlinesEnclosedBy '/' emph = B.emph <$> emphasisBetween '/'
strong :: OrgParser Inlines strong :: OrgParser Inlines
strong = B.strong <$> inlinesEnclosedBy '*' strong = B.strong <$> emphasisBetween '*'
strikeout :: OrgParser Inlines strikeout :: OrgParser Inlines
strikeout = B.strikeout <$> inlinesEnclosedBy '+' strikeout = B.strikeout <$> emphasisBetween '+'
-- There is no underline, so we use strong instead. -- There is no underline, so we use strong instead.
underline :: OrgParser Inlines underline :: OrgParser Inlines
underline = B.strong <$> inlinesEnclosedBy '_' underline = B.strong <$> emphasisBetween '_'
code :: OrgParser Inlines code :: OrgParser Inlines
code = B.code <$> rawEnclosedBy '=' code = B.code <$> verbatimBetween '='
math :: OrgParser Inlines
math = B.math <$> rawEnclosedBy '$'
verbatim :: OrgParser Inlines verbatim :: OrgParser Inlines
verbatim = B.rawInline "" <$> rawEnclosedBy '~' verbatim = B.rawInline "" <$> verbatimBetween '~'
math :: OrgParser Inlines
math = B.math <$> mathStringBetween '$'
subscript :: OrgParser Inlines subscript :: OrgParser Inlines
subscript = B.subscript <$> (try $ char '_' *> maybeGroupedByBraces) subscript = B.subscript <$> (try $ char '_' *> maybeGroupedByBraces)
@ -550,7 +622,72 @@ maybeGroupedByBraces = try $
] ]
symbol :: OrgParser Inlines symbol :: OrgParser Inlines
symbol = B.str . (: "") <$> oneOf specialChars symbol = B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
where updatePositions c
| c `elem` emphasisPreChars = c <$ updateLastPreCharPos
| c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos
| otherwise = return c
emphasisBetween :: Char
-> OrgParser Inlines
emphasisBetween c = try $ do
startEmphasisNewlinesCounting emphasisAllowedNewlines
res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState
when isTopLevelEmphasis
resetEmphasisNewlines
return res
verbatimBetween :: Char
-> OrgParser String
verbatimBetween c = try $
emphasisStart c *>
many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c)
-- | Parses a raw string delimited by @c@ using Org's math rules
mathStringBetween :: Char
-> OrgParser String
mathStringBetween c = try $ do
mathStart c
body <- many1TillNOrLessNewlines mathAllowedNewlines
(noneOf (c:"\n\r"))
(lookAhead $ mathEnd c)
final <- mathEnd c
return $ body ++ [final]
-- | Parses the start (opening character) of emphasis
emphasisStart :: Char -> OrgParser Char
emphasisStart c = try $ do
guard =<< afterEmphasisPreChar
guard =<< notAfterString
char c
lookAhead (noneOf emphasisForbiddenBorderChars)
pushToInlineCharStack c
return c
-- | Parses the closing character of emphasis
emphasisEnd :: Char -> OrgParser Char
emphasisEnd c = try $ do
guard =<< notAfterForbiddenBorderChar
char c
eof <|> lookAhead (surroundingEmphasisChar >>= \x ->
oneOf (x ++ emphasisPostChars))
*> return ()
updateLastStrPos
popInlineCharStack
return c
mathStart :: Char -> OrgParser Char
mathStart c = try $ do
char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars))
mathEnd :: Char -> OrgParser Char
mathEnd c = try $ do
res <- noneOf (c:mathForbiddenBorderChars)
char c
eof <|> (lookAhead $ oneOf mathPostChars *> pure ())
return res
enclosedInlines :: OrgParser a enclosedInlines :: OrgParser a
-> OrgParser b -> OrgParser b
@ -558,16 +695,6 @@ enclosedInlines :: OrgParser a
enclosedInlines start end = try $ enclosedInlines start end = try $
trimInlines . mconcat <$> enclosed start end inline trimInlines . mconcat <$> enclosed start end inline
-- FIXME: This is a hack
inlinesEnclosedBy :: Char
-> OrgParser Inlines
inlinesEnclosedBy c = try $ do
updateState $ \st -> st { orgInlineCharStack = c:(orgInlineCharStack st) }
res <- enclosedInlines (atStart (char c) <* endsOnThisOrNextLine c)
(atEnd $ char c)
updateState $ \st -> st { orgInlineCharStack = drop 1 . orgInlineCharStack $ st }
return res
enclosedRaw :: OrgParser a enclosedRaw :: OrgParser a
-> OrgParser b -> OrgParser b
-> OrgParser String -> OrgParser String
@ -577,63 +704,76 @@ enclosedRaw start end = try $
spanningTwoLines = try $ spanningTwoLines = try $
anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine
rawEnclosedBy :: Char -- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume
-> OrgParser String -- newlines.
rawEnclosedBy c = enclosedRaw (atStart $ char c) (atEnd $ char c) many1TillNOrLessNewlines :: Int
-> OrgParser Char
-- succeeds only if we're not right after a str (ie. in middle of word) -> OrgParser a
atStart :: OrgParser a -> OrgParser a -> OrgParser String
atStart p = do many1TillNOrLessNewlines n p end = try $
guard =<< not <$> isRightAfterString nMoreLines (Just n) mempty >>= oneOrMore
p
-- | succeeds only if we're at the end of a word
atEnd :: OrgParser a -> OrgParser a
atEnd p = try $ do
p <* lookingAtEndOfWord
where lookingAtEndOfWord =
eof <|> const (return ()) =<< lookAhead . oneOf =<< postWordChars
isRightAfterString :: OrgParser Bool
isRightAfterString = do
pos <- getPosition
st <- getState
-- the position `Nothing` isn't after a String, either, hence the double
-- negation
return $ not $ orgLastStrPos st /= Just pos
postWordChars :: OrgParser [Char]
postWordChars = do
st <- getState
return $ "\t\n\r !\"'),-.:?}" ++ (take 1 . drop 1 . orgInlineCharStack $ st)
-- FIXME: These functions are hacks and should be replaced
endsOnThisOrNextLine :: Char
-> OrgParser ()
endsOnThisOrNextLine c = do
inp <- getInput
let doOtherwise = \rest -> endsOnThisLine rest c (const mzero)
endsOnThisLine inp c doOtherwise
endsOnThisLine :: [Char]
-> Char
-> ([Char] -> OrgParser ())
-> OrgParser ()
endsOnThisLine input c doOnOtherLines = do
postWordChars' <- postWordChars
case break (`elem` c:"\n") input of
(_,'\n':rest) -> doOnOtherLines rest
(_,_:[]) -> return ()
(_,_:rest@(n:_)) -> if n `elem` postWordChars'
then return ()
else endsOnThisLine rest c doOnOtherLines
_ -> mzero
isImage :: String -> Bool
isImage filename =
any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
any (\x -> (x++":") `isPrefixOf` filename) protocols ||
':' `notElem` filename
where where
imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] nMoreLines Nothing cs = return cs
protocols = [ "file", "http", "https" ] nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine
nMoreLines k cs = try $ (final k cs <|> rest k cs)
>>= uncurry nMoreLines
final _ cs = (\x -> (Nothing, cs ++ x)) <$> (try finalLine)
rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> (try $ manyTill p P.newline)
finalLine = try $ manyTill p end
minus1 k = k - 1
oneOrMore cs = guard (not $ null cs) *> return cs
-- Org allows customization of the way it reads emphasis. We use the defaults
-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components`
-- for details).
-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too)
emphasisPreChars :: [Char]
emphasisPreChars = "\t \"'({"
-- | Chars allowed at after emphasis
emphasisPostChars :: [Char]
emphasisPostChars = "\t\n !\"'),-.:;?\\}"
-- | Chars not allowed at the (inner) border of emphasis
emphasisForbiddenBorderChars :: [Char]
emphasisForbiddenBorderChars = "\t\n\r \"',"
-- | The maximum number of newlines within
emphasisAllowedNewlines :: Int
emphasisAllowedNewlines = 1
-- LaTeX-style math: see `org-latex-regexps` for details
-- | Chars allowed after an inline ($...$) math statement
mathPostChars :: [Char]
mathPostChars = "\t\n \"',-.:;?"
-- | Chars not allowed at the (inner) border of math
mathForbiddenBorderChars :: [Char]
mathForbiddenBorderChars = "\t\n\r ,;.$"
-- | Maximum number of newlines in an inline math statement
mathAllowedNewlines :: Int
mathAllowedNewlines = 2
-- | Whether we are right behind a char allowed before emphasis
afterEmphasisPreChar :: OrgParser Bool
afterEmphasisPreChar = do
pos <- getPosition
lastPrePos <- orgStateLastPreCharPos <$> getState
return $ lastPrePos == Nothing || lastPrePos == Just pos
-- | Whether we are right after the end of a string
notAfterString :: OrgParser Bool
notAfterString = do
pos <- getPosition
lastStrPos <- orgStateLastStrPos <$> getState
return $ lastStrPos /= Just pos
-- | Whether the parser is right after a forbidden border char
notAfterForbiddenBorderChar :: OrgParser Bool
notAfterForbiddenBorderChar = do
pos <- getPosition
lastFBCPos <- orgStateLastForbiddenCharPos <$> getState
return $ lastFBCPos /= Just pos

View file

@ -86,16 +86,37 @@ tests =
unlines [ "this+that+ +so+on" unlines [ "this+that+ +so+on"
, "seven*eight* nine*" , "seven*eight* nine*"
, "+not+funny+" , "+not+funny+"
, "this == self"
] =?> ] =?>
para (spcSep [ "this+that+", "+so+on" para (spcSep [ "this+that+", "+so+on"
, "seven*eight*", "nine*" , "seven*eight*", "nine*"
, strikeout "not+funny" , strikeout "not+funny"
, "this" <> space <> "==" <> space <> "self"
]) ])
, "Adherence to Org's rules for markup borders" =:
"/t/& a/ / ./r/ (*l*) /e/! /b/." =?>
para (spcSep [ emph $ "t/&" <> space <> "a"
, "/"
, "./r/"
, "(" <> (strong "l") <> ")"
, (emph "e") <> "!"
, (emph "b") <> "."
])
, "Inline math must stay within three lines" =:
unlines [ "$a", "b", "c$", "$d", "e", "f", "g$" ] =?>
para ((math "a\nb\nc") <> space <>
spcSep [ "$d", "e", "f", "g$" ])
, "Markup may not span more than two lines" =: , "Markup may not span more than two lines" =:
unlines [ "/this *is", "not*", "emph/" ] =?> unlines [ "/this *is +totally", "nice+ not*", "emph/" ] =?>
para (spcSep [ "/this" para (spcSep [ "/this"
, (strong ("is" <> space <> "not")) , (strong (spcSep
[ "is"
, (strikeout ("totally" <> space <> "nice"))
, "not"
]))
, "emph/" ]) , "emph/" ])
, "Image" =: , "Image" =: