From 9df589b9c5a4f2dcb19445239dfae41b54625330 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Wed, 14 May 2014 14:45:37 +0200 Subject: [PATCH 1/3] Introduce class HasLastStrPosition, generalize functions Both `ParserState` and `OrgParserState` keep track of the parser position at which the last string ended. This patch introduces a new class `HasLastStrPosition` and makes the above types instances of that class. This enables the generalization of functions updating the state or checking if one is right after a string. --- src/Text/Pandoc/Parsing.hs | 32 +++++++++++++++++++++-------- src/Text/Pandoc/Readers/Markdown.hs | 11 +++------- src/Text/Pandoc/Readers/Org.hs | 11 ++++------ 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index d1e55cbc4..344f6c7ba 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -54,7 +54,6 @@ module Text.Pandoc.Parsing ( (>>~), withRaw, escaped, characterReference, - updateLastStrPos, anyOrderedListMarker, orderedListMarker, charRef, @@ -66,11 +65,14 @@ module Text.Pandoc.Parsing ( (>>~), testStringWith, guardEnabled, guardDisabled, + updateLastStrPos, + notAfterString, ParserState (..), HasReaderOptions (..), HasHeaderMap (..), HasIdentifierList (..), HasMacros (..), + HasLastStrPosition (..), defaultParserState, HeaderType (..), ParserContext (..), @@ -904,6 +906,14 @@ instance HasMacros ParserState where extractMacros = stateMacros updateMacros f st = st{ stateMacros = f $ stateMacros st } +class HasLastStrPosition st where + setLastStrPos :: SourcePos -> st -> st + getLastStrPos :: st -> Maybe SourcePos + +instance HasLastStrPosition ParserState where + setLastStrPos pos st = st{ stateLastStrPos = Just pos } + getLastStrPos st = stateLastStrPos st + defaultParserState :: ParserState defaultParserState = ParserState { stateOptions = def, @@ -938,6 +948,17 @@ guardEnabled ext = getOption readerExtensions >>= guard . Set.member ext guardDisabled :: HasReaderOptions st => Extension -> Parser s st () guardDisabled ext = getOption readerExtensions >>= guard . not . Set.member ext +-- | Update the position on which the last string ended. +updateLastStrPos :: HasLastStrPosition st => Parser s st () +updateLastStrPos = getPosition >>= updateState . setLastStrPos + +-- | Whether we are right after the end of a string. +notAfterString :: HasLastStrPosition st => Parser s st Bool +notAfterString = do + pos <- getPosition + st <- getState + return $ getLastStrPos st /= Just pos + data HeaderType = SingleHeader Char -- ^ Single line of characters underneath | DoubleHeader Char -- ^ Lines of characters above and below @@ -1049,17 +1070,11 @@ charOrRef cs = guard (c `elem` cs) return c) -updateLastStrPos :: Parser [Char] ParserState () -updateLastStrPos = getPosition >>= \p -> - updateState $ \s -> s{ stateLastStrPos = Just p } - singleQuoteStart :: Parser [Char] ParserState () singleQuoteStart = do failIfInQuoteContext InSingleQuote - pos <- getPosition - st <- getState -- single quote start can't be right after str - guard $ stateLastStrPos st /= Just pos + guard =<< notAfterString () <$ charOrRef "'\8216\145" singleQuoteEnd :: Parser [Char] st () @@ -1156,4 +1171,3 @@ applyMacros' target = do then do macros <- extractMacros `fmap` getState return $ applyMacros macros target else return target - diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index d1637b701..1ac98e94c 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1474,9 +1474,7 @@ strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_') where checkIntraword = do exts <- getOption readerExtensions when (Ext_intraword_underscores `Set.member` exts) $ do - pos <- getPosition - lastStrPos <- stateLastStrPos <$> getState - guard $ lastStrPos /= Just pos + guard =<< notAfterString -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) @@ -1518,8 +1516,7 @@ nonEndline = satisfy (/='\n') str :: MarkdownParser (F Inlines) str = do result <- many1 alphaNum - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + updateLastStrPos let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) isSmart <- getOption readerSmart if isSmart @@ -1821,9 +1818,7 @@ citeKey :: MarkdownParser (Bool, String) citeKey = try $ do -- make sure we're not right after an alphanumeric, -- since foo@bar.baz is probably an email address - lastStrPos <- stateLastStrPos <$> getState - pos <- getPosition - guard $ lastStrPos /= Just pos + guard =<< notAfterString suppress_author <- option False (char '-' >> return True) char '@' first <- letter <|> char '_' diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 2e4a29beb..5dbcaee98 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -105,6 +105,10 @@ instance HasMeta OrgParserState where deleteMeta field st = st{ orgStateMeta = deleteMeta field $ orgStateMeta st } +instance HasLastStrPosition OrgParserState where + getLastStrPos = orgStateLastStrPos + setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } + instance Default OrgParserState where def = defaultOrgParserState @@ -1274,13 +1278,6 @@ afterEmphasisPreChar = do lastPrePos <- orgStateLastPreCharPos <$> getState return . fromMaybe True $ (== pos) <$> lastPrePos --- | Whether we are right after the end of a string -notAfterString :: OrgParser Bool -notAfterString = do - pos <- getPosition - lastStrPos <- orgStateLastStrPos <$> getState - return $ lastStrPos /= Just pos - -- | Whether the parser is right after a forbidden border char notAfterForbiddenBorderChar :: OrgParser Bool notAfterForbiddenBorderChar = do From 2423f9e6b180bc6b04d222a4b574de995d296f80 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Wed, 14 May 2014 14:58:05 +0200 Subject: [PATCH 2/3] Move `citeKey` from Readers.Markdown to Parsing The function can be used by other readers, so it is made accessible for all parsers. --- src/Text/Pandoc/Parsing.hs | 13 +++++++++++++ src/Text/Pandoc/Readers/Markdown.hs | 14 -------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 344f6c7ba..4cd6591c0 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -94,6 +94,7 @@ module Text.Pandoc.Parsing ( (>>~), apostrophe, dash, nested, + citeKey, macro, applyMacros', Parser, @@ -1144,6 +1145,18 @@ nested p = do updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res +citeKey :: HasLastStrPosition st => Parser [Char] st (Bool, String) +citeKey = try $ do + guard =<< notAfterString + suppress_author <- option False (char '-' *> return True) + char '@' + firstChar <- letter <|> char '_' + let regchar = satisfy (\c -> isAlphaNum c || c == '_') + let internal p = try $ p <* lookAhead regchar + rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") + let key = firstChar:rest + return (suppress_author, key) + -- -- Macros -- diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 1ac98e94c..5129bc2e3 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1814,20 +1814,6 @@ normalCite = try $ do char ']' return citations -citeKey :: MarkdownParser (Bool, String) -citeKey = try $ do - -- make sure we're not right after an alphanumeric, - -- since foo@bar.baz is probably an email address - guard =<< notAfterString - suppress_author <- option False (char '-' >> return True) - char '@' - first <- letter <|> char '_' - let regchar = satisfy (\c -> isAlphaNum c || c == '_') - let internal p = try $ p >>~ lookAhead regchar - rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") - let key = first:rest - return (suppress_author, key) - suffix :: MarkdownParser (F Inlines) suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) From ceeb701c254c6dc4c054e10dd151d9ef6f751ad7 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Wed, 14 May 2014 14:49:30 +0200 Subject: [PATCH 3/3] Org reader: support Pandocs citation extension Citations are defined via the "normal citation" syntax used in markdown, with the sole difference that newlines are not allowed between "[...]". This is for consistency, as org-mode generally disallows newlines between square brackets. The extension is turned on by default and can be turned off via the default syntax-extension mechanism, i.e. by specifying "org-citation" as the input format. Move `citeKey` from Readers.Markdown into Parsing The function can be used by other readers, so it is made accessible for all parsers. --- src/Text/Pandoc.hs | 2 +- src/Text/Pandoc/Readers/Org.hs | 55 ++++++++++++++++++++++++++++++++-- tests/Tests/Readers/Org.hs | 22 ++++++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index dd5bc18f6..130338f0e 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -275,6 +275,7 @@ getDefaultExtensions "markdown_mmd" = multimarkdownExtensions getDefaultExtensions "markdown_github" = githubMarkdownExtensions getDefaultExtensions "markdown" = pandocExtensions getDefaultExtensions "plain" = pandocExtensions +getDefaultExtensions "org" = Set.fromList [Ext_citations] getDefaultExtensions "textile" = Set.fromList [Ext_auto_identifiers, Ext_raw_tex] getDefaultExtensions _ = Set.fromList [Ext_auto_identifiers] @@ -319,4 +320,3 @@ readJSON _ = either error id . eitherDecode' . UTF8.fromStringLazy writeJSON :: WriterOptions -> Pandoc -> String writeJSON _ = UTF8.toStringLazy . encode - diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 5dbcaee98..86dda2732 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -869,6 +869,7 @@ inline :: OrgParser (F Inlines) inline = choice [ whitespace , linebreak + , cite , footnote , linkOrImage , anchor @@ -933,6 +934,51 @@ endline = try $ do updateLastPreCharPos return . return $ B.space +cite :: OrgParser (F Inlines) +cite = try $ do + guardEnabled Ext_citations + (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + +normalCite :: OrgParser (F [Citation]) +normalCite = try $ char '[' + *> skipSpaces + *> citeList + <* skipSpaces + <* char ']' + +citeList :: OrgParser (F [Citation]) +citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) + +citation :: OrgParser (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + footnote :: OrgParser (F Inlines) footnote = try $ inlineNote <|> referencedNote @@ -1007,7 +1053,7 @@ selfTarget :: OrgParser String selfTarget = try $ char '[' *> linkTarget <* char ']' linkTarget :: OrgParser String -linkTarget = enclosed (char '[') (char ']') (noneOf "\n\r[]") +linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") applyCustomLinkFormat :: String -> OrgParser (F String) applyCustomLinkFormat link = do @@ -1083,7 +1129,12 @@ inlineCodeBlock = try $ do let attrClasses = [translateLang lang, rundocBlockClass] let attrKeyVal = map toRundocAttrib (("language", lang) : opts) returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode - where enclosedByPair s e p = char s *> many1Till p (char e) + +enclosedByPair :: Char -- ^ opening char + -> Char -- ^ closing char + -> OrgParser a -- ^ parser + -> OrgParser [a] +enclosedByPair s e p = char s *> many1Till p (char e) emph :: OrgParser (F Inlines) emph = fmap B.emph <$> emphasisBetween '/' diff --git a/tests/Tests/Readers/Org.hs b/tests/Tests/Readers/Org.hs index 4ef7a7731..ca97ba348 100644 --- a/tests/Tests/Readers/Org.hs +++ b/tests/Tests/Readers/Org.hs @@ -225,6 +225,28 @@ tests = ] ) "echo 'Hello, World'") + + , "Citation" =: + "[@nonexistent]" =?> + let citation = Citation + { citationId = "nonexistent" + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationNoteNum = 0 + , citationHash = 0} + in (para $ cite [citation] "[@nonexistent]") + + , "Citation containing text" =: + "[see @item1 p. 34-35]" =?> + let citation = Citation + { citationId = "item1" + , citationPrefix = [Str "see"] + , citationSuffix = [Space ,Str "p.",Space,Str "34-35"] + , citationMode = NormalCitation + , citationNoteNum = 0 + , citationHash = 0} + in (para $ cite [citation] "[see @item1 p. 34-35]") ] , testGroup "Meta Information" $