Org reader: fix parsing problem for colons in headline

Fixed a problem where words surrounded by colons could causing parse failures in some cases when they occurred in headers. Fixes: #5993
2019-12-19 07:09:43 +01:00 · 2019-12-19 07:09:43 +01:00 · 0a3cc7260c
commit 0a3cc7260c
parent 5bd2d28b19
3 changed files with 37 additions and 11 deletions
--- a/src/Text/Pandoc/Readers/Org/DocumentTree.hs
+++ b/src/Text/Pandoc/Readers/Org/DocumentTree.hs
@ -1,6 +1,7 @@
 {-# LANGUAGE FlexibleContexts  #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TupleSections     #-}
 {- |
   Module      : Text.Pandoc.Readers.Org.DocumentTree
   Copyright   : Copyright (C) 2014-2019 Albert Krewinkel
@ -16,8 +17,8 @@ module Text.Pandoc.Readers.Org.DocumentTree
  ) where

 import Prelude
-import Control.Arrow ((***))
-import Control.Monad (guard, void)
+import Control.Arrow ((***), first)
+import Control.Monad (guard)
 import Data.List (intersperse)
 import Data.Maybe (mapMaybe)
 import Data.Text (Text)
@ -110,15 +111,13 @@ headline blocks inline lvl = try $ do
  level <- headerStart
  guard (lvl <= level)
  todoKw <- optionMaybe todoKeyword
-  title <- trimInlinesF . mconcat <$> manyTill inline endOfTitle
-  tags  <- option [] headerTags
-  newline
+  (title, tags) <- manyThen inline endOfTitle
  planning   <- option emptyPlanning planningInfo
  properties <- option mempty propertiesDrawer
  contents   <- blocks
  children   <- many (headline blocks inline (level + 1))
  return $ do
-    title'    <- title
+    title'    <- trimInlinesF (mconcat title)
    contents' <- contents
    children' <- sequence children
    return Headline
@ -132,13 +131,29 @@ headline blocks inline lvl = try $ do
      , headlineChildren = children'
      }
 where
-   endOfTitle :: Monad m => OrgParser m ()
-   endOfTitle = void . lookAhead $ optional headerTags *> newline
+   endOfTitle :: Monad m => OrgParser m [Tag]
+   endOfTitle = try $ do
+     skipSpaces
+     tags <- option [] (headerTags <* skipSpaces)
+     newline
+     return tags

   headerTags :: Monad m => OrgParser m [Tag]
-   headerTags = try $
-     let tag = orgTagWord <* char ':'
-     in map toTag <$> (skipSpaces *> char ':' *> many1 tag <* skipSpaces)
+   headerTags = try $ do
+     char ':'
+     endBy1 (toTag <$> orgTagWord) (char ':')
+
+   manyThen :: Monad m
+            => OrgParser m a
+            -> OrgParser m b
+            -> OrgParser m ([a], b)
+   manyThen p end = (([],) <$> try end) <|> do
+     x <- p
+     first (x:) <$> manyThen p end
+
+   -- titleFollowedByTags :: Monad m => OrgParser m (Inlines, [Tag])
+   -- titleFollowedByTags = do
+

 unprunedHeadlineToBlocks :: Monad m => Headline -> OrgParserState -> OrgParser m [Block]
 unprunedHeadlineToBlocks hdln st =
--- a/src/Text/Pandoc/Readers/Org/Parsing.hs
+++ b/src/Text/Pandoc/Readers/Org/Parsing.hs
@ -94,6 +94,7 @@ module Text.Pandoc.Readers.Org.Parsing
  , sepBy
  , sepBy1
  , sepEndBy1
+  , endBy1
  , option
  , optional
  , optionMaybe
--- a/test/Tests/Readers/Org/Block/Header.hs
+++ b/test/Tests/Readers/Org/Block/Header.hs
@ -142,6 +142,16 @@ tests =
      "* This: is not: tagged" =?>
      headerWith ("this-is-not-tagged", [], []) 1 "This: is not: tagged"

+  , "Untagged header time followed by colon" =:
+      "** Meeting at 5:23: free food" =?>
+      let attr = ("meeting-at-523-free-food", [], [])
+      in headerWith attr 2 "Meeting at 5:23: free food"
+
+  , "tag followed by text" =:
+      "*** Looks like a :tag: but isn't" =?>
+      let attr = ("looks-like-a-tag-but-isnt", [], [])
+      in headerWith attr 3 "Looks like a :tag: but isn't"
+
  , "Header starting with strokeout text" =:
      T.unlines [ "foo"
                , ""