Org reader: move parser state into separate module

The org reader code has become large and confusing.  Extracting smaller
parts into submodules should help to clean things up.
This commit is contained in:
Albert Krewinkel 2016-05-11 12:26:54 +02:00
parent fd9ec835ec
commit 7a0729ea09
3 changed files with 234 additions and 159 deletions

View file

@ -393,6 +393,7 @@ Library
Text.Pandoc.Readers.Odt.Generic.XMLConverter,
Text.Pandoc.Readers.Odt.Arrows.State,
Text.Pandoc.Readers.Odt.Arrows.Utils,
Text.Pandoc.Readers.Org.ParserState,
Text.Pandoc.Writers.Shared,
Text.Pandoc.Asciify,
Text.Pandoc.MIME,

View file

@ -1,6 +1,5 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-}
{-
Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
@ -31,8 +30,7 @@ Conversion of org-mode formatted plain text to 'Pandoc' document.
module Text.Pandoc.Readers.Org ( readOrg ) where
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..),
trimInlines )
import Text.Pandoc.Builder ( Inlines, Blocks )
import Text.Pandoc.Definition
import Text.Pandoc.Compat.Monoid ((<>))
import Text.Pandoc.Error
@ -43,153 +41,33 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
, parseFromString, blanklines
)
import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
import Text.Pandoc.Readers.Org.ParserState
import Text.Pandoc.Shared (compactify', compactify'DL)
import Text.TeXMath (readTeX, writePandoc, DisplayType(..))
import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
import Control.Arrow (first)
import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when)
import Control.Monad.Reader (Reader, runReader, ask, asks, local)
import Control.Monad (foldM, guard, mplus, mzero, when)
import Control.Monad.Reader ( Reader, runReader )
import Data.Char (isAlphaNum, isSpace, toLower)
import Data.Default
import Data.List (intersperse, isPrefixOf, isSuffixOf)
import qualified Data.Map as M
import qualified Data.Set as Set
import Data.Maybe (fromMaybe, isJust)
import Network.HTTP (urlEncode)
-- | Parse org-mode string and return a Pandoc document.
readOrg :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Either PandocError Pandoc
readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext }
-- | The parser used to read org files.
type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal)
instance HasIdentifierList OrgParserState where
extractIdentifierList = orgStateIdentifiers
updateIdentifierList f s = s{ orgStateIdentifiers = f (orgStateIdentifiers s) }
instance HasHeaderMap OrgParserState where
extractHeaderMap = orgStateHeaderMap
updateHeaderMap f s = s{ orgStateHeaderMap = f (orgStateHeaderMap s) }
parseOrg :: OrgParser Pandoc
parseOrg = do
blocks' <- parseBlocks
st <- getState
let meta = runF (orgStateMeta' st) st
let removeUnwantedBlocks = dropCommentTrees . filter (/= Null)
return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st)
-- | Drop COMMENT headers and the document tree below those headers.
dropCommentTrees :: [Block] -> [Block]
dropCommentTrees [] = []
dropCommentTrees (b:bs) =
maybe (b:dropCommentTrees bs)
(dropCommentTrees . flip dropUntilHeaderAboveLevel bs)
(commentHeaderLevel b)
-- | Return the level of a header starting a comment or :noexport: tree and
-- Nothing otherwise.
commentHeaderLevel :: Block -> Maybe Int
commentHeaderLevel blk =
case blk of
(Header level _ ((Str "COMMENT"):_)) -> Just level
(Header level _ title) | hasNoExportTag title -> Just level
_ -> Nothing
where
hasNoExportTag :: [Inline] -> Bool
hasNoExportTag = any isNoExportTag
isNoExportTag :: Inline -> Bool
isNoExportTag (Span ("", ["tag"], [("data-tag-name", "noexport")]) []) = True
isNoExportTag _ = False
-- | Drop blocks until a header on or above the given level is seen
dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block]
dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n)
isHeaderLevelLowerEq :: Int -> Block -> Bool
isHeaderLevelLowerEq n blk =
case blk of
(Header level _ _) -> n >= level
_ -> False
--
-- Parser State for Org
-- Functions acting on the parser state
--
type OrgNoteRecord = (String, F Blocks)
type OrgNoteTable = [OrgNoteRecord]
type OrgBlockAttributes = M.Map String String
type OrgLinkFormatters = M.Map String (String -> String)
-- | Org-mode parser state
data OrgParserState = OrgParserState
{ orgStateOptions :: ReaderOptions
, orgStateAnchorIds :: [String]
, orgStateBlockAttributes :: OrgBlockAttributes
, orgStateEmphasisCharStack :: [Char]
, orgStateEmphasisNewlines :: Maybe Int
, orgStateLastForbiddenCharPos :: Maybe SourcePos
, orgStateLastPreCharPos :: Maybe SourcePos
, orgStateLastStrPos :: Maybe SourcePos
, orgStateLinkFormatters :: OrgLinkFormatters
, orgStateMeta :: Meta
, orgStateMeta' :: F Meta
, orgStateNotes' :: OrgNoteTable
, orgStateParserContext :: ParserContext
, orgStateIdentifiers :: Set.Set String
, orgStateHeaderMap :: M.Map Inlines String
}
instance Default OrgParserLocal where
def = OrgParserLocal NoQuote
instance HasReaderOptions OrgParserState where
extractReaderOptions = orgStateOptions
instance HasMeta OrgParserState where
setMeta field val st =
st{ orgStateMeta = setMeta field val $ orgStateMeta st }
deleteMeta field st =
st{ orgStateMeta = deleteMeta field $ orgStateMeta st }
instance HasLastStrPosition OrgParserState where
getLastStrPos = orgStateLastStrPos
setLastStrPos pos st = st{ orgStateLastStrPos = Just pos }
instance HasQuoteContext st (Reader OrgParserLocal) where
getQuoteContext = asks orgLocalQuoteContext
withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q})
instance Default OrgParserState where
def = defaultOrgParserState
defaultOrgParserState :: OrgParserState
defaultOrgParserState = OrgParserState
{ orgStateOptions = def
, orgStateAnchorIds = []
, orgStateBlockAttributes = M.empty
, orgStateEmphasisCharStack = []
, orgStateEmphasisNewlines = Nothing
, orgStateLastForbiddenCharPos = Nothing
, orgStateLastPreCharPos = Nothing
, orgStateLastStrPos = Nothing
, orgStateLinkFormatters = M.empty
, orgStateMeta = nullMeta
, orgStateMeta' = return nullMeta
, orgStateNotes' = []
, orgStateParserContext = NullState
, orgStateIdentifiers = Set.empty
, orgStateHeaderMap = M.empty
}
recordAnchorId :: String -> OrgParser ()
recordAnchorId i = updateState $ \s ->
s{ orgStateAnchorIds = i : (orgStateAnchorIds s) }
@ -243,6 +121,56 @@ addToNotesTable note = do
oldnotes <- orgStateNotes' <$> getState
updateState $ \s -> s{ orgStateNotes' = note:oldnotes }
--
-- Parser
--
parseOrg :: OrgParser Pandoc
parseOrg = do
blocks' <- parseBlocks
st <- getState
let meta = runF (orgStateMeta' st) st
let removeUnwantedBlocks = dropCommentTrees . filter (/= Null)
return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st)
-- | Drop COMMENT headers and the document tree below those headers.
dropCommentTrees :: [Block] -> [Block]
dropCommentTrees [] = []
dropCommentTrees (b:bs) =
maybe (b:dropCommentTrees bs)
(dropCommentTrees . flip dropUntilHeaderAboveLevel bs)
(commentHeaderLevel b)
-- | Return the level of a header starting a comment or :noexport: tree and
-- Nothing otherwise.
commentHeaderLevel :: Block -> Maybe Int
commentHeaderLevel blk =
case blk of
(Header level _ ((Str "COMMENT"):_)) -> Just level
(Header level _ title) | hasNoExportTag title -> Just level
_ -> Nothing
where
hasNoExportTag :: [Inline] -> Bool
hasNoExportTag = any isNoExportTag
isNoExportTag :: Inline -> Bool
isNoExportTag (Span ("", ["tag"], [("data-tag-name", "noexport")]) []) = True
isNoExportTag _ = False
-- | Drop blocks until a header on or above the given level is seen
dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block]
dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n)
isHeaderLevelLowerEq :: Int -> Block -> Bool
isHeaderLevelLowerEq n blk =
case blk of
(Header level _ _) -> n >= level
_ -> False
--
-- Adaptions and specializations of parsing utilities
--
-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts
-- of the state saved and restored.
parseFromString :: OrgParser a -> String -> OrgParser a
@ -253,35 +181,6 @@ parseFromString parser str' = do
updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos }
return result
--
-- Adaptions and specializations of parsing utilities
--
newtype F a = F { unF :: Reader OrgParserState a
} deriving (Monad, Applicative, Functor)
runF :: F a -> OrgParserState -> a
runF = runReader . unF
askF :: F OrgParserState
askF = F ask
asksF :: (OrgParserState -> a) -> F a
asksF f = F $ asks f
instance Monoid a => Monoid (F a) where
mempty = return mempty
mappend = liftM2 mappend
mconcat = fmap mconcat . sequence
trimInlinesF :: F Inlines -> F Inlines
trimInlinesF = liftM trimInlines
returnF :: a -> OrgParser (F a)
returnF = return . return
-- | Like @Text.Parsec.Char.newline@, but causes additional state changes.
newline :: OrgParser Char
newline =

View file

@ -0,0 +1,175 @@
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-
Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Org.Options
Copyright : Copyright (C) 2014-2016 Albert Krewinkel
License : GNU GPL, version 2 or above
Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
Define the Org-mode parser state.
-}
module Text.Pandoc.Readers.Org.ParserState
( OrgParserState(..)
, OrgParserLocal(..)
, OrgNoteRecord
, F(..)
, askF
, asksF
, trimInlinesF
, runF
, returnF
) where
import Control.Monad (liftM, liftM2)
import Control.Monad.Reader (Reader, runReader, ask, asks, local)
import Data.Default (Default(..))
import qualified Data.Map as M
import qualified Data.Set as Set
import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..),
trimInlines )
import Text.Pandoc.Definition ( Meta(..), nullMeta )
import Text.Pandoc.Options ( ReaderOptions(..) )
import Text.Pandoc.Parsing ( HasHeaderMap(..)
, HasIdentifierList(..)
, HasLastStrPosition(..)
, HasQuoteContext(..)
, HasReaderOptions(..)
, ParserContext(..)
, QuoteContext(..)
, SourcePos )
-- | An inline note / footnote containing the note key and its (inline) value.
type OrgNoteRecord = (String, F Blocks)
-- | Table of footnotes
type OrgNoteTable = [OrgNoteRecord]
-- | Map of org block attributes (e.g. LABEL, CAPTION, NAME, etc)
type OrgBlockAttributes = M.Map String String
-- | Map of functions for link transformations. The map key is refers to the
-- link-type, the corresponding function transforms the given link string.
type OrgLinkFormatters = M.Map String (String -> String)
-- | Org-mode parser state
data OrgParserState = OrgParserState
{ orgStateOptions :: ReaderOptions
, orgStateAnchorIds :: [String]
, orgStateBlockAttributes :: OrgBlockAttributes
, orgStateEmphasisCharStack :: [Char]
, orgStateEmphasisNewlines :: Maybe Int
, orgStateLastForbiddenCharPos :: Maybe SourcePos
, orgStateLastPreCharPos :: Maybe SourcePos
, orgStateLastStrPos :: Maybe SourcePos
, orgStateLinkFormatters :: OrgLinkFormatters
, orgStateMeta :: Meta
, orgStateMeta' :: F Meta
, orgStateNotes' :: OrgNoteTable
, orgStateParserContext :: ParserContext
, orgStateIdentifiers :: Set.Set String
, orgStateHeaderMap :: M.Map Inlines String
}
data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext }
instance Default OrgParserLocal where
def = OrgParserLocal NoQuote
instance HasReaderOptions OrgParserState where
extractReaderOptions = orgStateOptions
instance HasMeta OrgParserState where
setMeta field val st =
st{ orgStateMeta = setMeta field val $ orgStateMeta st }
deleteMeta field st =
st{ orgStateMeta = deleteMeta field $ orgStateMeta st }
instance HasLastStrPosition OrgParserState where
getLastStrPos = orgStateLastStrPos
setLastStrPos pos st = st{ orgStateLastStrPos = Just pos }
instance HasQuoteContext st (Reader OrgParserLocal) where
getQuoteContext = asks orgLocalQuoteContext
withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q})
instance HasIdentifierList OrgParserState where
extractIdentifierList = orgStateIdentifiers
updateIdentifierList f s = s{ orgStateIdentifiers = f (orgStateIdentifiers s) }
instance HasHeaderMap OrgParserState where
extractHeaderMap = orgStateHeaderMap
updateHeaderMap f s = s{ orgStateHeaderMap = f (orgStateHeaderMap s) }
instance Default OrgParserState where
def = defaultOrgParserState
defaultOrgParserState :: OrgParserState
defaultOrgParserState = OrgParserState
{ orgStateOptions = def
, orgStateAnchorIds = []
, orgStateBlockAttributes = M.empty
, orgStateEmphasisCharStack = []
, orgStateEmphasisNewlines = Nothing
, orgStateLastForbiddenCharPos = Nothing
, orgStateLastPreCharPos = Nothing
, orgStateLastStrPos = Nothing
, orgStateLinkFormatters = M.empty
, orgStateMeta = nullMeta
, orgStateMeta' = return nullMeta
, orgStateNotes' = []
, orgStateParserContext = NullState
, orgStateIdentifiers = Set.empty
, orgStateHeaderMap = M.empty
}
--
-- Parser state reader
--
-- | Reader monad wrapping the parser state. This is used to delay evaluation
-- until all relevant information has been parsed and made available in the
-- parser state. See also the newtype of the same name in
-- Text.Pandoc.Parsing.
newtype F a = F { unF :: Reader OrgParserState a
} deriving (Functor, Applicative, Monad)
instance Monoid a => Monoid (F a) where
mempty = return mempty
mappend = liftM2 mappend
mconcat = fmap mconcat . sequence
runF :: F a -> OrgParserState -> a
runF = runReader . unF
askF :: F OrgParserState
askF = F ask
asksF :: (OrgParserState -> a) -> F a
asksF f = F $ asks f
trimInlinesF :: F Inlines -> F Inlines
trimInlinesF = liftM trimInlines
returnF :: Monad m => a -> m (F a)
returnF = return . return