pandoc/src/Text/Pandoc/Shared.hs

418 lines
20 KiB
Haskell
Raw Normal View History

-- | Utility functions and definitions used by the various Pandoc modules.
module Text.Pandoc.Shared (
-- * Text processing
gsub,
joinWithSep,
tabsToSpaces,
backslashEscape,
escapePreservingRegex,
endsWith,
stripTrailingNewlines,
removeLeadingTrailingSpace,
removeLeadingSpace,
removeTrailingSpace,
-- * Parsing
readWith,
testStringWith,
HeaderType (..),
ParserContext (..),
ParserState (..),
defaultParserState,
-- * Native format prettyprinting
prettyPandoc,
-- * Pandoc block list processing
consolidateList,
isNoteBlock,
splitBySpace,
normalizeSpaces,
compactify,
generateReference,
WriterOptions (..),
KeyTable,
keyTable,
lookupKeySrc,
refsMatch,
replaceReferenceLinks,
replaceRefLinksBlockList
) where
import Text.Pandoc.Definition
import Text.ParserCombinators.Parsec
import Text.Pandoc.HtmlEntities ( decodeEntities )
import Text.Regex ( matchRegexAll, mkRegex, subRegex, Regex )
import Char ( toLower )
import List ( find, groupBy )
-- | Parse a string with a given parser and state.
readWith :: GenParser Char ParserState a -- ^ parser
-> ParserState -- ^ initial state
-> String -- ^ input string
-> a
readWith parser state input =
case runParser parser state "source" input of
Left err -> error $ "\nError:\n" ++ show err
Right result -> result
-- | Parse a string with @parser@ (for testing).
testStringWith :: (Show a) =>
GenParser Char ParserState a
-> String
-> IO ()
testStringWith parser str = putStrLn $ show $ readWith parser defaultParserState str
-- | Parser state
data HeaderType
= SingleHeader Char -- ^ Single line of characters underneath
| DoubleHeader Char -- ^ Lines of characters above and below
deriving (Eq, Show)
data ParserContext
= BlockQuoteState -- ^ Used when running parser on contents of blockquote
| ListItemState -- ^ Used when running parser on list item contents
| NullState -- ^ Default state
deriving (Eq, Show)
data ParserState = ParserState
{ stateParseRaw :: Bool, -- ^ Parse untranslatable HTML and LaTeX?
stateParserContext :: ParserContext, -- ^ What are we parsing?
stateKeyBlocks :: [Block], -- ^ List of reference key blocks
stateKeysUsed :: [[Inline]], -- ^ List of references used so far
stateNoteBlocks :: [Block], -- ^ List of note blocks
stateTabStop :: Int, -- ^ Tab stop
stateStandalone :: Bool, -- ^ If @True@, parse bibliographic info
stateTitle :: [Inline], -- ^ Title of document
stateAuthors :: [String], -- ^ Authors of document
stateDate :: String, -- ^ Date of document
stateHeaderTable :: [HeaderType] } -- ^ List of header types used, in what order (for reStructuredText only)
deriving Show
defaultParserState :: ParserState
defaultParserState =
ParserState { stateParseRaw = False,
stateParserContext = NullState,
stateKeyBlocks = [],
stateKeysUsed = [],
stateNoteBlocks = [],
stateTabStop = 4,
stateStandalone = False,
stateTitle = [],
stateAuthors = [],
stateDate = [],
stateHeaderTable = [] }
-- | Consolidate @Str@s and @Space@s in an inline list into one big @Str@.
-- Collapse adjacent @Space@s.
consolidateList :: [Inline] -> [Inline]
consolidateList ((Str a):(Str b):rest) = consolidateList ((Str (a ++ b)):rest)
consolidateList ((Str a):Space:rest) = consolidateList ((Str (a ++ " ")):rest)
consolidateList (Space:(Str a):rest) = consolidateList ((Str (" " ++ a)):rest)
consolidateList (Space:Space:rest) = consolidateList ((Str " "):rest)
consolidateList (inline:rest) = inline:(consolidateList rest)
consolidateList [] = []
-- | Indent string as a block.
indentBy :: Int -- ^ Number of spaces to indent the block
-> Int -- ^ Number of spaces to indent first line, relative to block
-> String -- ^ Contents of block to indent
-> String
indentBy num first [] = ""
indentBy num first str =
let (firstLine:restLines) = lines str
firstLineIndent = num + first in
(replicate firstLineIndent ' ') ++ firstLine ++ "\n" ++ (joinWithSep "\n" $ map (\line -> (replicate num ' ') ++ line) restLines)
-- | Prettyprint list of Pandoc blocks elements.
prettyBlockList :: Int -- ^ Number of spaces to indent list of blocks
-> [Block] -- ^ List of blocks
-> String
prettyBlockList indent [] = indentBy indent 0 "[]"
prettyBlockList indent blocks = indentBy indent (-2) $ "[ " ++ (joinWithSep "\n, " (map prettyBlock blocks)) ++ " ]"
-- | Prettyprint Pandoc block element.
prettyBlock :: Block -> String
prettyBlock (BlockQuote blocks) = "BlockQuote\n " ++ (prettyBlockList 2 blocks)
prettyBlock (Note ref blocks) = "Note " ++ (show ref) ++ "\n " ++ (prettyBlockList 2 blocks)
prettyBlock (OrderedList blockLists) = "OrderedList\n" ++ indentBy 2 0 ("[ " ++ (joinWithSep ", " (map (\blocks -> prettyBlockList 2 blocks) blockLists))) ++ " ]"
prettyBlock (BulletList blockLists) = "BulletList\n" ++ indentBy 2 0 ("[ " ++ (joinWithSep ", " (map (\blocks -> prettyBlockList 2 blocks) blockLists))) ++ " ]"
prettyBlock block = show block
-- | Prettyprint Pandoc document.
prettyPandoc :: Pandoc -> String
prettyPandoc (Pandoc meta blocks) = "Pandoc " ++ "(" ++ (show meta) ++ ")\n" ++ (prettyBlockList 0 blocks)
-- | Convert tabs to spaces (with adjustable tab stop).
tabsToSpaces :: Int -- ^ Tabstop
-> String -- ^ String to convert
-> String
tabsToSpaces tabstop str =
unlines (map (tabsInLine tabstop tabstop) (lines str))
-- | Convert tabs to spaces in one line.
tabsInLine :: Int -- ^ Number of spaces to next tab stop
-> Int -- ^ Tabstop
-> String -- ^ Line to convert
-> String
tabsInLine num tabstop "" = ""
tabsInLine num tabstop (c:cs) =
let replacement = (if (c == '\t') then (replicate num ' ') else [c]) in
let nextnumraw = (num - (length replacement)) in
let nextnum = if (nextnumraw < 1) then (nextnumraw + tabstop) else nextnumraw in
replacement ++ (tabsInLine nextnum tabstop cs)
-- | Substitute string for every occurrence of regular expression.
gsub :: String -- ^ Regular expression (as string) to substitute for
-> String -- ^ String to substitute for the regex
-> String -- ^ String to be substituted in
-> String
gsub regex replacement str = subRegex (mkRegex regex) str replacement
-- | Escape designated characters with backslash.
backslashEscape :: [Char] -- ^ list of special characters to escape
-> String -- ^ string input
-> String
backslashEscape special [] = []
backslashEscape special (x:xs) = if x `elem` special then
'\\':x:(backslashEscape special xs)
else
x:(backslashEscape special xs)
-- | Escape string by applying a function, but don't touch anything that matches regex.
escapePreservingRegex :: (String -> String) -- ^ Escaping function
-> Regex -- ^ Regular expression
-> String -- ^ String to be escaped
-> String
escapePreservingRegex escapeFunction regex str =
case (matchRegexAll regex str) of
Nothing -> escapeFunction str
Just (before, matched, after, _) ->
(escapeFunction before) ++ matched ++
(escapePreservingRegex escapeFunction regex after)
-- | Returns @True@ if string ends with given character.
endsWith :: Char -> [Char] -> Bool
endsWith char [] = False
endsWith char str = (char == last str)
-- | Returns @True@ if block is a @Note@ block
isNoteBlock :: Block -> Bool
isNoteBlock (Note ref blocks) = True
isNoteBlock _ = False
-- | Joins a list of lists, separated by another list.
joinWithSep :: [a] -- ^ List to use as separator
-> [[a]] -- ^ Lists to join
-> [a]
joinWithSep sep [] = []
joinWithSep sep lst = foldr1 (\a b -> a ++ sep ++ b) lst
-- | Strip trailing newlines from string.
stripTrailingNewlines :: String -> String
stripTrailingNewlines "" = ""
stripTrailingNewlines str =
if (last str) == '\n' then
stripTrailingNewlines (init str)
else
str
-- | Remove leading and trailing space (including newlines) from string.
removeLeadingTrailingSpace :: String -> String
removeLeadingTrailingSpace = removeLeadingSpace . removeTrailingSpace
-- | Remove leading space (including newlines) from string.
removeLeadingSpace :: String -> String
removeLeadingSpace = dropWhile (\x -> (x == ' ') || (x == '\n') || (x == '\t'))
-- | Remove trailing space (including newlines) from string.
removeTrailingSpace :: String -> String
removeTrailingSpace = reverse . removeLeadingSpace . reverse
-- | Split list of inlines into groups separated by a space.
splitBySpace :: [Inline] -> [[Inline]]
splitBySpace lst = filter (\a -> (/= Space) (head a))
(groupBy (\a b -> (/= Space) a && (/= Space) b) lst)
-- | Normalize a list of inline elements: remove leading and trailing
-- @Space@ elements, and collapse double @Space@s into singles.
normalizeSpaces :: [Inline] -> [Inline]
normalizeSpaces [] = []
normalizeSpaces list =
let removeDoubles [] = []
removeDoubles (Space:Space:rest) = removeDoubles (Space:rest)
removeDoubles (x:rest) = x:(removeDoubles rest) in
let removeLeading [] = []
removeLeading lst = if ((head lst) == Space) then tail lst else lst in
let removeTrailing [] = []
removeTrailing lst = if ((last lst) == Space) then init lst else lst in
removeLeading $ removeTrailing $ removeDoubles list
-- | Change final list item from @Para@ to @Plain@ if the list should be compact.
compactify :: [[Block]] -- ^ List of list items (each a list of blocks)
-> [[Block]]
compactify [] = []
compactify items =
let final = last items
others = init items in
case final of
[Para a] -> if any containsPara others then items else others ++ [[Plain a]]
otherwise -> items
containsPara :: [Block] -> Bool
containsPara [] = False
containsPara ((Para a):rest) = True
containsPara ((BulletList items):rest) = (any containsPara items) || (containsPara rest)
containsPara ((OrderedList items):rest) = (any containsPara items) || (containsPara rest)
containsPara (x:rest) = containsPara rest
-- | Options for writers
data WriterOptions = WriterOptions
{ writerStandalone :: Bool -- ^ If @True@, writer header and footer
, writerTitlePrefix :: String -- ^ Prefix for HTML titles
, writerHeader :: String -- ^ Header for the document
, writerIncludeBefore :: String -- ^ String to include before the document body
, writerIncludeAfter :: String -- ^ String to include after the document body
, writerSmartypants :: Bool -- ^ If @True@, use smart quotes, dashes, and ellipses
, writerS5 :: Bool -- ^ @True@ if we're writing S5 instead of normal HTML
, writerIncremental :: Bool -- ^ If @True@, display S5 lists incrementally
, writerNumberSections :: Bool -- ^ If @True@, number sections in LaTeX
, writerTabStop :: Int } -- ^ Tabstop for conversion between spaces and tabs
deriving Show
--
-- Functions for constructing lists of reference keys
--
-- | Returns @Just@ numerical key reference if there's already a key
-- for the specified target in the list of blocks, otherwise @Nothing@.
keyFoundIn :: [Block] -- ^ List of key blocks to search
-> Target -- ^ Target to search for
-> Maybe String
keyFoundIn [] src = Nothing
keyFoundIn ((Key [Str num] src1):rest) src = if (src1 == src) then
Just num
else
keyFoundIn rest src
keyFoundIn (_:rest) src = keyFoundIn rest src
-- | Return next unique numerical key, given keyList
nextUniqueKey :: [[Inline]] -> String
nextUniqueKey keys =
let nums = [1..10000]
notAKey n = not (any (== [Str (show n)]) keys) in
case (find notAKey nums) of
Just x -> show x
Nothing -> error "Could not find unique key for reference link"
-- | Generate a reference for a URL (either an existing reference, if
-- there is one, or a new one, if there isn't) and update parser state.
generateReference :: String -- ^ URL
-> String -- ^ Title
-> GenParser tok ParserState Target
generateReference url title = do
let src = Src (decodeEntities url) (decodeEntities title)
state <- getState
let keyBlocks = stateKeyBlocks state
let keysUsed = stateKeysUsed state
case (keyFoundIn keyBlocks src) of
Just num -> return (Ref [Str num])
Nothing -> do
let nextNum = nextUniqueKey keysUsed
updateState (\st -> st {stateKeyBlocks = (Key [Str nextNum] src):keyBlocks,
stateKeysUsed = [Str nextNum]:keysUsed})
return (Ref [Str nextNum])
--
-- code to replace reference links with real links and remove unneeded key blocks
--
type KeyTable = [([Inline], Target)]
-- | Returns @True@ if block is a Key block
isRefBlock :: Block -> Bool
isRefBlock (Key _ _) = True
isRefBlock _ = False
-- | Returns a pair of a list of pairs of keys and associated sources, and a new
-- list of blocks with the included key blocks deleted.
keyTable :: [Block] -> (KeyTable, [Block])
keyTable [] = ([],[])
keyTable ((Key ref target):lst) = (((ref, target):table), rest)
where (table, rest) = keyTable lst
keyTable (Null:lst) = keyTable lst -- get rid of Nulls
keyTable (Blank:lst) = keyTable lst -- get rid of Blanks
keyTable ((BlockQuote blocks):lst) = ((table1 ++ table2), ((BlockQuote rest1):rest2))
where (table1, rest1) = keyTable blocks
(table2, rest2) = keyTable lst
keyTable ((Note ref blocks):lst) = ((table1 ++ table2), ((Note ref rest1):rest2))
where (table1, rest1) = keyTable blocks
(table2, rest2) = keyTable lst
keyTable ((OrderedList blockLists):lst) = ((table1 ++ table2), ((OrderedList rest1):rest2))
where results = map keyTable blockLists
rest1 = map snd results
table1 = concatMap fst results
(table2, rest2) = keyTable lst
keyTable ((BulletList blockLists):lst) = ((table1 ++ table2), ((BulletList rest1):rest2))
where results = map keyTable blockLists
rest1 = map snd results
table1 = concatMap fst results
(table2, rest2) = keyTable lst
keyTable (other:lst) = (table, (other:rest))
where (table, rest) = keyTable lst
-- | Look up key in key table and return target object.
lookupKeySrc :: KeyTable -- ^ Key table
-> [Inline] -- ^ Key
-> Maybe Target
lookupKeySrc table key = case table of
[] -> Nothing
(k, src):rest -> if (refsMatch k key) then Just src else lookupKeySrc rest key
-- | Returns @True@ if keys match (case insensitive).
refsMatch :: [Inline] -> [Inline] -> Bool
refsMatch ((Str x):restx) ((Str y):resty) = ((map toLower x) == (map toLower y)) && refsMatch restx resty
refsMatch ((Code x):restx) ((Code y):resty) = ((map toLower x) == (map toLower y)) && refsMatch restx resty
refsMatch ((TeX x):restx) ((TeX y):resty) = ((map toLower x) == (map toLower y)) && refsMatch restx resty
refsMatch ((HtmlInline x):restx) ((HtmlInline y):resty) = ((map toLower x) == (map toLower y)) && refsMatch restx resty
refsMatch ((NoteRef x):restx) ((NoteRef y):resty) = ((map toLower x) == (map toLower y)) && refsMatch restx resty
refsMatch ((Emph x):restx) ((Emph y):resty) = refsMatch x y && refsMatch restx resty
refsMatch ((Strong x):restx) ((Strong y):resty) = refsMatch x y && refsMatch restx resty
refsMatch (x:restx) (y:resty) = (x == y) && refsMatch restx resty
refsMatch [] x = null x
refsMatch x [] = null x
-- | Replace reference links with explicit links in list of blocks, removing key blocks.
replaceReferenceLinks :: [Block] -> [Block]
replaceReferenceLinks blocks =
let (keytable, purged) = keyTable blocks in
replaceRefLinksBlockList keytable purged
-- | Use key table to replace reference links with explicit links in a list of blocks
replaceRefLinksBlockList :: KeyTable -> [Block] -> [Block]
replaceRefLinksBlockList keytable lst = map (replaceRefLinksBlock keytable) lst
-- | Use key table to replace reference links with explicit links in a block
replaceRefLinksBlock :: KeyTable -> Block -> Block
replaceRefLinksBlock keytable (Plain lst) = Plain (map (replaceRefLinksInline keytable) lst)
replaceRefLinksBlock keytable (Para lst) = Para (map (replaceRefLinksInline keytable) lst)
replaceRefLinksBlock keytable (Header lvl lst) = Header lvl (map (replaceRefLinksInline keytable) lst)
replaceRefLinksBlock keytable (BlockQuote lst) = BlockQuote (map (replaceRefLinksBlock keytable) lst)
replaceRefLinksBlock keytable (Note ref lst) = Note ref (map (replaceRefLinksBlock keytable) lst)
replaceRefLinksBlock keytable (OrderedList lst) = OrderedList (map (replaceRefLinksBlockList keytable) lst)
replaceRefLinksBlock keytable (BulletList lst) = BulletList (map (replaceRefLinksBlockList keytable) lst)
replaceRefLinksBlock keytable other = other
-- | Use key table to replace reference links with explicit links in an inline element.
replaceRefLinksInline :: KeyTable -> Inline -> Inline
replaceRefLinksInline keytable (Link text (Ref ref)) = (Link newText newRef)
where newRef = case lookupKeySrc keytable (if (null ref) then text else ref) of
Nothing -> (Ref ref)
Just src -> src
newText = map (replaceRefLinksInline keytable) text
replaceRefLinksInline keytable (Image text (Ref ref)) = (Image newText newRef)
where newRef = case lookupKeySrc keytable (if (null ref) then text else ref) of
Nothing -> (Ref ref)
Just src -> src
newText = map (replaceRefLinksInline keytable) text
replaceRefLinksInline keytable (Emph lst) = Emph (map (replaceRefLinksInline keytable) lst)
replaceRefLinksInline keytable (Strong lst) = Strong (map (replaceRefLinksInline keytable) lst)
replaceRefLinksInline keytable other = other