pandoc/Text/Pandoc/Readers/LaTeX.hs

{-
Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-}

{- |
   Module      : Text.Pandoc.Readers.LaTeX
   Copyright   : Copyright (C) 2006-8 John MacFarlane
   License     : GNU GPL, version 2 or above

   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable

Conversion of LaTeX to 'Pandoc' document.
-}
module Text.Pandoc.Readers.LaTeX (
                                  readLaTeX,
                                  rawLaTeXInline,
                                  rawLaTeXEnvironment
                                 ) where

import Text.ParserCombinators.Parsec
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Data.Maybe ( fromMaybe )
import Data.Char ( chr )
import Data.List ( isPrefixOf, isSuffixOf )

-- | Parse LaTeX from string and return 'Pandoc' document.
readLaTeX :: ParserState   -- ^ Parser state, including options for parser
          -> String        -- ^ String to parse
          -> Pandoc
readLaTeX = readWith parseLaTeX

-- characters with special meaning
specialChars :: [Char]
specialChars = "\\`$%^&_~#{}\n \t|<>'\"-"

--
-- utility functions
--

-- | Returns text between brackets and its matching pair.
bracketedText :: Char -> Char -> GenParser Char st [Char]
bracketedText openB closeB = do
  result <- charsInBalanced' openB closeB
  return $ [openB] ++ result ++ [closeB]

-- | Returns an option or argument of a LaTeX command.
optOrArg :: GenParser Char st [Char]
optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']'

-- | True if the string begins with '{'.
isArg :: [Char] -> Bool
isArg ('{':_) = True
isArg _       = False

-- | Returns list of options and arguments of a LaTeX command.
commandArgs :: GenParser Char st [[Char]]
commandArgs = many optOrArg

-- | Parses LaTeX command, returns (name, star, list of options or arguments).
command :: GenParser Char st ([Char], [Char], [[Char]])
command = do
  char '\\'
  name <- many1 letter
  star <- option "" (string "*")  -- some commands have starred versions
  args <- commandArgs
  return (name, star, args)

begin :: [Char] -> GenParser Char st [Char]
begin name = try $ do
  string $ "\\begin{" ++ name ++ "}"
  optional commandArgs
  spaces
  return name

end :: [Char] -> GenParser Char st [Char]
end name = try $ do
  string $ "\\end{" ++ name ++ "}"
  spaces
  return name

-- | Returns a list of block elements containing the contents of an
-- environment.
environment name = try $ begin name >> spaces >> manyTill block (end name)

anyEnvironment =  try $ do
  string "\\begin{"
  name <- many letter
  star <- option "" (string "*") -- some environments have starred variants
  char '}'
  optional commandArgs
  spaces
  contents <- manyTill block (end (name ++ star))
  return $ BlockQuote contents

--
-- parsing documents
--

-- | Process LaTeX preamble, extracting metadata.
processLaTeXPreamble :: GenParser Char ParserState ()
processLaTeXPreamble = try $ manyTill
  (choice [bibliographic, comment, unknownCommand, nullBlock])
  (try (string "\\begin{document}")) >>
  spaces

-- | Parse LaTeX and return 'Pandoc'.
parseLaTeX :: GenParser Char ParserState Pandoc
parseLaTeX = do
  optional processLaTeXPreamble -- preamble might not be present (fragment)
  spaces
  blocks <- parseBlocks
  spaces
  optional $ try (string "\\end{document}" >> many anyChar)
  -- might not be present (fragment)
  spaces
  eof
  state <- getState
  let blocks' = filter (/= Null) blocks
  let title' = stateTitle state
  let authors' = stateAuthors state
  let date' = stateDate state
  return $ Pandoc (Meta title' authors' date')  blocks'

--
-- parsing blocks
--

parseBlocks = spaces >> many block

block = choice [ hrule
               , codeBlock
               , header
               , list
               , blockQuote
               , mathBlock
               , comment
               , bibliographic
               , para
               , specialEnvironment
               , itemBlock
               , unknownEnvironment
               , unknownCommand ] <?> "block"

--
-- header blocks
--

header = try $ do
  char '\\'
  subs <- many (try (string "sub"))
  string "section"
  optional (char '*')
  char '{'
  title' <- manyTill inline (char '}')
  spaces
  return $ Header (length subs + 1) (normalizeSpaces title')

--
-- hrule block
--

hrule :: GenParser Char st Block
hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n",
                       "\\newpage" ] >> spaces >> return HorizontalRule

--
-- code blocks
--

codeBlock :: GenParser Char st Block
codeBlock = codeBlock1 <|> codeBlock2

codeBlock1 :: GenParser Char st Block
codeBlock1 = try $ do
  string "\\begin{verbatim}"  -- don't use begin function because it
                              -- gobbles whitespace
  optional blanklines         -- we want to gobble blank lines, but not
                              -- leading space
  contents <- manyTill anyChar (try (string "\\end{verbatim}"))
  spaces
  return $ CodeBlock ("",[],[]) (stripTrailingNewlines contents)

codeBlock2 :: GenParser Char st Block
codeBlock2 = try $ do
  string "\\begin{Verbatim}"  -- used by fancyvrb package
  optional blanklines
  contents <- manyTill anyChar (try (string "\\end{Verbatim}"))
  spaces
  return $ CodeBlock ("",[],[]) (stripTrailingNewlines contents)

--
-- block quotes
--

blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>=
             return . BlockQuote

--
-- math block
--

mathBlock :: GenParser Char st Block
mathBlock = mathBlockWith (begin "equation") (end "equation") <|>
            mathBlockWith (begin "displaymath") (end "displaymath") <|>
            mathBlockWith (try $ string "\\[") (try $ string "\\]") <?>
            "math block"

mathBlockWith :: GenParser Char st t
              -> GenParser Char st end
              -> GenParser Char st Block
mathBlockWith start end' = try $ do
  start
  spaces
  result <- manyTill anyChar end'
  spaces
  return $ BlockQuote [Para [Math result]]

--
-- list blocks
--

list = bulletList <|> orderedList <|> definitionList <?> "list"

listItem = try $ do
  ("item", _, args) <- command
  spaces
  state <- getState
  let oldParserContext = stateParserContext state
  updateState (\s -> s {stateParserContext = ListItemState})
  blocks <- many block
  updateState (\s -> s {stateParserContext = oldParserContext})
  opt <- case args of
           ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x ->
                       parseFromString (many inline) $ tail $ init x
           _        -> return []
  return (opt, blocks)

orderedList = try $ do
  string "\\begin{enumerate}"
  (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $
                              try $ do failIfStrict
                                       char '['
                                       res <- anyOrderedListMarker
                                       char ']'
                                       return res
  spaces
  option "" $ try $ do string "\\setlength{\\itemindent}"
                       char '{'
                       manyTill anyChar (char '}')
  spaces
  start <- option 1 $ try $ do failIfStrict
                               string "\\setcounter{enum"
                               many1 (oneOf "iv")
                               string "}{"
                               num <- many1 digit
                               char '}'
                               spaces
                               return $ (read num) + 1
  items <- many listItem
  end "enumerate"
  spaces
  return $ OrderedList (start, style, delim) $ map snd items

bulletList = try $ do
  begin "itemize"
  spaces
  items <- many listItem
  end "itemize"
  spaces
  return (BulletList $ map snd items)

definitionList = try $ do
  begin "description"
  spaces
  items <- many listItem
  end "description"
  spaces
  return (DefinitionList items)

--
-- paragraph block
--

para = many1 inline >>~ spaces >>= return . Para . normalizeSpaces

--
-- title authors date
--

bibliographic = choice [ maketitle, title, authors, date ]

maketitle :: GenParser Char st Block
maketitle = try (string "\\maketitle") >> spaces >> return Null

title = try $ do
  string "\\title{"
  tit <- manyTill inline (char '}')
  spaces
  updateState (\state -> state { stateTitle = tit })
  return Null

authors :: GenParser Char ParserState Block
authors = try $ do
  string "\\author{"
  authors' <- manyTill anyChar (char '}')
  spaces
  let authors'' = map removeLeadingTrailingSpace $ lines $
                  substitute "\\\\" "\n" authors'
  updateState (\s -> s { stateAuthors = authors'' })
  return Null

date :: GenParser Char ParserState Block
date = try $ do
  string "\\date{"
  date' <- manyTill anyChar (char '}')
  spaces
  updateState (\state -> state { stateDate = date' })
  return Null

--
-- item block
-- for use in unknown environments that aren't being parsed as raw latex
--

-- this forces items to be parsed in different blocks
itemBlock :: GenParser Char ParserState Block
itemBlock = try $ do
  ("item", _, args) <- command
  state <- getState
  if (stateParserContext state == ListItemState)
     then fail "item should be handled by list block"
     else if null args
             then return Null
             else return $ Plain [Str (stripFirstAndLast (head args))]

--
-- raw LaTeX
--

specialEnvironment :: GenParser Char st Block
specialEnvironment = do  -- these are always parsed as raw
  lookAhead (choice (map (\name -> begin name)  ["tabular", "figure",
              "tabbing", "eqnarry", "picture", "table", "verse", "theorem"]))
  rawLaTeXEnvironment

-- | Parse any LaTeX environment and return a Para block containing
-- the whole literal environment as raw TeX.
rawLaTeXEnvironment :: GenParser Char st Block
rawLaTeXEnvironment = try $ do
  string "\\begin{"
  name <- many1 letter
  star <- option "" (string "*") -- for starred variants
  let name' = name ++ star
  char '}'
  args <- option [] commandArgs
  let argStr = concat args
  contents <- manyTill (choice [ (many1 (noneOf "\\")),
                                 (do
                                    (Para [TeX s]) <- rawLaTeXEnvironment
                                    return s),
                                 string "\\" ])
                       (end name')
  spaces
  return $ Para [TeX $ "\\begin{" ++ name' ++ "}" ++ argStr ++
                 concat contents ++ "\\end{" ++ name' ++ "}"]

unknownEnvironment = try $ do
  state <- getState
  result <- if stateParseRaw state -- check whether we should include raw TeX
               then rawLaTeXEnvironment -- if so, get whole raw environment
               else anyEnvironment      -- otherwise just the contents
  return result

unknownCommand :: GenParser Char ParserState Block
unknownCommand = try $ do
  notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description",
                                     "document"]
  (name, star, args) <- command
  spaces
  let argStr = concat args
  state <- getState
  if name == "item" && (stateParserContext state) == ListItemState
     then fail "should not be parsed as raw"
     else return ""
  if stateParseRaw state
     then return $ Plain [TeX ("\\" ++ name ++ star ++ argStr)]
     else return $ Plain [Str (joinWithSep " " args)]

-- latex comment
comment :: GenParser Char st Block
comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null

--
-- inline
--

inline :: GenParser Char ParserState Inline
inline =  choice [ str
                 , endline
                 , whitespace
                 , quoted
                 , apostrophe
                 , spacer
                 , strong
                 , math
                 , ellipses
                 , emDash
                 , enDash
                 , hyphen
                 , emph
                 , strikeout
                 , superscript
                 , subscript
                 , ref
                 , lab
                 , code
                 , url
                 , link
                 , image
                 , footnote
                 , linebreak
                 , accentedChar
                 , specialChar
                 , rawLaTeXInline
                 , escapedChar
                 , unescapedChar
                 ] <?> "inline"

accentedChar = normalAccentedChar <|> specialAccentedChar

normalAccentedChar = try $ do
  char '\\'
  accent <- oneOf "'`^\"~"
  character <- (try $ char '{' >> letter >>~ char '}') <|> letter
  let table = fromMaybe [] $ lookup character accentTable
  let result = case lookup accent table of
                 Just num  -> chr num
                 Nothing   -> '?'
  return $ Str [result]

-- an association list of letters and association list of accents
-- and decimal character numbers.
accentTable =
  [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
    ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
    ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
    ('N', [('~', 209)]),
    ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
    ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
    ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
    ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
    ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
    ('n', [('~', 241)]),
    ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
    ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]

specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig,
                               oslash, pound, euro, copyright, sect ]

ccedil = try $ do
  char '\\'
  letter <- oneOfStrings ["cc", "cC"]
  let num = if letter == "cc" then 231 else 199
  return $ Str [chr num]

aring = try $ do
  char '\\'
  letter <- oneOfStrings ["aa", "AA"]
  let num = if letter == "aa" then 229 else 197
  return $ Str [chr num]

iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >>
       return (Str [chr 239])

icirc = try (string "\\^") >> oneOfStrings ["\\i", "{\\i}"] >>
        return (Str [chr 238])

szlig = try (string "\\ss") >> return (Str [chr 223])

oslash = try $ do
  char '\\'
  letter <- choice [char 'o', char 'O']
  let num = if letter == 'o' then 248 else 216
  return $ Str [chr num]

aelig = try $ do
  char '\\'
  letter <- oneOfStrings ["ae", "AE"]
  let num = if letter == "ae" then 230 else 198
  return $ Str [chr num]

pound = try (string "\\pounds") >> return (Str [chr 163])

euro = try (string "\\euro") >> return (Str [chr 8364])

copyright = try (string "\\copyright") >> return (Str [chr 169])

sect = try (string "\\S") >> return (Str [chr 167])

escapedChar = do
  result <- escaped (oneOf " $%&_#{}\n")
  return $ if result == Str "\n" then Str " " else result

-- ignore standalone, nonescaped special characters
unescapedChar = oneOf "`$^&_#{}|<>" >> return (Str "")

specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ]

backslash = try (string "\\textbackslash") >> return (Str "\\")

tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~")

caret = try (string "\\^{}") >> return (Str "^")

bar = try (string "\\textbar") >> return (Str "\\")

lt = try (string "\\textless") >> return (Str "<")

gt = try (string "\\textgreater") >> return (Str ">")

doubleQuote = char '"' >> return (Str "\"")

code = code1 <|> code2

code1 = try $ do
  string "\\verb"
  marker <- anyChar
  result <- manyTill anyChar (char marker)
  return $ Code $ removeLeadingTrailingSpace result

code2 = try $ do
  string "\\texttt{"
  result <- manyTill (noneOf "\\\n~$%^&{}") (char '}')
  return $ Code result

emph :: GenParser Char ParserState Inline
emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >>
             manyTill inline (char '}') >>= return . Emph

strikeout :: GenParser Char ParserState Inline
strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>=
                  return . Strikeout

superscript :: GenParser Char ParserState Inline
superscript = try $ string "\\textsuperscript{" >>
                    manyTill inline (char '}') >>= return . Superscript

-- note: \textsubscript isn't a standard latex command, but we use
-- a defined version in pandoc.
subscript :: GenParser Char ParserState Inline
subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>=
                  return . Subscript

apostrophe :: GenParser Char ParserState Inline
apostrophe = char '\'' >> return Apostrophe

quoted :: GenParser Char ParserState Inline
quoted = doubleQuoted <|> singleQuoted

singleQuoted :: GenParser Char ParserState Inline
singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>=
               return . Quoted SingleQuote . normalizeSpaces

doubleQuoted :: GenParser Char ParserState Inline
doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>=
               return . Quoted DoubleQuote . normalizeSpaces

singleQuoteStart :: GenParser Char st Char
singleQuoteStart = char '`'

singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum

doubleQuoteStart = string "``"

doubleQuoteEnd = try $ string "''"

ellipses = try $ string "\\ldots" >> optional (try (string "{}")) >>
                 return Ellipses

enDash = try (string "--") >> return EnDash

emDash = try (string "---") >> return EmDash

hyphen = char '-' >> return (Str "-")

lab = try $ do
  string "\\label{"
  result <- manyTill anyChar (char '}')
  return $ Str $ "(" ++ result ++ ")"

ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str

strong :: GenParser Char ParserState Inline
strong = try (string "\\textbf{") >> manyTill inline (char '}') >>=
         return . Strong

whitespace = many1 (oneOf "~ \t") >> return Space

-- hard line break
linebreak = try (string "\\\\") >> return LineBreak

spacer = try (string "\\,") >> return (Str "")

str = many1 (noneOf specialChars) >>= return . Str

-- endline internal to paragraph
endline = try $ newline >> notFollowedBy blankline >> return Space

-- math
math = math1 <|> math2 <?> "math"

math1 = try $ do
  char '$'
  result <- many (noneOf "$")
  char '$'
  return $ Math result

math2 = try $ do
  string "\\("
  result <- many (noneOf "$")
  string "\\)"
  return $ Math result

--
-- links and images
--

url = try $ do
  string "\\url"
  url <- charsInBalanced '{' '}'
  return $ Link [Code url] (url, "")

link = try $ do
  string "\\href{"
  url <- manyTill anyChar (char '}')
  char '{'
  label <- manyTill inline (char '}')
  return $ Link (normalizeSpaces label) (url, "")

image = try $ do
  ("includegraphics", _, args) <- command
  let args' = filter isArg args -- filter out options
  let src = if null args' then
              ("", "")
            else
              (stripFirstAndLast (head args'), "")
  return $ Image [Str "image"] src

footnote = try $ do
  (name, _, (contents:[])) <- command
  if ((name == "footnote") || (name == "thanks"))
     then string ""
     else fail "not a footnote or thanks command"
  let contents' = stripFirstAndLast contents
  -- parse the extracted block, which may contain various block elements:
  rest <- getInput
  setInput $ contents'
  blocks <- parseBlocks
  setInput rest
  return $ Note blocks

-- | Parse any LaTeX command and return it in a raw TeX inline element.
rawLaTeXInline :: GenParser Char ParserState Inline
rawLaTeXInline = try $ do
  (name, star, args) <- command
  state <- getState
  if ((name == "begin") || (name == "end") || (name == "item"))
     then fail "not an inline command"
     else string ""
  if stateParseRaw state
     then return $ TeX ("\\" ++ name ++ star ++ concat args)
     else return $ Str (joinWithSep " " args)