pandoc/src/Text/Pandoc/Readers/LaTeX.hs

{-
Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-}

{- |
   Module      : Text.Pandoc.Readers.LaTeX
   Copyright   : Copyright (C) 2006-7 John MacFarlane
   License     : GNU GPL, version 2 or above 

   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable

Conversion of LaTeX to 'Pandoc' document.
-}
module Text.Pandoc.Readers.LaTeX ( 
                                  readLaTeX,
                                  rawLaTeXInline,
                                  rawLaTeXEnvironment
                                 ) where

import Text.ParserCombinators.Parsec
import Text.Pandoc.ParserCombinators
import Text.Pandoc.Definition
import Text.Pandoc.Shared 
import Data.Maybe ( fromMaybe )
import Data.Char ( chr )

-- | Parse LaTeX from string and return 'Pandoc' document.
readLaTeX :: ParserState   -- ^ Parser state, including options for parser
          -> String        -- ^ String to parse
          -> Pandoc
readLaTeX = readWith parseLaTeX

-- for testing
testString = testStringWith parseLaTeX

-- characters with special meaning
specialChars = "\\$%&^&_~#{}\n \t|<>'\"-"

--
-- utility functions
--

-- | Returns text between brackets and its matching pair.
bracketedText openB closeB = try (do
  char openB
  result <- many (choice [ oneOfStrings [ ['\\', openB], ['\\', closeB] ],
                           count 1 (noneOf [openB, closeB]),
                           bracketedText openB closeB ])
  char closeB
  return ([openB] ++ (concat result) ++ [closeB]))

-- | Returns an option or argument of a LaTeX command.
optOrArg = choice [ (bracketedText '{' '}'), (bracketedText '[' ']') ]

-- | True if the string begins with '{'.
isArg ('{':rest) = True
isArg other = False

-- | Returns list of options and arguments of a LaTeX command.
commandArgs = many optOrArg

-- | Parses LaTeX command, returns (name, star, list of options or arguments).
command = try (do
  char '\\'
  name <- many1 alphaNum
  star <- option "" (string "*")  -- some commands have starred versions
  args <- commandArgs
  return (name, star, args))

begin name = try (do
  string "\\begin{"
  string name
  char '}'
  option [] commandArgs
  spaces
  return name)

end name = try (do
  string "\\end{"
  string name
  char '}'
  spaces
  return name)

-- | Returns a list of block elements containing the contents of an
-- environment.
environment name = try (do
  begin name
  spaces
  contents <- manyTill block (end name)
  return contents)

anyEnvironment =  try (do
  string "\\begin{"
  name <- many alphaNum 
  star <- option "" (string "*") -- some environments have starred variants
  char '}'
  option [] commandArgs
  spaces
  contents <- manyTill block (end (name ++ star))
  return (BlockQuote contents))

--
-- parsing documents
--

-- | Process LaTeX preamble, extracting metadata.
processLaTeXPreamble = try (do
  manyTill (choice [bibliographic, comment, unknownCommand, nullBlock]) 
           (try (string "\\begin{document}"))
  spaces)

-- | Parse LaTeX and return 'Pandoc'.
parseLaTeX = do
  option () processLaTeXPreamble -- preamble might not be present (fragment)
  blocks <- parseBlocks
  spaces
  option "" (try (string "\\end{document}")) -- might not be present (in fragment)
  spaces
  eof
  state <- getState
  let blocks' = filter (/= Null) blocks
  let title' = stateTitle state
  let authors' = stateAuthors state
  let date' = stateDate state
  return (Pandoc (Meta title' authors' date')  blocks')

--
-- parsing blocks
--

parseBlocks = do
  spaces
  result <- many block
  return result

block = choice [ hrule, codeBlock, header, list, blockQuote, mathBlock,
                 comment, bibliographic, para, specialEnvironment,
                 itemBlock, unknownEnvironment, unknownCommand ] <?>
                 "block"

--
-- header blocks
--

header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"

headerLevel n = try (do
  let subs = concat $ replicate (n - 1) "sub"
  string ("\\" ++ subs ++ "section")
  option ' ' (char '*')
  char '{'
  title <- manyTill inline (char '}')
  spaces
  return (Header n (normalizeSpaces title)))

--
-- hrule block
--

hrule = try (do
  oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", 
                 "\\newpage" ]
  spaces
  return HorizontalRule)

--
-- code blocks
--

codeBlock = try (do
  string "\\begin{verbatim}"  -- don't use begin function because it 
                              -- gobbles whitespace
  option "" blanklines        -- we want to gobble blank lines, but not 
                              -- leading space
  contents <- manyTill anyChar (try (string "\\end{verbatim}"))
  spaces
  return (CodeBlock (stripTrailingNewlines contents)))

--
-- block quotes
--

blockQuote = choice [ blockQuote1, blockQuote2 ] <?> "blockquote"

blockQuote1 = try (do
  blocks <- environment "quote"
  spaces
  return (BlockQuote blocks))

blockQuote2 = try (do
  blocks <- environment "quotation"
  spaces
  return (BlockQuote blocks))

--
-- math block
--

mathBlock = mathBlockWith (begin "equation") (end "equation") <|> 
            mathBlockWith (begin "displaymath") (end "displaymath") <|>
            mathBlockWith (string "\\[") (string "\\]") <?> "math block"

mathBlockWith start end = try (do
  start
  spaces
  result <- manyTill anyChar end
  spaces
  return (BlockQuote [Para [TeX ("$" ++ result ++ "$")]]))

--
-- list blocks
--

list = bulletList <|> orderedList <?> "list"

listItem = try (do
    ("item", _, _) <- command
    spaces
    state <- getState
    let oldParserContext = stateParserContext state
    updateState (\state -> state {stateParserContext = ListItemState})
    blocks <- many block
    updateState (\state -> state {stateParserContext = oldParserContext})
    return blocks)

orderedList = try (do
    begin "enumerate"
    spaces
    items <- many listItem
    end "enumerate"
    spaces
    return (OrderedList items))

bulletList = try (do
    begin "itemize"
    spaces
    items <- many listItem
    end "itemize"
    spaces
    return (BulletList items))

--
-- paragraph block
--

para = try (do 
  result <- many1 inline
  spaces
  return (Para (normalizeSpaces result)))

--
-- title authors date
--

bibliographic = choice [ maketitle, title, authors, date ]

maketitle = try (do
  string "\\maketitle"
  spaces
  return Null)

title = try (do
  string "\\title{"
  tit <- manyTill inline (char '}')
  spaces
  updateState (\state -> state { stateTitle = tit })
  return Null)

authors = try (do
  string "\\author{"
  authors <- manyTill anyChar (char '}')
  spaces
  let authors' = map removeLeadingTrailingSpace $ lines $
                 substitute "\\\\" "\n" authors
  updateState (\state -> state { stateAuthors = authors' })
  return Null)

date = try (do
  string "\\date{"
  date' <- manyTill anyChar (char '}')
  spaces
  updateState (\state -> state { stateDate = date' })
  return Null)

--
-- item block
-- for use in unknown environments that aren't being parsed as raw latex
--

-- this forces items to be parsed in different blocks
itemBlock = try (do
  ("item", _, args) <- command
  state <- getState
  if (stateParserContext state == ListItemState)
     then fail "item should be handled by list block"
     else if null args 
             then return Null
             else return (Plain [Str (stripFirstAndLast (head args))]))

--
-- raw LaTeX 
--

specialEnvironment = do  -- these are always parsed as raw
  lookAhead (choice (map (\name -> begin name)  ["tabular", "figure",
              "tabbing", "eqnarry", "picture", "table", "verse", "theorem"]))
  rawLaTeXEnvironment

-- | Parse any LaTeX environment and return a Para block containing
-- the whole literal environment as raw TeX.
rawLaTeXEnvironment :: GenParser Char st Block
rawLaTeXEnvironment = try (do
  string "\\begin"
  char '{'
  name <- many1 alphaNum
  star <- option "" (string "*") -- for starred variants
  let name' = name ++ star
  char '}'
  args <- option [] commandArgs
  let argStr = concat args
  contents <- manyTill (choice [(many1 (noneOf "\\")), 
                                 (do 
                                    (Para [TeX str]) <- rawLaTeXEnvironment
                                    return str),
                                 string "\\" ]) 
                       (end name')
  spaces
  return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ argStr ++ 
          (concat contents) ++ "\\end{" ++ name' ++ "}")]))

unknownEnvironment = try (do
  state <- getState
  result <- if stateParseRaw state -- check whether we should include raw TeX 
               then rawLaTeXEnvironment -- if so, get whole raw environment
               else anyEnvironment      -- otherwise just the contents
  return result)

unknownCommand = try (do
  notFollowedBy' (string "\\end{itemize}")
  notFollowedBy' (string "\\end{enumerate}")
  notFollowedBy' (string "\\end{document}")
  (name, star, args) <- command
  spaces
  let argStr = concat args
  state <- getState
  if (name == "item") && ((stateParserContext state) == ListItemState)
     then fail "should not be parsed as raw"
     else string ""
  if stateParseRaw state
     then return (Plain [TeX ("\\" ++ name ++ star ++ argStr)])
     else return (Plain [Str (joinWithSep " " args)]))

-- latex comment
comment = try (do
  char '%'
  result <- manyTill anyChar newline
  spaces
  return Null)

-- 
-- inline
--

inline =  choice [ strong, emph, strikeout, ref, lab, code, linebreak, spacer,
                   math, ellipses, emDash, enDash, hyphen, quoted, apostrophe,
                   accentedChar, specialChar, specialInline, escapedChar,
                   unescapedChar, str, endline, whitespace ] <?> "inline"

specialInline = choice [ link, image, footnote, rawLaTeXInline ] 
                <?> "link, raw TeX, note, or image"

accentedChar = normalAccentedChar <|> specialAccentedChar

normalAccentedChar = try (do
  char '\\'
  accent <- oneOf "'`^\"~"
  character <- choice [ between (char '{') (char '}') anyChar, anyChar ]
  let table = fromMaybe [] $ lookup character accentTable 
  let result = case lookup accent table of
                 Just num  -> chr num
                 Nothing   -> '?'
  return (Str [result]))

-- an association list of letters and association list of accents
-- and decimal character numbers.
accentTable = 
  [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
    ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
    ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
    ('N', [('~', 209)]),
    ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
    ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
    ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
    ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
    ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
    ('n', [('~', 241)]),
    ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
    ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]

specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig,
                              oslash, pound, euro, copyright, sect ]

ccedil = try (do
  char '\\'
  letter <- oneOfStrings ["cc", "cC"]
  let num = if letter == "cc" then 231 else 199
  return (Str [chr num]))

aring = try (do
  char '\\'
  letter <- oneOfStrings ["aa", "AA"]
  let num = if letter == "aa" then 229 else 197
  return (Str [chr num]))

iuml = try (do
  string "\\\""
  oneOfStrings ["\\i", "{\\i}"]
  return (Str [chr 239]))

icirc = try (do
  string "\\^"
  oneOfStrings ["\\i", "{\\i}"]
  return (Str [chr 238]))

szlig = try (do
  string "\\ss"
  return (Str [chr 223]))

oslash = try (do
  char '\\'
  letter <- choice [char 'o', char 'O']
  let num = if letter == 'o' then 248 else 216
  return (Str [chr num]))

aelig = try (do
  char '\\'
  letter <- oneOfStrings ["ae", "AE"]
  let num = if letter == "ae" then 230 else 198
  return (Str [chr num]))

pound = try (do 
  string "\\pounds"
  return (Str [chr 163]))

euro = try (do
  string "\\euro"
  return (Str [chr 8364]))

copyright = try (do
  string "\\copyright"
  return (Str [chr 169]))

sect = try (do
  string "\\S"
  return (Str [chr 167]))

escapedChar = do
  result <- escaped (oneOf " $%&_#{}\n")
  return (if result == Str "\n" then Str " " else result)

unescapedChar = do  -- ignore standalone, nonescaped special characters
  oneOf "$^&_#{}|<>"
  return (Str "")

specialChar = choice [ backslash, tilde, caret, bar, lt, gt ]

backslash = try (do 
  string "\\textbackslash"
  return (Str "\\"))

tilde = try (do 
  string "\\ensuremath{\\sim}"
  return (Str "~"))

caret = try (do 
  string "\\^{}"
  return (Str "^"))

bar = try (do
  string "\\textbar"
  return (Str "\\"))

lt = try (do
  string "\\textless"
  return (Str "<"))

gt = try (do
  string "\\textgreater"
  return (Str ">"))

code = try (do 
  string "\\verb"
  marker <- anyChar
  result <- manyTill anyChar (char marker)
  let result' = removeLeadingTrailingSpace result
  return (Code result'))

emph = try (do 
  oneOfStrings [ "\\emph{", "\\textit{" ]
  result <- manyTill inline (char '}')
  return (Emph result))

strikeout = try $ do 
  string "\\sout{"
  result <- manyTill inline (char '}')
  return (Strikeout result)

apostrophe = do
  char '\''
  return Apostrophe

quoted = do
  doubleQuoted <|> singleQuoted

singleQuoted = try (do
  result <- enclosed singleQuoteStart singleQuoteEnd inline
  return $ Quoted SingleQuote $ normalizeSpaces result)

doubleQuoted = try (do
  result <- enclosed doubleQuoteStart doubleQuoteEnd inline
  return $ Quoted DoubleQuote $ normalizeSpaces result)

singleQuoteStart = char '`'

singleQuoteEnd = try (do
  char '\''
  notFollowedBy alphaNum)

doubleQuoteStart = try (string "``")

doubleQuoteEnd = try (string "''")

ellipses = try (do
  string "\\ldots"
  option "" (try (string "{}"))
  return Ellipses)

enDash = try (do
  string "--"
  notFollowedBy (char '-')
  return EnDash) 

emDash = try (do
  string "---"
  return EmDash)

hyphen = do
  char '-'
  return (Str "-")

lab = try (do
  string "\\label{"
  result <- manyTill anyChar (char '}')
  return (Str ("(" ++ result ++ ")")))

ref = try (do
  string "\\ref{"
  result <- manyTill anyChar (char '}')
  return (Str (result)))

strong = try (do 
  string "\\textbf{"
  result <- manyTill inline (char '}')
  return (Strong result))

whitespace = do
  many1 (oneOf "~ \t")
  return Space

-- hard line break
linebreak = try (do
  string "\\\\"
  return LineBreak)

spacer = try $ do
  string "\\," 
  return (Str "")

str = do 
  result <- many1 (noneOf specialChars)
  return (Str result)

-- endline internal to paragraph
endline = try (do
  newline
  notFollowedBy blankline
  return Space)

-- math
math = math1 <|> math2 <?> "math"

math1 = try (do
  char '$'
  result <- many (noneOf "$")
  char '$'
  return (TeX ("$" ++ result ++ "$")))

math2 = try (do
  string "\\("
  result <- many (noneOf "$")
  string "\\)"
  return (TeX ("$" ++ result ++ "$")))

--
-- links and images
--

link = try (do
  string "\\href{"
  url <- manyTill anyChar (char '}')
  char '{'
  label <- manyTill inline (char '}') 
  return (Link (normalizeSpaces label) (url, "")))

image = try (do
  ("includegraphics", _, args) <- command
  let args' = filter isArg args -- filter out options
  let src = if null args' then
              ("", "")
            else
              (stripFirstAndLast (head args'), "")
  return (Image [Str "image"] src))

footnote = try (do
  (name, _, (contents:[])) <- command
  if ((name == "footnote") || (name == "thanks"))
     then string ""
     else fail "not a footnote or thanks command"
  let contents' = stripFirstAndLast contents
  -- parse the extracted block, which may contain various block elements:
  rest <- getInput
  setInput $ contents'
  blocks <- parseBlocks
  setInput rest
  return (Note blocks))

-- | Parse any LaTeX command and return it in a raw TeX inline element.
rawLaTeXInline :: GenParser Char ParserState Inline
rawLaTeXInline = try (do
  (name, star, args) <- command
  let argStr = concat args
  state <- getState
  if ((name == "begin") || (name == "end") || (name == "item"))
     then fail "not an inline command" 
     else string ""
  return (TeX ("\\" ++ name ++ star ++ argStr)))