Added Text.Pandoc.PDF module, 'pdf' as new output option.

The module calls pdflatex to produce the PDF, and is basically shell
scripting in haskell. But this is better than the existing markdown2pdf
script, which is limited to POSIX.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1394 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2008-08-10 17:33:20 +00:00
parent f435d14f28
commit 0f4bdc6b7d
4 changed files with 176 additions and 160 deletions

106
Main.hs
View file

@ -20,10 +20,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Main
Copyright : Copyright (C) 2006-8 John MacFarlane
License : GNU GPL, version 2 or above
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley@edu>
Stability : alpha
Stability : alpha
Portability : portable
Parses command-line options and calls the appropriate readers and
@ -32,6 +32,7 @@ writers.
module Main where
import Text.Pandoc
import Text.Pandoc.ODT
import Text.Pandoc.PDF
import Text.Pandoc.Shared ( joinWithSep, HTMLMathMethod (..) )
import Text.Pandoc.Highlighting ( languages )
import System.Environment ( getArgs, getProgName, getEnvironment )
@ -98,7 +99,7 @@ readers = [("native" , readPandoc)
-- | Reader for native Pandoc format.
readPandoc :: ParserState -> String -> Pandoc
readPandoc _ input = read input
-- | Association list of formats and pairs of writers and default headers.
writers :: [ ( String, ( WriterOptions -> Pandoc -> String, String ) ) ]
writers = [("native" , (writeDoc, ""))
@ -108,6 +109,7 @@ writers = [("native" , (writeDoc, ""))
,("opendocument" , (writeOpenDocument, defaultOpenDocumentHeader))
,("odt" , (writeOpenDocument, defaultOpenDocumentHeader))
,("latex" , (writeLaTeX, defaultLaTeXHeader))
,("pdf" , (writeLaTeX, defaultLaTeXHeader))
,("context" , (writeConTeXt, defaultConTeXtHeader))
,("texinfo" , (writeTexinfo, ""))
,("man" , (writeMan, ""))
@ -118,8 +120,7 @@ writers = [("native" , (writeDoc, ""))
]
isNonTextOutput :: String -> Bool
isNonTextOutput "odt" = True
isNonTextOutput _ = False
isNonTextOutput = (`elem` ["odt", "pdf"])
-- | Writer for Pandoc native format.
writeDoc :: WriterOptions -> Pandoc -> String
@ -205,7 +206,7 @@ options =
(\arg opt -> return opt { optWriter = map toLower arg })
"FORMAT")
"" -- ("(" ++ (joinWithSep ", " $ map fst writers) ++ ")")
, Option "s" ["standalone"]
(NoArg
(\opt -> return opt { optStandalone = True }))
@ -250,7 +251,7 @@ options =
, Option "m" ["asciimathml"]
(OptArg
(\arg opt -> return opt { optHTMLMathMethod =
(\arg opt -> return opt { optHTMLMathMethod =
ASCIIMathML arg })
"URL")
"" -- "Use ASCIIMathML script in html output"
@ -290,13 +291,13 @@ options =
, Option "" ["toc", "table-of-contents"]
(NoArg
(\opt -> return opt { optTableOfContents = True }))
"" -- "Include table of contents"
"" -- "Include table of contents"
, Option "c" ["css"]
(ReqArg
(\arg opt -> do
(\arg opt -> do
let old = optCSS opt
return opt { optCSS = old ++ [arg],
return opt { optCSS = old ++ [arg],
optStandalone = True })
"CSS")
"" -- "Link to CSS style sheet"
@ -340,17 +341,17 @@ options =
, Option "T" ["title-prefix"]
(ReqArg
(\arg opt -> return opt { optTitlePrefix = arg,
(\arg opt -> return opt { optTitlePrefix = arg,
optStandalone = True })
"STRING")
"" -- "String to prefix to HTML window title"
, Option "D" ["print-default-header"]
(ReqArg
(\arg _ -> do
let header = case (lookup arg writers) of
Just (_, h) -> h
Nothing -> error ("Unknown reader: " ++ arg)
Nothing -> error ("Unknown reader: " ++ arg)
hPutStr stdout header
exitWith ExitSuccess)
"FORMAT")
@ -376,7 +377,7 @@ options =
(NoArg
(\opt -> return opt { optIgnoreArgs = True }))
"" -- "Ignore command-line arguments."
, Option "v" ["version"]
(NoArg
(\_ -> do
@ -398,15 +399,15 @@ options =
-- Returns usage message
usageMessage :: String -> [OptDescr (Opt -> IO Opt)] -> String
usageMessage programName opts = usageInfo
(programName ++ " [OPTIONS] [FILES]" ++ "\nInput formats: " ++
(joinWithSep ", " $ map fst readers) ++ "\nOutput formats: " ++
(programName ++ " [OPTIONS] [FILES]" ++ "\nInput formats: " ++
(joinWithSep ", " $ map fst readers) ++ "\nOutput formats: " ++
(joinWithSep ", " $ map fst writers) ++ "\nOptions:")
opts
-- Determine default reader based on source file extensions
defaultReaderName :: [FilePath] -> String
defaultReaderName [] = "markdown"
defaultReaderName (x:xs) =
defaultReaderName (x:xs) =
case takeExtension (map toLower x) of
".xhtml" -> "html"
".html" -> "html"
@ -441,6 +442,7 @@ defaultWriterName x =
".texinfo" -> "texinfo"
".db" -> "docbook"
".odt" -> "odt"
".pdf" -> "pdf"
['.',y] | y `elem` ['1'..'9'] -> "man"
_ -> "html"
@ -464,7 +466,7 @@ main = do
else
return ()
let defaultOpts' = if compatMode
let defaultOpts' = if compatMode
then defaultOpts { optReader = "markdown"
, optWriter = "html"
, optStrict = True }
@ -513,11 +515,11 @@ main = do
let sources = if ignoreArgs then [] else args
-- assign reader and writer based on options and filenames
let readerName' = if null readerName
let readerName' = if null readerName
then defaultReaderName sources
else readerName
let writerName' = if null writerName
let writerName' = if null writerName
then defaultWriterName outputFile
else writerName
@ -539,74 +541,76 @@ main = do
-- remove DOS line endings
tabFilter _ ('\r':'\n':xs) = '\n':(tabFilter tabStop xs)
tabFilter _ ('\r':xs) = '\n':(tabFilter tabStop xs)
tabFilter spsToNextStop ('\t':xs) =
tabFilter spsToNextStop ('\t':xs) =
if preserveTabs
then '\t':(tabFilter tabStop xs)
else replicate spsToNextStop ' ' ++ tabFilter tabStop xs
tabFilter 1 (x:xs) =
then '\t':(tabFilter tabStop xs)
else replicate spsToNextStop ' ' ++ tabFilter tabStop xs
tabFilter 1 (x:xs) =
x:(tabFilter tabStop xs)
tabFilter spsToNextStop (x:xs) =
tabFilter spsToNextStop (x:xs) =
x:(tabFilter (spsToNextStop - 1) xs)
let standalone' = (standalone && not strict) || writerName' == "odt"
let standalone' = (standalone && not strict) || isNonTextOutput writerName'
#ifdef _CITEPROC
refs <- if null modsFile then return [] else readModsColletionFile modsFile
#endif
let startParserState =
let startParserState =
defaultParserState { stateParseRaw = parseRaw,
stateTabStop = tabStop,
stateTabStop = tabStop,
stateSanitizeHTML = sanitize,
stateStandalone = standalone',
#ifdef _CITEPROC
stateCitations = map citeKey refs,
#endif
stateSmart = smart || writerName' `elem`
stateSmart = smart || writerName' `elem`
["latex", "context"],
stateColumns = columns,
stateStrict = strict }
let csslink = if null css
then ""
else concatMap
else concatMap
(\f -> "<link rel=\"stylesheet\" href=\"" ++
f ++ "\" type=\"text/css\" media=\"all\" />\n")
css
let header = (if (customHeader == "DEFAULT")
let header = (if (customHeader == "DEFAULT")
then defaultHeader
else customHeader) ++ csslink ++ includeHeader
let writerOptions = WriterOptions { writerStandalone = standalone',
writerHeader = header,
writerHeader = header,
writerTitlePrefix = titlePrefix,
writerTabStop = tabStop,
writerTabStop = tabStop,
writerTableOfContents = toc &&
(not strict) &&
writerName' /= "s5",
writerHTMLMathMethod = mathMethod,
writerS5 = (writerName' == "s5"),
writerIgnoreNotes = False,
writerIncremental = incremental,
writerIncremental = incremental,
writerNumberSections = numberSections,
writerIncludeBefore = includeBefore,
writerIncludeBefore = includeBefore,
writerIncludeAfter = includeAfter,
writerStrictMarkdown = strict,
writerReferenceLinks = referenceLinks,
writerWrapText = wrap }
let writeOutput = if writerName' == "odt"
then if outputFile == "-"
then \_ -> do
hPutStrLn stderr ("Error: Cannot write " ++ writerName ++
" output to stdout.\n" ++
"Specify an output file using the -o option.")
exitWith $ ExitFailure 5
else let sourceDirRelative = if null sources
then ""
else takeDirectory (head sources)
in saveOpenDocumentAsODT outputFile sourceDirRelative
else if outputFile == "-"
then putStrLn
else writeFile outputFile . (++ "\n")
if isNonTextOutput writerName' && outputFile == "-"
then do hPutStrLn stderr ("Error: Cannot write " ++ writerName ++ " output to stdout.\n" ++
"Specify an output file using the -o option.")
exitWith $ ExitFailure 5
else return ()
let sourceDirRelative = if null sources
then ""
else takeDirectory (head sources)
let writeOutput = case writerName' of
"odt" -> saveOpenDocumentAsODT outputFile sourceDirRelative
"pdf" -> saveLaTeXAsPDF outputFile sourceDirRelative
_ -> if outputFile == "-"
then putStrLn
else writeFile outputFile . (++ "\n")
fmap (reader startParserState . tabFilter tabStop . joinWithSep "\n")
(readSources sources) >>=
@ -615,7 +619,7 @@ main = do
#endif
writeOutput . writer writerOptions
where
where
readSources [] = mapM readSource ["-"]
readSources sources = mapM readSource sources
readSource "-" = getContents

27
README
View file

@ -82,11 +82,28 @@ To convert `hello.html` from html to markdown:
pandoc -f html -t markdown hello.html
Supported output formats include `markdown`, `latex`, `context`
(ConTeXt), `html`, `rtf` (rich text format), `rst` (reStructuredText),
`docbook` (DocBook XML), `opendocument` (OpenDocument XML), `odt`
(OpenOffice text document), `texinfo`, (GNU Texinfo), `mediawiki`
(MediaWiki markup), `man` (groff man), and `s5` (which produces an
HTML file that acts like powerpoint).
(ConTeXt), `pdf`, `html`, `rtf` (rich text format), `rst`
(reStructuredText), `docbook` (DocBook XML), `opendocument`
(OpenDocument XML), `odt` (OpenOffice text document), `texinfo`, (GNU
Texinfo), `mediawiki` (MediaWiki markup), `man` (groff man), and `s5`
(which produces an HTML file that acts like powerpoint).
Notes:
- For `odt` output, you must have `zip` in the path. If you
don't have it installed, you can get the free [Info-ZIP].
- For `pdf` output, you must have `pdflatex` and `bibtex` in the
path. You should also have the following LaTeX packages installed:
`unicode`, `fancyhdr` (if you have verbatim text in footnotes),
`graphicx` (if you use images), `array` (if you use tables), and
`ulem` (if you use strikeout text). If they are not already included
in your LaTeX distribution, you can get them from [CTAN]. A full
[TeX Live] or [MacTeX] distribution will have all of these packages.
[Info-ZIP]: http://www.info-zip.org/Zip.html
[TeX Live]: http://www.tug.org/texlive/
[MacTeX]: http://www.tug.org/mactex/
Supported input formats include `markdown`, `html`, `latex`, and `rst`.
Note that the `rst` reader only parses a subset of reStructuredText

View file

@ -1,4 +1,4 @@
{-# LANGUAGE TemplateHaskell, CPP #-}
{-# LANGUAGE CPP #-}
{-
Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu>
@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.ODT
Module : Text.Pandoc.PDF
Copyright : Copyright (C) 2006-7 John MacFarlane
License : GNU GPL, version 2 or above
@ -26,119 +26,113 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Stability : alpha
Portability : portable
Functions for producing an ODT file from OpenDocument XML.
Functions for producing a PDF file from LaTeX.
-}
module Text.Pandoc.ODT ( saveOpenDocumentAsODT ) where
import Text.Pandoc.TH ( binaryContentsOf )
import Data.Maybe ( fromJust )
import Data.List ( partition, intersperse )
module Text.Pandoc.PDF ( saveLaTeXAsPDF ) where
import Data.List ( isInfixOf )
import System.Directory
import System.FilePath ( (</>), takeDirectory, takeFileName, splitDirectories )
import System.FilePath ( (</>), (<.>), takeBaseName )
import System.Process ( runProcess, waitForProcess )
import System.Exit
import Text.XML.Light
import Text.XML.Light.Cursor
import System.Environment ( getEnvironment )
import Text.Pandoc.Shared ( withTempDir )
import Network.URI ( isURI )
import qualified Data.ByteString as B ( writeFile, pack )
import Data.ByteString.Internal ( c2w )
import Prelude hiding ( writeFile, readFile )
import System.IO ( stderr )
import Prelude hiding ( writeFile, readFile, putStrLn )
import System.IO ( stderr, openFile, IOMode (..) )
#ifdef _UTF8STRING
import System.IO.UTF8
#else
import Text.Pandoc.UTF8
#endif
-- | Produce an ODT file from OpenDocument XML.
saveOpenDocumentAsODT :: FilePath -- ^ Pathname of ODT file to be produced.
-> FilePath -- ^ Relative directory of source file.
-> String -- ^ OpenDocument XML contents.
-> IO ()
saveOpenDocumentAsODT destinationODTPath sourceDirRelative xml = do
let zipCmd = "zip"
-- check for zip in path:
zipPathMaybe <- findExecutable zipCmd
let zipPath = case zipPathMaybe of
Nothing -> error $ "The '" ++ zipCmd ++
"' command, which is needed to build an ODT file, was not found.\n" ++
"It can be obtained from http://www.info-zip.org/Zip.html\n" ++
"Debian (and Debian-based) linux: apt-get install zip\n" ++
"Windows: See http://gnuwin32.sourceforge.net/packages/zip.htm"
-- | Produce an PDF file from LaTeX.
saveLaTeXAsPDF :: FilePath -- ^ Pathname of PDF file to be produced.
-> FilePath -- ^ Relative directory of source file.
-> String -- ^ LaTeX document.
-> IO ()
saveLaTeXAsPDF destinationPDFPath sourceDirRelative latex = do
-- check for pdflatex and bibtex in path:
latexPathMaybe <- findExecutable "pdflatex"
bibtexPathMaybe <- findExecutable "bibtex"
let latexPath = case latexPathMaybe of
Nothing -> error $ "The 'pdflatex' command, which is needed to build an PDF file, was not found."
Just x -> x
withTempDir "pandoc-odt" $ \tempDir -> do
let tempODT = tempDir </> "reference.odt"
B.writeFile tempODT $ B.pack $ map c2w $(binaryContentsOf $ "odt-styles" </> "reference.odt")
xml' <- handlePictures tempODT sourceDirRelative xml
writeFile (tempDir </> "content.xml") xml'
ph <- runProcess zipPath ["-9", "-q", "-r", tempODT, "content.xml", "Pictures"]
(Just tempDir) Nothing Nothing Nothing (Just stderr)
ec <- waitForProcess ph -- requires compilation with -threaded
case ec of
ExitSuccess -> copyFile tempODT destinationODTPath
_ -> error "Error creating ODT." >> exitWith ec
let bibtexPath = case bibtexPathMaybe of
Nothing -> error $ "The 'bibtex' command, which is needed to build an PDF file, was not found."
Just x -> x
sourceDirAbsolute <- getCurrentDirectory >>= return . (</> sourceDirRelative) >>= canonicalizePath
withTempDir "pandoc-pdf" $ \tempDir -> do
env <- getEnvironment
let env' = ("TEXINPUTS", ".:" ++ sourceDirAbsolute ++ ":") : env
let baseName = "input"
writeFile (tempDir </> baseName <.> "tex") latex
let runLatex = runProgram latexPath ["-interaction=nonstopmode", baseName] tempDir env'
let runBibtex = runProgram bibtexPath [baseName] tempDir env'
messages1 <- runLatex
let logPath = tempDir </> baseName <.> "log"
tocExists <- doesFileExist (tempDir </> baseName <.> "toc")
logContents <- readFile logPath
let undefinedRefs = "There were undefined references" `isInfixOf` logContents
let needsBibtex = "itation" `isInfixOf` logContents
if needsBibtex
then runBibtex >>= hPutStr stderr . unlines
else return ()
if tocExists || undefinedRefs
then do
messages2 <- runLatex
logContents' <- readFile logPath
let stillUndefinedRefs = "There were undefined references" `isInfixOf` logContents'
if stillUndefinedRefs
then runLatex >>= hPutStr stderr . unlines
else hPutStr stderr $ unlines messages2
else
hPutStr stderr $ unlines messages1
let pdfPath = tempDir </> baseName <.> "pdf"
pdfExists <- doesFileExist pdfPath
if pdfExists
then copyFile pdfPath destinationPDFPath
else error "The PDF could not be created."
-- | Find <draw:image ... /> elements and copy the file (xlink:href attribute) into Pictures/ in
-- the zip file. If filename is a URL, attempt to download it. Modify xlink:href attributes
-- to point to the new locations in Pictures/. Return modified XML.
handlePictures :: FilePath -- ^ Path of ODT file in temp directory
-> FilePath -- ^ Directory (relative) containing source file
-> String -- ^ OpenDocument XML string
-> IO String -- ^ Modified XML
handlePictures tempODT sourceDirRelative xml = do
let parsed = case parseXMLDoc xml of
Nothing -> error "Could not parse OpenDocument XML."
Just x -> x
let cursor = case (fromForest $ elContent parsed) of
Nothing -> error "ODT appears empty"
Just x -> x
cursor' <- scanPictures tempODT sourceDirRelative cursor
let modified = parsed { elContent = toForest $ root cursor' }
return $ showTopElement modified
scanPictures :: FilePath -> FilePath -> Cursor -> IO Cursor
scanPictures tempODT sourceDirRelative cursor = do
cursor' <- handleTree tempODT sourceDirRelative cursor
case right cursor' of
Just n -> scanPictures tempODT sourceDirRelative n
Nothing -> return cursor'
handleTree :: FilePath -> FilePath -> Cursor -> IO Cursor
handleTree tempODT sourceDirRelative cursor = do
case firstChild cursor of
Nothing -> modifyContentM (handleContent tempODT sourceDirRelative) cursor
Just n -> scanPictures tempODT sourceDirRelative n >>= return . fromJust . parent
-- | If content is an image link, handle it appropriately.
-- Otherwise, handle children if any.
handleContent :: FilePath -> FilePath -> Content -> IO Content
handleContent tempODT sourceDirRelative content@(Elem el) = do
if qName (elName el) == "image"
then do
let (hrefs, rest) = partition (\a -> qName (attrKey a) == "href") $ elAttribs el
let href = case hrefs of
[] -> error $ "No href found in " ++ show el
[x] -> x
_ -> error $ "Multiple hrefs found in " ++ show el
if isURI $ attrVal href
then return content
else do -- treat as filename
let oldLoc = sourceDirRelative </> attrVal href
fileExists <- doesFileExist oldLoc
if fileExists
then do
let pref = take 230 $ concat $ intersperse "_" $
splitDirectories $ takeDirectory $ attrVal href
let newLoc = "Pictures" </> pref ++ "_" ++ (takeFileName $ attrVal href)
let tempDir = takeDirectory tempODT
createDirectoryIfMissing False $ tempDir </> takeDirectory newLoc
copyFile oldLoc $ tempDir </> newLoc
let newAttrs = (href { attrVal = newLoc }) : rest
return $ Elem (el { elAttribs = newAttrs })
else do
hPutStrLn stderr $ "Warning: Unable to find image at " ++ oldLoc ++ " - ignoring."
return content
else return content
handleContent _ _ c = return c -- not Element
runProgram :: FilePath -- ^ pathname of executable
-> [String] -- ^ arguments
-> FilePath -- ^ working directory
-> [(String, String)] -- ^ environment
-> IO [String]
runProgram cmdPath arguments workingDir env = do
let runOutputPath = workingDir </> "output" <.> "tmp"
runOutput <- openFile runOutputPath WriteMode
ph <- runProcess cmdPath arguments (Just workingDir) (Just env) Nothing (Just runOutput) (Just runOutput)
ec <- waitForProcess ph -- requires compilation with -threaded
case ec of
ExitSuccess -> return []
_ -> do
output <- readFile runOutputPath
if (takeBaseName cmdPath) == "bibtex"
then return $! lines output
else do
return $!
(if "`ucs.sty' not found" `isInfixOf` output
then ["Please install the 'unicode' package from CTAN:",
" http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/"]
else []) ++
(if "`ulem.sty' not found" `isInfixOf` output
then ["Please install the 'ulem' package from CTAN:",
" http://www.ctan.org/tex-archive/macros/latex/contrib/misc/"]
else []) ++
(if "`graphicx.sty' not found" `isInfixOf` output
then ["Please install the 'graphicx' package from CTAN:",
" http://www.ctan.org/tex-archive/macros/latex/required/graphics/"]
else []) ++
(if "`fancyhdr.sty' not found" `isInfixOf` output
then ["Please install the 'fancyhdr' package from CTAN:",
" http://www.ctan.org/tex-archive/macros/latex/contrib/fancyhdr/"]
else []) ++
(if "`array.sty' not found" `isInfixOf` output
then ["Please install the 'array' package from CTAN:",
" http://www.ctan.org/tex-archive/macros/latex/required/tools/"]
else []) ++
(filter isUseful $ lines output)
where isUseful ln = take 1 ln == "!" ||
take 2 ln == "l." ||
"Error" `isInfixOf` ln ||
"error" `isInfixOf` ln

View file

@ -165,6 +165,7 @@ Library
Text.Pandoc.CharacterReferences,
Text.Pandoc.Shared,
Text.Pandoc.ODT,
Text.Pandoc.PDF,
Text.Pandoc.ASCIIMathML,
Text.Pandoc.DefaultHeaders,
Text.Pandoc.Highlighting,