Added an EPUB writer.

+ New writer module Text.Pandoc.Writers.EPUB
+ Stylesheet in epub.css
+ --epub-stylesheet command-line option.
+ New utility module Text.Pandoc.UUID to generate
  random UUIDs for EPUBs.
This commit is contained in:
John MacFarlane 2010-07-02 22:07:00 -07:00
parent ff4d94e054
commit f1114733a6
6 changed files with 382 additions and 4 deletions

30
epub.css Normal file
View file

@ -0,0 +1,30 @@
/* This defines styles and classes used in the book */
body { margin-left: 5%; margin-right: 5%; margin-top: 5%; margin-bottom: 5%; text-align: justify; font-size: medium; }
code { font-family: monospace; }
h1 { text-align: center; }
h2 { text-align: center; }
h3 { text-align: center; }
h4 { text-align: center; }
h5 { text-align: center; }
h6 { text-align: center; }
/* for syntax highlighting produced by highlighting-kate */
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode, table.sourceCode pre
{ margin: 0; padding: 0; border: 0; vertical-align: baseline; border: none; }
td.lineNumbers { text-align: right; background-color: #EBEBEB; color: black; padding-right: 5px; padding-left: 5px; }
td.sourceCode { padding-left: 5px; }
pre.sourceCode { }
pre.sourceCode span.Normal { }
pre.sourceCode span.Keyword { font-weight: bold; }
pre.sourceCode span.DataType { color: #800000; }
pre.sourceCode span.DecVal { color: #0000FF; }
pre.sourceCode span.BaseN { color: #0000FF; }
pre.sourceCode span.Float { color: #800080; }
pre.sourceCode span.Char { color: #FF00FF; }
pre.sourceCode span.String { color: #DD0000; }
pre.sourceCode span.Comment { color: #808080; font-style: italic; }
pre.sourceCode span.Others { }
pre.sourceCode span.Alert { color: green; font-weight: bold; }
pre.sourceCode span.Function { color: #000080; }
pre.sourceCode span.RegionMarker { }
pre.sourceCode span.Error { color: red; font-weight: bold; }

View file

@ -18,8 +18,8 @@ Description: Pandoc is a Haskell library for converting from one markup
this library. It can read markdown and (subsets of)
reStructuredText, HTML, and LaTeX, and it can write
markdown, reStructuredText, HTML, LaTeX, ConTeXt, Docbook,
OpenDocument, ODT, RTF, MediaWiki, groff man pages, and
S5 HTML slide shows.
OpenDocument, ODT, RTF, MediaWiki, groff man pages, EPUB,
and S5 HTML slide shows.
.
Pandoc extends standard markdown syntax with footnotes,
embedded LaTeX, definition lists, tables, and other
@ -44,6 +44,8 @@ Data-Files:
templates/mediawiki.template, templates/rtf.template,
-- data for ODT writer
reference.odt,
-- stylesheet for EPUB writer
epub.css,
-- data for LaTeXMathML writer
data/LaTeXMathML.js,
data/MathMLinHTML.js,
@ -146,7 +148,7 @@ Library
bytestring >= 0.9, zip-archive >= 0.1.1.4,
utf8-string >= 0.3, old-time >= 1,
HTTP >= 4000.0.5, texmath, xml >= 1.3.5 && < 1.4,
extensible-exceptions
random, extensible-exceptions
if impl(ghc >= 6.10)
Build-depends: base >= 4 && < 5, syb
else
@ -183,9 +185,11 @@ Library
Text.Pandoc.Writers.RTF,
Text.Pandoc.Writers.S5,
Text.Pandoc.Writers.ODT,
Text.Pandoc.Writers.EPUB,
Text.Pandoc.Templates
Other-Modules: Text.Pandoc.XML,
Text.Pandoc.UTF8,
Text.Pandoc.UUID,
Paths_pandoc
Extensions: CPP
if impl(ghc >= 6.12)

View file

@ -87,6 +87,7 @@ module Text.Pandoc
, writeMediaWiki
, writeRTF
, writeODT
, writeEPUB
, prettyPandoc
-- * Writer options used in writers
, WriterOptions (..)
@ -111,6 +112,7 @@ import Text.Pandoc.Writers.Texinfo
import Text.Pandoc.Writers.HTML
import Text.Pandoc.Writers.S5
import Text.Pandoc.Writers.ODT
import Text.Pandoc.Writers.EPUB
import Text.Pandoc.Writers.Docbook
import Text.Pandoc.Writers.OpenDocument
import Text.Pandoc.Writers.Man

77
src/Text/Pandoc/UUID.hs Normal file
View file

@ -0,0 +1,77 @@
{-
Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.UUID
Copyright : Copyright (C) 2010 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
UUID generation using Version 4 (random method) described
in RFC4122. See http://tools.ietf.org/html/rfc4122
-}
module Text.Pandoc.UUID ( UUID, getRandomUUID ) where
import Text.Printf ( printf )
import System.Random ( randomIO )
import Data.Word
import Data.Bits ( setBit, clearBit )
import Control.Monad ( liftM )
data UUID = UUID Word8 Word8 Word8 Word8 Word8 Word8 Word8 Word8
Word8 Word8 Word8 Word8 Word8 Word8 Word8 Word8
instance Show UUID where
show (UUID a b c d e f g h i j k l m n o p) =
"urn:uuid:" ++
printf "%02x" a ++
printf "%02x" b ++
printf "%02x" c ++
printf "%02x" d ++
"-" ++
printf "%02x" e ++
printf "%02x" f ++
"-" ++
printf "%02x" g ++
printf "%02x" h ++
"-" ++
printf "%02x" i ++
printf "%02x" j ++
"-" ++
printf "%02x" k ++
printf "%02x" l ++
printf "%02x" m ++
printf "%02x" n ++
printf "%02x" o ++
printf "%02x" p
getRandomUUID :: IO UUID
getRandomUUID = do
let getRN :: a -> IO Word8
getRN _ = liftM fromIntegral (randomIO :: IO Int)
[a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p] <- mapM getRN ([1..16] :: [Int])
-- set variant
let i' = i `setBit` 7 `clearBit` 6
-- set version (0100 for random)
let g' = g `clearBit` 7 `setBit` 6 `clearBit` 5 `clearBit` 4
return $ UUID a b c d e f g' h i' j k l m n o p

View file

@ -0,0 +1,247 @@
{-
Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Writers.EPUB
Copyright : Copyright (C) 2010 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Conversion of 'Pandoc' documents to EPUB.
-}
module Text.Pandoc.Writers.EPUB ( writeEPUB ) where
import Data.IORef
import Data.Maybe ( fromMaybe, isNothing )
import Data.List ( findIndices, isPrefixOf )
import System.Environment ( getEnv )
import System.FilePath ( (</>), takeBaseName, takeExtension )
import qualified Data.ByteString.Lazy as B
import Data.ByteString.Lazy.UTF8 ( fromString )
import Codec.Archive.Zip
import System.Time
import Text.Pandoc.Shared hiding ( Element )
import Text.Pandoc.Definition
import Control.Monad (liftM)
import Text.XML.Light hiding (ppTopElement)
import Text.Pandoc.UUID
import Text.Pandoc.Writers.HTML
import Text.Pandoc.Writers.Markdown ( writePlain )
import Data.Char ( toLower )
-- | Produce an EPUB file from a Pandoc document.
writeEPUB :: FilePath -- ^ Relative directory of source file
-> String -- ^ EPUB stylesheet
-> WriterOptions -- ^ Writer options
-> Pandoc -- ^ Document to convert
-> IO B.ByteString
writeEPUB sourceDir stylesheet opts doc = do
(TOD epochtime _) <- getClockTime
let opts' = opts{ writerEmailObfuscation = NoObfuscation
, writerStandalone = True
, writerWrapText = False }
-- mimetype
let mimetypeEntry = toEntry "mimetype" epochtime $ fromString "application/epub+zip"
-- container.xml
let containerData = fromString $ ppTopElement $
unode "container" ! [("version","1.0")
,("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")] $
unode "rootfiles" $
unode "rootfile" ! [("full-path","content.opf")
,("media-type","application/oebps-package+xml")] $ ()
let containerEntry = toEntry "META-INF/container.xml" epochtime containerData
-- stylesheet
let stylesheetEntry = toEntry "stylesheet.css" epochtime $
fromString stylesheet
-- title page
let vars = writerVariables opts'
let tpContent = fromString $
writeHtmlString opts'{writerTemplate = pageTemplate
,writerVariables = ("titlepage","yes"):vars} doc
let tpEntry = toEntry "title_page.xhtml" epochtime tpContent
-- handle pictures
picEntriesRef <- newIORef ([] :: [Entry])
Pandoc meta blocks <- liftM (processWith transformBlock) $
processWithM (transformInline (writerHTMLMathMethod opts)
sourceDir picEntriesRef) doc
picEntries <- readIORef picEntriesRef
-- body pages
let isH1 (Header 1 _) = True
isH1 _ = False
let chunks = splitByIndices (dropWhile (==0) $ findIndices isH1 blocks) blocks
let titleize (Header 1 xs : ys) = Pandoc meta{docTitle = xs} ys
titleize xs = Pandoc meta xs
let chapToHtml = writeHtmlString opts'{ writerTemplate = pageTemplate
, writerHTMLMathMethod = PlainMath}
let chapters = map titleize chunks
let chapterToEntry :: Int -> Pandoc -> Entry
chapterToEntry num chap = toEntry ("ch" ++ show num ++ ".xhtml")
epochtime $ fromString $ chapToHtml chap
let chapterEntries = zipWith chapterToEntry [1..] chapters
-- contents.opf
uuid <- getRandomUUID
let chapterNode ent = unode "item" !
[("id", takeBaseName $ eRelativePath ent),
("href", eRelativePath ent),
("media-type", "application/xhtml+xml")] $ ()
let chapterRefNode ent = unode "itemref" !
[("idref", takeBaseName $ eRelativePath ent)] $ ()
let pictureNode ent = unode "item" !
[("id", takeBaseName $ eRelativePath ent),
("href", eRelativePath ent),
("media-type", fromMaybe "application/octet-stream"
$ imageTypeOf $ eRelativePath ent)] $ ()
let plainify t = removeTrailingSpace $
writePlain opts'{ writerStandalone = False } $
Pandoc meta [Plain t]
let plainTitle = plainify $ docTitle meta
let plainAuthors = map plainify $ docAuthors meta
lang <- catch (liftM (takeWhile (/='.')) $ getEnv "lang")
(\_ -> return "en-US")
let contentsData = fromString $ ppTopElement $
unode "package" ! [("version","2.0")
,("xmlns","http://www.idpf.org/2007/opf")
,("unique-identifier","BookId")] $
[ unode "metadata" ! [("xmlns:dc","http://purl.org/dc/elements/1.1/")
,("xmlns:opf","http://www.idpf.org/2007/opf")] $
[ unode "dc:title" plainTitle
, unode "dc:language" lang
, unode "dc:identifier" ! [("id","BookId")] $ show uuid
] ++
map (unode "dc:creator" ! [("opf:role","aut")]) plainAuthors
, unode "manifest" $
[ unode "item" ! [("id","ncx"), ("href","toc.ncx")
,("media-type","application/x-dtbncx+xml")] $ ()
, unode "item" ! [("id","style"), ("href","stylesheet.css")
,("media-type","text/css")] $ ()
] ++
map chapterNode (tpEntry : chapterEntries) ++
map pictureNode picEntries
, unode "spine" ! [("toc","ncx")] $
map chapterRefNode (tpEntry : chapterEntries)
]
let contentsEntry = toEntry "content.opf" epochtime contentsData
-- toc.ncx
let navPointNode ent n tit = unode "navPoint" !
[("id", "navPoint-" ++ show n)
,("playOrder", show n)] $
[ unode "navLabel" $ unode "text" tit
, unode "content" ! [("src",
eRelativePath ent)] $ ()
]
let tocData = fromString $ ppTopElement $
unode "ncx" ! [("version","2005-1")
,("xmlns","http://www.daisy.org/z3986/2005/ncx/")] $
[ unode "head"
[ unode "meta" ! [("name","dtb:uid")
,("content", show uuid)] $ ()
, unode "meta" ! [("name","dtb:depth")
,("content", "1")] $ ()
, unode "meta" ! [("name","dtb:totalPageCount")
,("content", "0")] $ ()
, unode "meta" ! [("name","dtb:maxPageNumber")
,("content", "0")] $ ()
]
, unode "docTitle" $ unode "text" $ plainTitle
, unode "navMap" $ zipWith3 navPointNode (tpEntry : chapterEntries)
[1..(length chapterEntries + 1)]
("Title Page" : map (\(Pandoc m _) ->
plainify $ docTitle m) chapters)
]
let tocEntry = toEntry "toc.ncx" epochtime tocData
-- construct archive
let archive = foldr addEntryToArchive emptyArchive
(mimetypeEntry : containerEntry : stylesheetEntry : tpEntry :
contentsEntry : tocEntry : (picEntries ++ chapterEntries) )
return $ fromArchive archive
transformInline :: HTMLMathMethod
-> FilePath
-> IORef [Entry]
-> Inline
-> IO Inline
transformInline _ _ _ (Image lab (src,_)) | isNothing (imageTypeOf src) =
return (Emph lab)
transformInline _ sourceDir picsRef (Image lab (src,tit)) = do
entries <- readIORef picsRef
let newsrc = "images/img" ++ show (length entries) ++ takeExtension src
catch (readEntry [] (sourceDir </> src) >>= \entry ->
modifyIORef picsRef (entry{ eRelativePath = newsrc } :) >>
return (Image lab (newsrc, tit)))
(\_ -> return (Emph lab))
transformInline (MathML _) _ _ x@(Math _ _) = do
let writeHtmlInline opts z = removeTrailingSpace $
writeHtmlString opts $ Pandoc (Meta [] [] []) [Plain [z]]
mathml = writeHtmlInline defaultWriterOptions{
writerHTMLMathMethod = MathML Nothing } x
fallback = writeHtmlInline defaultWriterOptions{
writerHTMLMathMethod = PlainMath } x
inOps = "<ops:switch xmlns:ops=\"http://www.idpf.org/2007/ops\">" ++
"<ops:case required-namespace=\"http://www.w3.org/1998/Math/MathML\">" ++
mathml ++ "</ops:case><ops:default>" ++ fallback ++ "</ops:default>" ++
"</ops:switch>"
return $ HtmlInline $ if "<math" `isPrefixOf` mathml then inOps else mathml
transformInline _ _ _ (HtmlInline _) = return $ Str ""
transformInline _ _ _ x = return x
transformBlock :: Block -> Block
transformBlock (RawHtml _) = Null
transformBlock x = x
(!) :: Node t => (t -> Element) -> [(String, String)] -> t -> Element
(!) f attrs n = add_attrs (map (\(k,v) -> Attr (unqual k) v) attrs) (f n)
-- | Version of 'ppTopElement' that specifies UTF-8 encoding.
ppTopElement :: Element -> String
ppTopElement = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" ++) . ppElement
imageTypeOf :: FilePath -> Maybe String
imageTypeOf x = case drop 1 (map toLower (takeExtension x)) of
"jpg" -> Just "image/jpeg"
"jpeg" -> Just "image/jpeg"
"jfif" -> Just "image/jpeg"
"png" -> Just "image/png"
"gif" -> Just "image/gif"
"svg" -> Just "image/svg+xml"
_ -> Nothing
pageTemplate :: String
pageTemplate = unlines
[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">"
, "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
, "<head>"
, "<title>$title$</title>"
, "<link href=\"stylesheet.css\" type=\"text/css\" rel=\"stylesheet\" />"
, "</head>"
, "<body>"
, "$if(titlepage)$"
, "<h1 class=\"title\">$title$</h1>"
, "$for(author)$"
, "<h2 class=\"author\">$author$</h2>"
, "$endfor$"
, "$else$"
, "<h1>$title$</h1>"
, "$body$"
, "$endif$"
, "</body>"
, "</html>"
]

View file

@ -109,6 +109,7 @@ writers = [("native" , writeDoc)
,("docbook" , writeDocbook)
,("opendocument" , writeOpenDocument)
,("odt" , \_ _ -> "")
,("epub" , \_ _ -> "")
,("latex" , writeLaTeX)
,("latex+lhs" , writeLaTeX)
,("context" , writeConTeXt)
@ -124,7 +125,7 @@ writers = [("native" , writeDoc)
]
isNonTextOutput :: String -> Bool
isNonTextOutput = (`elem` ["odt"])
isNonTextOutput = (`elem` ["odt","epub"])
-- | Writer for Pandoc native format.
writeDoc :: WriterOptions -> Pandoc -> String
@ -157,6 +158,7 @@ data Opt = Opt
, optSmart :: Bool -- ^ Use smart typography
, optHTMLMathMethod :: HTMLMathMethod -- ^ Method to print HTML math
, optReferenceODT :: Maybe FilePath -- ^ Path of reference.odt
, optEPUBStylesheet :: Maybe String -- ^ EPUB stylesheet
, optDumpArgs :: Bool -- ^ Output command-line arguments
, optIgnoreArgs :: Bool -- ^ Ignore command-line arguments
, optStrict :: Bool -- ^ Use strict markdown syntax
@ -197,6 +199,7 @@ defaultOpts = Opt
, optSmart = False
, optHTMLMathMethod = PlainMath
, optReferenceODT = Nothing
, optEPUBStylesheet = Nothing
, optDumpArgs = False
, optIgnoreArgs = False
, optStrict = False
@ -467,6 +470,14 @@ options =
"FILENAME")
"" -- "Path of custom reference.odt"
, Option "" ["epub-stylesheet"]
(ReqArg
(\arg opt -> do
text <- UTF8.readFile arg
return opt { optEPUBStylesheet = Just text })
"FILENAME")
"" -- "Path of epub.css"
, Option "D" ["print-default-template"]
(ReqArg
(\arg _ -> do
@ -580,6 +591,7 @@ defaultWriterName x =
".texinfo" -> "texinfo"
".db" -> "docbook"
".odt" -> "odt"
".epub" -> "epub"
['.',y] | y `elem` ['1'..'9'] -> "man"
_ -> "html"
@ -628,6 +640,7 @@ main = do
, optSmart = smart
, optHTMLMathMethod = mathMethod
, optReferenceODT = referenceODT
, optEPUBStylesheet = epubStylesheet
, optDumpArgs = dumpArgs
, optIgnoreArgs = ignoreArgs
, optStrict = strict
@ -686,6 +699,11 @@ main = do
Nothing -> error ("Unknown reader: " ++ readerName')
writer <- case (lookup writerName' writers) of
Just _ | writerName' == "epub" -> do
epubstyle <- case epubStylesheet of
Just s -> return s
Nothing -> readDataFile datadir "epub.css"
return (writeEPUB sourceDirRelative epubstyle)
Just _ | writerName' == "odt" -> return
(writeODT datadir sourceDirRelative referenceODT)
Just r -> return $ \o d ->