pandoc/server/PandocServer.hs
John MacFarlane 87daaa37a9 Server: allow setting Accept:'application/octet-stream'...
to receive raw bytes instead of base64 encoded binary output,
e.g. docx.
2022-08-15 09:20:15 -07:00

300 lines
12 KiB
Haskell

{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module PandocServer
( app
, Params(..)
) where
import Data.Aeson
import Data.Aeson.TH
import Network.Wai
import Servant
import Text.DocTemplates as DocTemplates
import Text.Pandoc
import Text.Pandoc.Citeproc (processCitations)
import Text.Pandoc.Highlighting (lookupHighlightingStyle)
import qualified Text.Pandoc.UTF8 as UTF8
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TLE
import Data.Maybe (fromMaybe)
import Data.Char (isAlphaNum)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BL
import Data.ByteString.Base64 (decodeBase64, encodeBase64)
import Data.Default
import Data.Set (Set)
import Skylighting (defaultSyntaxMap)
newtype Blob = Blob BL.ByteString
deriving (Show, Eq)
instance ToJSON Blob where
toJSON (Blob bs) = toJSON (encodeBase64 $ BL.toStrict bs)
instance FromJSON Blob where
parseJSON = withText "Blob" $ \t -> do
let inp = UTF8.fromText t
case decodeBase64 inp of
Right bs -> return $ Blob $ BL.fromStrict bs
Left _ -> -- treat as regular text
return $ Blob $ BL.fromStrict inp
-- This is the data to be supplied by the JSON payload
-- of requests. Maybe values may be omitted and will be
-- given default values.
data Params = Params
{ text :: Text
, from :: Maybe Text
, to :: Maybe Text
, wrapText :: Maybe WrapOption
, columns :: Maybe Int
, standalone :: Maybe Bool
, template :: Maybe Text
, tabStop :: Maybe Int
, indentedCodeClasses :: Maybe [Text]
, abbreviations :: Maybe (Set Text)
, defaultImageExtension :: Maybe Text
, trackChanges :: Maybe TrackChanges
, stripComments :: Maybe Bool
, citeproc :: Maybe Bool
, variables :: Maybe (DocTemplates.Context Text)
, tableOfContents :: Maybe Bool
, incremental :: Maybe Bool
, htmlMathMethod :: Maybe HTMLMathMethod
, numberSections :: Maybe Bool
, numberOffset :: Maybe [Int]
, sectionDivs :: Maybe Bool
, referenceLinks :: Maybe Bool
, dpi :: Maybe Int
, emailObfuscation :: Maybe ObfuscationMethod
, identifierPrefix :: Maybe Text
, citeMethod :: Maybe CiteMethod
, htmlQTags :: Maybe Bool
, slideLevel :: Maybe Int
, topLevelDivision :: Maybe TopLevelDivision
, listings :: Maybe Bool
, highlightStyle :: Maybe Text
, setextHeaders :: Maybe Bool
, epubSubdirectory :: Maybe Text
, epubFonts :: Maybe [FilePath]
, epubMetadata :: Maybe Text
, epubChapterLevel :: Maybe Int
, tocDepth :: Maybe Int
, referenceDoc :: Maybe FilePath
, referenceLocation :: Maybe ReferenceLocation
, preferAscii :: Maybe Bool
, files :: Maybe [(FilePath, Blob)]
} deriving (Show)
instance Default Params where
def = Params
{ text = ""
, from = Nothing
, to = Nothing
, wrapText = Nothing
, columns = Nothing
, standalone = Nothing
, template = Nothing
, tabStop = Nothing
, indentedCodeClasses = Nothing
, abbreviations = Nothing
, defaultImageExtension = Nothing
, trackChanges = Nothing
, stripComments = Nothing
, citeproc = Nothing
, variables = Nothing
, tableOfContents = Nothing
, incremental = Nothing
, htmlMathMethod = Nothing
, numberSections = Nothing
, numberOffset = Nothing
, sectionDivs = Nothing
, referenceLinks = Nothing
, dpi = Nothing
, emailObfuscation = Nothing
, identifierPrefix = Nothing
, citeMethod = Nothing
, htmlQTags = Nothing
, slideLevel = Nothing
, topLevelDivision = Nothing
, listings = Nothing
, highlightStyle = Nothing
, setextHeaders = Nothing
, epubSubdirectory = Nothing
, epubMetadata = Nothing
, epubChapterLevel = Nothing
, epubFonts = Nothing
, tocDepth = Nothing
, referenceDoc = Nothing
, referenceLocation = Nothing
, preferAscii = Nothing
, files = Nothing
}
-- TODO:
-- shiftHeadingLevelBy
-- metadata
-- selfContained
-- embedResources
-- epubCoverImage
-- stripEmptyParagraphs
-- titlePrefix
-- ipynbOutput
-- eol
-- csl
-- bibliography
-- citationAbbreviations
-- Automatically derive code to convert to/from JSON.
$(deriveJSON defaultOptions ''Params)
-- This is the API. The "/convert" endpoint takes a request body
-- consisting of a JSON-encoded Params structure and responds to
-- Get requests with either plain text or JSON, depending on the
-- Accept header.
type API =
ReqBody '[JSON] Params :> Post '[PlainText, JSON] Text
:<|>
ReqBody '[JSON] Params :> Post '[OctetStream] BS.ByteString
:<|>
"batch" :> ReqBody '[JSON] [Params] :> Post '[JSON] [Text]
:<|>
"babelmark" :> QueryParam' '[Required] "text" Text :> QueryParam "from" Text :> QueryParam "to" Text :> QueryFlag "standalone" :> Get '[JSON] Value
:<|>
"version" :> Get '[PlainText, JSON] Text
app :: Application
app = serve api server
api :: Proxy API
api = Proxy
server :: Server API
server = convert
:<|> convertBytes
:<|> mapM convert
:<|> babelmark -- for babelmark which expects {"html": "", "version": ""}
:<|> pure pandocVersion
where
babelmark text' from' to' standalone' = do
res <- convert def{ text = text',
from = from', to = to',
standalone = Just standalone' }
return $ toJSON $ object [ "html" .= res, "version" .= pandocVersion ]
-- We use runPure for the pandoc conversions, which ensures that
-- they will do no IO. This makes the server safe to use. However,
-- it will mean that features requiring IO, like RST includes, will not work.
-- Changing this to
-- handleErr =<< liftIO (runIO (convert' params))
-- will allow the IO operations.
convert params = handleErr $
runPure (convert' id (encodeBase64 . BL.toStrict) params)
convertBytes params = handleErr $
runPure (convert' UTF8.fromText BL.toStrict params)
convert' :: PandocMonad m
=> (Text -> a) -> (BL.ByteString -> a) -> Params -> m a
convert' textHandler bsHandler params = do
let readerFormat = fromMaybe "markdown" $ from params
let writerFormat = fromMaybe "html" $ to params
(readerSpec, readerExts) <- getReader readerFormat
(writerSpec, writerExts) <- getWriter writerFormat
let binaryOutput = case writerSpec of
ByteStringWriter{} -> True
_ -> False
let isStandalone = fromMaybe binaryOutput (standalone params)
let toformat = T.toLower $ T.takeWhile isAlphaNum $ writerFormat
hlStyle <- traverse (lookupHighlightingStyle . T.unpack)
$ highlightStyle params
mbTemplate <- if isStandalone
then case template params of
Nothing -> Just <$>
compileDefaultTemplate toformat
Just t -> Just <$>
compileCustomTemplate toformat t
else return Nothing
let readeropts = def{ readerExtensions = readerExts
, readerStandalone = isStandalone
, readerTabStop = fromMaybe 4 (tabStop params)
, readerIndentedCodeClasses = fromMaybe []
(indentedCodeClasses params)
, readerAbbreviations =
fromMaybe mempty (abbreviations params)
, readerDefaultImageExtension =
fromMaybe mempty (defaultImageExtension params)
, readerTrackChanges =
fromMaybe AcceptChanges (trackChanges params)
, readerStripComments =
fromMaybe False (stripComments params)
}
let writeropts =
def{ writerExtensions = writerExts
, writerTabStop = fromMaybe 4 (tabStop params)
, writerWrapText = fromMaybe WrapAuto (wrapText params)
, writerColumns = fromMaybe 72 (columns params)
, writerTemplate = mbTemplate
, writerSyntaxMap = defaultSyntaxMap
, writerVariables = fromMaybe mempty (variables params)
, writerTableOfContents = fromMaybe False (tableOfContents params)
, writerIncremental = fromMaybe False (incremental params)
, writerHTMLMathMethod =
fromMaybe PlainMath (htmlMathMethod params)
, writerNumberSections = fromMaybe False (numberSections params)
, writerNumberOffset = fromMaybe [] (numberOffset params)
, writerSectionDivs = fromMaybe False (sectionDivs params)
, writerReferenceLinks = fromMaybe False (referenceLinks params)
, writerDpi = fromMaybe 96 (dpi params)
, writerEmailObfuscation =
fromMaybe NoObfuscation (emailObfuscation params)
, writerIdentifierPrefix =
fromMaybe mempty (identifierPrefix params)
, writerCiteMethod = fromMaybe Citeproc (citeMethod params)
, writerHtmlQTags = fromMaybe False (htmlQTags params)
, writerSlideLevel = slideLevel params
, writerTopLevelDivision =
fromMaybe TopLevelDefault (topLevelDivision params)
, writerListings = fromMaybe False (listings params)
, writerHighlightStyle = hlStyle
, writerSetextHeaders = fromMaybe False (setextHeaders params)
, writerEpubSubdirectory =
fromMaybe "EPUB" (epubSubdirectory params)
, writerEpubMetadata = epubMetadata params
, writerEpubFonts = fromMaybe [] (epubFonts params)
, writerEpubChapterLevel = fromMaybe 1 (epubChapterLevel params)
, writerTOCDepth = fromMaybe 3 (tocDepth params)
, writerReferenceDoc = referenceDoc params
, writerReferenceLocation =
fromMaybe EndOfDocument (referenceLocation params)
, writerPreferAscii = fromMaybe False (preferAscii params)
}
let reader = case readerSpec of
TextReader r -> r readeropts
ByteStringReader r -> \t -> do
let eitherbs = decodeBase64 $ UTF8.fromText t
case eitherbs of
Left errt -> throwError $ PandocSomeError errt
Right bs -> r readeropts $ BL.fromStrict bs
let writer = case writerSpec of
TextWriter w -> fmap textHandler . w writeropts
ByteStringWriter w -> fmap bsHandler . w writeropts
reader (text params) >>=
(if citeproc params == Just True
then processCitations
else return) >>=
writer
handleErr (Right t) = return t
handleErr (Left err) = throwError $
err500 { errBody = TLE.encodeUtf8 $ TL.fromStrict $ renderError err }
compileCustomTemplate toformat t = do
res <- runWithPartials $ compileTemplate ("custom." <> T.unpack toformat) t
case res of
Left e -> throwError $ PandocTemplateError (T.pack e)
Right tpl -> return tpl