Add interface for custom readers written in Lua. (#7671)

New module Text.Pandoc.Readers.Custom, exporting
readCustom [API change].

Users can now do `-f myreader.lua` and pandoc will treat the
script myreader.lua as a custom reader, which parses an input
string to a pandoc AST, using the pandoc module defined for
Lua filters.

A sample custom reader can be found in data/reader.lua.

Closes #7669.
This commit is contained in:
John MacFarlane 2021-11-05 22:10:29 -07:00 committed by GitHub
parent bac6ae9607
commit ee2f0021f9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 135 additions and 14 deletions

View file

@ -266,6 +266,7 @@ header when requesting a document from a URL:
- `tikiwiki` ([TikiWiki markup])
- `twiki` ([TWiki markup])
- `vimwiki` ([Vimwiki])
- the path of a custom Lua reader, see [Custom readers and writers] below
:::
Extensions can be individually enabled or disabled by
@ -338,7 +339,7 @@ header when requesting a document from a URL:
- `tei` ([TEI Simple])
- `xwiki` ([XWiki markup])
- `zimwiki` ([ZimWiki markup])
- the path of a custom Lua writer, see [Custom writers] below
- the path of a custom Lua writer, see [Custom readers and writers] below
:::
Note that `odt`, `docx`, `epub`, and `pdf` output will not be directed
@ -6574,19 +6575,35 @@ With these custom styles, you can use your input document as a
reference-doc while creating docx output (see below), and maintain the
same styles in your input and output files.
# Custom writers
# Custom readers and writers
Pandoc can be extended with custom writers written in [Lua]. (Pandoc
includes a Lua interpreter, so Lua need not be installed separately.)
Pandoc can be extended with custom readers and writers written
in [Lua]. (Pandoc includes a Lua interpreter, so Lua need not
be installed separately.)
To use a custom writer, simply specify the path to the Lua script
in place of the output format. For example:
To use a custom reader or writer, simply specify the path to the
Lua script in place of the input or output format. For example:
pandoc -t data/sample.lua
pandoc -f my_custom_markup_language.lua -t latex -s
Creating a custom writer requires writing a Lua function for each
possible element in a pandoc document. To get a documented example
which you can modify according to your needs, do
A custom reader is a Lua script that defines one function,
Reader, which takes a string as input and returns a Pandoc
AST. See the [Lua filters documentation] for documentation
of the functions that are available for creating pandoc
AST elements. For parsing, the [lpeg] parsing library
is available by default. To see a sample custom reader:
pandoc --print-default-data-file reader.lua
Reader options are available via the global variable
`PANDOC_READER_OPTIONS`, as expalined in the [Lua filters
documentation].
A custom writer is a Lua script that defines a function
that specifies how to render each element in a Pandoc AST.
To see a documented example which you can modify according
to your needs:
pandoc --print-default-data-file sample.lua
@ -6598,6 +6615,7 @@ default template with the name
subdirectory of your user data directory (see [Templates]).
[Lua]: https://www.lua.org
[lpeg]: http://www.inf.puc-rio.br/~roberto/lpeg/
# Reproducible builds

44
data/reader.lua Normal file
View file

@ -0,0 +1,44 @@
-- A sample custom reader for a very simple markup language.
-- This parses a document into paragraphs separated by blank lines.
-- This is _{italic} and this is *{boldface}
-- This is an escaped special character: \_, \*, \{, \}
-- == text makes a level-2 heading
-- That's it!
-- For better performance we put these functions in local variables:
local P, S, R, Cf, Cc, Ct, V, Cs, Cg, Cb, B =
lpeg.P, lpeg.S, lpeg.R, lpeg.Cf, lpeg.Cc, lpeg.Ct, lpeg.V,
lpeg.Cs, lpeg.Cg, lpeg.Cb, lpeg.B
local whitespacechar = S(" \t\r\n")
local specialchar = S("_*{}\\")
local escapedchar = P"\\" * specialchar
/ function (x) return string.sub(x,2) end
local wordchar = (P(1) - (whitespacechar + specialchar)) + escapedchar
local spacechar = S(" \t")
local newline = P"\r"^-1 * P"\n"
local blanklines = newline * spacechar^0 * newline^1
local endline = newline - blanklines
-- Grammar
G = P{ "Pandoc",
Pandoc = blanklines^-1 * Ct(V"Block"^0) / pandoc.Pandoc;
Block = V"Header" + V"Para";
Para = Ct(V"Inline"^1) * blanklines^-1 / pandoc.Para;
Header = Ct(Cg(P("=")^1 / function(x) return #x end, "length")
* spacechar^1
* Cg(Ct(V"Inline"^0), "contents")
* blanklines^-1) /
function(res) return pandoc.Header(res.length, res.contents) end;
Inline = V"Emph" + V"Str" + V"Space" + V"SoftBreak" + V"Special" ;
Str = wordchar^1 / pandoc.Str;
Space = spacechar^1 / pandoc.Space;
SoftBreak = endline / pandoc.SoftBreak;
Emph = Ct(P"_{" * Cg(Ct((V"Inline" - P"}")^1), "contents") * P"}") /
function(res) return pandoc.Emph(res.contents) end;
Special = specialchar / pandoc.Str;
}
function Reader(input)
return lpeg.match(G, input)
end

View file

@ -642,6 +642,7 @@ library
Text.Pandoc.Readers.Ipynb,
Text.Pandoc.Readers.CSV,
Text.Pandoc.Readers.RTF,
Text.Pandoc.Readers.Custom,
Text.Pandoc.Writers,
Text.Pandoc.Writers.Native,
Text.Pandoc.Writers.Docbook,

View file

@ -68,6 +68,7 @@ import Text.Pandoc.Shared (eastAsianLineBreakFilter, stripEmptyParagraphs,
defaultUserDataDir, tshow)
import Text.Pandoc.Writers.Shared (lookupMetaString)
import Text.Pandoc.Readers.Markdown (yamlToMeta)
import Text.Pandoc.Readers.Custom (readCustom)
import qualified Text.Pandoc.UTF8 as UTF8
#ifndef _WINDOWS
import System.Posix.IO (stdOutput)
@ -154,11 +155,13 @@ convertWithOpts opts = do
-> ByteStringReader $ \o t -> sandbox files (r o t)
(reader, readerExts) <-
if optSandbox opts
then case runPure (getReader readerName) of
Left e -> throwError e
Right (r, rexts) -> return (makeSandboxed r, rexts)
else getReader readerName
if ".lua" `T.isSuffixOf` readerName
then return (TextReader (readCustom (T.unpack readerName)), mempty)
else if optSandbox opts
then case runPure (getReader readerName) of
Left e -> throwError e
Right (r, rexts) -> return (makeSandboxed r, rexts)
else getReader readerName
outputSettings <- optToOutputSettings opts
let format = outputFormat outputSettings

View file

@ -0,0 +1,55 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{- |
Module : Text.Pandoc.Readers.Custom
Copyright : Copyright (C) 2021 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Supports custom parsers written in Lua which produce a Pandoc AST.
-}
module Text.Pandoc.Readers.Custom ( readCustom ) where
import Control.Exception
import Control.Monad (when)
import Data.Text (Text)
import HsLua as Lua hiding (Operation (Div), render)
import HsLua.Class.Peekable (PeekError)
import Control.Monad.IO.Class (MonadIO)
import Text.Pandoc.Definition
import Text.Pandoc.Lua (Global (..), runLua, setGlobals)
import Text.Pandoc.Lua.Util (dofileWithTraceback)
import Text.Pandoc.Options
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Sources (ToSources(..), sourcesToText)
-- | Convert custom markup to Pandoc.
readCustom :: (PandocMonad m, MonadIO m, ToSources s)
=> FilePath -> ReaderOptions -> s -> m Pandoc
readCustom luaFile opts sources = do
let input = sourcesToText $ toSources sources
let globals = [ PANDOC_SCRIPT_FILE luaFile
, PANDOC_READER_OPTIONS opts
]
res <- runLua $ do
setGlobals globals
stat <- dofileWithTraceback luaFile
-- check for error in lua script (later we'll change the return type
-- to handle this more gracefully):
when (stat /= Lua.OK)
Lua.throwErrorAsException
parseCustom input
case res of
Left msg -> throw msg
Right doc -> return doc
parseCustom :: forall e. PeekError e
=> Text
-> LuaE e Pandoc
parseCustom = invoke @e "Reader"