+ Removed the convenience symlinks (which don't work on Windows under

Cygwin, due to Windows' lack of true symbolic links).
+ Modified the wrappers to use 'pandoc' instead of the symlinks.
+ Modified the Makefile to remove all references to the symlinks.
+ Removed code from Main.hs that made pandoc's behavior depend on the
  name of the calling program.
+ Added code to Main.hs that sets default reader and writer based on
  extensions of input and output filenames (if provided).  (Thanks to
  roktas for the idea.)
+ Modified README and man pages accordingly.
+ Removed WINDOWS-README target from Makefile.  It is no longer needed
  now that we don't have the symlinks.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@295 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2006-12-28 02:20:09 +00:00
parent a1539d9ab8
commit a1a30d69bd
16 changed files with 198 additions and 231 deletions

View file

@ -26,8 +26,6 @@ EXECSBASE := $(shell sed -ne 's/^[Ee]xecutable:[[:space:]]*//p' $(CABAL).in)
# Install targets # Install targets
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
WRAPPERS := web2markdown markdown2pdf WRAPPERS := web2markdown markdown2pdf
SYMLINKS := markdown2html markdown2latex markdown2s5 markdown2rst \
markdown2rtf html2markdown latex2markdown rst2markdown
# Add .exe extensions if we're running Windows/Cygwin. # Add .exe extensions if we're running Windows/Cygwin.
EXTENSION := $(shell uname | tr '[:upper:]' '[:lower:]' | \ EXTENSION := $(shell uname | tr '[:upper:]' '[:lower:]' | \
sed -ne 's/^cygwin.*$$/\.exe/p') sed -ne 's/^cygwin.*$$/\.exe/p')
@ -96,12 +94,6 @@ all: build-program
templates: $(SRCDIR)/templates templates: $(SRCDIR)/templates
$(MAKE) -C $(SRCDIR)/templates $(MAKE) -C $(SRCDIR)/templates
.PHONY: symlinks
cleanup_files+=$(SYMLINKS)
symlinks: $(SYMLINKS)
$(SYMLINKS): $(MAIN)
ln -sf ./$(MAIN) $@
define generate-shell-script define generate-shell-script
echo "Generating $@..."; \ echo "Generating $@..."; \
awk ' \ awk ' \
@ -141,7 +133,7 @@ build: configure
$(BUILDCMD) build $(BUILDCMD) build
.PHONY: build-exec .PHONY: build-exec
build-exec: $(PROGS) $(SYMLINKS) build-exec: $(PROGS)
cleanup_files+=$(EXECS) cleanup_files+=$(EXECS)
$(EXECS): build $(EXECS): build
for f in $@; do \ for f in $@; do \
@ -201,9 +193,8 @@ install-exec: build-exec
fi; \ fi; \
$(INSTALL_PROGRAM) $$f $(BINPATH)/; \ $(INSTALL_PROGRAM) $$f $(BINPATH)/; \
done done
cd $(BINPATH); for f in $(SYMLINKS); do ln -sf $(MAIN) $$f; done
uninstall-exec: uninstall-exec:
-for f in $(notdir $(PROGS) $(SYMLINKS)); do rm -f $(BINPATH)/$$f; done ; -for f in $(notdir $(PROGS)); do rm -f $(BINPATH)/$$f; done ;
# Program + user documents installation. # Program + user documents installation.
.PHONY: install-program uninstall-program .PHONY: install-program uninstall-program
@ -295,15 +286,11 @@ $(osx_dmg_name): $(osx_pkg_name)
.PHONY: win-pkg .PHONY: win-pkg
win_pkg_name:=$(RELNAME).zip win_pkg_name:=$(RELNAME).zip
win_docs:=COPYING.txt COPYRIGHT.txt BUGS.txt README-WINDOWS.txt README-WINDOWS.html win_docs:=COPYING.txt COPYRIGHT.txt BUGS.txt README.txt README.html
cleanup_files+=$(win_pkg_name) $(win_docs) cleanup_files+=$(win_pkg_name) $(win_docs)
win-pkg: $(win_pkg_name) win-pkg: $(win_pkg_name)
$(win_pkg_name): $(THIS).exe $(win_docs) $(win_pkg_name): $(THIS).exe $(win_docs)
zip -r $(win_pkg_name) $(THIS).exe $(win_docs) zip -r $(win_pkg_name) $(THIS).exe $(win_docs)
cleanup_files+=README-WINDOWS
README-WINDOWS: README
sed -e '/^Requirements/,/^\[fancyvrb\]:/ d' \
-e '/^Character encodings/,/mysite.com$$/ d' $< > $@
.PHONY: test test-markdown .PHONY: test test-markdown
test: $(MAIN) test: $(MAIN)

139
README
View file

@ -36,14 +36,11 @@ Requirements
============ ============
The `pandoc` program itself does not depend on any external libraries The `pandoc` program itself does not depend on any external libraries
or programs. The convenience programs `markdown2html`, `markdown2latex`, or programs.
`markdown2rst`, `markdown2rtf`, `markdown2s5`, `html2markdown`,
`latex2markdown`, and `rst2markdown` are implemented as symbolic links to
`pandoc`.
The wrapper script `web2markdown` requires The wrapper script `web2markdown` requires
- `html2markdown` (included with Pandoc) - `pandoc` (which must be in the PATH)
- a POSIX-compliant shell (installed by default on all linux and unix - a POSIX-compliant shell (installed by default on all linux and unix
systems, including Mac OS X, and in [Cygwin] for Windows), systems, including Mac OS X, and in [Cygwin] for Windows),
- `HTML Tidy` - `HTML Tidy`
@ -56,7 +53,7 @@ The wrapper script `web2markdown` requires
The wrapper script `markdown2pdf` requires The wrapper script `markdown2pdf` requires
- `markdown2latex` (included with Pandoc) - `pandoc` (which must be in the PATH)
- a POSIX-compliant shell - a POSIX-compliant shell
- `pdflatex`, which should be part of any [LaTeX] distribution - `pdflatex`, which should be part of any [LaTeX] distribution
- the [unicode] and [fancyvrb] LaTeX packages, which are included - the [unicode] and [fancyvrb] LaTeX packages, which are included
@ -80,47 +77,11 @@ Using Pandoc
If you run `pandoc` without arguments, it will accept input from If you run `pandoc` without arguments, it will accept input from
STDIN. If you run it with file names as arguments, it will take input STDIN. If you run it with file names as arguments, it will take input
from those files. It accepts several command-line options. For a from those files. By default, `pandoc` writes its output to STDOUT.
list, type If you want to write to a file, use the `-o` option:
pandoc -h
The most important options specify the format of the source file and
the output. The default reader is markdown; the default writer is
HTML. So if you don't specify a reader or writer, `pandoc` will
convert markdown to HTML. For example,
pandoc hello.txt
will convert `hello.txt` from markdown to HTML. For other conversions,
you must specify a reader and/or a writer using the `-r` and `-w`
flags. To convert markdown to LaTeX, you would write:
pandoc -w latex hello.txt
To convert html to markdown:
pandoc -r html -w markdown hello.txt
Supported writers include `markdown`, `latex`, `html`, `rtf` (rich text
format), `rst` (reStructuredText), and `s5` (which produces an HTML
file that acts like powerpoint). Supported readers include `markdown`,
`html`, `latex`, and `rst`. Note that the `rst` reader only parses
a subset of reStructuredText syntax. For example, it doesn't handle
tables, definition lists, option lists, or footnotes. It handles only the
constructs expressible in unextended markdown. But for simple documents
it should be adequate. The `latex` and `html` readers are also limited
in what they can do. Because the `html` reader is picky about the HTML
it parses, it is recommended that you pipe HTML through [HTML Tidy] before
sending it to `pandoc`, or use the `web2markdown` script described below.
By default, `pandoc` writes its output to STDOUT. If you want to
write to a file, use the `-o` option or shell redirection:
pandoc -o hello.html hello.txt pandoc -o hello.html hello.txt
pandoc hello.txt > hello.html
Note that you can specify multiple input files on the command line. Note that you can specify multiple input files on the command line.
`pandoc` will concatenate them all (with blank lines between them) `pandoc` will concatenate them all (with blank lines between them)
before parsing: before parsing:
@ -131,6 +92,44 @@ before parsing:
with a proper header, rather than a fragment. For more details on this with a proper header, rather than a fragment. For more details on this
and many other command-line options, see below.) and many other command-line options, see below.)
The format of the input and output can be specified explicitly using
command-line options. The input format can be specified using the
`-r/--read` or `-f/--from` options, the output format using the
`-w/--write` or `-t/--to` options. Thus, to convert `hello.txt` from
markdown to LaTeX, you could type:
pandoc -f markdown -t latex hello.txt
To convert `hello.html` from html to markdown:
pandoc -f html -t markdown hello.html
Supported output formats include `markdown`, `latex`, `html`, `rtf`
(rich text format), `rst` (reStructuredText), and `s5` (which produces
an HTML file that acts like powerpoint). Supported input formats
include `markdown`, `html`, `latex`, and `rst`. Note that the `rst`
reader only parses a subset of reStructuredText syntax. For example,
it doesn't handle tables, definition lists, option lists, or footnotes.
It handles only the constructs expressible in unextended markdown.
But for simple documents it should be adequate. The `latex` and `html`
readers are also limited in what they can do. Because the `html`
reader is picky about the HTML it parses, it is recommended that you
pipe HTML through [HTML Tidy] before sending it to `pandoc`, or use the
`web2markdown` script described below.
If you don't specify a reader or writer explicitly, `pandoc` will
try to determine the input and output format from the extensions of
the input and output filenames. Thus, for example,
pandoc -o hello.tex hello.txt
will convert `hello.txt` from markdown to LaTeX. If no output file
is specified (so that output goes to STDOUT), or if the output file's
extension is unknown, the output format will default to HTML.
If no input file is specified (so that input comes from STDIN), or
if the input files' extensions are unknown, the input format will
be assumed to be markdown unless explicitly specified.
Character encodings Character encodings
------------------- -------------------
@ -150,31 +149,16 @@ The shell scripts (described below) automatically convert the input
from the local encoding to UTF-8 before running them through `pandoc`, from the local encoding to UTF-8 before running them through `pandoc`,
then convert the output back to the local encoding. then convert the output back to the local encoding.
Convenience programs and wrapper scripts `markdown2pdf` and `web2markdown`
======================================== =================================
For convenience, eight variant programs are included with Pandoc: Two shell scripts, `markdown2pdf` and `web2markdown`, are included in
`markdown2html` (which is equivalent to `pandoc -w html`), the standard Pandoc installation. (They are not included in the Windows
`markdown2latex` (equivalent to `pandoc -w latex`), `markdown2rst` binary package, as they require a POSIX shell, but they may be used
(equivalent to `pandoc -w rst`), `markdown2rtf` (equivalent to in Windows under Cygwin.)
`pandoc -w rtf`), `markdown2s5` (equivalent to `pandoc -w s5`),
`html2markdown` (equivalent to `pandoc -r html -w markdown`),
`latex2markdown` (equivalent to `pandoc -r latex -w markdown`), and
`rst2markdown` (equivalent to `pandoc -r rst -w markdown`). These
programs take an appropriately restricted subset of `pandoc`'s
options. (Run them with the `-h` flag for a full list of allowed
options.)
Like `pandoc`, all of these programs produce fragments by default.
If you want to produce a standalone file, complete with a header
and footer appropriate to the format, use the `-s` option:
markdown2latex -s sample.txt > sample.tex
Two shell scripts have also been included:
1. `markdown2pdf` produces a PDF file from markdown-formatted 1. `markdown2pdf` produces a PDF file from markdown-formatted
text, using `markdown2latex` and `pdflatex`. The default text, using `pandoc` and `pdflatex`. The default
behavior of `markdown2pdf` is to create a file with the same behavior of `markdown2pdf` is to create a file with the same
base name as the first argument and the extension `pdf`; thus, base name as the first argument and the extension `pdf`; thus,
for example, for example,
@ -190,7 +174,7 @@ Two shell scripts have also been included:
If no input file is specified, input will be taken from STDIN. If no input file is specified, input will be taken from STDIN.
2. `web2markdown` grabs a web page from a file or URL and converts 2. `web2markdown` grabs a web page from a file or URL and converts
it to markdown-formatted text, using `tidy` and `html2markdown`. it to markdown-formatted text, using `tidy` and `pandoc`.
Unless input is from STDIN, an attempt is made to determine the Unless input is from STDIN, an attempt is made to determine the
character encoding of the page from the "Content-type" meta tag. character encoding of the page from the "Content-type" meta tag.
If this is not present, UTF-8 is assumed. Alternatively, a character If this is not present, UTF-8 is assumed. Alternatively, a character
@ -207,9 +191,20 @@ Command-line options
==================== ====================
Various command-line options can be used to customize the output. Various command-line options can be used to customize the output.
For a complete list, type
pandoc --help `-f`, `--from`, `-r`, or `--read` can be used to specify the input
format -- the format Pandoc will be converting *from*. Available
formats are `native`, `markdown`, `rst`, `html`, and `latex`.
`-t`, `--to`, `-w`, or `--write` can be used to specify the output
format -- the format Pandoc will be converting *to*. Available formats
are `native`, `html`, `s5`, `latex`, `markdown`, `rst`, and `rtf`.
`-s` or `--standalone` indicates that a standalone document is to be
produced (with appropriate headers and footers), rather than a fragment.
`-o` or `--output` specifies the name of the output file. If no output
filename is given, output will be sent to STDOUT.
`-p` or `--preserve-tabs` causes tabs in the source text to be `-p` or `--preserve-tabs` causes tabs in the source text to be
preserved, rather than converted to spaces (the default). preserved, rather than converted to spaces (the default).
@ -225,12 +220,6 @@ untranslatable HTML codes and LaTeX environments. (The LaTeX reader
does pass through untranslatable LaTeX commands, even if `-R` is not does pass through untranslatable LaTeX commands, even if `-R` is not
specified.) specified.)
`-s` or `--standalone` causes `pandoc` to produce a standalone file,
complete with appropriate document headers. By default, `pandoc`
produces a fragment.
`-o` or `--output-file` can be used to specify an output file.
`-C` or `--custom-header` can be used to specify a custom document `-C` or `--custom-header` can be used to specify a custom document
header. To see the headers used by default, use the `-D` option: header. To see the headers used by default, use the `-D` option:
for example, `pandoc -D html` prints the default HTML header. for example, `pandoc -D html` prints the default HTML header.

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -6,14 +6,13 @@ markdown2pdf \- converts markdown-formatted text to PDF, using pdflatex
.SH DESCRIPTION .SH DESCRIPTION
\fBmarkdown2pdf\fR converts \fIinput\-file\fR (or text from standard \fBmarkdown2pdf\fR converts \fIinput\-file\fR (or text from standard
input) from markdown\-formatted plain text to PDF, using \fBpdflatex\fR. input) from markdown\-formatted plain text to PDF, using \fBpdflatex\fR.
If no output filename is specified, the name of the output file is If no output filename is specified (using the \fB\-o\fR option),
derived from the input file; thus, for example, if the input file the name of the output file is derived from the input file; thus, for
is \fIhello.txt\fR, the output file will be \fIhello.pdf\fR. If example, if the input file is \fIhello.txt\fR, the output file will be
the input is read from STDIN and no output filename is \fIhello.pdf\fR. If the input is read from STDIN and no output filename
specified, the output file will be named \fIstdin.pdf\fR. If is specified, the output file will be named \fIstdin.pdf\fR. If multiple
multiple input files are specified, they will be concatenated before input files are specified, they will be concatenated before conversion,
conversion, and the name of the output file will be derived from and the name of the output file will be derived from the first input file.
the first input file.
.PP .PP
Input is assumed to be in the UTF\-8 character encoding. If your Input is assumed to be in the UTF\-8 character encoding. If your
local character encoding is not UTF\-8, you should pipe input and local character encoding is not UTF\-8, you should pipe input and
@ -21,11 +20,11 @@ output through \fBiconv\fR:
.IP .IP
.B iconv \-t utf\-8 input.txt | pandoc | iconv \-f utf\-8 .B iconv \-t utf\-8 input.txt | pandoc | iconv \-f utf\-8
.PP .PP
\fBmarkdown2pdf\fR assumes that the 'unicode' package \fBmarkdown2pdf\fR assumes that the 'unicode' and 'fancyvrb' packages
is in latex's search path. If this package is not included in your are in latex's search path. If these packages are not included in your
latex setup, it can be obtained from <http://ctan.org>. latex setup, they can be obtained from <http://ctan.org>.
.PP .PP
\fBmarkdown2pdf\fR is a wrapper around \fBmarkdown2latex\fR. \fBmarkdown2pdf\fR is a wrapper around \fBpandoc\fR.
.SH OPTIONS .SH OPTIONS
.TP .TP
.B \-o FILE, \-\-output=FILE .B \-o FILE, \-\-output=FILE
@ -37,10 +36,6 @@ Preserve tabs instead of converting them to spaces.
.B \-\-tab-stop=\fITABSTOP\fB .B \-\-tab-stop=\fITABSTOP\fB
Specify tab stop (default is 4). Specify tab stop (default is 4).
.TP .TP
.B \-R, \-\-parse-raw
Parse untranslatable LaTeX environments as raw LaTeX,
instead of ignoring them.
.TP
.B \-N, \-\-number-sections .B \-N, \-\-number-sections
Number section headings in LaTeX output. (Default is not to number them.) Number section headings in LaTeX output. (Default is not to number them.)
.TP .TP

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -1,8 +1,6 @@
.TH PANDOC 1 "December 15, 2006" Pandoc "User Manuals" .TH PANDOC 1 "December 15, 2006" Pandoc "User Manuals"
.SH NAME .SH NAME
pandoc, markdown2html, markdown2latex, markdown2rst, markdown2rtf, pandoc \- general markup converter
markdown2s5, html2markdown2, latex2markdown, rst2markdown \- general
markup converter
.SH SYNOPSIS .SH SYNOPSIS
\fBpandoc\fR [\fIoptions\fR] [\fIinput\-file\fR]... \fBpandoc\fR [\fIoptions\fR] [\fIinput\-file\fR]...
.SH DESCRIPTION .SH DESCRIPTION
@ -13,41 +11,37 @@ slide shows.
.PP .PP
If no \fIinput\-file\fR is specified, input is read from STDIN. If no \fIinput\-file\fR is specified, input is read from STDIN.
Otherwise, the \fIinput\-files\fR are concatenated (with a blank Otherwise, the \fIinput\-files\fR are concatenated (with a blank
line between each) and used as input. Output goes to standard line between each) and used as input. Output goes to STDOUT by
output. If you want output to a file, use the \fB\-o\fR option or default. For output to a file, use the \fB\-o\fR option:
shell redirection:
.IP .IP
.B pandoc \-o output.html input.txt .B pandoc \-o output.html input.txt
.PP
The input and output formats may be specified using command-line options
(see \fBOPTIONS\fR, below, for details). If these formats are not
specified explicitly, \fIPandoc\fR will attempt to determine them
from the extensions of the input and output filenames. If input comes
from STDIN or from a file with an unknown extension, the input is assumed
to be markdown. If no output filename is specified using the \fB\-o\fR
option, or if a filename is specified but its extension is unknown,
the output will default to HTML. Thus, for example,
.IP .IP
.B pandoc input.txt > output.html .B pandoc -o chap1.tex chap1.txt
.PP .PP
The default behavior of \fIPandoc\fR is to convert the input from converts \fIchap1.txt\fR from markdown to LaTeX. And
markdown\-formatted plain text to HTML. Different input and output
formats can be specified using command\-line options. For example,
.IP .IP
.B pandoc \-f latex \-t markdown chap1.tex > chap1.txt .B pandoc README
.PP .PP
converts \fIchap1.tex\fR from LaTeX to markdown\-formatted plain text. converts \fIREADME\fR from markdown to HTML.
See below for a detailed list of command\-line options.
.PP
For convenience, eight variant programs are available:
\fBmarkdown2html\fR (same as \fBpandoc \-w html\fR),
\fBmarkdown2latex\fR (same as \fBpandoc \-w latex\fR),
\fBmarkdown2rst\fR (same as \fBpandoc \-w rst\fR),
\fBmarkdown2rtf\fR (same as \fBpandoc \-w rtf\fR),
\fBmarkdown2s5\fR (same as \fBpandoc \-w s5\fR),
\fBhtml2markdown\fR (same as \fBpandoc \-r html \-w markdown\fR),
\fBlatex2markdown\fR (same as \fBpandoc \-r latex \-w markdown\fR),
and \fBrst2markdown\fR (same as \fBpandoc \-r rst \-w markdown\fR).
These programs take an appropriately restricted subset of \fBpandoc\fR's
options. (Run them with the \fB-h\fR flag for a full list of allowed
options.)
.PP .PP
\fIPandoc\fR uses the UTF\-8 character encoding for both input and output. \fIPandoc\fR uses the UTF\-8 character encoding for both input and output.
If your local character encoding is not UTF\-8, you should pipe input If your local character encoding is not UTF\-8, you should pipe input
and output through \fBiconv\fR: and output through \fBiconv\fR:
.IP .IP
.B iconv \-t utf\-8 input.txt | pandoc | iconv \-f utf\-8 .B iconv \-t utf\-8 input.txt | pandoc | iconv \-f utf\-8
.PP
\fIPandoc\fR's HTML parser is not very forgiving. If your input is
HTML, consider running it through \fBtidy\fR(1) before passing it
to Pandoc. Or use \fBweb2markdown\fR(1), a wrapper around \fBpandoc\fR.
.SH OPTIONS .SH OPTIONS
.TP .TP
@ -158,9 +152,7 @@ Show usage message.
.SH "SEE ALSO" .SH "SEE ALSO"
\fBweb2markdown\fR(1), \fBweb2markdown\fR(1),
\fBmarkdown2pdf\fR(1), \fBmarkdown2pdf\fR(1).
\fBiconv\fR(1)
The The
.I README .I README
file distributed with Pandoc contains full documentation. file distributed with Pandoc contains full documentation.

View file

@ -1 +0,0 @@
.so man1/pandoc.1

View file

@ -16,7 +16,7 @@ option.
from STDIN, UTF-8 is assumed. A character encoding may be specified from STDIN, UTF-8 is assumed. A character encoding may be specified
explicitly using the \fB\-e\fR option. explicitly using the \fB\-e\fR option.
.PP .PP
\fBweb2markdown\fR is a wrapper for \fBhtml2markdown\fR. \fBweb2markdown\fR is a wrapper for \fBpandoc\fR.
.SH OPTIONS .SH OPTIONS
.TP .TP
.B \-s, \-\-standalone .B \-s, \-\-standalone
@ -76,7 +76,6 @@ web2markdown \-g 'wget \-\-user=foo \-\-password=bar' mysite.com
.SH "SEE ALSO" .SH "SEE ALSO"
\fBpandoc\fR(1), \fBpandoc\fR(1),
\fBhtml2markdown\fR(1),
\fBiconv\fR(1) \fBiconv\fR(1)
.SH AUTHOR .SH AUTHOR
John MacFarlane and Recai Oktas John MacFarlane and Recai Oktas

View file

@ -45,7 +45,7 @@ import Text.Pandoc.Writers.DefaultHeaders ( defaultHtmlHeader,
defaultRTFHeader, defaultS5Header, defaultLaTeXHeader ) defaultRTFHeader, defaultS5Header, defaultLaTeXHeader )
import Text.Pandoc.Definition import Text.Pandoc.Definition
import Text.Pandoc.Shared import Text.Pandoc.Shared
import Text.Regex ( mkRegex, splitRegex ) import Text.Regex ( mkRegex, matchRegex )
import System ( exitWith, getArgs, getProgName ) import System ( exitWith, getArgs, getProgName )
import System.Exit import System.Exit
import System.Console.GetOpt import System.Console.GetOpt
@ -94,8 +94,8 @@ data Opt = Opt
{ optPreserveTabs :: Bool -- ^ If @False@, convert tabs to spaces { optPreserveTabs :: Bool -- ^ If @False@, convert tabs to spaces
, optTabStop :: Int -- ^ Number of spaces per tab , optTabStop :: Int -- ^ Number of spaces per tab
, optStandalone :: Bool -- ^ If @True@, include header, footer , optStandalone :: Bool -- ^ If @True@, include header, footer
, optReader :: ParserState -> String -> Pandoc -- ^ Read format , optReader :: String -- ^ Reader format
, optWriter :: WriterOptions -> Pandoc -> String -- ^ Write fmt , optWriter :: String -- ^ Writer format
, optParseRaw :: Bool -- ^ If @True@, parse unconvertable , optParseRaw :: Bool -- ^ If @True@, parse unconvertable
-- HTML and TeX -- HTML and TeX
, optCSS :: String -- ^ CSS file to link to , optCSS :: String -- ^ CSS file to link to
@ -103,64 +103,55 @@ data Opt = Opt
, optIncludeBeforeBody :: String -- ^ File to include at top of body , optIncludeBeforeBody :: String -- ^ File to include at top of body
, optIncludeAfterBody :: String -- ^ File to include at end of body , optIncludeAfterBody :: String -- ^ File to include at end of body
, optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT" , optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT"
, optDefaultHeader :: String -- ^ Default header
, optTitlePrefix :: String -- ^ Optional prefix for HTML title , optTitlePrefix :: String -- ^ Optional prefix for HTML title
, optOutputFile :: String -- ^ Name of output file , optOutputFile :: String -- ^ Name of output file
, optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX , optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX
, optIncremental :: Bool -- ^ If @True@, incremental lists in S5 , optIncremental :: Bool -- ^ If @True@, incremental lists in S5
, optSmart :: Bool -- ^ If @True@, use smart typography , optSmart :: Bool -- ^ If @True@, use smart typography
, optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML , optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML
, optShowUsage :: Bool -- ^ If @True@, show usage message
, optDebug :: Bool -- ^ If @True@, output debug messages , optDebug :: Bool -- ^ If @True@, output debug messages
} }
-- | Defaults for command-line options. -- | Defaults for command-line options.
startOpt :: Opt defaultOpts :: Opt
startOpt = Opt defaultOpts = Opt
{ optPreserveTabs = False { optPreserveTabs = False
, optTabStop = 4 , optTabStop = 4
, optStandalone = False , optStandalone = False
, optReader = readMarkdown , optReader = "" -- null for default reader
, optWriter = writeHtml , optWriter = "" -- null for default writer
, optParseRaw = False , optParseRaw = False
, optCSS = "" , optCSS = ""
, optIncludeInHeader = "" , optIncludeInHeader = ""
, optIncludeBeforeBody = "" , optIncludeBeforeBody = ""
, optIncludeAfterBody = "" , optIncludeAfterBody = ""
, optCustomHeader = "DEFAULT" , optCustomHeader = "DEFAULT"
, optDefaultHeader = defaultHtmlHeader
, optTitlePrefix = "" , optTitlePrefix = ""
, optOutputFile = "" -- null for stdout , optOutputFile = "" -- null for stdout
, optNumberSections = False , optNumberSections = False
, optIncremental = False , optIncremental = False
, optSmart = False , optSmart = False
, optASCIIMathML = False , optASCIIMathML = False
, optShowUsage = False
, optDebug = False , optDebug = False
} }
-- | A list of functions, each transforming the options data structure in response -- | A list of functions, each transforming the options data structure
-- to a command-line option. -- in response to a command-line option.
allOptions :: [OptDescr (Opt -> IO Opt)] options :: [OptDescr (Opt -> IO Opt)]
allOptions = options =
[ Option "fr" ["from","read"] [ Option "fr" ["from","read"]
(ReqArg (ReqArg
(\arg opt -> case (lookup (map toLower arg) readers) of (\arg opt -> return opt { optReader = map toLower arg })
Just reader -> return opt { optReader = reader }
Nothing -> error ("Unknown reader: " ++ arg) )
"FORMAT") "FORMAT")
("Source format (" ++ ("Input format (" ++ (joinWithSep ", " (map fst readers)) ++
(concatMap (\(name, fn) -> " " ++ name) readers) ++ " )") ")")
, Option "tw" ["to","write"] , Option "tw" ["to","write"]
(ReqArg (ReqArg
(\arg opt -> case (lookup (map toLower arg) writers) of (\arg opt -> return opt { optWriter = map toLower arg })
Just (writer, defaultHeader) ->
return opt { optWriter = writer,
optDefaultHeader = defaultHeader }
Nothing -> error ("Unknown writer: " ++ arg) )
"FORMAT") "FORMAT")
("Output format (" ++ (concatMap (\(name, fn) -> " " ++ name) writers) ++ " )") ("Output format (" ++ (joinWithSep ", " (map fst writers)) ++
")")
, Option "s" ["standalone"] , Option "s" ["standalone"]
(NoArg (NoArg
@ -169,8 +160,7 @@ allOptions =
, Option "o" ["output"] , Option "o" ["output"]
(ReqArg (ReqArg
(\arg opt -> do (\arg opt -> return opt { optOutputFile = arg })
return opt { optOutputFile = arg })
"FILENAME") "FILENAME")
"Name of output file" "Name of output file"
@ -286,57 +276,66 @@ allOptions =
, Option "h" ["help"] , Option "h" ["help"]
(NoArg (NoArg
(\opt -> return opt { optShowUsage = True })) (\_ -> do
prg <- getProgName
hPutStr stderr (reformatUsageInfo $
usageInfo (prg ++ " [OPTIONS] [FILES]") options)
exitWith $ ExitFailure 2))
"Show help" "Show help"
] ]
-- parse name of calling program and return default reader and writer descriptions
parseProgName name =
case (splitRegex (mkRegex "2") (map toLower name)) of
[from, to] -> (from, to)
_ -> ("markdown", "html")
-- set default options based on reader and writer descriptions; start is starting options
setDefaultOpts from to start =
case ((lookup from readers), (lookup to writers)) of
(Just reader, Just (writer, header)) -> start {optReader = reader,
optWriter = writer,
optDefaultHeader = header}
_ -> start
-- True if single-letter option is in option list
inOptList :: [Char] -> OptDescr (Opt -> IO Opt) -> Bool
inOptList list desc =
let (Option letters _ _ _) = desc in
any (\x -> x `elem` list) letters
-- Reformat usage message so it doesn't wrap illegibly -- Reformat usage message so it doesn't wrap illegibly
reformatUsageInfo :: String -> String
reformatUsageInfo = gsub " *--" " --" . reformatUsageInfo = gsub " *--" " --" .
gsub "(-[A-Za-z0-9]) *--" "\\1, --" . gsub "(-[A-Za-z0-9]) *--" "\\1, --" .
gsub " *([^- ])" "\n\t\\1" gsub " *([^- ])" "\n\t\\1"
-- Determine default reader based on source file extensions
defaultReaderName :: [String] -> String
defaultReaderName [] = "markdown"
defaultReaderName (x:xs) =
let x' = map toLower x in
case (matchRegex (mkRegex ".*\\.(.*)") x') of
Nothing -> defaultReaderName xs -- no extension
Just ["xhtml"] -> "html"
Just ["html"] -> "html"
Just ["htm"] -> "html"
Just ["tex"] -> "latex"
Just ["latex"] -> "latex"
Just ["ltx"] -> "latex"
Just ["rst"] -> "rst"
Just ["native"] -> "native"
Just _ -> "markdown"
-- Determine default writer based on output file extension
defaultWriterName :: String -> String
defaultWriterName "" = "html" -- no output file
defaultWriterName x =
let x' = map toLower x in
case (matchRegex (mkRegex ".*\\.(.*)") x') of
Nothing -> "markdown" -- no extension
Just [""] -> "markdown" -- empty extension
Just ["tex"] -> "latex"
Just ["latex"] -> "latex"
Just ["ltx"] -> "latex"
Just ["rtf"] -> "rtf"
Just ["rst"] -> "rst"
Just ["s5"] -> "s5"
Just ["native"] -> "native"
Just ["txt"] -> "markdown"
Just ["text"] -> "markdown"
Just ["md"] -> "markdown"
Just ["markdown"] -> "markdown"
Just _ -> "html"
main = do main = do
name <- getProgName
let (from, to) = parseProgName name
let irrelevantOptions = if not ('2' `elem` name)
then ""
else "frtwD" ++
(if (to /= "html" && to /= "s5") then "SmcT" else "") ++
(if (to /= "latex") then "N" else "") ++
(if (to /= "s5") then "i" else "") ++
(if (from /= "html" && from /= "latex") then "R" else "")
let options = filter (not . inOptList irrelevantOptions) allOptions
let defaultOpts = setDefaultOpts from to startOpt
args <- getArgs args <- getArgs
let (actions, sources, errors) = getOpt Permute options args let (actions, sources, errors) = getOpt Permute options args
if (not (null errors)) if (not (null errors))
then do then do
name <- getProgName
mapM (\e -> hPutStrLn stderr e) errors mapM (\e -> hPutStrLn stderr e) errors
hPutStrLn stderr (reformatUsageInfo $ hPutStrLn stderr (reformatUsageInfo $
usageInfo (name ++ " [OPTIONS] [FILES]") options) usageInfo (name ++ " [OPTIONS] [FILES]") options)
@ -350,30 +349,39 @@ main = do
let Opt { optPreserveTabs = preserveTabs let Opt { optPreserveTabs = preserveTabs
, optTabStop = tabStop , optTabStop = tabStop
, optStandalone = standalone , optStandalone = standalone
, optReader = reader , optReader = readerName
, optWriter = writer , optWriter = writerName
, optParseRaw = parseRaw , optParseRaw = parseRaw
, optCSS = css , optCSS = css
, optIncludeInHeader = includeHeader , optIncludeInHeader = includeHeader
, optIncludeBeforeBody = includeBefore , optIncludeBeforeBody = includeBefore
, optIncludeAfterBody = includeAfter , optIncludeAfterBody = includeAfter
, optCustomHeader = customHeader , optCustomHeader = customHeader
, optDefaultHeader = defaultHeader
, optTitlePrefix = titlePrefix , optTitlePrefix = titlePrefix
, optOutputFile = outputFile , optOutputFile = outputFile
, optNumberSections = numberSections , optNumberSections = numberSections
, optIncremental = incremental , optIncremental = incremental
, optSmart = smart , optSmart = smart
, optASCIIMathML = asciiMathML , optASCIIMathML = asciiMathML
, optShowUsage = showUsage
, optDebug = debug , optDebug = debug
} = opts } = opts
if showUsage -- assign reader and writer based on options and filenames
then do let readerName' = if null readerName
hPutStr stderr (reformatUsageInfo $ usageInfo (name ++ " [OPTIONS] [FILES]") options) then defaultReaderName sources
exitWith $ ExitFailure 2 else readerName
else return ()
let writerName' = if null writerName
then defaultWriterName outputFile
else writerName
reader <- case (lookup readerName' readers) of
Just r -> return r
Nothing -> error ("Unknown reader: " ++ readerName')
(writer, defaultHeader) <- case (lookup writerName' writers) of
Just (w,h) -> return (w, h)
Nothing -> error ("Unknown writer: " ++ writerName')
output <- if ((null outputFile) || debug) output <- if ((null outputFile) || debug)
then return stdout then return stdout
@ -385,7 +393,6 @@ main = do
hPutStr stderr $ concatMap (\s -> "INPUT=" ++ s ++ "\n") sources hPutStr stderr $ concatMap (\s -> "INPUT=" ++ s ++ "\n") sources
else return () else return ()
let writingS5 = (defaultHeader == defaultS5Header)
let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop) let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop)
let addBlank str = str ++ "\n\n" let addBlank str = str ++ "\n\n"
let removeCRs str = filter (/= '\r') str -- remove DOS-style line endings let removeCRs str = filter (/= '\r') str -- remove DOS-style line endings
@ -407,7 +414,7 @@ main = do
writerTitlePrefix = titlePrefix, writerTitlePrefix = titlePrefix,
writerSmart = smart, writerSmart = smart,
writerTabStop = tabStop, writerTabStop = tabStop,
writerS5 = writingS5, writerS5 = (writerName=="s5"),
writerIncremental = incremental, writerIncremental = incremental,
writerNumberSections = numberSections, writerNumberSections = numberSections,
writerIncludeBefore = includeBefore, writerIncludeBefore = includeBefore,

View file

@ -1,6 +1,6 @@
#!/bin/sh -e #!/bin/sh -e
REQUIRED="markdown2latex pdflatex" REQUIRED="pdflatex"
### common.sh ### common.sh
@ -9,9 +9,12 @@ REQUIRED="markdown2latex pdflatex"
texname=output texname=output
logfile=$THIS_TEMPDIR/log logfile=$THIS_TEMPDIR/log
if ! markdown2latex -s -d "$@" >$THIS_TEMPDIR/$texname.tex 2>$logfile; then if ! pandoc -s -d -r markdown -w latex "$@" >$THIS_TEMPDIR/$texname.tex \
[ -f $logfile ] && sed -e 's/markdown2latex/markdown2pdf/g' \ 2>$logfile; then
-e '/^INPUT=/d' -e '/^OUTPUT=/d' $logfile >&2 [ -f $logfile ] && sed -e 's/^pandoc/markdown2pdf/g' \
-e '/^INPUT=/d' -e '/^OUTPUT=/d' \
-e '/^[[:space:]]*\(-f\|-t\|-s\|-R\|-S\|-m\|-i\|-c\|-T\|-D\|-d\)/,/./d'\
-e 's/(implies -s)//g' $logfile >&2
exit 1 exit 1
fi fi

View file

@ -2,7 +2,7 @@
# converts HTML from a URL, file, or stdin to markdown # converts HTML from a URL, file, or stdin to markdown
# uses an available program to fetch URL and tidy to normalize it first # uses an available program to fetch URL and tidy to normalize it first
REQUIRED="tidy html2markdown" REQUIRED="tidy"
### common.sh ### common.sh
@ -72,14 +72,16 @@ grabber=
while [ $# -gt 0 ]; do while [ $# -gt 0 ]; do
case "$1" in case "$1" in
-h|--help) -h|--help)
html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2 pandoc -h 2>&1 | sed -e 's/pandoc/web2markdown/' \
-e '/^[[:space:]]*\(-f\|-t\|-S\|-N\|-m\|-i\|-c\|-T\|-D\|-d\)/,/./d'\
1>&2
err " -e ENCODING, --encoding=ENCODING" err " -e ENCODING, --encoding=ENCODING"
err " Specify character encoding of input" err " Specify character encoding of input"
err " -g COMMAND, --grabber=COMMAND" err " -g COMMAND, --grabber=COMMAND"
err " Specify command to be used to grab contents of URL" err " Specify command to be used to grab contents of URL"
exit 0 ;; exit 0 ;;
-v|--version) -v|--version)
html2markdown -v pandoc -v 2>&1 | sed -e 's/pandoc/web2markdown/' 1>&2
exit 0 ;; exit 0 ;;
-e) -e)
shift shift
@ -112,7 +114,7 @@ while [ $# -gt 0 ]; do
shift shift
done done
# Unpack options. Now "$@" will hold the html2markdown options. # Unpack options. Now "$@" will hold the pandoc options.
oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs" oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
inurl= inurl=
@ -162,10 +164,11 @@ else # assume UTF-8
fi fi
if [ -z "$argument" ]; then if [ -z "$argument" ]; then
tidy -utf8 2>/dev/null | html2markdown "$@" tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@"
else else
if [ -f "$argument" ]; then if [ -f "$argument" ]; then
to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@" to_utf8 "$argument" |
tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@"
else else
err "File '$argument' not found." err "File '$argument' not found."
exit 1 exit 1