Merged changes from branches/wrappers since r177.

Summary of main changes:
+ Added -o/--output and -d/--debug options to pandoc.
+ Modified pandoc to behave differently depending on the name
  of the program.  For example, if the program name is 'html2latex',
  the default reader will be html and the default writer latex. 
+ Removed most of the old wrappers, replacing them with symlinks
  to pandoc.
+ Rewrote markdown2pdf and created a new wrapper web2markdown,
  with the functionality of the old html2markdown script.  These
  new scripts exploit pandoc's -d option to avoid having to do
  complex command-line parsing.
+ Revised man pages and documentation appropriately.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@279 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2006-12-22 20:16:03 +00:00
parent cfaf0c178c
commit d829c4820a
27 changed files with 713 additions and 799 deletions

View file

@ -23,15 +23,16 @@ EXECSBASE := $(shell sed -ne 's/^[Ee]xecutable:[[:space:]]*//p' $(CABAL).in)
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# Install targets # Install targets
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
WRAPPERS := web2markdown markdown2pdf
SYMLINKS := markdown2html markdown2latex markdown2s5 markdown2rst \
markdown2rtf html2markdown latex2markdown rst2markdown
PROGS := $(EXECS) $(WRAPPERS)
# Add .exe extensions if we're running Windows/Cygwin. # Add .exe extensions if we're running Windows/Cygwin.
EXTENSION := $(shell uname | tr '[:upper:]' '[:lower:]' | \ EXTENSION := $(shell uname | tr '[:upper:]' '[:lower:]' | \
sed -ne 's/^cygwin.*$$/\.exe/p') sed -ne 's/^cygwin.*$$/\.exe/p')
EXECS := $(addsuffix $(EXTENSION),$(EXECSBASE)) EXECS := $(addsuffix $(EXTENSION),$(EXECSBASE))
# First entry in Cabal's executable stanza is the main executable. # First entry in Cabal's executable stanza is the main executable.
MAIN := $(firstword $(EXECS)) MAIN := $(firstword $(EXECS))
WRAPPERS := html2markdown latex2markdown markdown2html \
markdown2latex markdown2pdf
PROGS := $(EXECS) $(WRAPPERS)
DOCS := README.html README BUGS DOCS := README.html README BUGS
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
@ -92,6 +93,12 @@ all: build-program
templates: $(SRCDIR)/templates templates: $(SRCDIR)/templates
$(MAKE) -C $(SRCDIR)/templates $(MAKE) -C $(SRCDIR)/templates
.PHONY: symlinks
cleanup_files+=$(SYMLINKS)
symlinks: $(SYMLINKS)
$(SYMLINKS): $(MAIN)
ln -sf ./$(MAIN) $@
define generate-shell-script define generate-shell-script
echo "Generating $@..."; \ echo "Generating $@..."; \
awk ' \ awk ' \
@ -131,7 +138,7 @@ build: configure
$(BUILDCMD) build $(BUILDCMD) build
.PHONY: build-exec .PHONY: build-exec
build-exec: $(PROGS) build-exec: $(PROGS) $(SYMLINKS)
cleanup_files+=$(EXECS) cleanup_files+=$(EXECS)
$(EXECS): build $(EXECS): build
for f in $@; do \ for f in $@; do \
@ -191,8 +198,9 @@ install-exec: build-exec
fi; \ fi; \
$(INSTALL_PROGRAM) $$f $(BINPATH)/; \ $(INSTALL_PROGRAM) $$f $(BINPATH)/; \
done done
cd $(BINPATH); for f in $(SYMLINKS); do ln -sf $(MAIN) $$f; done
uninstall-exec: uninstall-exec:
-for f in $(notdir $(PROGS)); do rm -f $(BINPATH)/$$f; done -for f in $(notdir $(PROGS) $(SYMLINKS)); do rm -f $(BINPATH)/$$f; done ;
# Program + user documents installation. # Program + user documents installation.
.PHONY: install-program uninstall-program .PHONY: install-program uninstall-program
@ -277,15 +285,11 @@ osx-dmg: ../$(osx_dmg_name)
-rm -f $(osx_dmg_name) -rm -f $(osx_dmg_name)
mv $(osx_udzo_name) ../$(osx_dmg_name) mv $(osx_udzo_name) ../$(osx_dmg_name)
.PHONY: test test-markdown test-wrapper .PHONY: test test-markdown
test: $(MAIN) test: $(MAIN)
@cd $(TESTDIR) && perl runtests.pl -s $(PWD)/$(MAIN) @cd $(TESTDIR) && perl runtests.pl -s $(PWD)/$(MAIN)
test-markdown: $(MAIN) test-markdown: $(MAIN)
@cd $(TESTDIR)/MarkdownTest_1.0.3 && perl MarkdownTest.pl -s $(PWD)/$(MAIN) -tidy @cd $(TESTDIR)/MarkdownTest_1.0.3 && perl MarkdownTest.pl -s $(PWD)/$(MAIN) -tidy
cleanup_files+=testwrapper
test-wrappers: testwrapper
@echo "Running $<..."
@sh testwrapper
# Stolen and slightly improved from a GPLed Makefile. Credits to John Meacham. # Stolen and slightly improved from a GPLed Makefile. Credits to John Meacham.
src_all:=$(shell find $(SRCDIR) -type f -name '*hs' | egrep -v '^\./(_darcs|lib|test)/') src_all:=$(shell find $(SRCDIR) -type f -name '*hs' | egrep -v '^\./(_darcs|lib|test)/')

251
README
View file

@ -20,7 +20,7 @@ or output format requires only adding a reader or writer.
[reStructuredText]: http://docutils.sourceforge.net/docs/ref/rst/introduction.html [reStructuredText]: http://docutils.sourceforge.net/docs/ref/rst/introduction.html
[S5]: http://meyerweb.com/eric/tools/s5/ [S5]: http://meyerweb.com/eric/tools/s5/
[HTML]: http://www.w3.org/TR/html40/ [HTML]: http://www.w3.org/TR/html40/
[LaTeX]: http://www.latex-project.org/ [LaTeX]: http://www.latex-project.org/
[RTF]: http://en.wikipedia.org/wiki/Rich_Text_Format [RTF]: http://en.wikipedia.org/wiki/Rich_Text_Format
[Haskell]: http://www.haskell.org/ [Haskell]: http://www.haskell.org/
@ -30,9 +30,53 @@ any kind. (See COPYRIGHT for full copyright and warranty notices.)
Recai Oktaş (roktas at debian dot org) deserves credit for the build Recai Oktaş (roktas at debian dot org) deserves credit for the build
system, the debian package, and the robust wrapper scripts. system, the debian package, and the robust wrapper scripts.
[GPL]: http://www.gnu.org/copyleft/gpl.html [GPL]: http://www.gnu.org/copyleft/gpl.html "GNU General Public License"
# Using Pandoc Requirements
============
The `pandoc` program itself does not depend on any external libraries
or programs. The convenience programs `markdown2html`, `markdown2latex`,
`markdown2rst`, `markdown2rtf`, `markdown2s5`, `html2markdown`,
`latex2markdown`, and `rst2markdown` are implemented as symbolic links to
`pandoc`.
The wrapper script `web2markdown` requires
- `html2markdown` (included with Pandoc)
- a POSIX-compliant shell (installed by default on all linux and unix
systems, including Mac OS X, and in [Cygwin] for Windows),
- `HTML Tidy`
- `iconv` (for character encoding conversion). (If `iconv` is absent,
`web2markdown` will still work, but it will treat everything as UTF-8.)
[Cygwin]: http://www.cygwin.com/
[HTML Tidy]: http://tidy.sourceforge.net/
[`iconv`]: http://www.gnu.org/software/libiconv/
The wrapper script `markdown2pdf` requires
- `markdown2latex` (included with Pandoc)
- a POSIX-compliant shell
- `pdflatex`, which should be part of any [LaTeX] distribution
- the [unicode] and [fancyvrb] LaTeX packages, which are included
in many LaTeX distributions. The [unicode] package allows LaTeX to
process UTF-8 characters. [fancyvrb] allows code blocks and verbatim
text to be used within footnotes. If your installation of LaTeX
does not include these packages, you will get an error (complaining
about missing `ucs.sty` or `fancyvrb.sty`) when you try to compile
a LaTeX file produced by Pandoc, or when you use the `markdown2pdf`
script (described below). If this happens, install the [unicode] and
[fancyvrb] packages package from [CTAN]. (Get the zip file from CTAN
and unpack it into `~/texmf/tex/latex/`. You may also need to run
`mktexlsr` or `texhash` before the files can be found by TeX.)
[CTAN]: http://www.ctan.org "Comprehensive TeX Archive Network"
[unicode]: http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/
[fancyvrb]: http://www.ctan.org/tex-archive/macros/latex/contrib/fancyvrb/
Using Pandoc
============
If you run `pandoc` without arguments, it will accept input from If you run `pandoc` without arguments, it will accept input from
STDIN. If you run it with file names as arguments, it will take input STDIN. If you run it with file names as arguments, it will take input
@ -66,10 +110,14 @@ a subset of reStructuredText syntax. For example, it doesn't handle
tables, definition lists, option lists, or footnotes. It handles only the tables, definition lists, option lists, or footnotes. It handles only the
constructs expressible in unextended markdown. But for simple documents constructs expressible in unextended markdown. But for simple documents
it should be adequate. The `latex` and `html` readers are also limited it should be adequate. The `latex` and `html` readers are also limited
in what they can do. in what they can do. Because the `html` reader is picky about the HTML
it parses, it is recommended that you pipe HTML through [HTML Tidy] before
sending it to `pandoc`, or use the `web2markdown` script described below.
`pandoc` writes its output to STDOUT. If you want to write to a file, By default, `pandoc` writes its output to STDOUT. If you want to
use redirection: write to a file, use the `-o` option or shell redirection:
pandoc -o hello.html hello.txt
pandoc hello.txt > hello.html pandoc hello.txt > hello.html
@ -77,13 +125,14 @@ Note that you can specify multiple input files on the command line.
`pandoc` will concatenate them all (with blank lines between them) `pandoc` will concatenate them all (with blank lines between them)
before parsing: before parsing:
pandoc -s chapter1.txt chapter2.txt chapter3.txt references.txt > book.html pandoc -s chapter1.txt chapter2.txt references.txt > book.html
(The `-s` option here tells `pandoc` to produce a standalone HTML file, (The `-s` option here tells `pandoc` to produce a standalone HTML file,
with a proper header, rather than a fragment. For more details on this with a proper header, rather than a fragment. For more details on this
and many other command-line options, see below.) and many other command-line options, see below.)
# Character encodings Character encodings
-------------------
Unfortunately, due to limitations in GHC, `pandoc` does not automatically Unfortunately, due to limitations in GHC, `pandoc` does not automatically
detect the system's local character encoding. Hence, all input and detect the system's local character encoding. Hence, all input and
@ -97,92 +146,65 @@ will convert `source.txt` from the local encoding to UTF-8, then
convert it to HTML, then convert back to the local encoding, convert it to HTML, then convert back to the local encoding,
putting the output in `output.html`. putting the output in `output.html`.
[`iconv`]: http://www.gnu.org/software/libiconv/
The shell scripts (described below) automatically convert the input The shell scripts (described below) automatically convert the input
from the local encoding to UTF-8 before running them through `pandoc`, from the local encoding to UTF-8 before running them through `pandoc`,
then convert the output back to the local encoding. then convert the output back to the local encoding.
## LaTeX and UTF-8 Convenience programs and wrapper scripts
========================================
LaTeX sources produced by Pandoc use `ucs.sty`, which is included in many For convenience, eight variant programs are included with Pandoc:
LaTeX distributions. This allows LaTeX to process UTF-8 characters. `markdown2html` (which is equivalent to `pandoc -w html`),
If your installation of LaTeX does not include `ucs.sty`, you will get an `markdown2latex` (equivalent to `pandoc -w latex`), `markdown2rst`
error when you try to compile a LaTeX file produced by Pandoc, or when (equivalent to `pandoc -w rst`), `markdown2rtf` (equivalent to
you use the `markdown2pdf` script (described below). If this happens, `pandoc -w rtf`), `markdown2s5` (equivalent to `pandoc -w s5`),
install the [unicode] package from [CTAN]. (Get the `unicode.zip` `html2markdown` (equivalent to `pandoc -r html -w markdown`),
file from CTAN, unpack it, and copy the whole `unicode` directory into `latex2markdown` (equivalent to `pandoc -r latex -w markdown`), and
`~/texmf/tex/latex/`. You may also need to run `mktexlsr` or `texhash` `rst2markdown` (equivalent to `pandoc -r rst -w markdown`). These
before the files can be found by TeX.) programs take an appropriately restricted subset of `pandoc`'s
options. (Run them with the `-h` flag for a full list of allowed
options.)
[CTAN]: http://www.ctan.org Like `pandoc`, all of these programs produce fragments by default.
[unicode]: http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/ If you want to produce a standalone file, complete with a header
and footer appropriate to the format, use the `-s` option:
# The shell scripts markdown2latex -s sample.txt > sample.tex
Five shell scripts have been included that make it easy to run Two shell scripts have also been included:
`pandoc` without worrying about character encodings, and without
remembering all the command-line options:
- `markdown2html` converts markdown-formatted text to HTML 1. `markdown2pdf` produces a PDF file from markdown-formatted
- `markdown2latex` converts markdown-formatted text to LaTeX text, using `markdown2latex` and `pdflatex`. The default
- `markdown2pdf` produces a PDF file from markdown-formatted behavior of `markdown2pdf` is to create a file with the same
text, using `pdflatex`. base name as the first argument and the extension `pdf`; thus,
- `html2markdown` converts HTML to markdown-formatted text for example,
- `latex2markdown` converts LaTeX to markdown-formatted text
All of the scripts use `iconv` (if available) to convert to and from markdown2pdf sample.txt endnotes.txt
the local character encoding. All of the scripts presuppose that
`pandoc` is in the path, and some have additional requirements. (For
example, `html2markdown` uses `tidy`, and `markdown2pdf` uses
`pdflatex`.)
When no arguments are specified, text will be read from standard will produce `sample.pdf`. (If `sample.pdf` exists already,
input. Arguments specify input files (limited to one in the case of it will be backed up before being overwritten.) An output file
`latex2markdown` and `html2markdown`; the other scripts accept any number name can be specified explicitly using the `-o` option:
of arguments). `html2markdown` may take a URL as argument instead of
a filename; in this case, `curl`, `wget`, or an available text-based
browser will be used to fetch the contents of the URL. (The `-n` option
inhibits this behavior; the `-g` option allows the user to specify a
custom command that will be used to fetch from a URL.)
With the exception of `markdown2pdf`, the scripts write to standard output. markdown2pdf -o "My Book.pdf" chap1.txt chap2.txt chap3.txt
Output can be sent to a file using shell output redirection:
latex2markdown sample.tex > sample.txt If no input file is specified, input will be taken from STDIN.
The default behavior of `markdown2pdf` is to create a file with the same 2. `web2markdown` grabs a web page from a file or URL and converts
base name as the first argument and the extension `pdf`; thus, for example, it to markdown-formatted text, using `tidy` and `html2markdown`.
Unless input is from STDIN, an attempt is made to determine the
character encoding of the page from the "Content-type" meta tag.
If this is not present, UTF-8 is assumed. Alternatively, a character
encoding may be specified explicitly using the `-e` option.
markdown2pdf sample.txt endnotes.txt `web2markdown` searches for an available program (`wget`, `curl`,
or a text-mode browser) to fetch the contents of a URL.
Optionally, the `-g` command may be used to specify the command
to be used:
will produce `sample.pdf`. (If `sample.pdf` exists already, it will be web2markdown -g 'wget --user=foo --password=bar' mysite.com
backed up before being overwritten.) An output file name can be specified
explicitly using the `-o` option:
markdown2pdf -o "My Book.pdf" chap1.txt chap2.txt chap3.txt Command-line options
====================
Options specific to the scripts, like `-o`, `-g`, and `-n`, must
be specified *before* any command-line arguments (file names or URLs).
Any options specified *after* the command-line arguments will be
passed directly to `pandoc`. For example,
markdown2html tusks.txt -S -T Elephants
will convert `tusks.txt` to `tusks.html` using smart quotes, ellipses,
and dashes, with "Elephants" as the page title prefix. (For a
complete list of `pandoc` options, see below.) When there are no
command-line arguments (because input is from STDIN), `pandoc`
options must be preceded by ` -- `:
cat tusks.txt | markdown2html -- -S -T Elephants
The ` -- ` separator may optionally be used when there are command-line
arguments:
markdown2html -- tusks.txt -S -T Elephants
# Command-line options
Various command-line options can be used to customize the output. Various command-line options can be used to customize the output.
For a complete list, type For a complete list, type
@ -207,9 +229,11 @@ specified.)
complete with appropriate document headers. By default, `pandoc` complete with appropriate document headers. By default, `pandoc`
produces a fragment. produces a fragment.
`--custom-header` can be used to specify a custom document header. To `-o` or `--output-file` can be used to specify an output file.
see the headers used by default, use the `-D` option: for example,
`pandoc -D html` prints the default HTML header. `-C` or `--custom-header` can be used to specify a custom document
header. To see the headers used by default, use the `-D` option:
for example, `pandoc -D html` prints the default HTML header.
`-c` or `--css` allows the user to specify a custom stylesheet that `-c` or `--css` allows the user to specify a custom stylesheet that
will be linked to in HTML and S5 output. will be linked to in HTML and S5 output.
@ -253,15 +277,38 @@ is for lists to be displayed all at once.
`-N` or `--number-sections` causes sections to be numbered in LaTeX `-N` or `--number-sections` causes sections to be numbered in LaTeX
output. By default, sections are not numbered. output. By default, sections are not numbered.
# Pandoc's markdown vs. standard markdown `-d` or `--debug` causes a debugging message to be written to STDERR.
The format of the message is as follows:
OUTPUT=foo
INPUT=bar
INPUT=Foo Baz
Here `OUTPUT=` is followed by the name of the output file specified
using `-o`, if any. If no output file was specified, `OUTPUT=`
will appear with nothing following it. Lines beginning `INPUT=`
specify input files. If there are no input files, no `INPUT=` lines
will be printed. The `-d` option forces output to be written to
STDOUT, even if an output file was specified using the `-o` option.
(This option is provided to make it easier to write wrappers for
`pandoc`.)
`-v` or `--version` prints the version number to STDERR.
`-h` or `--help` prints a usage message to STDERR.
Pandoc's markdown vs. standard markdown
=======================================
In parsing markdown, Pandoc departs from and extends [standard markdown] In parsing markdown, Pandoc departs from and extends [standard markdown]
in a few respects. (To run Pandoc on the official in a few respects. (To run Pandoc on the official
markdown test suite, type `make test-markdown`.) markdown test suite, type `make test-markdown`.)
[standard markdown]: http://daringfireball.net/projects/markdown/syntax [standard markdown]: http://daringfireball.net/projects/markdown/syntax
"Markdown syntax description"
## Section Headings Section Headings
----------------
Pandoc creates an invisible anchor in front of every HTML section Pandoc creates an invisible anchor in front of every HTML section
heading. The ID of this anchor is derived from the section heading heading. The ID of this anchor is derived from the section heading
@ -281,7 +328,8 @@ example, just insert:
[Back to Aristotle](#Aristotle's_De_Anima) [Back to Aristotle](#Aristotle's_De_Anima)
## Lists Lists
-----
Pandoc behaves differently from standard markdown on some "edge Pandoc behaves differently from standard markdown on some "edge
cases" involving lists. Consider this source: cases" involving lists. Consider this source:
@ -332,7 +380,8 @@ the example above:
B) Fie B) Fie
C) Third C) Third
## Literal quotes in titles Literal quotes in titles
------------------------
Standard markdown allows unescaped literal quotes in titles, as Standard markdown allows unescaped literal quotes in titles, as
in in
@ -343,7 +392,8 @@ Pandoc requires all quotes within titles to be escaped:
[foo]: "bar \"embedded\" baz" [foo]: "bar \"embedded\" baz"
## Reference links Reference links
---------------
Pandoc allows implicit reference links in either of two styles: Pandoc allows implicit reference links in either of two styles:
@ -357,7 +407,8 @@ will appear as regular bracketed text. Note: even `[link][]` will
appear as `[link]` if there's no reference for `link`. If you want appear as `[link]` if there's no reference for `link`. If you want
`[link][]`, use a backslash escape: `\[link]\[]`. `[link][]`, use a backslash escape: `\[link]\[]`.
## Footnotes Footnotes
---------
Pandoc's markdown allows footnotes, using the following syntax: Pandoc's markdown allows footnotes, using the following syntax:
@ -394,7 +445,8 @@ they cannot contain multiple paragraphs). The syntax is as follows:
Inline and regular footnotes may be mixed freely. Inline and regular footnotes may be mixed freely.
## Embedded HTML Embedded HTML
-------------
Pandoc treats embedded HTML in markdown a bit differently than Pandoc treats embedded HTML in markdown a bit differently than
Markdown 1.0. While Markdown 1.0 leaves HTML blocks exactly as they Markdown 1.0. While Markdown 1.0 leaves HTML blocks exactly as they
@ -427,7 +479,8 @@ markdown with HTML block elements. For example, one can surround
a block of markdown text with `<div>` tags without preventing it a block of markdown text with `<div>` tags without preventing it
from being interpreted as markdown. from being interpreted as markdown.
## Title blocks Title blocks
------------
If the file begins with a title block If the file begins with a title block
@ -460,7 +513,8 @@ If a title prefix is specified with `-T` and no title block appears
in the document, the title prefix will be used by itself as the in the document, the title prefix will be used by itself as the
HTML title. HTML title.
## Box-style blockquotes Box-style blockquotes
---------------------
Pandoc supports emacs-style boxquote block quotes, in addition to Pandoc supports emacs-style boxquote block quotes, in addition to
standard markdown (email-style) boxquotes: standard markdown (email-style) boxquotes:
@ -469,7 +523,8 @@ standard markdown (email-style) boxquotes:
| They look like this. | They look like this.
`---- `----
## Inline LaTeX Inline LaTeX
------------
Anything between two $ characters will be parsed as LaTeX math. The Anything between two $ characters will be parsed as LaTeX math. The
opening $ must have a character immediately to its right, while the opening $ must have a character immediately to its right, while the
@ -501,7 +556,8 @@ You can also use LaTeX environments. For example,
Note, however, that material between the begin and end tags will Note, however, that material between the begin and end tags will
be interpreted as raw LaTeX, not as markdown. be interpreted as raw LaTeX, not as markdown.
## Custom headers Custom headers
--------------
When run with the "standalone" option (`-s`), `pandoc` creates a When run with the "standalone" option (`-s`), `pandoc` creates a
standalone file, complete with an appropriate header. To see the standalone file, complete with an appropriate header. To see the
@ -516,13 +572,14 @@ it and specify it on the command line as follows:
pandoc --header=MyHeaderFile pandoc --header=MyHeaderFile
# Producing S5 with Pandoc Producing S5 with Pandoc
========================
Producing an [S5] slide show with Pandoc is easy. A title page is Producing an [S5] web-based slide show with Pandoc is easy. A title
constructed automatically from the document's title block (see above). page is constructed automatically from the document's title block (see
Each section (with a level-one header) produces a single slide. (Note above). Each section (with a level-one header) produces a single slide.
that if the section is too big, the slide will not fit on the page; S5 (Note that if the section is too big, the slide will not fit on the page;
is not smart enough to produce multiple pages.) S5 is not smart enough to produce multiple pages.)
Here's the markdown source for a simple slide show, `eating.txt`: Here's the markdown source for a simple slide show, `eating.txt`:

2
debian/changelog vendored
View file

@ -14,6 +14,8 @@ pandoc (0.22) unstable; urgency=low
* Refactored template processing (fillTemplates.pl). * Refactored template processing (fillTemplates.pl).
* Modified wrapper scripts to make them more robust.
* Modified wrapper scripts to make them more robust and portable. * Modified wrapper scripts to make them more robust and portable.
To avoid code duplication and ensure consistency, wrappers are To avoid code duplication and ensure consistency, wrappers are
generated via a templating system from templates in src/wrappers. generated via a templating system from templates in src/wrappers.

View file

@ -1,60 +1 @@
.TH HTML2MARKDOWN 1 "November 21, 2006" Pandoc "User Manuals" .so man1/pandoc.1
.SH NAME
html2markdown \- converts HTML to markdown-formatted text
.SH SYNOPSIS
\fBhtml2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR or \fIURL\fR]
[\fB\-\-\fR] [\fIpandoc\-opts\fR]
.SH DESCRIPTION
\fBhtml2markdown\fR converts \fIinput\-file\fR or \fIURL\fR (or text
from STDIN) from HTML to markdown\-formatted plain text.
If a URL is specified, \fBhtml2markdown\fR uses an available program
(e.g. wget, w3m, lynx or curl) to fetch its contents. Output is sent
to STDOUT.
.PP
\fBhtml2markdown\fR is a wrapper for \fBpandoc\fR.
.SH OPTIONS
.TP
.B \-h
Show usage message.
.TP
.B \-e \fIencoding\fR
Assume the character encoding \fIencoding\fR in reading the HTML.
(Note: \fIencoding\fR will be passed to \fBiconv\fR; a list of
available encodings may be obtained using `\fBiconv \-l\fR'.)
If the \fB\-e\fR option is not specified, the encoding will be
determined as follows: If input is from STDIN, the local encoding
will be assumed. Otherwise, \fBhtml2markdown\fR will try to
extract the character encoding from the "Content-type" meta tag.
If no character encoding is specified in this way, UTF-8 will be
assumed for a URL argument, and the local encoding will be assumed
for a file argument.
.TP
.B \-g \fIcommand\fR
Use \fIcommand\fR to fetch the contents of a URL. (By default,
\fBhtml2markdown\fR searches for an available program or text-based
browser to fetch the contents of a URL.) For example:
.IP
html2markdown \-g 'wget \-\-user=foo \-\-password=bar' mysite.com
.TP
.B \-n
Disable automatic fetching of contents when URLs are specified as
arguments.
.TP
.I pandoc\-opts
Any options appearing after \fIinput\-file\fR or \fIURL\fR on the
command line will be passed directly to \fBpandoc\fR. If no
\fIinput-file\fR or \fIURL\fR is specified, these options must
be preceded by ` \fB\-\-\fR '. (In other cases, ` \fB\-\-\fR ' is
optional.) See \fBpandoc\fR(1) for a list of options that may be used.
Example:
.IP
html2markdown input.txt \-\- \-R
.SH "SEE ALSO"
\fBpandoc\fR(1),
\fBmarkdown2html\fR(1),
\fBmarkdown2latex\fR(1),
\fBlatex2markdown\fR(1),
\fBmarkdown2pdf\fR(1),
\fBiconv\fR(1)
.SH AUTHOR
John MacFarlane and Recai Oktas

View file

@ -1,33 +1 @@
.TH LATEX2MARKDOWN 1 "November 21, 2006" Pandoc "User Manuals" .so man1/pandoc.1
.SH NAME
latex2markdown \- converts LaTeX to markdown\-formatted text
.SH SYNOPSIS
\fBlatex2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR]
[\fB\-\-\fR] [\fIpandoc\-opts\fR]
.SH DESCRIPTION
\fBlatex2markdown\fR converts \fIinput\-file\fR
(or text from STDIN) from LaTeX to markdown\-formatted plain text.
Output is sent to STDOUT.
.PP
\fBlatex2markdown\fR is a wrapper for \fBpandoc\fR.
.SH OPTIONS
.TP
.B \-h
Show usage message.
.TP
.I pandoc\-opts
Any options appearing after \fIinput\-file\fR on the command line
will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
is specified, these options must be preceded by ` \fB\-\-\fR '.
(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
for a list of options that may be used. Example:
.IP
latex2markdown input.txt \-\- \-R
.SH "SEE ALSO"
\fBpandoc\fR(1),
\fBmarkdown2html\fR(1),
\fBhtml2markdown\fR(1),
\fBmarkdown2latex\fR(1),
\fBmarkdown2pdf\fR(1)
.SH AUTHOR
John MacFarlane and Recai Oktas

View file

@ -1,34 +1 @@
.TH MARKDOWN2HTML 1 "November 21, 2006" Pandoc "User Manuals" .so man1/pandoc.1
.SH NAME
markdown2html \- converts markdown\-formatted text to HTML
.SH SYNOPSIS
\fBmarkdown2html\fR [\fIoptions\fR] [\fIinput\-file\fR]...
[\fB\-\-\fR] [\fIpandoc\-opts\fR]
.SH DESCRIPTION
\fBmarkdown2html\fR converts \fIinput\-file\fR
(or text from STDIN) from markdown\-formatted plain text to HTML.
If multiple files are specified, they will be combined to make a single
HTML document. Output is sent to STDOUT.
.PP
\fBmarkdown2html\fR is a wrapper for \fBpandoc\fR.
.SH OPTIONS
.TP
.B \-h
Show usage message.
.TP
.I pandoc\-opts
Any options appearing after \fIinput\-file\fR... on the command line
will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
is specified, these options must be preceded by ` \fB\-\-\fR '.
(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
for a list of options that may be used. Example:
.IP
markdown2html input.txt \-\- \-\-css=main.css \-S
.SH "SEE ALSO"
\fBpandoc\fR(1),
\fBhtml2markdown\fR(1),
\fBmarkdown2latex\fR(1),
\fBlatex2markdown\fR(1),
\fBmarkdown2pdf\fR(1)
.SH AUTHOR
John MacFarlane and Recai Oktas

View file

@ -1,34 +1 @@
.TH MARKDOWN2LATEX 1 "November 21, 2006" Pandoc "User Manuals" .so man1/pandoc.1
.SH NAME
markdown2latex \- converts markdown-formatted text to LaTeX
.SH SYNOPSIS
\fBmarkdown2latex\fR [\fIoptions\fR] [\fIinput\-file\fR]...
[\fB\-\-\fR] [\fIpandoc\-opts\fR]
.SH DESCRIPTION
\fBmarkdown2latex\fR converts \fIinput\-file\fR (or text from STDIN)
from markdown\-formatted plain text to LaTeX. If multiple files are
specified, they will be combined to make a single LaTeX document.
Output is sent to STDOUT.
.PP
\fBmarkdown2latex\fR is a wrapper for \fBpandoc\fR.
.SH OPTIONS
.TP
.B \-h
Show usage message.
.TP
.I pandoc\-opts
Any options appearing after \fIinput\-file\fR... on the command line
will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
is specified, these options must be preceded by ` \fB\-\-\fR '.
(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
for a list of options that may be used. Example:
.IP
markdown2latex input.txt \-\- \-\-custom\-header=letterhead.tex
.SH "SEE ALSO"
\fBpandoc\fR(1),
\fBmarkdown2html\fR(1),
\fBhtml2markdown\fR(1),
\fBlatex2markdown\fR(1),
\fBmarkdown2pdf\fR(1)
.SH AUTHOR
John MacFarlane and Recai Oktas

View file

@ -1,43 +1,71 @@
.TH MARKDOWN2PDF 1 "November 21, 2006" Pandoc "User Manuals" .TH MARKDOWN2PDF 1 "December 15, 2006" Pandoc "User Manuals"
.SH NAME .SH NAME
markdown2pdf \- converts markdown-formatted text to PDF, using pdflatex markdown2pdf \- converts markdown-formatted text to PDF, using pdflatex
.SH SYNOPSIS .SH SYNOPSIS
\fBmarkdown2pdf\fR [\fIoptions\fR] [\fB\-o\fR \fIoutput-file\fR] \fBmarkdown2pdf\fR [\fIoptions\fR] [\fIinput-file\fR]...
[\fIinput-file\fR]... [\fB\-\-\fR] [\fIpandoc\-opts\fR]
.SH DESCRIPTION .SH DESCRIPTION
\fBmarkdown2pdf\fR converts \fIinput\-file\fR (or text from STDIN) from \fBmarkdown2pdf\fR converts \fIinput\-file\fR (or text from standard
markdown\-formatted plain text to PDF, using \fBpdflatex\fR. If no output input) from markdown\-formatted plain text to PDF, using \fBpdflatex\fR.
filename is specified, the name of the output file is derived from the If no output filename is specified, the name of the output file is
input file; thus, for example, if the input file is \fIhello.txt\fR, derived from the input file; thus, for example, if the input file
the output file will be \fIhello.pdf\fR. If the input is read from STDIN is \fIhello.txt\fR, the output file will be \fIhello.pdf\fR. If
and no output filename is specified, the output file will be named the input is read from STDIN and no output filename is
\fIstdin.pdf\fR. If multiple input files are specified, they will be specified, the output file will be named \fIstdin.pdf\fR. If
concatenated before conversion, and the name of the output file will be multiple input files are specified, they will be concatenated before
derived from the first input file. conversion, and the name of the output file will be derived from
the first input file.
.PP .PP
\fBmarkdown2pdf\fR is a wrapper for \fBpandoc\fR. Input is assumed to be in the UTF\-8 character encoding. If your
local character encoding is not UTF\-8, you should pipe input and
output through \fBiconv\fR:
.IP
.B iconv \-t utf\-8 input.txt | pandoc | iconv \-f utf\-8
.PP
\fBmarkdown2pdf\fR assumes that the 'unicode' package
is in latex's search path. If this package is not included in your
latex setup, it can be obtained from <http://ctan.org>.
.PP
\fBmarkdown2pdf\fR is a wrapper around \fBmarkdown2latex\fR.
.SH OPTIONS .SH OPTIONS
.TP .TP
.B \-h .B \-o FILE, \-\-output=FILE
Write output to \fIFILE\fR.
.TP
.B \-p, \-\-preserve-tabs
Preserve tabs instead of converting them to spaces.
.TP
.B \-\-tab-stop=\fITABSTOP\fB
Specify tab stop (default is 4).
.TP
.B \-R, \-\-parse-raw
Parse untranslatable LaTeX environments as raw LaTeX,
instead of ignoring them.
.TP
.B \-N, \-\-number-sections
Number section headings in LaTeX output. (Default is not to number them.)
.TP
.B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB
Include (LaTeX) contents of \fIFILE\fR at the end of the header. Implies
\fB\-s\fR.
.TP
.B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB
Include (LaTeX) contents of \fIFILE\fR at the beginning of the document body.
.TP
.B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB
Include (LaTeX) contents of \fIFILE\fR at the end of the document body.
.TP
.B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB
Use contents of \fIFILE\fR
as the LaTeX document header (overriding the default header, which can be
printed using '\fBpandoc \-D latex\fR'). Implies \fB-s\fR.
.TP
.B \-v, \-\-version
Print version.
.TP
.B \-h, \-\-help
Show usage message. Show usage message.
.TP
.B \-o \fIoutput-file\fR
Specify name of output (PDF) file.
.TP
.I pandoc\-opts
Any options appearing after \fIinput\-file\fR... on the command line
will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
is specified, these options must be preceded by ` \fB\-\-\fR '.
(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
for a list of options that may be used. Example:
.IP
markdown2pdf input.txt \-\- \-\-custom\-header=letterhead.tex
.SH "SEE ALSO" .SH "SEE ALSO"
\fBpandoc\fR(1), \fBpandoc\fR(1),
\fBmarkdown2html\fR(1),
\fBhtml2markdown\fR(1),
\fBmarkdown2latex\fR(1),
\fBlatex2markdown\fR(1),
\fBpdflatex\fR(1) \fBpdflatex\fR(1)
.SH AUTHOR .SH AUTHOR
John MacFarlane and Recai Oktas John MacFarlane and Recai Oktas

1
man/man1/markdown2rst.1 Normal file
View file

@ -0,0 +1 @@
.so man1/pandoc.1

1
man/man1/markdown2rtf.1 Normal file
View file

@ -0,0 +1 @@
.so man1/pandoc.1

1
man/man1/markdown2s5.1 Normal file
View file

@ -0,0 +1 @@
.so man1/pandoc.1

View file

@ -1,18 +1,23 @@
.TH PANDOC 1 "November 21, 2006" Pandoc "User Manuals" .TH PANDOC 1 "December 15, 2006" Pandoc "User Manuals"
.SH NAME .SH NAME
pandoc \- general markup converter pandoc, markdown2html, markdown2latex, markdown2rst, markdown2rtf,
markdown2s5, html2markdown2, latex2markdown, rst2markdown \- general
markup converter
.SH SYNOPSIS .SH SYNOPSIS
\fBpandoc\fR [\fIoptions\fR] [\fIinput\-file\fR]... \fBpandoc\fR [\fIoptions\fR] [\fIinput\-file\fR]...
.SH DESCRIPTION .SH DESCRIPTION
\fIPandoc\fR converts files from one markup format to another. It can \fBPandoc\fR converts files from one markup format to another. It can
read markdown and (subsets of) reStructuredText, HTML, and LaTeX, and read markdown and (subsets of) reStructuredText, HTML, and LaTeX, and
it can write markdown, reStructuredText, HTML, LaTeX, RTF, and S5 HTML it can write markdown, reStructuredText, HTML, LaTeX, RTF, and S5 HTML
slide shows. slide shows.
.PP .PP
If no \fIinput\-file\fR is specified, input is read from STDIN. Otherwise, If no \fIinput\-file\fR is specified, input is read from STDIN.
the \fIinput\-files\fR are concatenated (with a blank line between each) Otherwise, the \fIinput\-files\fR are concatenated (with a blank
and used as input. Output goes to STDOUT. If you want output to a file, line between each) and used as input. Output goes to standard
use shell redirection: output. If you want output to a file, use the \fB\-o\fR option or
shell redirection:
.IP
.B pandoc \-o output.html input.txt
.IP .IP
.B pandoc input.txt > output.html .B pandoc input.txt > output.html
.PP .PP
@ -25,6 +30,19 @@ formats can be specified using command\-line options. For example,
converts \fIchap1.tex\fR from LaTeX to markdown\-formatted plain text. converts \fIchap1.tex\fR from LaTeX to markdown\-formatted plain text.
See below for a detailed list of command\-line options. See below for a detailed list of command\-line options.
.PP .PP
For convenience, eight variant programs are available:
\fBmarkdown2html\fR (same as \fBpandoc \-w html\fR),
\fBmarkdown2latex\fR (same as \fBpandoc \-w latex\fR),
\fBmarkdown2rst\fR (same as \fBpandoc \-w rst\fR),
\fBmarkdown2rtf\fR (same as \fBpandoc \-w rtf\fR),
\fBmarkdown2s5\fR (same as \fBpandoc \-w s5\fR),
\fBhtml2markdown\fR (same as \fBpandoc \-r html \-w markdown\fR),
\fBlatex2markdown\fR (same as \fBpandoc \-r latex \-w markdown\fR),
and \fBrst2markdown\fR (same as \fBpandoc \-r rst \-w markdown\fR).
These programs take an appropriately restricted subset of \fBpandoc\fR's
options. (Run them with the \fB-h\fR flag for a full list of allowed
options.)
.PP
\fIPandoc\fR uses the UTF\-8 character encoding for both input and output. \fIPandoc\fR uses the UTF\-8 character encoding for both input and output.
If your local character encoding is not UTF\-8, you should pipe input If your local character encoding is not UTF\-8, you should pipe input
and output through \fBiconv\fR: and output through \fBiconv\fR:
@ -33,61 +51,58 @@ and output through \fBiconv\fR:
.SH OPTIONS .SH OPTIONS
.TP .TP
.B \-v, \-\-version .B \-f \fIFORMAT\fB, \-r \fIFORMAT\fB, \-\-from=\fIFORMAT\fB, \-\-read=\fIFORMAT\fB
Print version.
.TP
.B \-h, \-\-help
Show usage message.
.TP
.B \-f FORMAT, \-r FORMAT, \-\-from=FORMAT, \-\-read=FORMAT
Specify input format. Specify input format.
.I FORMAT .I FORMAT
can be can be
.I native .B native
(native Haskell), (native Haskell),
.I markdown .B markdown
(markdown or plain text), (markdown or plain text),
.I rst .B rst
(reStructuredText), (reStructuredText),
.I html .B html
(HTML), (HTML),
or or
.I latex .B latex
(LaTeX). (LaTeX).
.TP .TP
.B \-t FORMAT, \-w FORMAT, \-\-to=FORMAT, \-\-write=FORMAT .B \-t \fIFORMAT\fB, \-w \fIFORMAT\fB, \-\-to=\fIFORMAT\fB, \-\-write=\fIFORMAT\fB
Specify output format. Specify output format.
.I FORMAT .I FORMAT
can be can be
.I native .B native
(native Haskell), (native Haskell),
.I markdown .B markdown
(markdown or plain text), (markdown or plain text),
.I rst .B rst
(reStructuredText), (reStructuredText),
.I html .B html
(HTML), (HTML),
.I latex .B latex
(LaTeX), (LaTeX),
.I s5 .B s5
(S5 HTML and javascript slide show), (S5 HTML and javascript slide show),
or or
.I rtf .B rtf
(rich text format). (rich text format).
.TP .TP
.B \-s, \-\-standalone .B \-s, \-\-standalone
Produce output with an appropriate header and footer (e.g. a Produce output with an appropriate header and footer (e.g. a
standalone HTML, LaTeX, or RTF file, not a fragment). standalone HTML, LaTeX, or RTF file, not a fragment).
.TP .TP
.B \-o FILE, \-\-output=FILE
Write output to \fIFILE\fR instead of STDOUT.
.TP
.B \-p, \-\-preserve-tabs .B \-p, \-\-preserve-tabs
Preserve tabs instead of converting them to spaces. Preserve tabs instead of converting them to spaces.
.TP .TP
.B \-\-tab-stop=TABSTOP .B \-\-tab-stop=\fITABSTOP\fB
Specify tab stop (default is 4). Specify tab stop (default is 4).
.TP .TP
.B \-R, \-\-parse-raw .B \-R, \-\-parse-raw
Parse untranslatable HTML codes and LaTeX environments as raw HTML or Parse untranslatable HTML codes and LaTeX environments as raw HTML
LaTeX, instead of ignoring them. or LaTeX, instead of ignoring them.
.TP .TP
.B \-S, \-\-smartypants .B \-S, \-\-smartypants
Use smart quotes, dashes, and ellipses in HTML output. Use smart quotes, dashes, and ellipses in HTML output.
@ -99,41 +114,50 @@ Use ASCIIMathML to display embedded LaTeX math in HTML output.
Make list items in S5 display incrementally (one by one). Make list items in S5 display incrementally (one by one).
.TP .TP
.B \-N, \-\-number-sections .B \-N, \-\-number-sections
Number section headings in LaTeX output. (Default is not to number them.) Number section headings in LaTeX output. (Default is not to number
them.)
.TP .TP
.B \-c CSS, \-\-css=CSS .B \-c \fICSS\fB, \-\-css=\fICSS\fB
Link to a CSS style sheet. Link to a CSS style sheet.
.I CSS .I CSS
is the pathname of the style sheet. is the pathname of the style sheet.
.TP .TP
.B \-H FILENAME, \-\-include-in-header=FILENAME .B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB
Include contents of \fIFILENAME\fR at the end of the header. Implies Include contents of \fIFILE\fR at the end of the header. Implies
\fB\-s\fR. \fB\-s\fR.
.TP .TP
.B \-B FILENAME, \-\-include-before-body=FILENAME .B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB
Include contents of \fIFILENAME\fR at the beginning of the document body. Include contents of \fIFILE\fR at the beginning of the document
body.
.TP .TP
.B \-A FILENAME, \-\-include-after-body=FILENAME .B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB
Include contents of \fIFILENAME\fR at the end of the document body. Include contents of \fIFILE\fR at the end of the document body.
.TP .TP
.B \-\-custom-header=FILENAME .B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB
Use contents of \fIFILENAME\fR Use contents of \fIFILE\fR as the document header (overriding the
as the document header (overriding the default header, which can be default header, which can be printed by using the \fB\-D\fR option).
printed by using the \fB\-D\fR option). Implies Implies \fB-s\fR.
\fB-s\fR.
.TP .TP
.B \-D FORMAT, \-\-print-default-header=FORMAT .B \-D \fIFORMAT\fB, \-\-print-default-header=\fIFORMAT\fB
Print the default header for \fIFORMAT\fR Print the default header for \fIFORMAT\fR (\fIhtml, s5, latex,
(\fIhtml, s5, latex, markdown, rst, rtf\fR). markdown, rst, rtf\fR).
.TP .TP
.B \-T STRING, \-\-title-prefix=STRING .B \-T \fISTRING\fB, \-\-title-prefix=\fISTRING\fB
Specify \fISTRING\fR as a prefix to the HTML window title. Specify \fISTRING\fR as a prefix to the HTML window title.
.TP
.B \-d, \-\-debug
Print debugging information (names of input and output files) to
STDERR. Write output to STDOUT, even if an output file was specified
using the \fB\-o\fR option.
.TP
.B \-v, \-\-version
Print version.
.TP
.B \-h, \-\-help
Show usage message.
.SH "SEE ALSO" .SH "SEE ALSO"
\fBmarkdown2html\fR(1), \fBweb2markdown\fR(1),
\fBhtml2markdown\fR(1),
\fBmarkdown2latex\fR(1),
\fBlatex2markdown\fR(1),
\fBmarkdown2pdf\fR(1), \fBmarkdown2pdf\fR(1),
\fBiconv\fR(1) \fBiconv\fR(1)

1
man/man1/rst2markdown.1 Normal file
View file

@ -0,0 +1 @@
.so man1/pandoc.1

82
man/man1/web2markdown.1 Normal file
View file

@ -0,0 +1,82 @@
.TH WEB2MARKDOWN 1 "December 15, 2006" Pandoc "User Manuals"
.SH NAME
web2markdown \- converts HTML to markdown-formatted text
.SH SYNOPSIS
\fBweb2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR or \fIURL\fR]
.SH DESCRIPTION
\fBweb2markdown\fR converts \fIinput\-file\fR or \fIURL\fR (or text
from STDIN) from HTML to markdown\-formatted plain text.
If a URL is specified, \fBweb2markdown\fR uses an available program
(e.g. wget, w3m, lynx or curl) to fetch its contents. Output is sent
to STDOUT unless an output file is specified using the \fB\-o\fR
option.
.PP
\fBweb2markdown\fR uses the character encoding specified in the
"Content-type" meta tag. If this is not present, or if input comes
from STDIN, UTF-8 is assumed. A character encoding may be specified
explicitly using the \fB\-e\fR option.
.PP
\fBweb2markdown\fR is a wrapper for \fBhtml2markdown\fR.
.SH OPTIONS
.TP
.B \-s, \-\-standalone
Include title, author, and date information (if present) at the
top of markdown output.
.TP
.B \-o FILE, \-\-output=FILE
Write output to \fIFILE\fR instead of STDOUT.
.TP
.B \-p, \-\-preserve-tabs
Preserve tabs instead of converting them to spaces.
.TP
.B \-\-tab-stop=\fITABSTOP\fB
Specify tab stop (default is 4).
.TP
.B \-R, \-\-parse-raw
Parse untranslatable HTML codes as raw HTML.
.TP
.B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB
Include contents of \fIFILE\fR at the end of the header. Implies
\fB\-s\fR.
.TP
.B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB
Include contents of \fIFILE\fR at the beginning of the document body.
.TP
.B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB
Include contents of \fIFILE\fR at the end of the document body.
.TP
.B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB
Use contents of \fIFILE\fR
as the document header (overriding the default header, which can be
printed using '\fBpandoc \-D markdown\fR'). Implies
\fB-s\fR.
.TP
.B \-v, \-\-version
Print version.
.TP
.B \-h, \-\-help
Show usage message.
.TP
.B \-e \fIencoding\fR
Assume the character encoding \fIencoding\fR in reading HTML.
(Note: \fIencoding\fR will be passed to \fBiconv\fR; a list of
available encodings may be obtained using `\fBiconv \-l\fR'.)
If the \fB\-e\fR option is not specified and input is not from
STDIN, \fBweb2markdown\fR will try to extract the character encoding
from the "Content-type" meta tag. If no character encoding is
specified in this way, or if input is from STDIN, UTF-8 will be
assumed.
.TP
.B \-g \fIcommand\fR
Use \fIcommand\fR to fetch the contents of a URL. (By default,
\fBweb2markdown\fR searches for an available program or text-based
browser to fetch the contents of a URL.) For example:
.IP
web2markdown \-g 'wget \-\-user=foo \-\-password=bar' mysite.com
.SH "SEE ALSO"
\fBpandoc\fR(1),
\fBhtml2markdown\fR(1),
\fBiconv\fR(1)
.SH AUTHOR
John MacFarlane and Recai Oktas

View file

@ -45,6 +45,7 @@ import Text.Pandoc.Writers.DefaultHeaders ( defaultHtmlHeader,
defaultRTFHeader, defaultS5Header, defaultLaTeXHeader ) defaultRTFHeader, defaultS5Header, defaultLaTeXHeader )
import Text.Pandoc.Definition import Text.Pandoc.Definition
import Text.Pandoc.Shared import Text.Pandoc.Shared
import Text.Regex ( mkRegex, splitRegex )
import System ( exitWith, getArgs, getProgName ) import System ( exitWith, getArgs, getProgName )
import System.Exit import System.Exit
import System.Console.GetOpt import System.Console.GetOpt
@ -57,6 +58,9 @@ import Control.Monad ( (>>=) )
version :: String version :: String
version = "0.3" version = "0.3"
copyrightMessage :: String
copyrightMessage = "\nCopyright (C) 2006 John MacFarlane\nWeb: http://sophos.berkeley.edu/macfarlane/pandoc\nThis is free software; see the source for copying conditions. There is no\nwarranty, not even for merchantability or fitness for a particular purpose."
-- | Association list of formats and readers. -- | Association list of formats and readers.
readers :: [(String, ParserState -> String -> Pandoc)] readers :: [(String, ParserState -> String -> Pandoc)]
readers = [("native" , readPandoc) readers = [("native" , readPandoc)
@ -101,10 +105,13 @@ data Opt = Opt
, optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT" , optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT"
, optDefaultHeader :: String -- ^ Default header , optDefaultHeader :: String -- ^ Default header
, optTitlePrefix :: String -- ^ Optional prefix for HTML title , optTitlePrefix :: String -- ^ Optional prefix for HTML title
, optOutputFile :: String -- ^ Name of output file
, optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX , optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX
, optIncremental :: Bool -- ^ If @True@, incremental lists in S5 , optIncremental :: Bool -- ^ If @True@, incremental lists in S5
, optSmart :: Bool -- ^ If @True@, use smart typography , optSmart :: Bool -- ^ If @True@, use smart typography
, optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML , optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML
, optShowUsage :: Bool -- ^ If @True@, show usage message
, optDebug :: Bool -- ^ If @True@, output debug messages
} }
-- | Defaults for command-line options. -- | Defaults for command-line options.
@ -123,32 +130,20 @@ startOpt = Opt
, optCustomHeader = "DEFAULT" , optCustomHeader = "DEFAULT"
, optDefaultHeader = defaultHtmlHeader , optDefaultHeader = defaultHtmlHeader
, optTitlePrefix = "" , optTitlePrefix = ""
, optOutputFile = "" -- null for stdout
, optNumberSections = False , optNumberSections = False
, optIncremental = False , optIncremental = False
, optSmart = False , optSmart = False
, optASCIIMathML = False , optASCIIMathML = False
, optShowUsage = False
, optDebug = False
} }
-- | A list of functions, each transforming the options data structure in response -- | A list of functions, each transforming the options data structure in response
-- to a command-line option. -- to a command-line option.
options :: [OptDescr (Opt -> IO Opt)] allOptions :: [OptDescr (Opt -> IO Opt)]
options = allOptions =
[ Option "v" ["version"] [ Option "fr" ["from","read"]
(NoArg
(\_ -> do
hPutStrLn stderr ("Version " ++ version)
exitWith ExitSuccess))
"Print version"
, Option "h" ["help"]
(NoArg
(\_ -> do
prg <- getProgName
hPutStrLn stderr (usageInfo (prg ++ " [OPTIONS] [FILES] - convert FILES from one markup format to another\nIf no OPTIONS specified, converts from markdown to html.\nIf no FILES specified, input is read from STDIN.\nOptions:") options)
exitWith ExitSuccess))
"Show help"
, Option "fr" ["from","read"]
(ReqArg (ReqArg
(\arg opt -> case (lookup (map toLower arg) readers) of (\arg opt -> case (lookup (map toLower arg) readers) of
Just reader -> return opt { optReader = reader } Just reader -> return opt { optReader = reader }
@ -172,6 +167,13 @@ options =
(\opt -> return opt { optStandalone = True })) (\opt -> return opt { optStandalone = True }))
"Include needed header and footer on output" "Include needed header and footer on output"
, Option "o" ["output"]
(ReqArg
(\arg opt -> do
return opt { optOutputFile = arg })
"FILENAME")
"Name of output file"
, Option "p" ["preserve-tabs"] , Option "p" ["preserve-tabs"]
(NoArg (NoArg
(\opt -> return opt { optPreserveTabs = True })) (\opt -> return opt { optPreserveTabs = True }))
@ -241,7 +243,7 @@ options =
"FILENAME") "FILENAME")
"File to include after document body" "File to include after document body"
, Option "" ["custom-header"] , Option "C" ["custom-header"]
(ReqArg (ReqArg
(\arg opt -> do (\arg opt -> do
text <- readFile arg text <- readFile arg
@ -263,18 +265,87 @@ options =
let header = case (lookup arg writers) of let header = case (lookup arg writers) of
Just (writer, head) -> head Just (writer, head) -> head
Nothing -> error ("Unknown reader: " ++ arg) Nothing -> error ("Unknown reader: " ++ arg)
hPutStrLn stdout header hPutStr stdout header
exitWith ExitSuccess) exitWith ExitSuccess)
"FORMAT") "FORMAT")
"Print default header for FORMAT" "Print default header for FORMAT"
, Option "d" ["debug"]
(NoArg
(\opt -> return opt { optDebug = True }))
"Print debug messages to stderr, output to stdout"
, Option "v" ["version"]
(NoArg
(\_ -> do
prg <- getProgName
hPutStrLn stderr (prg ++ " " ++ version ++
copyrightMessage)
exitWith $ ExitFailure 2))
"Print version"
, Option "h" ["help"]
(NoArg
(\opt -> return opt { optShowUsage = True }))
"Show help"
] ]
-- parse name of calling program and return default reader and writer descriptions
parseProgName name =
case (splitRegex (mkRegex "2") (map toLower name)) of
[from, to] -> (from, to)
_ -> ("markdown", "html")
-- set default options based on reader and writer descriptions; start is starting options
setDefaultOpts from to start =
case ((lookup from readers), (lookup to writers)) of
(Just reader, Just (writer, header)) -> start {optReader = reader,
optWriter = writer,
optDefaultHeader = header}
_ -> start
-- True if single-letter option is in option list
inOptList :: [Char] -> OptDescr (Opt -> IO Opt) -> Bool
inOptList list desc =
let (Option letters _ _ _) = desc in
any (\x -> x `elem` list) letters
-- Reformat usage message so it doesn't wrap illegibly
reformatUsageInfo = gsub " *--" " --" .
gsub "(-[A-Za-z0-9]) *--" "\\1, --" .
gsub " *([^- ])" "\n\t\\1"
main = do main = do
name <- getProgName
let (from, to) = parseProgName name
let irrelevantOptions = if not ('2' `elem` name)
then ""
else "frtwD" ++
(if (to /= "html" && to /= "s5") then "SmcT" else "") ++
(if (to /= "latex") then "N" else "") ++
(if (to /= "s5") then "i" else "") ++
(if (from /= "html" && from /= "latex") then "R" else "")
let options = filter (not . inOptList irrelevantOptions) allOptions
let defaultOpts = setDefaultOpts from to startOpt
args <- getArgs args <- getArgs
let (actions, sources, errors) = getOpt RequireOrder options args let (actions, sources, errors) = getOpt Permute options args
if (not (null errors))
then do
mapM (\e -> hPutStrLn stderr e) errors
hPutStrLn stderr (reformatUsageInfo $
usageInfo (name ++ " [OPTIONS] [FILES]") options)
exitWith $ ExitFailure 2
else
return ()
-- thread option data structure through all supplied option actions -- thread option data structure through all supplied option actions
opts <- foldl (>>=) (return startOpt) actions opts <- foldl (>>=) (return defaultOpts) actions
let Opt { optPreserveTabs = preserveTabs let Opt { optPreserveTabs = preserveTabs
, optTabStop = tabStop , optTabStop = tabStop
@ -289,12 +360,31 @@ main = do
, optCustomHeader = customHeader , optCustomHeader = customHeader
, optDefaultHeader = defaultHeader , optDefaultHeader = defaultHeader
, optTitlePrefix = titlePrefix , optTitlePrefix = titlePrefix
, optOutputFile = outputFile
, optNumberSections = numberSections , optNumberSections = numberSections
, optIncremental = incremental , optIncremental = incremental
, optSmart = smart , optSmart = smart
, optASCIIMathML = asciiMathML , optASCIIMathML = asciiMathML
, optShowUsage = showUsage
, optDebug = debug
} = opts } = opts
if showUsage
then do
hPutStr stderr (reformatUsageInfo $ usageInfo (name ++ " [OPTIONS] [FILES]") options)
exitWith $ ExitFailure 2
else return ()
output <- if ((null outputFile) || debug)
then return stdout
else openFile outputFile WriteMode
if debug
then do
hPutStrLn stderr ("OUTPUT=" ++ outputFile)
hPutStr stderr $ concatMap (\s -> "INPUT=" ++ s ++ "\n") sources
else return ()
let writingS5 = (defaultHeader == defaultS5Header) let writingS5 = (defaultHeader == defaultS5Header)
let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop) let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop)
let addBlank str = str ++ "\n\n" let addBlank str = str ++ "\n\n"
@ -323,13 +413,13 @@ main = do
writerIncludeBefore = includeBefore, writerIncludeBefore = includeBefore,
writerIncludeAfter = includeAfter } writerIncludeAfter = includeAfter }
(readSources sources) >>= (putStr . encodeUTF8 . (writer writerOptions) . (readSources sources) >>= (hPutStr output . encodeUTF8 .
(writer writerOptions) .
(reader startParserState) . filter . (reader startParserState) . filter .
decodeUTF8 . (joinWithSep "\n")) decodeUTF8 . (joinWithSep "\n")) >> hClose output
where where
readSources [] = mapM readSource ["-"] readSources [] = mapM readSource ["-"]
readSources sources = mapM readSource sources readSources sources = mapM readSource sources
readSource "-" = getContents readSource "-" = getContents
readSource source = readFile source readSource source = readFile source

View file

@ -1,7 +0,0 @@
# Check if input files exist.
for f; do
if [ -n "$f" ] && ! [ -f "$f" ]; then
err "File '$f' not found."
exit 1
fi
done

View file

@ -8,22 +8,6 @@ WRAPPEE_ARGS=
err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; } err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; }
errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; } errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; }
usage () {
synopsis="$@"
err "Usage: $THIS $synopsis"
err "See $THIS(1) man file for details."
}
runpandoc () {
if [ -n "$WRAPPEE_ARGS" ]; then
# Unpack arguments that will be passed to pandoc.
oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPEE_ARGS "$@"; IFS="$oldifs"
case "$1" in --) shift;; esac # tolerate the existence of a leading '--'
fi
pandoc "$@"
}
# Portable which(1). # Portable which(1).
pathfind () { pathfind () {
oldifs="$IFS"; IFS=':' oldifs="$IFS"; IFS=':'
@ -37,17 +21,6 @@ pathfind () {
return 1 return 1
} }
HAVE_ICONV=
if pathfind iconv; then
HAVE_ICONV=1
alias to_utf8='iconv -t utf-8'
alias from_utf8='iconv -f utf-8'
else
err "Warning: iconv not present. Assuming UTF-8 character encoding."
alias to_utf8='cat'
alias from_utf8='cat'
fi
for p in pandoc $REQUIRED; do for p in pandoc $REQUIRED; do
pathfind $p || { pathfind $p || {
err "You need '$p' to use this program!" err "You need '$p' to use this program!"

View file

@ -1,12 +0,0 @@
if [ -z "$SYNOPSIS" ]; then
SYNOPSIS="[-h] [input_file]"
[ -n "$THIS_NARG" ] || SYNOPSIS="${SYNOPSIS}..."
fi
while getopts h opt; do
case $opt in
h|?) usage "$SYNOPSIS"; exit 2 ;;
esac
done
shift $(($OPTIND - 1))

View file

@ -1,134 +0,0 @@
#!/bin/sh -e
# converts html to markdown
# uses an available program to fetch URL and tidy to normalize it first
REQUIRED=tidy
### common.sh
grab_url_with () {
url="${1:?internal error: grab_url_with: url required}"
shift
cmdline="$@"
prog=
prog_opts=
if [ -n "$cmdline" ]; then
eval "set -- $cmdline"
prog=$1
shift
prog_opts="$@"
fi
if [ -z "$prog" ]; then
# Locate a sensible web grabber (note the order).
for p in wget lynx w3m curl links w3c; do
if pathfind $p; then
prog=$p
break
fi
done
[ -n "$prog" ] || {
errn "$THIS: Couldn't find a program to fetch the file from URL "
err "(e.g. wget, w3m, lynx, w3c, or curl)."
return 1
}
else
pathfind "$prog" || {
err "$THIS: No such web grabber '$prog' found; aborting."
return 1
}
fi
# Setup proper base options for known grabbers.
base_opts=
case "$prog" in
wget) base_opts="-O-" ;;
lynx) base_opts="-source" ;;
w3m) base_opts="-dump_source" ;;
curl) base_opts="" ;;
links) base_opts="-source" ;;
w3c) base_opts="-n -get" ;;
*) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
esac
err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
eval "set -- $base_opts $prog_opts"
$prog "$@" "$url"
}
encoding=
grabber=
nograb=
while getopts e:g:nh opt; do
case $opt in
e) encoding="$OPTARG" ;;
g) grabber="$OPTARG" ;;
n) nograb=1 ;;
h|?)
usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]"
exit 2 ;;
esac
done
shift $(($OPTIND - 1))
### postopts.sh
### singlearg.sh
inurl=
if [ -n "$1" ] && ! [ -f "$1" ]; then
if [ -n "$nograb" ]; then
err "'$1' not found; refusing to treat input as URL."
exit 1
fi
# Treat given argument as an URL.
inurl="$1"
fi
if [ -n "$inurl" ]; then
err "Attempting to fetch file from '$inurl'..."
### tempdir.sh
grabber_out=$THIS_TEMPDIR/grabber.out
grabber_log=$THIS_TEMPDIR/grabber.log
if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \
2>$grabber_log; then
errn "grab_url_with failed"
if [ -f $grabber_log ]; then
err " with the following error log."
err
cat >&2 $grabber_log
else
err .
fi
exit 1
fi
set -- $grabber_out
fi
if [ -z "$encoding" ] && [ "x$@" != "x" ]; then
# Try to determine character encoding unless not specified
# and input is STDIN.
encoding=$(
head "$@" |
LC_ALL=C tr 'A-Z' 'a-z' |
sed -ne '/<meta .*content-type.*charset=/ {
s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
}'
)
fi
if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then
alias to_utf8='iconv -f "$encoding" -t utf-8'
elif [ -n "$inurl" ]; then # assume web pages are UTF-8
alias to_utf8='cat'
fi # else just use local encoding
to_utf8 "$@" | tidy -utf8 2>/dev/null |
runpandoc -r html -w markdown -s | from_utf8

View file

@ -1,14 +0,0 @@
#!/bin/sh -e
# runs pandoc to convert latex to markdown
### common.sh
### getopts.sh
### postopts.sh
### singlearg.sh
### checkin.sh
to_utf8 "$@" | runpandoc -r latex -w markdown -s | from_utf8

View file

@ -1,12 +0,0 @@
#!/bin/sh -e
# converts markdown to HTML
### common.sh
### getopts.sh
### postopts.sh
### checkin.sh
to_utf8 "$@" | runpandoc | from_utf8

View file

@ -1,12 +0,0 @@
#!/bin/sh -e
# converts markdown to latex
### common.sh
### getopts.sh
### postopts.sh
### checkin.sh
to_utf8 "$@" | runpandoc -w latex -s | from_utf8

View file

@ -1,64 +1,54 @@
#!/bin/sh -e #!/bin/sh -e
# converts markdown to latex, then uses latex to make a PDF
REQUIRED=pdflatex REQUIRED="markdown2latex pdflatex"
### common.sh ### common.sh
outfile=
while getopts o:h opt; do
case $opt in
o) outfile="$OPTARG" ;;
h|?) usage "[-o output_file] [-h] [input_file]..."; exit 2 ;;
esac
done
shift $(($OPTIND - 1))
### postopts.sh
### checkin.sh
if [ -z "$outfile" ]; then
if [ -n "$1" ]; then
outfile="${1%.*}"
else
outfile="stdin" # input is STDIN, since no argument given
fi
fi
case "$outfile" in
*.*) ;; # skip appending extension if one is already present
*) outfile="${outfile%.*}.pdf";;
esac
### tempdir.sh ### tempdir.sh
# We should use a filename without white spaces for pdflatex. texname=output
TEXNAME=$THIS logfile=$THIS_TEMPDIR/log
if ! markdown2latex -s -d "$@" >$THIS_TEMPDIR/$texname.tex 2>$logfile; then
[ -f $logfile ] && sed -e 's/markdown2latex/markdown2pdf/g' \
-e '/^INPUT=/d' -e '/^OUTPUT=/d' $logfile >&2
exit 1
fi
outfile="$(sed -ne 's/^OUTPUT=//p' $logfile)"
IFS="$NEWLINE"
set -- $(sed -ne 's/^INPUT=//p' $logfile)
firstinfilebase="${1%.*}"
defaultdest="${firstinfilebase:-stdin}.pdf"
destname="${outfile:-$defaultdest}"
to_utf8 "$@" | runpandoc -w latex -s >$THIS_TEMPDIR/$TEXNAME.tex
( (
cd $THIS_TEMPDIR cd $THIS_TEMPDIR
if ! pdflatex -interaction=batchmode $TEXNAME.tex >/dev/null 2>&1; then if ! pdflatex -interaction=batchmode $texname.tex >/dev/null 2>&1; then
err "LaTeX errors:" err "LaTeX errors:"
from_utf8 $TEXNAME.log | sed -ne '/^!/,/^ *$/p' >&2 sed -ne '/^!/,/^ *$/p' $texname.log >&2
if grep -q "File \`ucs.sty' not found" $TEXNAME.log; then if grep -q "File \`ucs.sty' not found" $texname.log; then
err "Please install the 'unicode' package from ctan.org." err "Please install the 'unicode' package from CTAN:"
err "http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/"
fi
if grep -q "File \`fancyvrb.sty' not found" $texname.log; then
err "Please install the 'fancyvrb' package from CTAN:"
err "http://www.ctan.org/tex-archive/macros/latex/contrib/fancyvrb/"
fi fi
exit 1 exit 1
fi fi
) ) || exit $?
is_target_exists= is_target_exists=
if [ -f "$outfile" ]; then if [ -f "$destname" ]; then
is_target_exists=1 is_target_exists=1
mv -f "$outfile" "$outfile~" mv "$destname" "$destname~"
fi fi
mv -f $THIS_TEMPDIR/$TEXNAME.pdf "$outfile" mv -f $THIS_TEMPDIR/$texname.pdf "$destname"
errn "Created '$outfile'" errn "Created $destname"
[ -z "$is_target_exists" ] || { [ -z "$is_target_exists" ] || {
errn " (previous file has been backed up as '$outfile~')" errn " (previous file has been backed up as $destname~)"
} }
err . err .

View file

@ -1,17 +0,0 @@
# Parse wrapper and wrappee (pandoc) arguments by taking
# into account that they may have space or tab characters.
pick="WRAPPER_ARGS"
while [ $# -gt 0 ]; do
if [ "$pick" = "WRAPPER_ARGS" ]; then
case "$1" in
-*) pick="WRAPPEE_ARGS" ;;
esac
fi
# Pack args with NEWLINE to preserve spaces,
# and put them into the picked variable.
eval "$pick=\"\$${pick}${NEWLINE}${1}\""
shift
done
# Unpack filename arguments. Now "$@" will hold the filenames.
oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPER_ARGS; IFS="$oldifs"

View file

@ -1,7 +0,0 @@
# Ensure to work with a single argument.
if [ $# -gt 1 ]; then
first_arg="$1"
shift
err "Warning: extra arguments '$@' will be ignored."
set -- $first_arg
fi

View file

@ -1,141 +0,0 @@
#!/bin/sh
THIS=$1
ASH="ash -s"
BASH="bash --posix -s"
DASH="dash -s"
KSH="ksh -s"
POSH="posh -s"
ZSH="zsh -s"
ERROR=""
wrapper () {
$SH -- "$@" <<-'EOF'
### common.sh
outfile=
while getopts o: opt; do
case $opt in
o) outfile="$OPTARG" ;;
esac
done
shift $(($OPTIND - 1))
### postopts.sh
echo "Options passed to wrapper:"
[ -z "$outfile" ] || echo "|$outfile|"
echo "Arguments passed to wrapper:"
for arg; do
echo "|$arg|"
done
pandoc () {
echo "Arguments passed to wrappee:"
for arg; do
echo "|$arg|"
done
}
runpandoc
EOF
}
# Portable which(1).
pathfind () {
oldifs="$IFS"; IFS=':'
for _p in $PATH; do
if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then
IFS="$oldifs"
return 0
fi
done
IFS="$oldifs"
return 1
}
check_results () {
if [ "$1" = "$2" ]; then
echo >&2 ok
return 0
else
echo >&2 failed
sed "s/^/\t/" >&2 <<EOF
Command line: '$3'
===> Expected:
$2
<=== Got:
$1
EOF
return 1
fi
}
for SH in "$BASH" "$DASH" "$KSH" "$ZSH"; do
CMD=${SH%% *}
echo >&2 " Testing with $CMD..."
if pathfind "$CMD"; then
if [ "$CMD" = "zsh" ]; then
# Zsh needs to be called as 'sh' to enable POSIX mode.
ln -s $(which zsh) ./sh
SH="./sh ${SH#* }"
trap 'err=$?; rm -f ./sh; exit $err' 0 1 2 3 13 15
fi
set -e
# Test 1
printf >&2 " test case 1... "
actual=$(wrapper -o "output file" "foo bar" -A "quux baz" -B)
expected=$(cat <<'EOF'
Options passed to wrapper:
|output file|
Arguments passed to wrapper:
|foo bar|
Arguments passed to wrappee:
|-A|
|quux baz|
|-B|
EOF
)
check_results "$actual" "$expected" \
'wrapper -o "output file" "foo bar" -A "quux baz" -B'
# Test 2
printf >&2 " test case 2... "
actual=$(wrapper -- -A "foo bar")
expected=$(cat <<'EOF'
Options passed to wrapper:
Arguments passed to wrapper:
Arguments passed to wrappee:
|-A|
|foo bar|
EOF
)
check_results "$actual" "$expected" 'wrapper -- -A "foo bar"'
# Test 3 (Test 1 with a redundant '--')
printf >&2 " test case 3... "
actual=$(wrapper -o "output file" "foo bar" -- -A "quux baz" -B)
expected=$(cat <<'EOF'
Options passed to wrapper:
|output file|
Arguments passed to wrapper:
|foo bar|
Arguments passed to wrappee:
|-A|
|quux baz|
|-B|
EOF
)
check_results "$actual" "$expected" \
'wrapper -o "output file" "foo bar" -- -A "quux baz" -B'
else
echo >&2 "Warning: cannot verify correctness with $CMD; shell not available"
fi
done
exit 0

View file

@ -0,0 +1,173 @@
#!/bin/sh -e
# converts HTML from a URL, file, or stdin to markdown
# uses an available program to fetch URL and tidy to normalize it first
REQUIRED="tidy html2markdown"
### common.sh
grab_url_with () {
url="${1:?internal error: grab_url_with: url required}"
shift
cmdline="$@"
prog=
prog_opts=
if [ -n "$cmdline" ]; then
eval "set -- $cmdline"
prog=$1
shift
prog_opts="$@"
fi
if [ -z "$prog" ]; then
# Locate a sensible web grabber (note the order).
for p in wget lynx w3m curl links w3c; do
if pathfind $p; then
prog=$p
break
fi
done
[ -n "$prog" ] || {
errn "$THIS: Couldn't find a program to fetch the file from URL "
err "(e.g. wget, w3m, lynx, w3c, or curl)."
return 1
}
else
pathfind "$prog" || {
err "$THIS: No such web grabber '$prog' found; aborting."
return 1
}
fi
# Setup proper base options for known grabbers.
base_opts=
case "$prog" in
wget) base_opts="-O-" ;;
lynx) base_opts="-source" ;;
w3m) base_opts="-dump_source" ;;
curl) base_opts="" ;;
links) base_opts="-source" ;;
w3c) base_opts="-n -get" ;;
*) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
esac
err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
eval "set -- $base_opts $prog_opts"
$prog "$@" "$url"
}
add_option () {
options="$options$NEWLINE$1"
}
options=
argument=
encoding=
grabber=
# Parse command-line arguments
while [ $# -gt 0 ]; do
case "$1" in
-h|--help)
html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2
err " -e ENCODING, --encoding=ENCODING"
err " Specify character encoding of input"
err " -g COMMAND, --grabber=COMMAND"
err " Specify command to be used to grab contents of URL"
exit 0 ;;
-v|--version)
html2markdown -v
exit 0 ;;
-e)
shift
encoding=$1 ;;
--encoding=*)
wholeopt=$1
# extract encoding from after =
encoding=${wholeopt#*=} ;;
-g)
shift
grabber=$1 ;;
--grabber=*)
wholeopt=$1
# extract encoding from after =
grabber=${wholeopt#*=} ;;
-o|--output|-b|--tab-stop|-H|--include-in-header| \
-A|--include-after-body|-C|-B|--include-before-body| \
-C|--custom-header|-T|--title-prefix)
add_option $1
shift
add_option $1 ;;
-*) add_option $1 ;;
*)
if [ -z "$argument" ]; then
argument=$1
else
err "Warning: extra argument '$1' will be ignored."
fi ;;
esac
shift
done
# Unpack options. Now "$@" will hold the html2markdown options.
oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
inurl=
if [ -n "$argument" ] && ! [ -f "$argument" ]; then
# Treat given argument as an URL.
inurl="$argument"
fi
if [ -n "$inurl" ]; then
err "Attempting to fetch file from '$inurl'..."
### tempdir.sh
grabber_out=$THIS_TEMPDIR/grabber.out
grabber_log=$THIS_TEMPDIR/grabber.log
if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
errn "grab_url_with failed"
if [ -f $grabber_log ]; then
err " with the following error log."
err
cat >&2 $grabber_log
else
err .
fi
exit 1
fi
argument="$grabber_out"
fi
if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
# Try to determine character encoding if not specified
# and input is not STDIN.
encoding=$(
head "$argument" |
LC_ALL=C tr 'A-Z' 'a-z' |
sed -ne '/<meta .*content-type.*charset=/ {
s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
}'
)
fi
if [ -n "$encoding" ] && pathfind iconv; then
alias to_utf8='iconv -f "$encoding" -t utf-8'
else # assume UTF-8
alias to_utf8='cat'
fi
if [ -z "$argument" ]; then
tidy -utf8 2>/dev/null | html2markdown "$@"
else
if [ -f "$argument" ]; then
to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@"
else
err "File '$argument' not found."
exit 1
fi
fi