Modified html2markdown. Previously html2markdown piped all input

through html tidy before passing it to pandoc.  This causes problems
on certain sites (e.g. daringfireball.com/markdown) which have
well-formed xhtml that causes tidy to choke.  Solution is to try
pandoc on the original HTML, and run it through tidy only if that
fails.

This means that a temp file is now always used, even when input comes
from a local file or standard input.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1039 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2007-10-02 02:08:52 +00:00
parent 5f64258a4e
commit a1ad3b4e5f

View file

@ -104,11 +104,11 @@ if [ -n "$argument" ] && ! [ -f "$argument" ]; then
inurl="$argument"
fi
### tempdir.sh
if [ -n "$inurl" ]; then
err "Attempting to fetch file from '$inurl'..."
### tempdir.sh
grabber_out=$THIS_TEMPDIR/grabber.out
grabber_log=$THIS_TEMPDIR/grabber.log
if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
@ -144,14 +144,19 @@ else # assume UTF-8
alias to_utf8='cat'
fi
htmlinput=$THIS_TEMPDIR/htmlinput
if [ -z "$argument" ]; then
tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@"
to_utf8 > $htmlinput # read from STDIN
elif [ -f "$argument" ]; then
to_utf8 "$argument" > $htmlinput # read from file
else
if [ -f "$argument" ]; then
to_utf8 "$argument" |
tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@"
else
err "File '$argument' not found."
exit 1
fi
err "File '$argument' not found."
exit 1
fi
if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then
err "Failed to parse HTML. Trying again with tidy..."
tidy -q -asxhtml -utf8 $htmlinput | \
pandoc --ignore-args -r html -w markdown "$@"
fi