pandoc/html2markdown

222 lines
5.3 KiB
Text
Raw Normal View History

#!/bin/sh -e
# converts HTML from a URL, file, or stdin to markdown
# uses an available program to fetch URL and tidy to normalize it first
REQUIRED="tidy"
SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text."
THIS=${0##*/}
NEWLINE='
'
err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; }
errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; }
usage () {
err "$1 - $2" # short description
err "See the $1(1) man page for usage."
}
# Portable which(1).
pathfind () {
oldifs="$IFS"; IFS=':'
for _p in $PATH; do
if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then
IFS="$oldifs"
return 0
fi
done
IFS="$oldifs"
return 1
}
for p in pandoc $REQUIRED; do
pathfind $p || {
err "You need '$p' to use this program!"
exit 1
}
done
CONF=$(pandoc --dump-args "$@" 2>&1) || {
errcode=$?
echo "$CONF" | sed -e '/^pandoc \[OPTIONS\] \[FILES\]/,$d' >&2
[ $errcode -eq 2 ] && usage "$THIS" "$SYNOPSIS"
exit $errcode
}
OUTPUT=$(echo "$CONF" | sed -ne '1p')
ARGS=$(echo "$CONF" | sed -e '1d')
grab_url_with () {
url="${1:?internal error: grab_url_with: url required}"
shift
cmdline="$@"
prog=
prog_opts=
if [ -n "$cmdline" ]; then
eval "set -- $cmdline"
prog=$1
shift
prog_opts="$@"
fi
if [ -z "$prog" ]; then
# Locate a sensible web grabber (note the order).
for p in wget lynx w3m curl links w3c; do
if pathfind $p; then
prog=$p
break
fi
done
[ -n "$prog" ] || {
errn "$THIS: Couldn't find a program to fetch the file from URL "
err "(e.g. wget, w3m, lynx, w3c, or curl)."
return 1
}
else
pathfind "$prog" || {
err "$THIS: No such web grabber '$prog' found; aborting."
return 1
}
fi
# Setup proper base options for known grabbers.
base_opts=
case "$prog" in
wget) base_opts="-O-" ;;
lynx) base_opts="-source" ;;
w3m) base_opts="-dump_source" ;;
curl) base_opts="" ;;
links) base_opts="-source" ;;
w3c) base_opts="-n -get" ;;
*) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
esac
err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
eval "set -- $base_opts $prog_opts"
$prog "$@" "$url"
}
# Parse command-line arguments
parse_arguments () {
while [ $# -gt 0 ]; do
case "$1" in
--encoding=*)
wholeopt="$1"
# extract encoding from after =
encoding="${wholeopt#*=}" ;;
-e|--encoding|-encoding)
shift
encoding="$1" ;;
--grabber=*)
wholeopt="$1"
# extract encoding from after =
grabber="\"${wholeopt#*=}\"" ;;
-g|--grabber|-grabber)
shift
grabber="$1" ;;
*)
if [ -z "$argument" ]; then
argument="$1"
else
err "Warning: extra argument '$1' will be ignored."
fi ;;
esac
shift
done
}
argument=
encoding=
grabber=
oldifs="$IFS"
IFS=$NEWLINE
parse_arguments $ARGS
IFS="$oldifs"
inurl=
if [ -n "$argument" ] && ! [ -f "$argument" ]; then
# Treat given argument as an URL.
inurl="$argument"
fi
# As a security measure refuse to proceed if mktemp is not available.
pathfind mktemp || { err "Couldn't find 'mktemp'; aborting."; exit 1; }
# Avoid issues with /tmp directory on Windows/Cygwin
cygwin=
cygwin=$(uname | sed -ne '/^CYGWIN/p')
if [ -n "$cygwin" ]; then
TMPDIR=.
export TMPDIR
fi
THIS_TEMPDIR=
THIS_TEMPDIR="$(mktemp -d -t $THIS.XXXXXXXX)" || exit 1
readonly THIS_TEMPDIR
trap 'exitcode=$?
[ -z "$THIS_TEMPDIR" ] || rm -rf "$THIS_TEMPDIR"
exit $exitcode' 0 1 2 3 13 15
if [ -n "$inurl" ]; then
err "Attempting to fetch file from '$inurl'..."
grabber_out=$THIS_TEMPDIR/grabber.out
grabber_log=$THIS_TEMPDIR/grabber.log
if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
errn "grab_url_with failed"
if [ -f $grabber_log ]; then
err " with the following error log."
err
cat >&2 $grabber_log
else
err .
fi
exit 1
fi
argument="$grabber_out"
fi
if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
# Try to determine character encoding if not specified
# and input is not STDIN.
encoding=$(
head "$argument" |
LC_ALL=C tr 'A-Z' 'a-z' |
sed -ne '/<meta .*content-type.*charset=/ {
s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
}'
)
fi
if [ -n "$encoding" ] && pathfind iconv; then
alias to_utf8='iconv -f "$encoding" -t utf-8'
else # assume UTF-8
alias to_utf8='cat'
fi
htmlinput=$THIS_TEMPDIR/htmlinput
if [ -z "$argument" ]; then
to_utf8 > $htmlinput # read from STDIN
elif [ -f "$argument" ]; then
to_utf8 "$argument" > $htmlinput # read from file
else
err "File '$argument' not found."
exit 1
fi
if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then
err "Failed to parse HTML. Trying again with tidy..."
tidy -q -asxhtml -utf8 $htmlinput | \
pandoc --ignore-args -r html -w markdown "$@"
fi