2006-12-22 21:16:03 +01:00
|
|
|
#!/bin/sh -e
|
|
|
|
# converts HTML from a URL, file, or stdin to markdown
|
|
|
|
# uses an available program to fetch URL and tidy to normalize it first
|
|
|
|
|
2006-12-28 03:20:09 +01:00
|
|
|
REQUIRED="tidy"
|
2007-01-08 20:55:34 +01:00
|
|
|
SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text."
|
2006-12-22 21:16:03 +01:00
|
|
|
|
|
|
|
### common.sh
|
|
|
|
|
|
|
|
grab_url_with () {
|
|
|
|
url="${1:?internal error: grab_url_with: url required}"
|
|
|
|
|
|
|
|
shift
|
|
|
|
cmdline="$@"
|
|
|
|
|
|
|
|
prog=
|
|
|
|
prog_opts=
|
|
|
|
if [ -n "$cmdline" ]; then
|
|
|
|
eval "set -- $cmdline"
|
|
|
|
prog=$1
|
|
|
|
shift
|
|
|
|
prog_opts="$@"
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ -z "$prog" ]; then
|
|
|
|
# Locate a sensible web grabber (note the order).
|
|
|
|
for p in wget lynx w3m curl links w3c; do
|
|
|
|
if pathfind $p; then
|
|
|
|
prog=$p
|
|
|
|
break
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
|
|
|
[ -n "$prog" ] || {
|
|
|
|
errn "$THIS: Couldn't find a program to fetch the file from URL "
|
|
|
|
err "(e.g. wget, w3m, lynx, w3c, or curl)."
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
else
|
|
|
|
pathfind "$prog" || {
|
|
|
|
err "$THIS: No such web grabber '$prog' found; aborting."
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Setup proper base options for known grabbers.
|
|
|
|
base_opts=
|
|
|
|
case "$prog" in
|
|
|
|
wget) base_opts="-O-" ;;
|
|
|
|
lynx) base_opts="-source" ;;
|
|
|
|
w3m) base_opts="-dump_source" ;;
|
|
|
|
curl) base_opts="" ;;
|
|
|
|
links) base_opts="-source" ;;
|
|
|
|
w3c) base_opts="-n -get" ;;
|
|
|
|
*) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
|
|
|
|
esac
|
|
|
|
|
|
|
|
err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
|
|
|
|
eval "set -- $base_opts $prog_opts"
|
|
|
|
$prog "$@" "$url"
|
|
|
|
}
|
|
|
|
|
|
|
|
# Parse command-line arguments
|
2007-01-08 20:55:34 +01:00
|
|
|
parse_arguments () {
|
|
|
|
while [ $# -gt 0 ]; do
|
|
|
|
case "$1" in
|
|
|
|
--encoding=*)
|
|
|
|
wholeopt="$1"
|
|
|
|
# extract encoding from after =
|
|
|
|
encoding="${wholeopt#*=}" ;;
|
|
|
|
-e|--encoding|-encoding)
|
|
|
|
shift
|
|
|
|
encoding="$1" ;;
|
|
|
|
--grabber=*)
|
|
|
|
wholeopt="$1"
|
|
|
|
# extract encoding from after =
|
|
|
|
grabber="\"${wholeopt#*=}\"" ;;
|
|
|
|
-g|--grabber|-grabber)
|
|
|
|
shift
|
|
|
|
grabber="$1" ;;
|
|
|
|
*)
|
|
|
|
if [ -z "$argument" ]; then
|
|
|
|
argument="$1"
|
|
|
|
else
|
|
|
|
err "Warning: extra argument '$1' will be ignored."
|
|
|
|
fi ;;
|
|
|
|
esac
|
|
|
|
shift
|
|
|
|
done
|
|
|
|
}
|
2006-12-22 21:16:03 +01:00
|
|
|
|
2007-01-08 22:24:31 +01:00
|
|
|
argument=
|
|
|
|
encoding=
|
|
|
|
grabber=
|
|
|
|
|
2007-01-08 20:55:34 +01:00
|
|
|
oldifs="$IFS"
|
|
|
|
IFS=$NEWLINE
|
|
|
|
parse_arguments $ARGS
|
|
|
|
IFS="$oldifs"
|
2006-12-22 21:16:03 +01:00
|
|
|
|
|
|
|
inurl=
|
|
|
|
if [ -n "$argument" ] && ! [ -f "$argument" ]; then
|
|
|
|
# Treat given argument as an URL.
|
|
|
|
inurl="$argument"
|
|
|
|
fi
|
|
|
|
|
2007-10-02 04:08:52 +02:00
|
|
|
### tempdir.sh
|
|
|
|
|
2006-12-22 21:16:03 +01:00
|
|
|
if [ -n "$inurl" ]; then
|
|
|
|
err "Attempting to fetch file from '$inurl'..."
|
|
|
|
|
|
|
|
grabber_out=$THIS_TEMPDIR/grabber.out
|
|
|
|
grabber_log=$THIS_TEMPDIR/grabber.log
|
|
|
|
if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
|
|
|
|
errn "grab_url_with failed"
|
|
|
|
if [ -f $grabber_log ]; then
|
|
|
|
err " with the following error log."
|
|
|
|
err
|
|
|
|
cat >&2 $grabber_log
|
|
|
|
else
|
|
|
|
err .
|
|
|
|
fi
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
argument="$grabber_out"
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
|
|
|
|
# Try to determine character encoding if not specified
|
|
|
|
# and input is not STDIN.
|
|
|
|
encoding=$(
|
|
|
|
head "$argument" |
|
|
|
|
LC_ALL=C tr 'A-Z' 'a-z' |
|
|
|
|
sed -ne '/<meta .*content-type.*charset=/ {
|
|
|
|
s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
|
|
|
|
}'
|
|
|
|
)
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ -n "$encoding" ] && pathfind iconv; then
|
|
|
|
alias to_utf8='iconv -f "$encoding" -t utf-8'
|
|
|
|
else # assume UTF-8
|
|
|
|
alias to_utf8='cat'
|
|
|
|
fi
|
|
|
|
|
2007-10-02 04:08:52 +02:00
|
|
|
htmlinput=$THIS_TEMPDIR/htmlinput
|
|
|
|
|
2006-12-22 21:16:03 +01:00
|
|
|
if [ -z "$argument" ]; then
|
2007-10-02 04:08:52 +02:00
|
|
|
to_utf8 > $htmlinput # read from STDIN
|
|
|
|
elif [ -f "$argument" ]; then
|
|
|
|
to_utf8 "$argument" > $htmlinput # read from file
|
2006-12-22 21:16:03 +01:00
|
|
|
else
|
2007-10-02 04:08:52 +02:00
|
|
|
err "File '$argument' not found."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then
|
|
|
|
err "Failed to parse HTML. Trying again with tidy..."
|
|
|
|
tidy -q -asxhtml -utf8 $htmlinput | \
|
|
|
|
pandoc --ignore-args -r html -w markdown "$@"
|
2006-12-22 21:16:03 +01:00
|
|
|
fi
|