79fdbcea69
git-svn-id: https://pandoc.googlecode.com/svn/trunk@101 788f1e2b-df1e-0410-8736-df70ead52e1b
69 lines
1.7 KiB
Bash
69 lines
1.7 KiB
Bash
#!/bin/sh -e
|
|
# converts html to markdown
|
|
# uses an available program to fetch URL and tidy to normalize it first
|
|
|
|
pathfind () { # portable which(1), code taken from Debian Developer's Reference
|
|
OLDIFS="$IFS"
|
|
IFS=:
|
|
for _p in $PATH; do
|
|
if [ -x "$_p/$*" ]; then
|
|
IFS="$OLDIFS"
|
|
return 0
|
|
fi
|
|
done
|
|
IFS="$OLDIFS"
|
|
return 1
|
|
}
|
|
|
|
for p in pandoc tidy; do
|
|
pathfind $p || {
|
|
echo >&2 "You need '$p' to use this program!"
|
|
exit 1
|
|
}
|
|
done
|
|
|
|
ALL="$*"
|
|
ARGS=${ALL%% -- *} # only the part before ' -- ' delimiters is relevant
|
|
set -- $ARGS
|
|
|
|
REST=${ALL#$ARGS}; REST=${REST# -- }
|
|
PANDOC_OPTS=${REST:-$PANDOC_OPTS}
|
|
|
|
infile=$1
|
|
shift
|
|
|
|
if [ -n "$@" ]; then
|
|
echo >&2 "Warning: extra arguments '$@' will be ignored!"
|
|
fi
|
|
|
|
if [ -z "$infile" ] || [ -f $infile ]; then
|
|
tidy -utf8 $infile 2>/dev/null | \
|
|
pandoc $PANDOC_OPTS -r html -w markdown -s | \
|
|
iconv -f utf-8
|
|
else
|
|
# Treat given argument as an URL. Locate a
|
|
# sensible text based browser (note the order).
|
|
for p in wget lynx w3m curl links w3c; do
|
|
if pathfind $p; then
|
|
DUMPER=$p
|
|
break
|
|
fi
|
|
done
|
|
# Setup proper options.
|
|
case "$DUMPER" in
|
|
wget) OPT="-O-" ;;
|
|
lynx) OPT="-source" ;;
|
|
w3m) OPT="-dump_source" ;;
|
|
curl) OPT="" ;;
|
|
links) OPT="-source" ;;
|
|
w3c) OPT="-n -get" ;;
|
|
"") printf "Needs a program to fetch the URL " >&2
|
|
printf "(e.g. wget, w3m, lynx, w3c, or curl)." >&2
|
|
exit 1 ;;
|
|
esac
|
|
# Fetch and feed to pandoc.
|
|
$DUMPER $OPT $infile 2>/dev/null | \
|
|
tidy -utf8 2>/dev/null | \
|
|
pandoc $PANDOC_OPTS -r html -w markdown -s | \
|
|
iconv -f utf-8
|
|
fi
|