#!/bin/sh -e # converts html to markdown # uses an available program to fetch URL and tidy to normalize it first pathfind () { # portable which(1), code taken from Debian Developer's Reference OLDIFS="$IFS" IFS=: for _p in $PATH; do if [ -x "$_p/$*" ]; then IFS="$OLDIFS" return 0 fi done IFS="$OLDIFS" return 1 } for p in pandoc tidy; do pathfind $p || { echo >&2 "You need '$p' to use this program!" exit 1 } done ALL="$*" ARGS=${ALL%% -- *} # only the part before ' -- ' delimiters is relevant set -- $ARGS REST=${ALL#$ARGS}; REST=${REST# -- } PANDOC_OPTS=${REST:-$PANDOC_OPTS} infile=$1 shift if [ -n "$@" ]; then echo >&2 "Warning: extra arguments '$@' will be ignored!" fi if [ -z "$infile" ] || [ -f $infile ]; then tidy -utf8 $infile 2>/dev/null | \ pandoc $PANDOC_OPTS -r html -w markdown -s | \ iconv -f utf-8 else # Treat given argument as an URL. Locate a # sensible text based browser (note the order). for p in wget lynx w3m curl links w3c; do if pathfind $p; then DUMPER=$p break fi done # Setup proper options. case "$DUMPER" in wget) OPT="-O-" ;; lynx) OPT="-source" ;; w3m) OPT="-dump_source" ;; curl) OPT="" ;; links) OPT="-source" ;; w3c) OPT="-n -get" ;; "") printf "Needs a program to fetch the URL " >&2 printf "(e.g. wget, w3m, lynx, w3c, or curl)." >&2 exit 1 ;; esac # Fetch and feed to pandoc. $DUMPER $OPT $infile 2>/dev/null | \ tidy -utf8 2>/dev/null | \ pandoc $PANDOC_OPTS -r html -w markdown -s | \ iconv -f utf-8 fi