From a1ad3b4e5fde28e74787dd4af609ea7aaf5b8005 Mon Sep 17 00:00:00 2001
From: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>
Date: Tue, 2 Oct 2007 02:08:52 +0000
Subject: [PATCH] Modified html2markdown. Previously html2markdown piped all
 input through html tidy before passing it to pandoc.  This causes problems on
 certain sites (e.g. daringfireball.com/markdown) which have well-formed xhtml
 that causes tidy to choke.  Solution is to try pandoc on the original HTML,
 and run it through tidy only if that fails.

This means that a temp file is now always used, even when input comes
from a local file or standard input.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1039 788f1e2b-df1e-0410-8736-df70ead52e1b
---
 src/wrappers/html2markdown.in | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in
index ad026c24e..0f4297128 100644
--- a/src/wrappers/html2markdown.in
+++ b/src/wrappers/html2markdown.in
@@ -104,11 +104,11 @@ if [ -n "$argument" ] && ! [ -f "$argument" ]; then
     inurl="$argument"
 fi
 
+### tempdir.sh
+
 if [ -n "$inurl" ]; then
     err "Attempting to fetch file from '$inurl'..."
 
-    ### tempdir.sh
-
     grabber_out=$THIS_TEMPDIR/grabber.out
     grabber_log=$THIS_TEMPDIR/grabber.log
     if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
@@ -144,14 +144,19 @@ else # assume UTF-8
     alias to_utf8='cat'
 fi 
 
+htmlinput=$THIS_TEMPDIR/htmlinput
+
 if [ -z "$argument" ]; then
-    tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@"
+    to_utf8 > $htmlinput                # read from STDIN
+elif [ -f "$argument" ]; then
+    to_utf8 "$argument" > $htmlinput    # read from file
 else
-    if [ -f "$argument" ]; then
-        to_utf8 "$argument" | 
-        tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@"
-    else
-        err "File '$argument' not found."
-        exit 1
-    fi
+    err "File '$argument' not found."
+    exit 1
+fi
+
+if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then
+     err "Failed to parse HTML.  Trying again with tidy..."
+     tidy -q -asxhtml -utf8 $htmlinput | \
+        pandoc --ignore-args -r html -w markdown "$@"
 fi