Fixed URL regex in html2x.pl, and added a command to the pipe

to truncate input pages to 100K.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1019 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2007-09-15 21:30:31 +00:00
parent bf100f8276
commit 9ed11f4500

View file

@ -19,14 +19,16 @@ if ($format =~ /^markdown\+$/) {
} }
# Validate URL and format # Validate URL and format
unless ($url =~ /^(https?:\/\/)?[\w#?_-]+(\.[\w#?_-]+)+[\w\/#?_.-]*$/) { unless ($url =~ /^(https?:\/\/)?[\w#_-]+(\.[\w#_-]+)+[\w\/#=?_.-]*$/) {
die "Illegal URL: $url\n" ; die "Illegal URL: $url\n" ;
} }
unless ($format =~ /^markdown\+?|rst|latex|context|rtf|man|docbook$/) { unless ($format =~ /^markdown\+?|rst|latex|context|rtf|man|docbook$/) {
die "Illegal format: $format\n"; die "Illegal format: $format\n";
} }
my $output = `wget -O- $url | tidy -asxhtml -utf8 | pandoc -w $format $options`; # Note - pass through head to truncate file to 100K if greater.
# This should prevent certain kinds of DoS attacks.
my $output = `wget -O- $url | head -c100000 | tidy -asxhtml -utf8 | pandoc -w $format $options`;
if ($output =~ /^\s*$/) { if ($output =~ /^\s*$/) {
print start_html, print start_html,
h1("No output"), h1("No output"),