#!/usr/bin/perl
#
#	Turns the HTML files from the Asimov site into
#	text files, suitable to go onto a mobile device
#
# Only tested with the Charles Stross stories, may need slight tweaking
#  for others
#
# Note - this script strips out all the Copyright notices et al, so
#        you probably can't redistribute the files you make with the
#        script
#
# This script is covered by the GNU General Public License (GPL), details
#  can be found at http://www.gnu.org/copyleft/gpl.html
#
#				v0.02	21/08/2003

my $file = shift;

unless($file) {
	die("Usage:\n  html-to-text.pl <filename>\n");
}

my $outfile = $file;
$outfile =~ s/\.html$/.txt/;
$outfile =~ s/\.shtml$/.txt/;

if($file eq $outfile) {
	die("In and out the same!");
}

open IN, "<$file";
open OUT, ">$outfile";

my $state = 0;
while(<IN>) {
	chomp;
	if($state == 0) {
		s/\sheight=\"1\%\"//;
		if(/<td colspan="3" align="left" bgcolor="#.{6}"><font face="Arial,Helvetica" color="white"><b>(.*)<\/b>(.*)<\/font/) {
			my ($name,$author) = ($1,$2);
			$name =~ s/\s+$//;
			$name =~ s/<\/b>.*<b>//;
			$author =~ s/\s+$//;
			my $title = $name." - ".$author;
			my $foo = "-" x length($title);

			print OUT "    $title\n";
			print OUT "    $foo\n\n";
			$state = 1;
		}
	} 
	if($state == 1 || $state == 2) {
		my $text = undef;
		if(/<p.*?><\/p>/) { $text = " "; }
		if(/<p.*?>(.+)<\/p>/) {
			$text = $1;
		}
		if(/<p.*?>(.+)<br>/) {
			$text = $1."</font>";
		}
		if($text) {
			# Note - Nokia 3650 uses ISO-8859-1
			#  As such, we convert accents into that
			$text =~ s/\&agrave\;/\xe0/g;
			$text =~ s/\&egrave\;/\xe8/g;
			$text =~ s/\&igrave\;/\xec/g;
			$text =~ s/\&ograve\;/\xf2/g;
			$text =~ s/\&aacute\;/\xe1/g;
			$text =~ s/\&eacute\;/\xe9/g;
			$text =~ s/\&iacute\;/\xed/g;
			$text =~ s/\&oacute\;/\xf3/g;

			$text =~ s/\&ntilde\;/\xf1/g;
			$text =~ s/\&iquest\;/\xbf/g;
			$text =~ s/\&iexcl\;/\xa1/g;

			$text =~ s/\&quot\;/"/g;
			$text =~ s/\&\#145\;/\`/g;
			$text =~ s/\&\#146\;/'/g;
			$text =~ s/\&\#150\;/-/g;
			$text =~ s/\&amp\;/\&/g;
			$text =~ s/<\/i>\s?<i>/ /gi;
			$text =~ s/<i>(.*?)<\/i>/_$1_/gi;
			$text =~ s/^(.*?)<\/i>/_$1_/gi;
			$text =~ s/_ _//g;
			$text =~ s/<font.*?>(.*)<\/font>/$1/;
			if($text) {
				print OUT $text."\n\n";
			} else { #print "** "; }
			#$text =~ s/^(.{15}).*/$1/;
			#print "$text\n";
		} else { #print "*** ".$_."\n"; }
		if($state == 1) {
			print OUT "\n";
			$state = 2;
		}
		if(/<\/table\>/) {
			$state = 3;
		}
	}
}
close IN;

close OUT;
