#!/usr/local/bin/perl -Tw

#NAME
#  httpcat - catenate web pages to standard output

#SYNOPSIS
#  httpcat [ URL | path ]...

#REQUIRES
#  perl5 and the following  modules,  which  should  be  found  in  the  same
#  directory  where  you found this program.  Change @INC to include whatever
#  directory you put them into.

	push @INC,"$ENV{HOME}/sh",'sh','.';
	require "Vopt.pm";		#Verbose output.
	require "HTTPcon.pm";	#Makes HTTP connection, sends GET.
	require "HTMLdir.pm";	#Produces HTML listing of directory.
	require "URLopen.pm";	#Parses URL and returns file handle.

#DESCRIPTION
#  This is a web version of the Unix cat(1) command.

#  Given a list of URLs, this program reads them one at a  time,  and  writes
#  their contents, catenated into one long string, to standard output.  Local
#  file names may be used instead of  URLs.   Directories  are  output  in  a
#  simplified HTML format.

#  If you want to learn how to do this stuff, you can study this program.  It
#  is  useful as a starting point for writing other simple web clients.  It's
#  not nearly as difficult as people would like  you  to  believe.   But  the
#  socket  stuff  uses  several  magical  incantations that "you just have to
#  know"; see the required perl module files for this socket magic.

#OPTIONS
#  The default setup is to deliver only the data  (contents)  of  a  URL  and
#  discard  the  header  and  tracing  information.   Here is the list of our
#  current options.  The options may be combined into  a  single  string,  as
#  usual,  with  the  qualification  that options which have an arg (O and P)
#  must be the last in the string.  Options may be in any order, and apply to
#  all subsequent URLs unless canceled by another option.

#  +B
#    Binary data; don't alter white space.

#  -B
#    Non-binary; reduce trailing white space to single \n char.

#  +D
#    Output the data [default].

#  -D
#    Don't output the data.

#  -E
#    Don't extract text; deliver the data as-is. (default)
#  +E
#    Extract the text.  This is a simple conversion from HTML to plain  text.
#    Assorted  line-end  tags  are  converted  to  \n or \n\n as appropriate.
#    Hyperlinks are converted to <URL:...>, which is recognized by a  lot  of
#    non-HTML software, including ABCbot.

#	+H
#    Include the HTTP header info in the output.

#  -H
#    Don't include the HTTP header info in the output [default].


#  -I
#    Don't send agent identification (default).
#  +I"agentid"
#    Send the quoted string as the agent identification. Some web sites won't
#    talk to you unless you pretend to be an acceptable browser.  If there is
#    no string, we send the following string, which seems  to  convince  most
#    servers  that  we're  either  netscape  or  IE  (which  pretends  to  be
#    netscape), plus a bogus sytem name to confuse those servers  that  won't
#    serve  to linux or BSD users.  Feel free to use your own AgentID string,
#    but you should know that there is server software out  there  that  will
#    refuse service to any client that doesn't call itself "Mozilla".

	$dflagentid = "Mozilla/4.5 [en] (compatible; I; Mumble 5.7)";
#	$dflagentid = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)";

#  +M
#    Macintosh kludge: convert \r to \n (if not followed by \n).  This may be
#    obsolescent,  now  that  we  have  code to automatically split the input
#    apart on \r, \n or \r\n.

#  -O<file>
#    Write the data to <file>.  Default is stdout.

#  +P<proxy>
#    Proxy gateway.  If you are hidden behind a proxy, put the proxy hostname
#    (and  optionally  :port)  in  a  -P  option, and we will try to indirect
#    through the proxy server.

#  -P<noproxy>
#    Proxy exception. The <noproxy> string should be a perl pattern. If a URL
#    matches this pattern, the proxy gateway isn't used.

#  -R
#    Ignore redirects (default).
#  +R
#    Follow HTTP "Location:" redirects.

#  +T
#    Enable WWW tracing. This sets a global flag that causes various routines
#    to produce lines of the form:
#      <!--name:  ...  some message ...  -->
#    These look like both HTTP header lines and HTML comments. Some WWW tools
#    (such as the "H" html viewer) can show these to tell you which stage  of
#    a GET operation we have reached.

#  -T
#    Disable WWW tracing [default].

#  -T<n>
#    Timeout of <n> seconds.  The default is no  timeout,  meaning  that  the
#    underlying system's connect() will determine the timeout, if any.

#  +T<n>
#    Retry <N> times on dropped connections. This is to handle a failure mode
#    seen  on some servers:  The server accepts the TCP connection, and drops
#    the connection after receiving the GET request, without even sending  an
#    error code. We retry such connections $tries time, with a default of 10,
#    with 1-second delays between tries.  This  option  selects  a  different
#    limit to the number of retries.  (See home1.swipnet.se for examples.)

#  -V<version>
#    The HTTP version to use. The default is -V1.0. For -V1.1, an extra Host:
#    header is sent, since some servers require it.

#  +X
#    Exit abruptly on timeout. The default is -X, which means to just abandon
#    the  URL.   On some systems, there is a bug in the connect() system call
#    that can result in hanging indefinitely; this option  is  a  last-resort
#    "solution" that "works" if you are only trying to get one URL.

#ENVIRONMENT
#  We use the following from the environment:

#  W3PROXY
#    The name (or address) and an optional :port for a proxy  gateway.   URLs
#    that  don't  match  the  W3NOPROXY  will  be  fetched indirectly via the
#    proxy's web  server.   If  not  defined,  we  will  attempt  direct  TCP
#    connections for all URLs.

#  W3NOPROXY
#    A pattern which is applied to URLs, and if they match, no proxy is used.
#    That is, any URL that matches W3NOPROXY is considered local, and we will
#    access it directly. If not defined, we will use W3PROXY for all URLs (if
#    it exists).

#LIMITATIONS
#  So far only the http://  protocol  is  implemented;  ftp://,  file://  and
#  others  may appear if I need them.  If someone feels like adding FTP code,
#  you might send me a copy.

#  HTTP  "redirection"  (the  "Location:" HTTP header) is implemented now via
#  the +R option.  By default, it is disabled and  must  be  handled  by  the
#  caller, if desired.  This mainly means that if you omit the final '/' on a
#  directory name, we will fail.  This is not  considered  a  bug,  so  it'll
#  probably never be fixed.

#DEBUGGING
#  You can use "perl -dw", of course. Or you can do the following:
#    setenv V_httpcat 5/tmp/httpcat.out	# csh or tcsh users.
#    export V_httpcat=5/tmp/httpcat.out	# ksh or bash users.
#  This will turn on the "print V" lines for $V in the range 0-5,  and  write
#  the verbose output to /tmp/httpcat.out.

#BUGS
#  Despite many attempts to detect failure, we still don't  optimally  handle
#  all  the myriad things that can go wrong.  In particular, on some systems,
#  the connect() system call can hang indefinitely and can't be killed by  an
#  ALARM. There does not appear to be any known solution to this problem. (No
#  amount of clever code will help if your process doesn't get any cpu time.)

#AUTHOR
#  <a href="mailto:jc@trillian.mit.edu">John Chambers</a>
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

$| = 1;					# Don't buffer output.
$exitstat =   0;		# Set this to get a failure exit status.
($me = $0) =~ s"^.*/"";
&Vopt($ENV{"V_$me"} || $ENV{"D_$me"} || $ENV{"T_$me"} || '1');
print V "$me: Started ", `date` if $V>1;

#bufsiz =    10;		# Small for testing.
$bufsiz = 10000;		# Large for routine use.

$W3hdrs = 0;			# Whether to output header lines.
$W3data = 1;			# Whether to output data.
$kludge1_1_404 = 0;	# HTTP/1.1 GET required.

# That's all the config stuff you should have to worry about.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's our table for converting HTML's symbolic entities to characters. The
# value  is  the  decimal code for the character in 8859-1.  How to deal with
# other character sets is still not very clearly defined.

%htmlsym = (
	'lt'     =>  60,	# Less than
	'gt',    =>  62,	# Greater than
	'quot'   =>  34,	# (Double) quote
	'nbsp'   => 160,	# no-break space = non-breaking space, U+00A0 ISOnum
	'iexcl'  => 161,	# cent sign, U+00A2 ISOnum
	'pound'  => 163,	# currency sign, U+00A4 ISOnum
	'yen'    => 165,	# broken bar = broken vertical bar,
	'sect'   => 167,	# diaeresis = spacing diaeresis,
	'copy'   => 169,	# feminine ordinal indicator, U+00AA ISOnum
	'laquo'  => 171,	# left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
	'not'    => 172,	# soft hyphen = discretionary hyphen,
	'reg'    => 174,	# registered sign = registered trade mark sign, U+00AE ISOnum
	'macr'   => 175,	# macron = spacing macron = overline = APL overbar, U+00AF ISOdia
	'deg'    => 176,	# plus-minus sign = plus-or-minus sign,
	'sup2'   => 178,	# superscript two = superscript digit two = squared, U+00B2 ISOnum
	'sup3'   => 179,	# superscript three = superscript digit three = cubed, U+00B3 ISOnum
	'acute'  => 180,	# acute accent = spacing acute, U+00B4 ISOdia
	'micro'  => 181,	# pilcrow sign = paragraph sign,
	'middot' => 183,	# middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
	'cedil'  => 184,	# superscript one = superscript digit one,
	'ordm'   => 186,	# masculine ordinal indicator, U+00BA ISOnum
	'raquo'  => 187,	# right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
	'frac14' => 188,	# vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
	'frac12' => 189,	# vulgar fraction one half = fraction one half, U+00BD ISOnum
	'frac34' => 190,	# vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
	'iquest' => 191,	# inverted question mark = turned question mark, U+00BF ISOnum
	'Agrave' => 192,	# latin capital letter A with grave = latin capital letter A grave,
	'Aacute' => 193,	# latin capital letter A with acute, U+00C1 ISOlat1
	'Acirc'  => 194,	# latin capital letter A with circumflex, U+00C2 ISOlat1
	'Atilde' => 195,	# latin capital letter A with tilde, U+00C3 ISOlat1
	'Auml'   => 196,	# latin capital letter A with diaeresis, U+00C4 ISOlat1
	'Aring'  => 197,	# latin capital letter A with ring above = latin capital letter A ring,
	'AElig'  => 198,	# latin capital letter AE = latin capital ligature AE,
	'Ccedil' => 199,	# latin capital letter C with cedilla, U+00C7 ISOlat1
	'Egrave' => 200,	# latin capital letter E with grave, U+00C8 ISOlat1
	'Eacute' => 201,	# latin capital letter E with acute, U+00C9 ISOlat1
	'Ecirc'  => 202,	# latin capital letter E with circumflex, U+00CA ISOlat1
	'Euml'   => 203,	# latin capital letter E with diaeresis, U+00CB ISOlat1
	'Igrave' => 204,	# latin capital letter I with grave, U+00CC ISOlat1
	'Iacute' => 205,	# latin capital letter I with acute, U+00CD ISOlat1
	'Icirc'  => 206,	# latin capital letter I with circumflex, U+00CE ISOlat1
	'Iuml'   => 207,	# latin capital letter I with diaeresis, U+00CF ISOlat1
	'ETH'    => 208,	# latin capital letter N with tilde,
	'Ograve' => 210,	# latin capital letter O with grave, U+00D2 ISOlat1
	'Oacute' => 211,	# latin capital letter O with acute, U+00D3 ISOlat1
	'Ocirc'  => 212,	# latin capital letter O with circumflex, U+00D4 ISOlat1
	'Otilde' => 213,	# latin capital letter O with tilde, U+00D5 ISOlat1
	'Ouml'   => 214,	# latin capital letter O with diaeresis, U+00D6 ISOlat1
	'times'  => 215,	# latin capital letter O with stroke
	'Ugrave' => 217,	# latin capital letter U with grave, U+00D9 ISOlat1
	'Uacute' => 218,	# latin capital letter U with acute, U+00DA ISOlat1
	'Ucirc'  => 219,	# latin capital letter U with circumflex, U+00DB ISOlat1
	'Uuml'   => 220,	# latin capital letter U with diaeresis, U+00DC ISOlat1
	'Yacute' => 221,	# latin capital letter Y with acute, U+00DD ISOlat1
	'THORN'  => 222,	# latin capital letter THORN, U+00DE ISOlat1
	'szlig'  => 223,	# latin small letter sharp s = ess-zed, U+00DF ISOlat1
	'agrave' => 224,	# latin small letter a with grave = latin small letter a grave,
	'aacute' => 225,	# latin small letter a with acute, U+00E1 ISOlat1
	'acirc'  => 226,	# latin small letter a with circumflex, U+00E2 ISOlat1
	'atilde' => 227,	# latin small letter a with tilde, U+00E3 ISOlat1
	'auml'   => 228,	# latin small letter a with diaeresis, U+00E4 ISOlat1
	'aring'  => 229,	# latin small letter a with ring above = latin small letter a ring,
	'aelig'  => 230,	# latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
	'ccedil' => 231,	# latin small letter c with cedilla, U+00E7 ISOlat1
	'egrave' => 232,	# latin small letter e with grave, U+00E8 ISOlat1
	'eacute' => 233,	# latin small letter e with acute, U+00E9 ISOlat1
	'ecirc'  => 234,	# latin small letter e with circumflex, U+00EA ISOlat1
	'euml'   => 235,	# latin small letter e with diaeresis, U+00EB ISOlat1
	'igrave' => 236,	# latin small letter i with grave, U+00EC ISOlat1
	'iacute' => 237,	# latin small letter i with acute, U+00ED ISOlat1
	'icirc'  => 238,	# latin small letter i with circumflex, U+00EE ISOlat1
	'iuml'   => 239,	# latin small letter i with diaeresis, U+00EF ISOlat1
	'eth'    => 240,	# latin small letter n with tilde,
	'ograve' => 242,	# latin small letter o with grave, U+00F2 ISOlat1
	'oacute' => 243,	# latin small letter o with acute, U+00F3 ISOlat1
	'ocirc'  => 244,	# latin small letter o with circumflex, U+00F4 ISOlat1
	'otilde' => 245,	# latin small letter o with tilde, U+00F5 ISOlat1
	'ouml'   => 246,	# latin small letter o with diaeresis, U+00F6 ISOlat1
	'divide' => 247,	# latin small letter o with stroke,
	'ugrave' => 249,	# latin small letter u with grave, U+00F9 ISOlat1
	'uacute' => 250,	# latin small letter u with acute, U+00FA ISOlat1
	'ucirc'  => 251,	# latin small letter u with circumflex, U+00FB ISOlat1
	'uuml'   => 252,	# latin small letter u with diaeresis, U+00FC ISOlat1
	'yacute' => 253,	# latin small letter y with acute, U+00FD ISOlat1
	'thorn'  => 254,	# latin small letter thorn, U+00FE ISOlat1
	'yuml'   => 255,	# latin small letter y with diaeresis, U+00FF ISOlat1
);

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

arg:
for $u (@ARGV) {
	$moved = 0;
	$URLerr = "Don't know why";				# Set by URLopen when failures.
	if (($pfx,$opt) = ($u =~ /^([-+])(.*)/)) {
		while ($opt) {						# Each time MUST remove at least one char.
			if ($opt =~ s/^B//i) {			# +B or -B (binary/nonbinary mode)
				$binary = ($pfx eq '+') ? 1 : 0;
				print V ($binary ? "Binary" : "Text"), " data.\n" if $V>1;
			} elsif ($opt =~ s/^D//i) {			# +D or -D (whether to produce data)
				$W3data = ($pfx eq '+') ? 1 : 0;
				print V ($W3data ? "Do" : "Don't"), " produce data.\n" if $V>1;
			} elsif ($opt =~ s/^E//i) {			# +E or -E (whether to extract text)
				$Etext = ($pfx eq '+') ? 1 : 0;
				print V ($Etext ? "Do" : "Don't"), " extract text.\n" if $V>1;
			} elsif ($opt =~ s/^H//i) {		# +H or -H (whether to produce headers)
				$W3hdrs = ($pfx eq '+') ? 1 : 0;
				print V ($W3hdrs ? "Do" : "Don't"), " produce headers.\n" if $V>1;
			} elsif ($opt =~ s/^I//i) {		# +I<agentid> or -P
				if ($pfx eq '-') {
					$W3agentid = '';
					print V "$me: No agent identification." if $V>1;
				} else {
					$W3agentid = $opt || $dflagentid;	# Rest of string is id.
					print V "$me: Agent \"$W3agentid\"\n" if $V>1;
					$opt = '';
				}
			} elsif ($opt =~ s/^M//i) {      # +M or -M (Macintosh kludge)
				$MACfl = ($pfx eq '+') ? 1 : 0;
				print V "$me: Mac kludge " . ($MACfl?'on':'off') . ".\n" if $V>1;
			} elsif ($opt =~ s/^O//i) {		# +O or -O (output file)
				$outfile = $opt;			# Rest of string is file name.
				$opt = '';
				print V "$me: Output to \"$outfile\"\n" if $V>1;
			} elsif ($opt =~ s/^P//i) {		# +P<proxy> or -P<noproxy>
				if ($pfx eq '-') {
					$W3nopxy = $opt;		# Rest of string is pattern.
					print V "$me: Proxy exceptions are /$W3nopxy/\n" if $V>1;
				} else {
					$W3proxy = $opt;		# Rest of string is proxy host.
					print V "$me: Proxy server is $W3nopxy.\n" if $V>1;
				}
				$opt = '';
			} elsif ($opt =~ s/^R//i) {		# +R or -R (whether to follow 302 redirects)
				$W302 = ($pfx eq '+') ? 1 : 0;
				print V ($W302 ? "Do" : "Don't"), " follow 302 redirects.\n" if $V>1;
			} elsif ($opt =~ s/^T//i) {		# +T or -T (WWW tracing) or -T<n> (timeout)
				if ($opt =~ s/^(\d+)//) {	# T with number:
					if ($pfx eq '+') {
						$tries = $1;		# +T\d+ is limit to tries.
						print V "tries = $tries.\n" if $V>1;
					} else {
						$HTTPtimeout = $1;	# -T\d+ is timeout.
						print V "HTTPtimeout = $HTTPtimeout sec.\n" if $V>1;
					}
				} else {					# T without number:
					if ($pfx eq '+') {
						$W3trace = 1;		# +T enables tracing.
						print V "Do produce WWW tracing.\n" if $V>1;
					} else {
						$W3trace = 0;		# -T disables tracing.
						print V "Don't produce WWW tracing.\n" if $V>1;
					}
				}
			} elsif ($opt =~ s/^V//i) {		# +V<version>
				$HTTPvopt = $HTTPversion = $opt;
				$opt = '';
				print V "$me: HTTP version '$HTTPvopt'\n" if $V>1;
			} elsif ($opt =~ s/^X//i) {		# +X
				$HTTPtimexit = ($pfx eq '+') ? 1 : 0;
				print V ($W302 ? "Do" : "Don't"), " exit on timeout.\n" if $V>1;
			} else {
				print V "$me: unknown option \"$opt\" ignored.\n";
				$opt =~ s/.//;				# Discard this option char.
			}
		}
		next arg;
	}
	$try = $hdrlines = 0;		# Count the opens and header lines.
	$tries = 10 unless $tries;
try:
	while ($try++ < $tries && $hdrlines == 0) {
		print V "$me: Try $try\n" if $V>0 && $try>1;
		sleep 1 if $try>1;	# Don't hit the server too hard.
		unless (&URLopen(*U,$u)) {
			print V "$me: Can't open \"$u\" ($URLerr)\n" if $V>1;
			$exitstat = 1;
			next arg;
		}
		print V "$me: Opened \"$u\"\n" if $V>1;
		if ($HTTPtimeout > 0) {
			alarm $HTTPtimeout;
			$savsig = $SIG{ALRM};
			$SIG{ALRM} = 'READalarm';
			print V "$me: Set alarm after $HTTPtimeout sec.\n" if $V>2;
		}
		$inTEXT  = $URLhdr ? 1 : 0;
		$inHTML  =		# Is it HTML?
		$inPRE   =		# Within a <pre>...</pre> section?
		$statmax = 0;	# Max status code seen.
		$Ubuf    = 		# Input buffer.
		$staterr = '';	# Last error message.
		# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
		# Here's where we read the data from one URL and write it  to #
		# standard output.  If you want to do something else with the #
		# data, you should rewrite this loop:                         #
		# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
		if ($outfile && !$outopen) {	# Do we need to open the output?
			print V "$me: Open \"$outfile\"\n" if $V>1;
			if (open(O,">$outfile")) {	# Try to open it for writing.
				print V "$me: Writing \"$outfile\" [$!]\n" if $V>1;
			} else {
				print V "$me: Can't write \"$outfile\" [$!]\n" if $V>0;
				$outfile = '';
			}
			$outopen = 1;
		}
		print V "$me: Headers (URLhdr=$URLhdr) ...\n" if $V>1;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# In all cases, we must first read through the HTTP headers.  We look #
# for a few of them, and set global variables to match what we see.   #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
hdr:	while ($URLhdr && ($b = &Uline())) {
			$inTEXT = 1;
			$b =~ s/\s+$//;				# Discard trailing white stuff
			if ($W3hdrs) {if ($outfile) {print O "$b\n"} else {print "$b\n"}}
			if (length($b) > 0) {		# Anything left in the buffer?
				++$hdrlines;
				if (($httpvrs,$statcode,$statmsg) = ($b =~ /^HTTP\/([\d.]+)\s+(\d+)\s+(.*)/)) {
					print V "<!--$me: httpvrs=$httpvrs statcode=$statcode -->\n" if $V>1;
					if ($statcode > $statmax) {$statmax = $statcode; $staterr = $statmsg}
					if ($W302 && $statcode == 302) {
						print V "<!--$me: MOVED -->\n" if $V>1;
						$moved = 1;
					}
					if ($httpvrs eq '1.1' && $statcode >= 400) {
						if ($HTTPversion ne '1.1') {
							print V "<!--$me: HTTP/$HTTPversion => HTTP/1.1 404 kludge -->\n" if $W3trace;
							$kludge1_1_404 = 1;
							$HTTPversion = '1.1';
							redo arg;
						} else {
							print V "<!--$me: HTTP/1.1 $statcode $statmsg (probably real) \"$u\" -->\n" if $W3trace;
						}
					}
				} elsif ($Etext && ($b =~ m"^Transfer-Encoding:\s*(.*)$"i)) {
					if ($1 eq 'chunked') {
						print V "<!--$me: CAN'T HANDLE \"$Etext\" -->\n" if $V>0;
						close U;	# Hang up now
					}
				} elsif ($Etext && ($b =~ m"^Content-type:\s*(.*)/(.*)$"i)) {
					$doctype = lc($1);
					$subtype = lc($2);
					print V "<!--$me: Content-type: \"$doctype/$subtype\" -->\n" if $V>1;
					if ($doctype eq 'text') {
						$inTEXT = 1;;
						$inHTML = ($subtype eq 'html') ? 1 : 0;
					} else {
						$inTEXT = $inHTML = 0;
					}
					print V "<!--$me: inTEXT=$inTEXT inHTML=$inHTML -->\n" if $V>1;
				} elsif ($moved && ($b =~ /^Location:\s*(.*)$/)) {
					print V "<!--$me: New URL \"$1\" -->\n" if $V>1;
					$u = $1;
					redo arg;
				}
				if ($statcode >= 400) {
					print V "<!--$me: Fail with HTTP err $statcode ($statmsg)  \"$u\"-->\n" if $W3trace;
				}
			} else {
				$URLhdr = 0;		# Blank line ends headers
			}
		}
	}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# We should now be positioned just after the double  CRLF  that  ends #
# HTTP header lines.  The rest should be the contents of the file.    #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	if ($statmax >= 400) {	# Did we get a fatal error code?
		exit 1;
	}
	$/ = undef if $binary;
	if ($HTTPtimeout > 0) {alarm $HTTPtimeout}
	print V "$me: Headers done.\n" if $V>1;
	if (!$W3data) {			# Data not wanted?
		print V "$me: Data not wanted, quitting.\n" if $V>1;
		close(U);			# Close this connection.
		next arg;			# Go on to next URL.
	}
#	if ($W3hdrs) {
#		print V "$me: Writing NL\n" if $V>1;
#		if ($outfile) {print O "\n"} else {print "\n"}
#	}
# Now we copy the data, doing any needed processing to each line.
data:
	while ($b = &Uline()) {
		print V "$me: Got: \"$b\"\n" if $V>5;
		if (!$W3hdrs && $URLhdr) {	# Suppressing header lines.
			print V "$me: HDR check in \"$b\"\n" if $V>1;
			if ($b =~ s/^.*\r\n\r\n//s) {
				$URLhdr = 0;		# Found \n\n separator.
			} else {
				next;				# No separator, discard it all.
			}
		}
		if ($MACfl) {$b =~ s"\r\n?"\n"g}
		if ($outfile) {print O $b} else {print $b}
		if ($HTTPtimeout) {alarm $HTTPtimeout}
	}
	unless (defined $b) {
		print V "\n$me: Error reading \"$u\" (Reason: $!)\n" if $V>0;
		$exitstat = $?;
	}
}

exit $exitstat;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub READalarm {
	my $t = time - $HTTPopentime;
	print V "<!--$me: ALARM after $t sec -->\n" if $W3trace;
	exit -1;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Convert an HTML tag to its plain-text equivalent.
sub Htag {
	local($t) = @_;
	print V "Htag: HTML tag \"$t\"\n" if $V>2;
	if ($t =~ /a\s.*href\s*=\s*"*\s*([^\s>"]+)\s*"*\s*$/i) {
		return "<URL:\"$1\">\n";
	}
	$t = lc($t);
#	$t =~ s/"\s+/"/g;
	if ($t eq 'br')               {return "\n"}
	if ($t =~ m"^/*p\b")            {return "\n\n"}
	if ($t =~ m"^/*t[dr]\b")      {return "\n"}
	if ($t =~ m"^/*H\d+\b")       {return "\n\n"}
	if ($t =~ m"^/*[bod]l\b")     {return "\n"}
	if ($t =~ m"^li\b")           {return "\n"}
	if ($t =~ m"^hr\b")           {return "\n"}
	if ($t =~ m"^d[td]\b")        {return "\n"}
	if ($t =~ m"^/*center\b")     {return "\n"}
	if ($t =~ m"^/*blockquote\b") {return "\n"}
	if ($t eq 'pre')              {$inPRE++; return "\n"}
	if ($t eq '/pre')             {$inPRE--; return "\n"}
	if ($t =~ '^textarea\b')      {$inPRE++; return "\n"}
	if ($t =~ '^/textarea\b')     {$inPRE--; return "\n\n"}
	if ($t =~ m"^/*head\b")       {return "\n\n"}
	if ($t =~ m"^/*body\b")       {return "\n\n"}
	return '';		# Discard all others.
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Return one "line" from the URL file U. Depending on global flags, we may or #
# may  not trim away trailing white space, rewrite HTML tags, etc.  Note that #
# while in the HTTP headers ($URLhdr true), we treat the  data  as  text  and #
# return  it line at a time, regardless of settings.  When we reach the data, #
# we then start looking at how we are to process the data.                    #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub Uline {
	local($b,$c,$h,$C,$s,$n);
	local($F) = 'Uline';
loop: while (1) {
		if ($binary && !$URLhdr) {
			$h .= $Ubuf;	# Anything left over from last time?
			$Ubuf = '';		# Use it all up
			print V "$F HTML \"$h\"\n" if $V>6;
			return $h if length($h)>0;
		} elsif ($Etext && $inHTML && !$URLhdr) {
			print V "$F HTML \"$Ubuf\"\n" if $V>5;
			if ($h =~ /Notation /) {
				print V "$F ########\n" if $V>1;
			}
			if ($Ubuf =~ s/^([\s\r]+)//) {		# White space => space
				$h .= ($inPRE > 0) ? $1 : ' ';
				print V "$F HTML \"$h\"\n" if $V>5;
			} elsif ($Ubuf =~ s/^\&//) {		# HTML entity
				if ($Ubuf =~ s/^(\w+);//) {	# HTML symbolic entity
					$h .= &Hsym($1);
				} elsif ($Ubuf =~ s/^#(\d+);//) {	# One HTML numeric entity
					$h .= chr($1);
				} else {
					$h .= '&';					# False alarm
				}
			} elsif ($Ubuf =~ s/^<//) {			# One HTML tag
				if ($Ubuf =~ s/^([^>]+)>\s*//) {
					print V "$F HTML tag \"$1\"\n" if $V>2;
					$c = &Htag($1);
					if ($c eq "\n" || $c eq "\n\n") {
						$C = ($c eq "\n") ? '\n' : ($c eq "\n\n") ? '\n\n' : '###';
						print V "$F HTML \"$h\" + $C\n" if $V>2;
						return $h . $c;
					} else {
						$h .= $c;
					}
				} else {
					print V "$F Incomplete tag.\n" if $V>5;
				}
			} elsif ($Ubuf =~ s/^([^\s\r<&]+)//) {	# Up to white space or < or &
				$h .= $1;
				print V "$F HTML \"$h\"\n" if $V>5;
			} else {
				$h .= $Ubuf;
				$Ubuf = '';
				print V "$F HTML \"$h\"\n" if $V>5;
			}
			print V "$F h=\"$h\"\n" if $V>3;
			print V "$F HTML Need more ...\n" if $V>5;
			next loop if $Ubuf;
		} elsif ($Ubuf =~ s/^([^\r\n]*)([\r\n])//) {
			$s = $1;
			$c = $2;
			$C = ($c eq "\n") ? '\n' : ($c eq "\r") ? '\r' : $c;
			print V "$F Line $s$C\n" if $V>3;
			if (($c eq "\r") && $inTEXT)  {		# DOS/Macintosh kludge
				print V "$F TEXT map $C => \\n\n" if $V>4;
				unless ($Ubuf) {if (($n = read(U,$b,$bufsiz)) > 0) {$Ubuf .= $b}}
				$Ubuf =~ s/^\n//;	# remove \r\n
				$c = "\n";
			}
			return "$s$c";
		}
		$n = read(U,$b,$bufsiz);	# Read in the next chunk of data
		if ($n <= 0) {
			print V "$F Read failed; n=$n ($!)\n" if $V>1;
			$staterr =  "$!";		# Remember the error message
			$b = $h . $Ubuf;		# Whatever is left
			$Ubuf = ($n < 0) ? undef : '';
			return $b;				# Return remaining data
		}
		$Ubuf .= $b;
		print V "$F Ubuf=\"$Ubuf\"\n" if $V>6;
		print V "$F ================================\n" if $V>6;
	}
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's the routine to convert &foo; to the corresponding 8859-1  character. #
# If it's not found, we just return the "&foo;" encoding.  Eventually we will #
# have to deal with the Unicodization of the unix world.                      #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub Hsym {
	local($s) = @_;
	local($c) = $htmlsym{$s};
	return(chr($c)) if defined $c;
	return("&$s;");
}

