#!/usr/pkg/bin/perl
#
# @(#)proc-httpd-access.pl,v 1.9 2006/06/04 21:16:46 kim Exp
#
# Output TOP 10 requested web documents
#
# 1996-07-28  Kimmo Suominen
#
# First tell the daemon to switch to a new file
#
#if (-r '/var/run/httpd.pid') {
#    open(DPIDF, '</var/run/httpd.pid');
#    $dpid = <DPIDF>;
#    kill 'HUP', $dpid;
#    close(DPIDF);
#}
#
# Process the log
#
# The common log format is: "%h %l %u %t \"%r\" %s %b"
#
# We use an extended form of the combined log format to include
# the virtual site.
#
# "[%v] %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
#
# This script should be processing all three of the common, combined
# and vhost formats just fine.
#
while (<>) {
    if (m!^(\[([^\s]+)\]\s+)?.*\s+"?(GET|HEAD)\s+(.*?)(\s+HTTP/[.0-9]+)?"?\s+([-0-9]+)\s+(([0-9]+)((\s+"([^"]+)")(\s+"([^"]+))?)?)?!) {
	$sitenm = $2;
	$rawurl = $4;
	$stcode = $6;
	next if (($stcode != 200) && ($stcode != 304));
	$sitenm = "" if (! $sitenm);
	$site{$sitenm}++;
	$bytes{$sitenm} += $8 if defined($8);
	$referer = $11 if defined($11);
	$rawbrowser = $13 if defined($13);
	$url = "";
	while ($rawurl =~ m!(.*?)%([0-9a-fA-F]{2})(.*)!) {
	    $url .= $1 . chr(hex($2));
	    $rawurl = $3;
	}
	$url .= $rawurl;
	# strip GET parameters
	$url =~ s/\?.*//;
	# strip language codes from URL
	$url =~ s/\.(de|en|es|fi|fr|sv)(\.[^\/]+)?$/\2/;
	# strip default index file if explicitly addressed
	$url =~ s!/(default|index)\.(cgi|s?html?|php|pl)$!/!;
	# skip audio files, graphics, executables, style sheets, proxy confs
	next if ($url =~ m!\.(au|css|exe|gif|jpe?g|midi?|pac|png|x[bp]m)$!i);
	# skip FrontPage authoring URLs
	next if ($url =~ m!/_vti_!i);
	# count the rest
	$doc{$sitenm}{$url}++;
	# We only want major version number for each browser
	# Some browsers try hard to look like Mozilla -- try to detect
	if ($rawbrowser =~ m!\s*(([^/ 0-9]+)[/ ]?([0-9.]+)?)(.*compatible;\s*([^/ 0-9]+)[/ ]?((\w|\.)*)?)?!) {
	    if (defined($5)) {
		if (defined($6)) {
		    $browser{$5 . ' ' . $6}++;
		} else {
		    $browser{$5}++;
		}
	    } else {
		if (defined($3)) {
		    $browser{$2 . ' ' . $3}++
		} else {
		    $browser{$2}++;
		}
	    }
	}
    }
}
#
# Output TOP 10 browsers
#
$k = 1;
printf "%10s %-s\n", "TOP 10", "browsers by successful accesses";
printf "%10s %-s\n", "--------", "------------------------------------------------------------";
foreach $i (sort {$browser{$b} <=> $browser{$a}} (keys %browser)) {
    printf "%10d %-s\n", $browser{$i}, $i;
    last if (++$k > 10);
}
printf "\n";
#
# Output TOP 10 documents
#
foreach $j (sort (keys %site)) {
    $k = 1;
    printf "%10d accesses on %-s (%.1f kB)\n", $site{$j},
	($j eq "" ? "default server" : $j), $bytes{$j} / 1024;
    printf "%10s %-s\n", "--------", "------------------------------------------------------------";
    foreach $i (sort {$doc{$j}{$b} <=> $doc{$j}{$a}} (keys %{$doc{$j}})) {
	printf "%10d %-s\n", $doc{$j}{$i}, $i;
	last if (++$k > 10);
    }
    printf "\n";
}
