#! /usr/bin/perl -w # # david landgren 24-apr-2001 use strict; my %domain; my $total_size; foreach my $file( @ARGV ) { open F, $file or die "Cannot open $file for input: $!\n"; while( ) { chomp; my( $size, $command ) = (split)[4,8]; if( my( $dom ) = ( $command =~ /^DIRECT\/(.*)/ )) { $total_size += $size; $domain{$dom}{SIZE} += $size; $domain{$dom}{HITS}++; } } close F; } my $count; my $cum_percent = 0; foreach my $d ( sort {$domain{$b}{SIZE} <=> $domain{$a}{SIZE}} keys %domain ) { ++$count; $cum_percent += (my $percent = $domain{$d}{SIZE}*100/$total_size); my $percent_rounded = sprintf '%0.3f%%', $percent; my $cum_percent_rounded = sprintf '%0.3f%%', $cum_percent; print "$count\t$domain{$d}{HITS}\t$domain{$d}{SIZE}\t$percent_rounded\t$cum_percent_ro unded\t$d\n"; } =head1 NAME topweb - Determine biggest targets of inbound HTTP traffic =head1 SYNOPSIS B filespec [filespec...] =head1 DESCRIPTION Generate a snapshot of direct web traffic recorded by a Squid proxy. Scan the Squid access logs specified on the command line looking for DIRECT connections Accumulate the number of hits and and bytes transferred for each FQDN. Sort and print the results based on bytes transferred. The goal is to see how much real traffic is coming in due to cache misses. =head1 OUTPUT This program outputs a tab-delimited text file. The fields are as follows =item * rank -- from 1 to n, the rank in terms of bytes transferred for the domain. =item * hits -- the number of seperate transfers logged. =item * bytes -- the total number of bytes transferred from the above hits. =item * percent -- the percentage that this site represents in terms of the total traffic. =item * cumulative percent -- the percentage that this site and all busier sites represent in terms of the total traffic. =item * fqdn -- the fully qualified domain name of the host, or numeric IP address if the address does not resolve. Here is an a sample output, which indicates, among other things, that the four most demanded sites in this data sample represent 10% of incoming traffic: 1 25226 106606531 2.877% 2.877% www.cadremploi.fr 2 15996 104380579 2.817% 5.693% mailv2.voila.fr 3 24842 97149410 2.621% 8.315% www.apec.asso.fr 4 16861 81954034 2.211% 10.526% www.voila.fr =head1 EXAMPLES C Ctopweb.yyyymmdd> =head1 SEE ALSO topwebdiff - A report tool to analyse the day to day changes of the output from topweb. =head1 COPYRIGHT Copyright (c) 2001 David Landgren. This script is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR David "grinder" Landgren grinder on perlmonks (http://www.perlmonks.org/) eval {join chr(64) => qw[landgren bpinet.com]} =cut