#
# http://www.fabiankeil.de/sourcecode/privoxy-log-parser/
#
-# $Id: privoxy-log-parser.pl,v 1.86 2010/07/26 11:28:52 fabiankeil Exp $
+# $Id: privoxy-log-parser.pl,v 1.103 2010/11/08 17:53:29 fabiankeil Exp $
#
# TODO:
# - LOG_LEVEL_CGI, LOG_LEVEL_ERROR, LOG_LEVEL_WRITE content highlighting
# - Handle incomplete input without Perl warning about undefined variables.
# - Use generic highlighting function that takes a regex and the
# hash key as input.
+# - Add --compress and --decompress options.
#
# Copyright (c) 2007-2010 Fabian Keil <fk@fabiankeil.de>
#
CLI_OPTION_SHOW_INEFFECTIVE_FILTERS => 0,
CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES => 0,
CLI_OPTION_STATISTICS => 0,
+ CLI_OPTION_URL_STATISTICS_THRESHOLD => 0,
+ CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0,
SUPPRESS_SUCCEEDED_FILTER_ADDITIONS => 1,
SHOW_SCAN_INTRO => 0,
'action-bits-update' => 'light_red',
'configuration-line' => 'red',
'content-type' => 'yellow',
+ 'HOST' => HEADER_DEFAULT_COLOUR,
);
%h_colours = %h;
update_header_highlight_regex($header);
}
- } elsif ($c =~ m/^scan: ((\w+) (.+) (HTTP\/\d\.\d))/) {
+ } elsif ($c =~ m/^(scan: )(\w+ .+ HTTP\/\d\.\d)/) {
- # Client request line
- # Save for statistics (XXX: Not implemented yet)
- $req{$t}{'method'} = $2;
- $req{$t}{'destination'} = $3;
- $req{$t}{'http-version'} = $4;
-
- $c = highlight_request_line($1);
+ # scan: GET http://p.p/ HTTP/1.1
+ $c = $1 . highlight_request_line($2);
} elsif ($c =~ m/^(scan: )((?:HTTP\/\d\.\d|ICY) (\d+) (.*))/) {
- # Server response line
+ # scan: HTTP/1.1 200 OK
$req{$t}{'response_line'} = $2;
$req{$t}{'status_code'} = $3;
$req{$t}{'status_message'} = $4;
or $c =~ m/^Removing 'Connection: close' to imply keep-alive./
or $c =~ m/^keep-alive support is disabled/
or $c =~ m/^Continue hack in da house/
+ or $c =~ m/^Merged multiple header lines to:/
)
{
# XXX: Some of these may need highlighting
# Removing 'Connection: close' to imply keep-alive.
# keep-alive support is disabled. Crunching: Keep-Alive: 300.
# Continue hack in da house.
+ # Merged multiple header lines to: 'X-FORWARDED-PROTO: http X-HOST: 127.0.0.1'
} elsif ($c =~ m/^scanning headers for:/) {
$c =~ s@(?<= from )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c =~ s@(?<= to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ } elsif ($c =~ m/^Killed all-caps Host header line: HOST:/) {
+
+ # Killed all-caps Host header line: HOST: bestproxydb.com
+ $c = highlight_matched_host($c, '(?<=HOST: )[^\s]+');
+ $c = highlight_matched_pattern($c, 'HOST', 'HOST');
+
} else {
found_unknown_content($c);
$c =~ s@(?<=set to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c =~ s@(?<=reading )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ } elsif ($c =~ m/^Reducing expected bytes to /) {
+
+ # Reducing expected bytes to 0. Marking the server socket tainted after throwing 4 bytes away.
+ $c =~ s@(?<=bytes to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ $c =~ s@(?<=after throwing )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
} elsif ($c =~ m/^Waiting for up to /) {
# Waiting for up to 4999 bytes from the client.
$stats{requests}++;
$stats{crunches}++;
+
+ if ($c =~ m/^Redirected:/) {
+ # Redirected: http://www.example.org/http://p.p/
+ $stats{'fast-redirections'}++;
+
+ } elsif ($c =~ m/^Blocked:/) {
+ # Blocked: blogger.googleusercontent.com:443
+ $stats{'blocked'}++;
+ }
}
# A HTTP/1.1 response without Connection header implies keep-alive.
# Keeping the server header 'Connection: keep-alive' around.
$stats{'server-keep-alive'}++;
+
+ } elsif ($c =~ m/^scan: ((\w+) (.+) (HTTP\/\d\.\d))/) {
+
+ # scan: HTTP/1.1 200 OK
+ $stats{'method'}{$2}++;
+ $stats{'ressource'}{$3}++;
+ $stats{'http-version'}{$4}++;
+
+ } elsif ($c =~ m/^scan: Host: ([^\s]+)/) {
+
+ # scan: Host: p.p
+ $stats{'hosts'}{$1}++;
}
}
'empty-responses' => 0,
'empty-responses-on-new-connections' => 0,
'empty-responses-on-reused-connections' => 0,
+ 'fast-redirections' => 0,
+ 'blocked' => 0,
+ 'reused-connections' => 0,
+ 'server-keep-alive' => 0,
);
}
sub print_stats () {
our %stats;
+ our %cli_options;
my $new_connections = $stats{requests} - $stats{crunches} - $stats{'reused-connections'};
my $outgoing_requests = $stats{requests} - $stats{crunches};
print "Client requests total: " . $stats{requests} . "\n";
print "Crunches: " . $stats{crunches} . " (" .
get_percentage($stats{requests}, $stats{crunches}) . ")\n";
+ print "Blocks: " . $stats{'blocked'} . " (" .
+ get_percentage($stats{requests}, $stats{'blocked'}) . ")\n";
+ print "Fast redirections: " . $stats{'fast-redirections'} . " (" .
+ get_percentage($stats{requests}, $stats{'fast-redirections'}) . ")\n";
print "Outgoing requests: " . $outgoing_requests . " (" .
get_percentage($stats{requests}, $outgoing_requests) . ")\n";
print "Server keep-alive offers: " . $stats{'server-keep-alive'} . " (" .
print "New outgoing connections: " . $new_connections . " (" .
get_percentage($stats{requests}, $new_connections) . ")\n";
print "Reused connections: " . $stats{'reused-connections'} . " (" .
- get_percentage($stats{requests}, $stats{'reused-connections'}) . ")\n";
+ get_percentage($stats{requests}, $stats{'reused-connections'}) .
+ "; server offers accepted: " .
+ get_percentage($stats{'server-keep-alive'}, $stats{'reused-connections'}) . ")\n";
print "Empty responses: " . $stats{'empty-responses'} . " (" .
get_percentage($stats{requests}, $stats{'empty-responses'}) . ")\n";
print "Empty responses on new connections: "
$stats{'empty-responses-on-reused-connections'} . " (" .
get_percentage($stats{requests}, $stats{'empty-responses-on-reused-connections'}) .
")\n";
+
+ if ($stats{method} eq 0) {
+ print "No response lines parsed yet yet.\n";
+ return;
+ }
+ print "Method distribution:\n";
+ foreach my $method (sort {$stats{'method'}{$b} <=> $stats{'method'}{$a}} keys %{$stats{'method'}}) {
+ printf "%8d : %-8s\n", $stats{'method'}{$method}, $method;
+ }
+ print "Client HTTP versions:\n";
+ foreach my $http_version (sort {$stats{'http-version'}{$b} <=> $stats{'http-version'}{$a}} keys %{$stats{'http-version'}}) {
+ printf "%d : %s\n", $stats{'http-version'}{$http_version}, $http_version;
+ }
+
+ if ($cli_options{'url-statistics-threshold'} == 0) {
+ print "URL statistics are disabled. Increase --url-statistics-threshold to enable them.\n";
+ } else {
+ print "Requested URLs:\n";
+ foreach my $ressource (sort {$stats{'ressource'}{$b} <=> $stats{'ressource'}{$a}} keys %{$stats{'ressource'}}) {
+ if ($stats{'ressource'}{$ressource} < $cli_options{'url-statistics-threshold'}) {
+ print "Skipped statistics for URLs below the treshold.\n";
+ last;
+ }
+ printf "%d : %s\n", $stats{'ressource'}{$ressource}, $ressource;
+ }
+ }
+
+ if ($cli_options{'host-statistics-threshold'} == 0) {
+ print "Host statistics are disabled. Increase --host-statistics-threshold to enable them.\n";
+ } else {
+ print "Requested Hosts:\n";
+ foreach my $host (sort {$stats{'hosts'}{$b} <=> $stats{'hosts'}{$a}} keys %{$stats{'hosts'}}) {
+ if ($stats{'hosts'}{$host} < $cli_options{'host-statistics-threshold'}) {
+ print "Skipped statistics for Hosts below the treshold.\n";
+ last;
+ }
+ printf "%d : %s\n", $stats{'hosts'}{$host}, $host;
+ }
+ }
}
'Error' => \&gather_loglevel_error_stats,
'Fatal error' => \&handle_loglevel_ignore,
'Writing' => \&handle_loglevel_ignore,
+ 'Received' => \&handle_loglevel_ignore,
'Unknown log level' => \&handle_loglevel_ignore
);
'show-ineffective-filters' => CLI_OPTION_SHOW_INEFFECTIVE_FILTERS,
'accept-unknown-messages' => CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES,
'statistics' => CLI_OPTION_STATISTICS,
+ 'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD,
+ 'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD,
);
GetOptions (
'show-ineffective-filters' => \$cli_options{'show-ineffective-filters'},
'accept-unknown-messages' => \$cli_options{'accept-unknown-messages'},
'statistics' => \$cli_options{'statistics'},
+ 'url-statistics-threshold=s'=> \$cli_options{'url-statistics-threshold'},
+ 'host-statistics-threshold=s'=> \$cli_options{'host-statistics-threshold'},
'version' => sub { VersionMessage && exit(0) },
'help' => \&help,
) or exit(1);
Options and their default values if they have any:
[--accept-unknown-messages]
+ [--host-statistics-threshold $cli_options{'host-statistics-threshold'}]
[--html-output]
[--no-embedded-css]
[--no-msecs]
[--shorten-thread-ids]
[--show-ineffective-filters]
[--statistics]
+ [--url-statistics-threshold $cli_options{'url-statistics-threshold'}]
[--title $cli_options{'title'}]
[--version]
see "perldoc $0" for more information
B<privoxy-log-parser> [B<--accept-unknown-messages>] [B<--html-output>]
[B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>]
-[B<--shorten-thread-ids>] [B<--show-ineffective-filters>] [B<--version>]
+[B<--shorten-thread-ids>] [B<--show-ineffective-filters>]
+[B<--url-statistics-threshold>] [B<--version>]
=head1 DESCRIPTION
[B<--accept-unknown-messages>] Don't print warnings in case of unknown messages,
just don't highlight them.
+[B<--host-statistics-threshold>] Only show the request count for a host
+if it's above or equal to the given threshold. If the threshold is 0, host
+statistics are disabled.
+
[B<--html-output>] Use HTML and CSS for the syntax highlighting. If this option is
omitted, ANSI escape sequences are used unless B<--no-syntax-highlighting> is active.
This option is only intended to make embedding log excerpts in web pages easier.
they very well might be. Also note that the results are pretty much guaranteed
to be incorrect if Privoxy and Privoxy-Log-Parser aren't in sync.
+[B<--url-statistics-threshold>] Only show the request count for a ressource
+if it's above or equal to the given threshold. If the threshold is 0, URL
+statistics are disabled.
+
[B<--version>] Print version and exit.
=head1 EXAMPLES