#
# http://www.fabiankeil.de/sourcecode/privoxy-log-parser/
#
-# $Id: privoxy-log-parser.pl,v 1.88 2010/08/28 12:55:47 fabiankeil Exp $
+# $Id: privoxy-log-parser.pl,v 1.107 2010/12/11 15:40:29 fabiankeil Exp $
#
# TODO:
# - LOG_LEVEL_CGI, LOG_LEVEL_ERROR, LOG_LEVEL_WRITE content highlighting
use Getopt::Long;
use constant {
- PRIVOXY_LOG_PARSER_VERSION => '0.6',
+ PRIVOXY_LOG_PARSER_VERSION => '0.7',
# Feel free to mess with these ...
DEFAULT_BACKGROUND => 'black', # Choose registered colour (like 'black')
DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black')
CLI_OPTION_SHOW_INEFFECTIVE_FILTERS => 0,
CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES => 0,
CLI_OPTION_STATISTICS => 0,
+ CLI_OPTION_UNBREAK_LINES_ONLY => 0,
+ CLI_OPTION_URL_STATISTICS_THRESHOLD => 0,
+ CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0,
SUPPRESS_SUCCEEDED_FILTER_ADDITIONS => 1,
SHOW_SCAN_INTRO => 0,
'action-bits-update' => 'light_red',
'configuration-line' => 'red',
'content-type' => 'yellow',
+ 'HOST' => HEADER_DEFAULT_COLOUR,
);
%h_colours = %h;
my $message = shift;
if (LOG_UNPARSED_LINES_TO_EXTRA_FILE) {
- open(ERRORLOG, ">>" . ERROR_LOG_FILE) || die "Writing " . ERROR_LOG_FILE . " failed";
- print ERRORLOG $message;
- close(ERRORLOG);
+ open(my $errorlog_fd, ">>" . ERROR_LOG_FILE) || die "Writing " . ERROR_LOG_FILE . " failed";
+ print $errorlog_fd $message;
+ close($errorlog_fd);
}
}
} elsif ($c =~ m/^(scan: )(\w+ .+ HTTP\/\d\.\d)/) {
- # scan: HTTP/1.1 200 OK
+ # scan: GET http://p.p/ HTTP/1.1
$c = $1 . highlight_request_line($2);
} elsif ($c =~ m/^(scan: )((?:HTTP\/\d\.\d|ICY) (\d+) (.*))/) {
- # Server response line
+ # scan: HTTP/1.1 200 OK
$req{$t}{'response_line'} = $2;
$req{$t}{'status_code'} = $3;
$req{$t}{'status_message'} = $4;
or $c =~ m/^Removing 'Connection: close' to imply keep-alive./
or $c =~ m/^keep-alive support is disabled/
or $c =~ m/^Continue hack in da house/
+ or $c =~ m/^Merged multiple header lines to:/
)
{
# XXX: Some of these may need highlighting
# Removing 'Connection: close' to imply keep-alive.
# keep-alive support is disabled. Crunching: Keep-Alive: 300.
# Continue hack in da house.
+ # Merged multiple header lines to: 'X-FORWARDED-PROTO: http X-HOST: 127.0.0.1'
} elsif ($c =~ m/^scanning headers for:/) {
$c =~ s@(?<= from )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c =~ s@(?<= to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ } elsif ($c =~ m/^Killed all-caps Host header line: HOST:/) {
+
+ # Killed all-caps Host header line: HOST: bestproxydb.com
+ $c = highlight_matched_host($c, '(?<=HOST: )[^\s]+');
+ $c = highlight_matched_pattern($c, 'HOST', 'HOST');
+
} else {
found_unknown_content($c);
$c =~ s@(?<=set to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c =~ s@(?<=reading )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ } elsif ($c =~ m/^Reducing expected bytes to /) {
+
+ # Reducing expected bytes to 0. Marking the server socket tainted after throwing 4 bytes away.
+ $c =~ s@(?<=bytes to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ $c =~ s@(?<=after throwing )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
} elsif ($c =~ m/^Waiting for up to /) {
# Waiting for up to 4999 bytes from the client.
$stats{requests}++;
$stats{crunches}++;
+
+ if ($c =~ m/^Redirected:/) {
+ # Redirected: http://www.example.org/http://p.p/
+ $stats{'fast-redirections'}++;
+
+ } elsif ($c =~ m/^Blocked:/) {
+ # Blocked: blogger.googleusercontent.com:443
+ $stats{'blocked'}++;
+ }
}
# A HTTP/1.1 response without Connection header implies keep-alive.
# Keeping the server header 'Connection: keep-alive' around.
$stats{'server-keep-alive'}++;
+
+ } elsif ($c =~ m/^scan: ((\w+) (.+) (HTTP\/\d\.\d))/) {
+
+ # scan: HTTP/1.1 200 OK
+ $stats{'method'}{$2}++;
+ $stats{'ressource'}{$3}++;
+ $stats{'http-version'}{$4}++;
+
+ } elsif ($c =~ m/^scan: Host: ([^\s]+)/) {
+
+ # scan: Host: p.p
+ $stats{'hosts'}{$1}++;
}
}
'empty-responses' => 0,
'empty-responses-on-new-connections' => 0,
'empty-responses-on-reused-connections' => 0,
+ 'fast-redirections' => 0,
+ 'blocked' => 0,
+ 'reused-connections' => 0,
+ 'server-keep-alive' => 0,
);
}
sub print_stats () {
our %stats;
+ our %cli_options;
my $new_connections = $stats{requests} - $stats{crunches} - $stats{'reused-connections'};
my $outgoing_requests = $stats{requests} - $stats{crunches};
print "Client requests total: " . $stats{requests} . "\n";
print "Crunches: " . $stats{crunches} . " (" .
get_percentage($stats{requests}, $stats{crunches}) . ")\n";
+ print "Blocks: " . $stats{'blocked'} . " (" .
+ get_percentage($stats{requests}, $stats{'blocked'}) . ")\n";
+ print "Fast redirections: " . $stats{'fast-redirections'} . " (" .
+ get_percentage($stats{requests}, $stats{'fast-redirections'}) . ")\n";
print "Outgoing requests: " . $outgoing_requests . " (" .
get_percentage($stats{requests}, $outgoing_requests) . ")\n";
print "Server keep-alive offers: " . $stats{'server-keep-alive'} . " (" .
print "New outgoing connections: " . $new_connections . " (" .
get_percentage($stats{requests}, $new_connections) . ")\n";
print "Reused connections: " . $stats{'reused-connections'} . " (" .
- get_percentage($stats{requests}, $stats{'reused-connections'}) . ")\n";
+ get_percentage($stats{requests}, $stats{'reused-connections'}) .
+ "; server offers accepted: " .
+ get_percentage($stats{'server-keep-alive'}, $stats{'reused-connections'}) . ")\n";
print "Empty responses: " . $stats{'empty-responses'} . " (" .
get_percentage($stats{requests}, $stats{'empty-responses'}) . ")\n";
print "Empty responses on new connections: "
$stats{'empty-responses-on-reused-connections'} . " (" .
get_percentage($stats{requests}, $stats{'empty-responses-on-reused-connections'}) .
")\n";
+
+ if ($stats{method} eq 0) {
+ print "No response lines parsed yet yet.\n";
+ return;
+ }
+ print "Method distribution:\n";
+ foreach my $method (sort {$stats{'method'}{$b} <=> $stats{'method'}{$a}} keys %{$stats{'method'}}) {
+ printf "%8d : %-8s\n", $stats{'method'}{$method}, $method;
+ }
+ print "Client HTTP versions:\n";
+ foreach my $http_version (sort {$stats{'http-version'}{$b} <=> $stats{'http-version'}{$a}} keys %{$stats{'http-version'}}) {
+ printf "%d : %s\n", $stats{'http-version'}{$http_version}, $http_version;
+ }
+
+ if ($cli_options{'url-statistics-threshold'} == 0) {
+ print "URL statistics are disabled. Increase --url-statistics-threshold to enable them.\n";
+ } else {
+ print "Requested URLs:\n";
+ foreach my $ressource (sort {$stats{'ressource'}{$b} <=> $stats{'ressource'}{$a}} keys %{$stats{'ressource'}}) {
+ if ($stats{'ressource'}{$ressource} < $cli_options{'url-statistics-threshold'}) {
+ print "Skipped statistics for URLs below the treshold.\n";
+ last;
+ }
+ printf "%d : %s\n", $stats{'ressource'}{$ressource}, $ressource;
+ }
+ }
+
+ if ($cli_options{'host-statistics-threshold'} == 0) {
+ print "Host statistics are disabled. Increase --host-statistics-threshold to enable them.\n";
+ } else {
+ print "Requested Hosts:\n";
+ foreach my $host (sort {$stats{'hosts'}{$b} <=> $stats{'hosts'}{$a}} keys %{$stats{'hosts'}}) {
+ if ($stats{'hosts'}{$host} < $cli_options{'host-statistics-threshold'}) {
+ print "Skipped statistics for Hosts below the treshold.\n";
+ last;
+ }
+ printf "%d : %s\n", $stats{'hosts'}{$host}, $host;
+ }
+ }
}
'Error' => \&gather_loglevel_error_stats,
'Fatal error' => \&handle_loglevel_ignore,
'Writing' => \&handle_loglevel_ignore,
+ 'Received' => \&handle_loglevel_ignore,
'Unknown log level' => \&handle_loglevel_ignore
);
}
+sub unbreak_lines_only_loop() {
+ my $log_messages_reached = 0;
+ while (<>) {
+ chomp;
+
+ # Log level other than LOG_LEVEL_CLF?
+ if (m/^(\w{3} \d{2}) (\d\d:\d\d:\d\d)\.?(\d+)? (?:Privoxy\()?([^\)\s]*)[\)]? ([\w -]*): (.*?)\r?$/ or
+ # LOG_LEVEL_CLF?
+ m/^((?:\d+\.\d+\.\d+\.\d+|[:\d]+)) - - \[(.*)\] "(.*)" (\d+) (\d+)/) {
+ $log_messages_reached = 1;
+ print "\n";
+
+ } else {
+ # Wrapped message
+ }
+ s@<BR>$@@;
+ print;
+ print "\n" unless $log_messages_reached;
+ }
+}
+
sub VersionMessage {
my $version_message;
'show-ineffective-filters' => CLI_OPTION_SHOW_INEFFECTIVE_FILTERS,
'accept-unknown-messages' => CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES,
'statistics' => CLI_OPTION_STATISTICS,
+ 'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD,
+ 'unbreak-lines-only' => CLI_OPTION_UNBREAK_LINES_ONLY,
+ 'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD,
);
GetOptions (
'show-ineffective-filters' => \$cli_options{'show-ineffective-filters'},
'accept-unknown-messages' => \$cli_options{'accept-unknown-messages'},
'statistics' => \$cli_options{'statistics'},
+ 'unbreak-lines-only' => \$cli_options{'unbreak-lines-only'},
+ 'url-statistics-threshold=s'=> \$cli_options{'url-statistics-threshold'},
+ 'host-statistics-threshold=s'=> \$cli_options{'host-statistics-threshold'},
'version' => sub { VersionMessage && exit(0) },
'help' => \&help,
) or exit(1);
Options and their default values if they have any:
[--accept-unknown-messages]
+ [--host-statistics-threshold $cli_options{'host-statistics-threshold'}]
[--html-output]
[--no-embedded-css]
[--no-msecs]
[--shorten-thread-ids]
[--show-ineffective-filters]
[--statistics]
+ [--unbreak-lines-only]
+ [--url-statistics-threshold $cli_options{'url-statistics-threshold'}]
[--title $cli_options{'title'}]
[--version]
see "perldoc $0" for more information
print_intro();
- if (cli_option_is_set('statistics')) {
+ # XXX: should explicitly reject incompatible argument combinations
+ if (cli_option_is_set('unbreak-lines-only')) {
+ unbreak_lines_only_loop();
+ } elsif (cli_option_is_set('statistics')) {
stats_loop();
} else {
parse_loop();
B<privoxy-log-parser> [B<--accept-unknown-messages>] [B<--html-output>]
[B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>]
-[B<--shorten-thread-ids>] [B<--show-ineffective-filters>] [B<--version>]
+[B<--shorten-thread-ids>] [B<--show-ineffective-filters>]
+[B<--url-statistics-threshold>] [B<--version>]
=head1 DESCRIPTION
[B<--accept-unknown-messages>] Don't print warnings in case of unknown messages,
just don't highlight them.
+[B<--host-statistics-threshold>] Only show the request count for a host
+if it's above or equal to the given threshold. If the threshold is 0, host
+statistics are disabled.
+
[B<--html-output>] Use HTML and CSS for the syntax highlighting. If this option is
omitted, ANSI escape sequences are used unless B<--no-syntax-highlighting> is active.
This option is only intended to make embedding log excerpts in web pages easier.
they very well might be. Also note that the results are pretty much guaranteed
to be incorrect if Privoxy and Privoxy-Log-Parser aren't in sync.
+[B<--unbreak-lines-only] Tries to fix lines that got messed up by a broken or
+interestingly configured mail client and thus are no longer recognized properly.
+Only fixes some breakage, but may be good enough or at least better than nothing.
+Doesn't do anything else, so you probably want to pipe the output into
+B<privoxy-log-parser> again.
+
+[B<--url-statistics-threshold>] Only show the request count for a ressource
+if it's above or equal to the given threshold. If the threshold is 0, URL
+statistics are disabled.
+
[B<--version>] Print version and exit.
=head1 EXAMPLES