# A parser for Privoxy log messages. For incomplete documentation run
# perldoc privoxy-log-parser(.pl), for fancy screenshots see:
#
-# http://www.fabiankeil.de/sourcecode/privoxy-log-parser/
+# https://www.fabiankeil.de/sourcecode/privoxy-log-parser/
#
-# $Id: privoxy-log-parser.pl,v 1.139 2012/10/21 12:30:51 fabiankeil Exp $
+# $Id: privoxy-log-parser.pl,v 1.170 2017/03/03 17:43:35 fabiankeil Exp $
#
# TODO:
# - LOG_LEVEL_CGI, LOG_LEVEL_ERROR, LOG_LEVEL_WRITE content highlighting
# hash key as input.
# - Add --compress and --decompress options.
#
-# Copyright (c) 2007-2010 Fabian Keil <fk@fabiankeil.de>
+# Copyright (c) 2007-2017 Fabian Keil <fk@fabiankeil.de>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
use Getopt::Long;
use constant {
- PRIVOXY_LOG_PARSER_VERSION => '0.7',
+ PRIVOXY_LOG_PARSER_VERSION => '0.9',
# Feel free to mess with these ...
DEFAULT_BACKGROUND => 'black', # Choose registered colour (like 'black')
DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black')
CLI_OPTION_NO_SYNTAX_HIGHLIGHTING => 0,
CLI_OPTION_SHORTEN_THREAD_IDS => 0,
CLI_OPTION_SHOW_INEFFECTIVE_FILTERS => 0,
- CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES => 0,
CLI_OPTION_STATISTICS => 0,
+ CLI_OPTION_STRICT_CHECKS => 0,
CLI_OPTION_UNBREAK_LINES_ONLY => 0,
CLI_OPTION_URL_STATISTICS_THRESHOLD => 0,
CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0,
+ CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION => 0,
SUPPRESS_SUCCEEDED_FILTER_ADDITIONS => 1,
SHOW_SCAN_INTRO => 0,
my $unknown = shift;
my $message;
- return if cli_option_is_set('accept-unknown-messages');
+ return unless cli_option_is_set('strict-checks');
return if ($unknown =~ /\[too long, truncated\]$/);
$rl = h('invalid-request') . $rl . h('Standard');
- } elsif ($rl =~ m/^([-\w]+) (.*) (HTTP\/\d\.\d)/) {
+ } elsif ($rl =~ m/^([-\w]+) (.*) (HTTP\/\d+\.\d+)/) {
# XXX: might not match in case of HTTP method fuzzing.
# XXX: save these: ($method, $path, $http_version) = ($1, $2, $3);
or $c =~ m/^Merged multiple header lines to:/
or $c =~ m/^Added header: /
or $c =~ m/^Enlisting (?:sorted|left-over) header/
+ or $c =~ m/^Multiple Content-Type headers detected. Removing and ignoring: Content-Type:/
)
{
# XXX: Some of these may need highlighting
# Added header: Content-Encoding: deflate
# Enlisting sorted header User-Agent: Mozilla/5.0 (X11; SunOS i86pc; rv:10.0.3) Gecko/20100101 Firefox/10.0.3
# Enlisting left-over header Connection: close
+ # Multiple Content-Type headers detected. Removing and ignoring: Content-Type: text/html
} elsif ($c =~ m/^scanning headers for:/) {
# Rejecting connection from 178.63.152.227. Maximum number of connections reached.
$c =~ s@(?<=onnection from )((?:\d+\.?){3}\d+)@$h{'Number'}$1$h{'Standard'}@;
- } elsif ($c =~ m/^(?:Reusing|Closing) server socket \d./ or
+ } elsif ($c =~ m/^(?:Reusing|Closing) server socket / or
$c =~ m/^No additional client request/) {
# Reusing server socket 4. Opened for 10.0.0.1.
# No additional client request received in time. \
# Closing server socket 4, initially opened for 10.0.0.1.
# No additional client request received in time on socket 29.
+ # Privoxy 3.0.20 and later
+ # Reusing server socket 7 connected to www.privoxy.org. Total requests: 2.
+ # Closing server socket 6 connected to d.asset.soup.io. Keep-alive: 0.\
+ # Tainted: 1. Socket alive: 1. Timeout: 60. Configuration file change detected: 0.
$c =~ s@(?<= socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
- $c = highlight_matched_host($c, '(?<=for )[^\s]+(?=\.$)');
+ $c = highlight_matched_host($c, '(?<=for )[^\s]+(?=\.)');
+ $c = highlight_matched_host($c, '(?<=connected to )[^\s]+(?=\.)');
+ for my $number_pattern ('requests', 'Keep-alive', 'Tainted', ' alive', 'Timeout', 'detected') {
+ $c = highlight_matched_pattern($c, 'Number', '(?<='. $number_pattern . ': )\d+');
+ }
} elsif ($c =~ m/^Connected to /) {
} elsif ($c =~ m/^Waiting for the next client request/ or
$c =~ m/^The connection on server socket/ or
- $c =~ m/^Client request arrived in time /) {
+ $c =~ m/^Client request (?:\d+ )?(?:arrived in time|has been pipelined) /) {
# Waiting for the next client request on socket 3. Keeping the server \
# socket 12 to a.fsdn.com open.
# The connection on server socket 6 to upload.wikimedia.org isn't reusable. Closing.
- # Used by Privoxy 3.0.18 and later:
+ # Privoxy 3.0.20 and later:
+ # Client request 4 arrived in time on socket 7.
+ # Used by Privoxy 3.0.18 and 3.0.19:
# Client request arrived in time on socket 21.
# Used by earlier version:
# Client request arrived in time or the client closed the connection on socket 12.
+ # Client request 8 has been pipelined on socket 7 and the socket is still alive.
+ $c =~ s@(?<=request )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c =~ s@(?<=on socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c =~ s@(?<=server socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c = highlight_matched_host($c, '(?<=to )[^\s]+');
# Waiting for up to 4999 bytes from the client.
$c =~ s@(?<=up to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ } elsif ($c =~ m/^Optimistically sending /) {
+
+ # Optimistically sending 318 bytes of client headers intended for www.privoxy.org
+ $c =~ s@(?<=sending )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ $c = highlight_matched_host($c, '(?<=for )[^\s]+');
+
} elsif ($c =~ m/^Stopping to watch the client socket/) {
# Stopping to watch the client socket. There's already another request waiting.
# Stopping to watch the client socket 5. There's already another request waiting.
$c =~ s@(?<=client socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ } elsif ($c =~ m/^Drained \d+ bytes before closing/) {
+
+ # Drained 180 bytes before closing socket 6
+ $c =~ s@(?<=Drained )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
+ } elsif ($c =~ m/^Tainting client socket/ or
+ $c =~ m/^Failed to shutdown socket/) {
+
+ # Tainting client socket 7 due to unread data.
+ # Failed to shutdown socket 11: Connection reset by peer
+
+ $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
+ } elsif ($c =~ m/^Shifting \d+ pipelined bytes/) {
+
+ # Shifting 360 pipelined bytes by 360 bytes
+ $c =~ s@(?<=Shifting )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+ $c =~ s@(?<=by )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
} elsif ($c =~ m/^Looks like we / or
$c =~ m/^Unsetting keep-alive flag/ or
$c =~ m/^No connections to wait/ or
$c =~ m/^The server still wants to talk, but the client hung up on us./ or
$c =~ m/^The server didn't specify how long the connection will stay open/ or
$c =~ m/^There might be a request body. The connection will not be kept alive/ or
+ $c =~ m/^There better be a request body./ or
$c =~ m/^Done reading from the client\.$/) {
# Looks like we reached the end of the last chunk. We better stop reading.
# The server still wants to talk, but the client hung up on us.
# The server didn't specify how long the connection will stay open. Assume it's only a second.
# There might be a request body. The connection will not be kept alive.
- # Done reading from the client\.
+ # Privoxy 3.0.20 and later
+ # There better be a request body.
+ # Done reading from the client.
} else {
# Reloading configuration file '/usr/local/etc/privoxy/config'
$c =~ s@(?<=loading configuration file \')([^\']*)@$h{'file'}$1$h{'Standard'}@;
- } elsif ($c =~ m/^Loading (actions|filter) file: /) {
+ } elsif ($c =~ m/^Loading (actions|filter|trust) file: /) {
# Loading actions file: /usr/local/etc/privoxy/default.action
# Loading filter file: /usr/local/etc/privoxy/default.filter
+ # Loading trust file: /usr/local/etc/privoxy/trust
+
$c =~ s@(?<= file: )(.*)$@$h{'file'}$1$h{'Standard'}@;
} elsif ($c =~ m/^exiting by signal/) {
} elsif ($c =~ m/^Blocked:/) {
# Blocked: blogger.googleusercontent.com:443
$stats{'blocked'}++;
+
+ } elsif ($c =~ m/^Connection timeout:/) {
+ # Connection timeout: http://c.tile.openstreetmap.org/18/136116/87842.png
+ $stats{'connection-timeout'}++;
+
+ } elsif ($c =~ m/^Connection failure:/) {
+ # Connection failure: http://127.0.0.1:8080/
+ $stats{'connection-failure'}++;
}
}
$thread_data{$thread}{'new_connection'} = 0;
$stats{'reused-connections'}++;
+
+ } elsif ($c =~ m/^Closing client socket \d+. .* Requests received: (\d+)\.$/) {
+
+ # Closing client socket 12. Keep-alive: 1. Socket alive: 1. Data available: 0. \
+ # Configuration file change detected: 0. Requests received: 14.
+
+ $stats{'client-requests-on-connection'}{$1}++;
+ $stats{'closed-client-connections'}++;
}
}
my ($c, $thread) = @_;
our %stats;
+ our %cli_options;
if ($c =~ m/^A HTTP\/1\.1 response without/ or
$c =~ m/^Keeping the server header 'Connection: keep-alive' around./)
# scan: HTTP/1.1 200 OK
$stats{'method'}{$2}++;
- $stats{'resource'}{$3}++;
+ if ($cli_options{'url-statistics-threshold'} != 0) {
+ $stats{'resource'}{$3}++;
+ }
$stats{'http-version'}{$4}++;
- } elsif ($c =~ m/^scan: Host: ([^\s]+)/) {
+ } elsif ($cli_options{'host-statistics-threshold'} != 0 and
+ $c =~ m/^scan: Host: ([^\s]+)/) {
# scan: Host: p.p
$stats{'hosts'}{$1}++;
'empty-responses-on-reused-connections' => 0,
'fast-redirections' => 0,
'blocked' => 0,
+ 'connection-failure' => 0,
+ 'connection-timeout' => 0,
'reused-connections' => 0,
'server-keep-alive' => 0,
+ 'closed-client-connections' => 0,
);
+ $stats{'client-requests-on-connection'}{1} = 0;
}
sub get_percentage ($$) {
our %cli_options;
my $new_connections = $stats{requests} - $stats{crunches} - $stats{'reused-connections'};
my $outgoing_requests = $stats{requests} - $stats{crunches};
+ my $client_requests_checksum = 0;
if ($stats{requests} eq 0) {
print "No requests yet.\n";
get_percentage($stats{requests}, $stats{'blocked'}) . ")\n";
print "Fast redirections: " . $stats{'fast-redirections'} . " (" .
get_percentage($stats{requests}, $stats{'fast-redirections'}) . ")\n";
+ print "Connection timeouts: " . $stats{'connection-timeout'} . " (" .
+ get_percentage($stats{requests}, $stats{'connection-timeout'}) . ")\n";
+ print "Connection failures: " . $stats{'connection-failure'} . " (" .
+ get_percentage($stats{requests}, $stats{'connection-failure'}) . ")\n";
print "Outgoing requests: " . $outgoing_requests . " (" .
get_percentage($stats{requests}, $outgoing_requests) . ")\n";
print "Server keep-alive offers: " . $stats{'server-keep-alive'} . " (" .
$stats{'empty-responses-on-reused-connections'} . " (" .
get_percentage($stats{requests}, $stats{'empty-responses-on-reused-connections'}) .
")\n";
-
- if ($stats{method} eq 0) {
- print "No response lines parsed yet yet.\n";
- return;
+ print "Client connections: " . $stats{'closed-client-connections'} . "\n";
+
+ my $lines_printed = 0;
+ print "Client requests per connection distribution:\n";
+ foreach my $client_requests (sort {
+ $stats{'client-requests-on-connection'}{$b} <=> $stats{'client-requests-on-connection'}{$a}}
+ keys %{$stats{'client-requests-on-connection'}
+ })
+ {
+ my $count = $stats{'client-requests-on-connection'}{$client_requests};
+ $client_requests_checksum += $count * $client_requests;
+ if ($cli_options{'show-complete-request-distribution'} or ($lines_printed < 10)) {
+ printf "%8d: %d\n", $count, $client_requests;
+ $lines_printed++;
+ }
}
- print "Method distribution:\n";
- foreach my $method (sort {$stats{'method'}{$b} <=> $stats{'method'}{$a}} keys %{$stats{'method'}}) {
- printf "%8d : %-8s\n", $stats{'method'}{$method}, $method;
+ unless ($cli_options{'show-complete-request-distribution'}) {
+ printf "Enable --show-complete-request-distribution to get less common numbers as well.\n";
+ }
+ # Due to log rotation we may not have a complete picture for all the requests
+ printf "Improperly accounted requests: ~%d\n", abs($stats{requests} - $client_requests_checksum);
+
+ if (exists $stats{method}) {
+ print "Method distribution:\n";
+ foreach my $method (sort {$stats{'method'}{$b} <=> $stats{'method'}{$a}} keys %{$stats{'method'}}) {
+ printf "%8d : %-8s\n", $stats{'method'}{$method}, $method;
+ }
+ } else {
+ print "Method distribution unknown. No response headers parsed yet. Is 'debug 8' enabled?\n";
}
print "Client HTTP versions:\n";
foreach my $http_version (sort {$stats{'http-version'}{$b} <=> $stats{'http-version'}{$a}} keys %{$stats{'http-version'}}) {
sub stats_loop () {
my ($day, $time_stamp, $msecs, $thread, $log_level, $content);
+ my $strict_checks = cli_option_is_set('strict-checks');
my %log_level_handlers = (
- 'Re-Filter' => \&handle_loglevel_ignore,
- 'Header' => \&gather_loglevel_header_stats,
- 'Connect' => \&gather_loglevel_connect_stats,
- 'Redirect' => \&handle_loglevel_ignore,
- 'Request' => \&gather_loglevel_request_stats,
- 'Crunch' => \&gather_loglevel_crunch_stats,
- 'Gif-Deanimate' => \&handle_loglevel_ignore,
- 'Info' => \&handle_loglevel_ignore,
- 'CGI' => \&handle_loglevel_ignore,
- 'Force' => \&handle_loglevel_ignore,
- 'Error' => \&gather_loglevel_error_stats,
- 'Fatal error' => \&handle_loglevel_ignore,
- 'Writing' => \&handle_loglevel_ignore,
- 'Received' => \&handle_loglevel_ignore,
- 'Actions' => \&handle_loglevel_ignore,
- 'Unknown log level' => \&handle_loglevel_ignore
+ 'Connect:' => \&gather_loglevel_connect_stats,
+ 'Crunch:' => \&gather_loglevel_crunch_stats,
+ 'Error:' => \&gather_loglevel_error_stats,
+ 'Header:' => \&gather_loglevel_header_stats,
+ 'Request:' => \&gather_loglevel_request_stats,
+ );
+ my %ignored_log_levels = (
+ 'Actions:' => \&handle_loglevel_ignore,
+ 'CGI:' => \&handle_loglevel_ignore,
+ 'Fatal error:' => \&handle_loglevel_ignore,
+ 'Force:' => \&handle_loglevel_ignore,
+ 'Gif-Deanimate:' => \&handle_loglevel_ignore,
+ 'Info:' => \&handle_loglevel_ignore,
+ 'Re-Filter:' => \&handle_loglevel_ignore,
+ 'Received:' => \&handle_loglevel_ignore,
+ 'Redirect:' => \&handle_loglevel_ignore,
+ 'Unknown log level:' => \&handle_loglevel_ignore,
+ 'Writing:' => \&handle_loglevel_ignore,
);
while (<>) {
- if (m/^(\d{4}-\d{2}-\d{2}|\w{3} \d{2}) (\d\d:\d\d:\d\d)\.?(\d+)? (?:Privoxy\()?([^\)\s]*)[\)]? ([\w -]*): (.*?)\r?$/) {
- $day = $1;
- $time_stamp = $2;
- $msecs = $3 ? $3 : 0;
- $thread = $4;
- $log_level = $5;
- $content = $6;
+ (undef, $time_stamp, $thread, $log_level, $content) = split(/ /, $_, 5);
- if (defined($log_level_handlers{$log_level})) {
+ # Skip LOG_LEVEL_CLF
+ next if (not defined($log_level) or $time_stamp eq "-");
- $content = $log_level_handlers{$log_level}($content, $thread);
+ if (defined($log_level_handlers{$log_level})) {
- } else {
+ $content = $log_level_handlers{$log_level}($content, $thread);
- die "No handler found for log level \"$log_level\"\n";
+ } elsif ($strict_checks and not defined($ignored_log_levels{$log_level})) {
- }
+ die "No handler found for: $_";
}
}
my $version_message;
$version_message .= 'Privoxy-Log-Parser ' . PRIVOXY_LOG_PARSER_VERSION . "\n";
- $version_message .= 'Copyright (C) 2007-2010 Fabian Keil <fk@fabiankeil.de>' . "\n";
- $version_message .= 'http://www.fabiankeil.de/sourcecode/privoxy-log-parser/' . "\n";
+ $version_message .= 'https://www.fabiankeil.de/sourcecode/privoxy-log-parser/' . "\n";
print $version_message;
}
'no-msecs' => CLI_OPTION_NO_MSECS,
'shorten-thread-ids' => CLI_OPTION_SHORTEN_THREAD_IDS,
'show-ineffective-filters' => CLI_OPTION_SHOW_INEFFECTIVE_FILTERS,
- 'accept-unknown-messages' => CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES,
'statistics' => CLI_OPTION_STATISTICS,
+ 'strict-checks' => CLI_OPTION_STRICT_CHECKS,
'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD,
'unbreak-lines-only' => CLI_OPTION_UNBREAK_LINES_ONLY,
'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD,
+ 'show-complete-request-distribution' => CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION,
);
GetOptions (
'no-msecs' => \$cli_options{'no-msecs'},
'shorten-thread-ids' => \$cli_options{'shorten-thread-ids'},
'show-ineffective-filters' => \$cli_options{'show-ineffective-filters'},
- 'accept-unknown-messages' => \$cli_options{'accept-unknown-messages'},
'statistics' => \$cli_options{'statistics'},
+ 'strict-checks' => \$cli_options{'strict-checks'},
'unbreak-lines-only' => \$cli_options{'unbreak-lines-only'},
'url-statistics-threshold=i'=> \$cli_options{'url-statistics-threshold'},
'host-statistics-threshold=i'=> \$cli_options{'host-statistics-threshold'},
+ 'show-complete-request-distribution' => \$cli_options{'show-complete-request-distribution'},
'version' => sub { VersionMessage && exit(0) },
'help' => \&help,
) or exit(1);
print << " EOF"
Options and their default values if they have any:
- [--accept-unknown-messages]
[--host-statistics-threshold $cli_options{'host-statistics-threshold'}]
[--html-output]
[--no-embedded-css]
[--no-syntax-highlighting]
[--shorten-thread-ids]
[--show-ineffective-filters]
+ [--show-complete-request-distribution]
[--statistics]
[--unbreak-lines-only]
[--url-statistics-threshold $cli_options{'url-statistics-threshold'}]
=head1 SYNOPSIS
-B<privoxy-log-parser> [B<--accept-unknown-messages>] [B<--html-output>]
+B<privoxy-log-parser> [B<--html-output>]
[B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>]
[B<--shorten-thread-ids>] [B<--show-ineffective-filters>]
[B<--url-statistics-threshold>] [B<--version>]
=head1 OPTIONS
-[B<--accept-unknown-messages>] Don't print warnings in case of unknown messages,
-just don't highlight them.
-
[B<--host-statistics-threshold>] Only show the request count for a host
if it's above or equal to the given threshold. If the threshold is 0, host
statistics are disabled.
[B<--show-ineffective-filters>] Don't suppress log lines for filters
that didn't modify the content.
+[B<--show-complete-request-distribution>] Show the complete client request
+distribution in the B<--statistics> output. Without this option only the
+ten most common numbers are shown.
+
[B<--statistics>] Gather various statistics instead of syntax highlighting
log messages. This is an experimental feature, if the results look wrong
they very well might be. Also note that the results are pretty much guaranteed
to be incorrect if Privoxy and Privoxy-Log-Parser aren't in sync.
+[B<--strict-checks>] When generating statistics, look more careful at the
+input data and abort if it is unexpected, even if it doesn't affect the
+results. Significantly slows the parsing down and is not expected to catch
+any problems that matter.
+When highlighting, print warnings in case of unknown messages which can't be
+properly highlighted.
+
[B<--unbreak-lines-only>] Tries to fix lines that got messed up by a broken or
interestingly configured mail client and thus are no longer recognized properly.
Only fixes some breakage, but may be good enough or at least better than nothing.