# A parser for Privoxy log messages. For incomplete documentation run
# perldoc privoxy-log-parser(.pl), for fancy screenshots see:
#
-# http://www.fabiankeil.de/sourcecode/privoxy-log-parser/
-#
-# $Id: privoxy-log-parser.pl,v 1.150 2012/12/24 15:37:39 fabiankeil Exp $
+# https://www.fabiankeil.de/sourcecode/privoxy-log-parser/
#
# TODO:
# - LOG_LEVEL_CGI, LOG_LEVEL_ERROR, LOG_LEVEL_WRITE content highlighting
# hash key as input.
# - Add --compress and --decompress options.
#
-# Copyright (c) 2007-2012 Fabian Keil <fk@fabiankeil.de>
+# Copyright (c) 2007-2017 Fabian Keil <fk@fabiankeil.de>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
use Getopt::Long;
use constant {
- PRIVOXY_LOG_PARSER_VERSION => '0.7',
+ PRIVOXY_LOG_PARSER_VERSION => '0.9',
# Feel free to mess with these ...
DEFAULT_BACKGROUND => 'black', # Choose registered colour (like 'black')
DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black')
CLI_OPTION_NO_SYNTAX_HIGHLIGHTING => 0,
CLI_OPTION_SHORTEN_THREAD_IDS => 0,
CLI_OPTION_SHOW_INEFFECTIVE_FILTERS => 0,
- CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES => 0,
CLI_OPTION_STATISTICS => 0,
CLI_OPTION_STRICT_CHECKS => 0,
CLI_OPTION_UNBREAK_LINES_ONLY => 0,
CLI_OPTION_URL_STATISTICS_THRESHOLD => 0,
CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0,
+ CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION => 0,
SUPPRESS_SUCCEEDED_FILTER_ADDITIONS => 1,
SHOW_SCAN_INTRO => 0,
my $unknown = shift;
my $message;
- return if cli_option_is_set('accept-unknown-messages');
+ return unless cli_option_is_set('strict-checks');
return if ($unknown =~ /\[too long, truncated\]$/);
$rl = h('invalid-request') . $rl . h('Standard');
- } elsif ($rl =~ m/^([-\w]+) (.*) (HTTP\/\d\.\d)/) {
+ } elsif ($rl =~ m/^([-\w]+) (.*) (HTTP\/\d+\.\d+)/) {
# XXX: might not match in case of HTTP method fuzzing.
# XXX: save these: ($method, $path, $http_version) = ($1, $2, $3);
or $c =~ m/^Merged multiple header lines to:/
or $c =~ m/^Added header: /
or $c =~ m/^Enlisting (?:sorted|left-over) header/
+ or $c =~ m/^Multiple Content-Type headers detected. Removing and ignoring: Content-Type:/
)
{
# XXX: Some of these may need highlighting
# Added header: Content-Encoding: deflate
# Enlisting sorted header User-Agent: Mozilla/5.0 (X11; SunOS i86pc; rv:10.0.3) Gecko/20100101 Firefox/10.0.3
# Enlisting left-over header Connection: close
+ # Multiple Content-Type headers detected. Removing and ignoring: Content-Type: text/html
} elsif ($c =~ m/^scanning headers for:/) {
$c =~ s@(?<=Drained )(\d+)@$h{'Number'}$1$h{'Standard'}@;
$c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
- } elsif ($c =~ m/^Tainting client socket/) {
+ } elsif ($c =~ m/^Tainting client socket/ or
+ $c =~ m/^Failed to shutdown socket/) {
# Tainting client socket 7 due to unread data.
+ # Failed to shutdown socket 11: Connection reset by peer
+
$c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
} elsif ($c =~ m/^Shifting \d+ pipelined bytes/) {
# Reloading configuration file '/usr/local/etc/privoxy/config'
$c =~ s@(?<=loading configuration file \')([^\']*)@$h{'file'}$1$h{'Standard'}@;
- } elsif ($c =~ m/^Loading (actions|filter) file: /) {
+ } elsif ($c =~ m/^Loading (actions|filter|trust) file: /) {
# Loading actions file: /usr/local/etc/privoxy/default.action
# Loading filter file: /usr/local/etc/privoxy/default.filter
+ # Loading trust file: /usr/local/etc/privoxy/trust
+
$c =~ s@(?<= file: )(.*)$@$h{'file'}$1$h{'Standard'}@;
} elsif ($c =~ m/^exiting by signal/) {
return shift;
}
+sub gather_loglevel_clf_stats ($) {
+
+ my $content = shift;
+ my ($method, $resource, $http_version, $status_code, $size);
+ our %stats;
+ our %cli_options;
+
+ # +0200] "GET https://www.youtube.com/watch?v=JmcA9LIIXWw HTTP/1.1" 200 68004
+ $content =~ m/^[+-]\d{4}\] "(\w+) (.+) (HTTP\/\d\.\d)" (\d+) (\d+)/;
+ $method = $1;
+ $resource = $2;
+ $http_version = $3;
+ $status_code = $4;
+ $size = $5;
+
+ unless (defined $method) {
+ print("Failed to parse: $content\n");
+ return;
+ }
+ $stats{'method'}{$method}++;
+ if ($cli_options{'url-statistics-threshold'} != 0) {
+ $stats{'resource'}{$resource}++;
+ }
+ $stats{'http-version'}{$http_version}++;
+
+ if ($cli_options{'host-statistics-threshold'} != 0) {
+ $resource =~ m@(?:http[s]://)([^/]+)/?@;
+ $stats{'hosts'}{$1}++;
+ }
+}
+
sub gather_loglevel_request_stats ($$) {
my $c = shift;
my $thread = shift;
my $thread = shift;
our %stats;
- $stats{requests}++;
$stats{crunches}++;
if ($c =~ m/^Redirected:/) {
} elsif ($c =~ m/^Blocked:/) {
# Blocked: blogger.googleusercontent.com:443
$stats{'blocked'}++;
+
+ } elsif ($c =~ m/^Connection timeout:/) {
+ # Connection timeout: http://c.tile.openstreetmap.org/18/136116/87842.png
+ $stats{'connection-timeout'}++;
+
+ } elsif ($c =~ m/^Connection failure:/) {
+ # Connection failure: http://127.0.0.1:8080/
+ $stats{'connection-failure'}++;
}
}
$thread_data{$thread}{'new_connection'} = 0;
$stats{'reused-connections'}++;
+
+ } elsif ($c =~ m/^Closing client socket \d+. .* Requests received: (\d+)\.$/) {
+
+ # Closing client socket 12. Keep-alive: 1. Socket alive: 1. Data available: 0. \
+ # Configuration file change detected: 0. Requests received: 14.
+
+ $stats{'client-requests-on-connection'}{$1}++;
+ $stats{'closed-client-connections'}++;
}
}
my ($c, $thread) = @_;
our %stats;
+ our %cli_options;
if ($c =~ m/^A HTTP\/1\.1 response without/ or
$c =~ m/^Keeping the server header 'Connection: keep-alive' around./)
# A HTTP/1.1 response without Connection header implies keep-alive.
# Keeping the server header 'Connection: keep-alive' around.
$stats{'server-keep-alive'}++;
-
- } elsif ($c =~ m/^scan: ((\w+) (.+) (HTTP\/\d\.\d))/) {
-
- # scan: HTTP/1.1 200 OK
- $stats{'method'}{$2}++;
- $stats{'resource'}{$3}++;
- $stats{'http-version'}{$4}++;
-
- } elsif ($c =~ m/^scan: Host: ([^\s]+)/) {
-
- # scan: Host: p.p
- $stats{'hosts'}{$1}++;
}
}
'empty-responses-on-reused-connections' => 0,
'fast-redirections' => 0,
'blocked' => 0,
+ 'connection-failure' => 0,
+ 'connection-timeout' => 0,
'reused-connections' => 0,
'server-keep-alive' => 0,
+ 'closed-client-connections' => 0,
);
+ $stats{'client-requests-on-connection'}{1} = 0;
}
sub get_percentage ($$) {
our %cli_options;
my $new_connections = $stats{requests} - $stats{crunches} - $stats{'reused-connections'};
my $outgoing_requests = $stats{requests} - $stats{crunches};
+ my $client_requests_checksum = 0;
if ($stats{requests} eq 0) {
print "No requests yet.\n";
get_percentage($stats{requests}, $stats{'blocked'}) . ")\n";
print "Fast redirections: " . $stats{'fast-redirections'} . " (" .
get_percentage($stats{requests}, $stats{'fast-redirections'}) . ")\n";
+ print "Connection timeouts: " . $stats{'connection-timeout'} . " (" .
+ get_percentage($stats{requests}, $stats{'connection-timeout'}) . ")\n";
+ print "Connection failures: " . $stats{'connection-failure'} . " (" .
+ get_percentage($stats{requests}, $stats{'connection-failure'}) . ")\n";
print "Outgoing requests: " . $outgoing_requests . " (" .
get_percentage($stats{requests}, $outgoing_requests) . ")\n";
print "Server keep-alive offers: " . $stats{'server-keep-alive'} . " (" .
$stats{'empty-responses-on-reused-connections'} . " (" .
get_percentage($stats{requests}, $stats{'empty-responses-on-reused-connections'}) .
")\n";
-
- if ($stats{method} eq 0) {
- print "No response lines parsed yet yet.\n";
- return;
+ print "Client connections: " . $stats{'closed-client-connections'} . "\n";
+
+ my $lines_printed = 0;
+ print "Client requests per connection distribution:\n";
+ foreach my $client_requests (sort {
+ $stats{'client-requests-on-connection'}{$b} <=> $stats{'client-requests-on-connection'}{$a}}
+ keys %{$stats{'client-requests-on-connection'}
+ })
+ {
+ my $count = $stats{'client-requests-on-connection'}{$client_requests};
+ $client_requests_checksum += $count * $client_requests;
+ if ($cli_options{'show-complete-request-distribution'} or ($lines_printed < 10)) {
+ printf "%8d: %d\n", $count, $client_requests;
+ $lines_printed++;
+ }
+ }
+ unless ($cli_options{'show-complete-request-distribution'}) {
+ printf "Enable --show-complete-request-distribution to get less common numbers as well.\n";
}
- print "Method distribution:\n";
- foreach my $method (sort {$stats{'method'}{$b} <=> $stats{'method'}{$a}} keys %{$stats{'method'}}) {
- printf "%8d : %-8s\n", $stats{'method'}{$method}, $method;
+ # Due to log rotation we may not have a complete picture for all the requests
+ printf "Improperly accounted requests: ~%d\n", abs($stats{requests} - $client_requests_checksum);
+
+ if (exists $stats{method}) {
+ print "Method distribution:\n";
+ foreach my $method (sort {$stats{'method'}{$b} <=> $stats{'method'}{$a}} keys %{$stats{'method'}}) {
+ printf "%8d : %-8s\n", $stats{'method'}{$method}, $method;
+ }
+ } else {
+ print "Method distribution unknown. No CLF message parsed yet. Is 'debug 512' enabled?\n";
}
print "Client HTTP versions:\n";
foreach my $http_version (sort {$stats{'http-version'}{$b} <=> $stats{'http-version'}{$a}} keys %{$stats{'http-version'}}) {
while (<>) {
(undef, $time_stamp, $thread, $log_level, $content) = split(/ /, $_, 5);
- # Skip LOG_LEVEL_CLF
- next if ($time_stamp eq "-");
- if (defined($log_level_handlers{$log_level})) {
+ next if (not defined($log_level));
+
+ if ($time_stamp eq "-") {
+
+ gather_loglevel_clf_stats($content);
+
+ } elsif (defined($log_level_handlers{$log_level})) {
$content = $log_level_handlers{$log_level}($content, $thread);
my $version_message;
$version_message .= 'Privoxy-Log-Parser ' . PRIVOXY_LOG_PARSER_VERSION . "\n";
- $version_message .= 'Copyright (C) 2007-2010 Fabian Keil <fk@fabiankeil.de>' . "\n";
- $version_message .= 'http://www.fabiankeil.de/sourcecode/privoxy-log-parser/' . "\n";
+ $version_message .= 'https://www.fabiankeil.de/sourcecode/privoxy-log-parser/' . "\n";
print $version_message;
}
'no-msecs' => CLI_OPTION_NO_MSECS,
'shorten-thread-ids' => CLI_OPTION_SHORTEN_THREAD_IDS,
'show-ineffective-filters' => CLI_OPTION_SHOW_INEFFECTIVE_FILTERS,
- 'accept-unknown-messages' => CLI_OPTION_ACCEPT_UNKNOWN_MESSAGES,
'statistics' => CLI_OPTION_STATISTICS,
'strict-checks' => CLI_OPTION_STRICT_CHECKS,
'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD,
'unbreak-lines-only' => CLI_OPTION_UNBREAK_LINES_ONLY,
'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD,
+ 'show-complete-request-distribution' => CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION,
);
GetOptions (
'no-msecs' => \$cli_options{'no-msecs'},
'shorten-thread-ids' => \$cli_options{'shorten-thread-ids'},
'show-ineffective-filters' => \$cli_options{'show-ineffective-filters'},
- 'accept-unknown-messages' => \$cli_options{'accept-unknown-messages'},
'statistics' => \$cli_options{'statistics'},
'strict-checks' => \$cli_options{'strict-checks'},
'unbreak-lines-only' => \$cli_options{'unbreak-lines-only'},
'url-statistics-threshold=i'=> \$cli_options{'url-statistics-threshold'},
'host-statistics-threshold=i'=> \$cli_options{'host-statistics-threshold'},
+ 'show-complete-request-distribution' => \$cli_options{'show-complete-request-distribution'},
'version' => sub { VersionMessage && exit(0) },
'help' => \&help,
) or exit(1);
print << " EOF"
Options and their default values if they have any:
- [--accept-unknown-messages]
[--host-statistics-threshold $cli_options{'host-statistics-threshold'}]
[--html-output]
[--no-embedded-css]
[--no-syntax-highlighting]
[--shorten-thread-ids]
[--show-ineffective-filters]
+ [--show-complete-request-distribution]
[--statistics]
[--unbreak-lines-only]
[--url-statistics-threshold $cli_options{'url-statistics-threshold'}]
=head1 SYNOPSIS
-B<privoxy-log-parser> [B<--accept-unknown-messages>] [B<--html-output>]
+B<privoxy-log-parser> [B<--html-output>]
[B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>]
[B<--shorten-thread-ids>] [B<--show-ineffective-filters>]
[B<--url-statistics-threshold>] [B<--version>]
=head1 OPTIONS
-[B<--accept-unknown-messages>] Don't print warnings in case of unknown messages,
-just don't highlight them.
-
[B<--host-statistics-threshold>] Only show the request count for a host
if it's above or equal to the given threshold. If the threshold is 0, host
statistics are disabled.
[B<--show-ineffective-filters>] Don't suppress log lines for filters
that didn't modify the content.
+[B<--show-complete-request-distribution>] Show the complete client request
+distribution in the B<--statistics> output. Without this option only the
+ten most common numbers are shown.
+
[B<--statistics>] Gather various statistics instead of syntax highlighting
log messages. This is an experimental feature, if the results look wrong
they very well might be. Also note that the results are pretty much guaranteed
input data and abort if it is unexpected, even if it doesn't affect the
results. Significantly slows the parsing down and is not expected to catch
any problems that matter.
+When highlighting, print warnings in case of unknown messages which can't be
+properly highlighted.
[B<--unbreak-lines-only>] Tries to fix lines that got messed up by a broken or
interestingly configured mail client and thus are no longer recognized properly.