X-Git-Url: http://www.privoxy.org/gitweb/?p=privoxy.git;a=blobdiff_plain;f=tools%2Fprivoxy-log-parser.pl;h=bf2dbdcde4b2d2083f48419043cae9cb60ca9592;hp=1c2a5be359404c4148a4e95fdd95f2ebd4cbd8ef;hb=c24e1b4673a77246936ec9ce9cfb15fd72ad4f7d;hpb=9ada65bf0c07f7c40c10e53060204f6fec84298a diff --git a/tools/privoxy-log-parser.pl b/tools/privoxy-log-parser.pl index 1c2a5be3..bf2dbdcd 100755 --- a/tools/privoxy-log-parser.pl +++ b/tools/privoxy-log-parser.pl @@ -23,7 +23,7 @@ # hash key as input. # - Add --compress and --decompress options. # -# Copyright (c) 2007-2021 Fabian Keil +# Copyright (c) 2007-2022 Fabian Keil # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -43,15 +43,17 @@ use warnings; use Getopt::Long; use constant { - PRIVOXY_LOG_PARSER_VERSION => '0.9.3', + PRIVOXY_LOG_PARSER_VERSION => '0.9.4', # Feel free to mess with these ... DEFAULT_BACKGROUND => 'black', # Choose registered colour (like 'black') DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black') HEADER_DEFAULT_COLOUR => 'yellow', REGISTER_HEADERS_WITH_THE_SAME_COLOUR => 1, + CLI_OPTION_DETECT_INACTIVITY => 0, CLI_OPTION_DEFAULT_TO_HTML_OUTPUT => 0, CLI_OPTION_TITLE => 'Privoxy-Log-Parser in da house', + CLI_OPTION_INACTIVITY_THRESHOLD => 100, CLI_OPTION_KEEP_DATE => 0, CLI_OPTION_NO_EMBEDDED_CSS => 0, CLI_OPTION_NO_MSECS => 0, @@ -62,6 +64,7 @@ use constant { CLI_OPTION_STRICT_CHECKS => 0, CLI_OPTION_UNBREAK_LINES_ONLY => 0, CLI_OPTION_URL_STATISTICS_THRESHOLD => 0, + CLI_OPTION_PASSED_REQUEST_STATISTICS_THRESHOLD => 0, CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0, CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION => 0, @@ -1622,9 +1625,10 @@ sub handle_loglevel_connect($) { $c = highlight_matched_host($c, '(?<=for )[^\s]+'); $c =~ s@(?<=in slot )(\d+)@$h{'Number'}$1$h{'Standard'}@; - } elsif ($c =~ m/^Socket/) { + } elsif ($c =~ m/^Socket \d+ (already|closed)/) { # Socket 16 already forgotten or never remembered. + # Socket 9 closed while waiting for client headers $c =~ s@(?<=Socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; } elsif ($c =~ m/^The connection to/) { @@ -1641,7 +1645,7 @@ sub handle_loglevel_connect($) { $c =~ s@(?<=Assumed latency: )(\d+)@$h{'Number'}$1$h{'Standard'}@; } elsif ($c =~ m/^Stopped waiting for the request line/ or - $c =~ m/^No request line on socket \d received in time/ or + $c =~ m/^No request line on socket \d+ received in time/ or $c =~ m/^The client side of the connection on socket \d/) { # Stopped waiting for the request line. Timeout: 121. @@ -1900,6 +1904,27 @@ sub handle_loglevel_connect($) { # Client socket 7 is no longer usable. The server socket has been closed. $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } elsif ($c =~ m/^Socket timeout \d+ reached/) { + + # Socket timeout 3 reached: http://127.0.0.1:20000/no-filter/chunked-content/36 + $c =~ s@(?<=timeout )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c = highlight_matched_url($c, "(?<=reached: ).*") + + } elsif ($c =~ m/^Prepared to read up to /) { + + # Prepared to read up to 157 bytes of encrypted request body from the client. + $c =~ s@(?<=up to )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Forwarding \d+ bytes /) { + + # Forwarding 157 bytes of encrypted request body. + $c =~ s@(?<=Forwarding )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Buffering encrypted client body/) { + + # Buffering encrypted client body. Prepared to read up to 2236 bytes. + $c =~ s@(?<=up to )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } elsif ($c =~ m/^Looks like we / or $c =~ m/^Unsetting keep-alive flag/ or $c =~ m/^No connections to wait/ or @@ -2146,6 +2171,10 @@ sub handle_loglevel_error($) { $c =~ s@(?<=digested )(\d+)@$h{'Number'}$1$h{'Standard'}@; $c =~ s@(?<=of )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } elsif ($c =~ m/^The socks connection timed out after/) { + + # The socks connection timed out after 60 seconds. + $c =~ s@(?<=after )(\d+)@$h{'Number'}$1$h{'Standard'}@; } # XXX: There are probably more messages that deserve highlighting. @@ -2204,17 +2233,25 @@ sub gather_loglevel_clf_stats($) { } sub gather_loglevel_request_stats($$) { - my $c = shift; + my $request_url = shift; my $thread = shift; our %stats; + our %cli_options; $stats{requests}++; + if ($cli_options{'passed-request-statistics-threshold'} != 0) { + # If the request get blocked we'll decrement + # in gather_loglevel_crunch_stats() + chomp $request_url; + $stats{'passed-request-url'}{$request_url}++; + } } sub gather_loglevel_crunch_stats($$) { my $c = shift; my $thread = shift; our %stats; + our %cli_options; $stats{crunches}++; @@ -2234,6 +2271,15 @@ sub gather_loglevel_crunch_stats($$) { # Connection failure: http://127.0.0.1:8080/ $stats{'connection-failure'}++; } + if ($cli_options{'passed-request-statistics-threshold'} != 0) { + $c =~ m/^[^:]+: (.*)/; + if ($stats{'passed-request-url'}{$1}) { + $stats{'passed-request-url'}{$1}-- ; + if ($stats{'passed-request-url'}{$1} == 0) { + delete($stats{'passed-request-url'}{$1}); + } + } + } } @@ -2475,6 +2521,19 @@ sub print_stats() { } } + if ($cli_options{'passed-request-statistics-threshold'} == 0) { + print "Passed request statistics are disabled. Increase --passed-request-statistics-threshold to enable them.\n"; + } else { + print "Requested requests that were passed:\n"; + foreach my $passed_url (sort {$stats{'passed-request-url'}{$b} <=> $stats{'passed-request-url'}{$a}} + keys %{$stats{'passed-request-url'}}) { + if ($stats{'passed-request-url'}{$passed_url} < $cli_options{'passed-request-statistics-threshold'}) { + print "Skipped statistics for passed URLs below the treshold.\n"; + last; + } + printf "%d : %s\n", $stats{'passed-request-url'}{$passed_url}, $passed_url; + } + } if ($cli_options{'host-statistics-threshold'} == 0) { print "Host statistics are disabled. Increase --host-statistics-threshold to enable them.\n"; } else { @@ -2708,6 +2767,75 @@ sub stats_loop() { } +# Convert a timestamp like 18:07:28.733 into milliseconds +sub time_stamp_to_msecs($) { + my $time_stamp = shift; + + if ($time_stamp =~ /(\d\d):(\d\d):(\d\d)\.(\d{3})/) { + my ($hours, $minutes, $seconds, $msecs) = ($1, $2, $3, $4); + + $msecs += $seconds * 1000; + $msecs += $minutes * 1000 * 60; + $msecs += $hours * 1000 * 60 * 60; + + return $msecs; + } + return undef; +} + +sub inactivity_detection_loop() { + + our %cli_options; + my ($date, $time_stamp, $thread, $log_level, $content); + my ($msecs, $previous_msecs, $inactivity); + my $inactivity_threshold = $cli_options{'inactivity-threshold'}; + my $previous_date; + my $log_messages_out_of_order = 0; + + while (<>) { + ($date, $time_stamp, $thread, $log_level, $content) = split(/ /, $_, 5); + + next if (not defined($log_level)); + next if ($time_stamp eq "-"); + $msecs = time_stamp_to_msecs($time_stamp); + unless (defined $msecs) { + print "Failed to convert $time_stamp into milliseconds\n"; + print "$_"; + next; + } + unless (defined $previous_msecs) { + $previous_msecs = $msecs; + $previous_date = $date; + print "$_"; + next; + } + $inactivity = $msecs - $previous_msecs; + if ($inactivity < 0) { + # This can happen if there's a high load in which case + # a Privoxy thread may be moved off schedule between + # getting the timestamp for the log message and actually + # writing it. + $log_messages_out_of_order++; + } + if ($inactivity > $inactivity_threshold) { + if ($previous_date eq $date) { + print "Detected inactivity: $inactivity msecs\n"; + } else { + # While we could include the date in the timestamp + # we currently don't. + print "Detected date change. Timestamp difference ignored.\n"; + } + } + print "$_"; + $previous_msecs = $msecs; + $previous_date = $date; + } + if ($log_messages_out_of_order) { + print "At least $log_messages_out_of_order messages were written out of the chronological order.\n"; + print "This can result in false positives. Consider sorting the log first.\n"; + } +} + sub unbreak_lines_only_loop() { my $log_messages_reached = 0; while (<>) { @@ -2744,6 +2872,8 @@ sub VersionMessage { sub get_cli_options() { our %cli_options = ( + 'detect-inactivity' => CLI_OPTION_DETECT_INACTIVITY, + 'inactivity-threshold' => CLI_OPTION_INACTIVITY_THRESHOLD, 'html-output' => CLI_OPTION_DEFAULT_TO_HTML_OUTPUT, 'title' => CLI_OPTION_TITLE, 'keep-date' => CLI_OPTION_KEEP_DATE, @@ -2757,10 +2887,13 @@ sub get_cli_options() { 'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD, 'unbreak-lines-only' => CLI_OPTION_UNBREAK_LINES_ONLY, 'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD, + 'passed-request-statistics-threshold' => CLI_OPTION_PASSED_REQUEST_STATISTICS_THRESHOLD, 'show-complete-request-distribution' => CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION, ); GetOptions ( + 'detect-inactivity' => \$cli_options{'detect-inactivity'}, + 'inactivity-threshold=i' => \$cli_options{'inactivity-threshold'}, 'html-output' => \$cli_options{'html-output'}, 'title' => \$cli_options{'title'}, 'keep-date' => \$cli_options{'keep-date'}, @@ -2774,6 +2907,7 @@ sub get_cli_options() { 'unbreak-lines-only' => \$cli_options{'unbreak-lines-only'}, 'url-statistics-threshold=i'=> \$cli_options{'url-statistics-threshold'}, 'host-statistics-threshold=i'=> \$cli_options{'host-statistics-threshold'}, + 'passed-request-statistics-threshold=i' => \$cli_options{'passed-request-statistics-threshold'}, 'show-complete-request-distribution' => \$cli_options{'show-complete-request-distribution'}, 'version' => sub { VersionMessage && exit(0) }, 'help' => \&help, @@ -2795,6 +2929,8 @@ sub help() { print << " EOF" Options and their default values if they have any: + [--detect-innactivity] + [--inactivity-threshold $cli_options{'inactivity-threshold'}] [--host-statistics-threshold $cli_options{'host-statistics-threshold'}] [--html-output] [--no-embedded-css] @@ -2806,6 +2942,7 @@ Options and their default values if they have any: [--statistics] [--unbreak-lines-only] [--url-statistics-threshold $cli_options{'url-statistics-threshold'}] + [--passed-request-statistics-threshold $cli_options{'passed-request-statistics-threshold'}] [--title $cli_options{'title'}] [--version] see "perldoc $0" for more information @@ -2823,18 +2960,18 @@ sub main() { set_background(DEFAULT_BACKGROUND); prepare_our_stuff(); - print_intro(); - # XXX: should explicitly reject incompatible argument combinations if (cli_option_is_set('unbreak-lines-only')) { unbreak_lines_only_loop(); } elsif (cli_option_is_set('statistics')) { stats_loop(); + } elsif (cli_option_is_set('detect-inactivity')) { + inactivity_detection_loop(); } else { + print_intro(); parse_loop(); + print_outro(); } - - print_outro(); } main(); @@ -2845,7 +2982,8 @@ B - A parser and syntax-highlighter for Privoxy log messages =head1 SYNOPSIS -B [B<--html-output>] +B [B<--detect-inactivity>] [B<--inactivity-threshold msecs>] +[B<--html-output>] [B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>] [B<--shorten-thread-ids>] [B<--show-ineffective-filters>] [B<--url-statistics-threshold>] [B<--version>] @@ -2872,15 +3010,23 @@ will hide the "filter foo caused 0 hits" message. =head1 OPTIONS +[B<--detect-inactivity>] Instead of syntax highlighting, detect periods +of log inactivity of more than the amount of milliseconds specified with +the B<--inactivity-threshold> option. Mainly useful for debugging. + [B<--host-statistics-threshold>] Only show the request count for a host if it's above or equal to the given threshold. If the threshold is 0, host statistics are disabled. -[B<--html-output>] Use HTML and CSS for the syntax highlighting. If this option is +[B<--html-output>] Use HTML and CSS when syntax highlighting. If this option is omitted, ANSI escape sequences are used unless B<--no-syntax-highlighting> is active. This option is only intended to make embedding log excerpts in web pages easier. It does not escape any input! +[B<--inactivity-threshold msecs>] Specifies the number of milliseconds between +log messages to consider inactivity when running in [B<--detect-inactivity>] +mode. + [B<--keep-date>] Don't remove the date when printing highlighted log messages. Useful when parsing multiple log files at once. @@ -2891,6 +3037,10 @@ the filtered output is piped into less in which case the ANSI control codes don't work, or if the terminal itself doesn't support the control codes. +[B<--passed-request-statistics-threshold>] Only show the request count for +a passed requests if it's above or equal to the given threshold. If the +threshold is 0, passed request statistics are disabled. + [B<--shorten-thread-ids>] Shorten the thread ids to a three-digit decimal number. Note that the mapping from thread ids to shortened ids is created at run-time and thus varies with the input.