privoxy-log-parser: Highlight 'Buffering encrypted client body. Prepared to read...
[privoxy.git] / tools / privoxy-log-parser.pl
index 376607c..bf2dbdc 100755 (executable)
@@ -23,7 +23,7 @@
 #         hash key as input.
 #       - Add --compress and --decompress options.
 #
 #         hash key as input.
 #       - Add --compress and --decompress options.
 #
-# Copyright (c) 2007-2021 Fabian Keil <fk@fabiankeil.de>
+# Copyright (c) 2007-2022 Fabian Keil <fk@fabiankeil.de>
 #
 # Permission to use, copy, modify, and distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
 #
 # Permission to use, copy, modify, and distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
@@ -43,15 +43,17 @@ use warnings;
 use Getopt::Long;
 
 use constant {
 use Getopt::Long;
 
 use constant {
-    PRIVOXY_LOG_PARSER_VERSION => '0.9.3',
+    PRIVOXY_LOG_PARSER_VERSION => '0.9.4',
     # Feel free to mess with these ...
     DEFAULT_BACKGROUND => 'black',  # Choose registered colour (like 'black')
     DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black')
     HEADER_DEFAULT_COLOUR => 'yellow',
     REGISTER_HEADERS_WITH_THE_SAME_COLOUR => 1,
 
     # Feel free to mess with these ...
     DEFAULT_BACKGROUND => 'black',  # Choose registered colour (like 'black')
     DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black')
     HEADER_DEFAULT_COLOUR => 'yellow',
     REGISTER_HEADERS_WITH_THE_SAME_COLOUR => 1,
 
+    CLI_OPTION_DETECT_INACTIVITY => 0,
     CLI_OPTION_DEFAULT_TO_HTML_OUTPUT => 0,
     CLI_OPTION_TITLE => 'Privoxy-Log-Parser in da house',
     CLI_OPTION_DEFAULT_TO_HTML_OUTPUT => 0,
     CLI_OPTION_TITLE => 'Privoxy-Log-Parser in da house',
+    CLI_OPTION_INACTIVITY_THRESHOLD => 100,
     CLI_OPTION_KEEP_DATE => 0,
     CLI_OPTION_NO_EMBEDDED_CSS => 0,
     CLI_OPTION_NO_MSECS => 0,
     CLI_OPTION_KEEP_DATE => 0,
     CLI_OPTION_NO_EMBEDDED_CSS => 0,
     CLI_OPTION_NO_MSECS => 0,
@@ -62,6 +64,7 @@ use constant {
     CLI_OPTION_STRICT_CHECKS => 0,
     CLI_OPTION_UNBREAK_LINES_ONLY => 0,
     CLI_OPTION_URL_STATISTICS_THRESHOLD => 0,
     CLI_OPTION_STRICT_CHECKS => 0,
     CLI_OPTION_UNBREAK_LINES_ONLY => 0,
     CLI_OPTION_URL_STATISTICS_THRESHOLD => 0,
+    CLI_OPTION_PASSED_REQUEST_STATISTICS_THRESHOLD => 0,
     CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0,
     CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION => 0,
 
     CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0,
     CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION => 0,
 
@@ -1622,9 +1625,10 @@ sub handle_loglevel_connect($) {
         $c = highlight_matched_host($c, '(?<=for )[^\s]+');
         $c =~ s@(?<=in slot )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
         $c = highlight_matched_host($c, '(?<=for )[^\s]+');
         $c =~ s@(?<=in slot )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
-    } elsif ($c =~ m/^Socket/) {
+    } elsif ($c =~ m/^Socket \d+ (already|closed)/) {
 
         # Socket 16 already forgotten or never remembered.
 
         # Socket 16 already forgotten or never remembered.
+        # Socket 9 closed while waiting for client headers
         $c =~ s@(?<=Socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
     } elsif ($c =~ m/^The connection to/) {
         $c =~ s@(?<=Socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
     } elsif ($c =~ m/^The connection to/) {
@@ -1641,7 +1645,7 @@ sub handle_loglevel_connect($) {
         $c =~ s@(?<=Assumed latency: )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
     } elsif ($c =~ m/^Stopped waiting for the request line/ or
         $c =~ s@(?<=Assumed latency: )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
     } elsif ($c =~ m/^Stopped waiting for the request line/ or
-             $c =~ m/^No request line on socket \d received in time/ or
+             $c =~ m/^No request line on socket \d+ received in time/ or
              $c =~ m/^The client side of the connection on socket \d/) {
 
         # Stopped waiting for the request line. Timeout: 121.
              $c =~ m/^The client side of the connection on socket \d/) {
 
         # Stopped waiting for the request line. Timeout: 121.
@@ -1900,6 +1904,27 @@ sub handle_loglevel_connect($) {
         # Client socket 7 is no longer usable. The server socket has been closed.
         $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
         # Client socket 7 is no longer usable. The server socket has been closed.
         $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
+    } elsif ($c =~ m/^Socket timeout \d+ reached/) {
+
+        # Socket timeout 3 reached: http://127.0.0.1:20000/no-filter/chunked-content/36
+        $c =~ s@(?<=timeout )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+        $c = highlight_matched_url($c, "(?<=reached: ).*")
+
+    } elsif ($c =~ m/^Prepared to read up to /) {
+
+        # Prepared to read up to 157 bytes of encrypted request body from the client.
+        $c =~ s@(?<=up to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
+    } elsif ($c =~ m/^Forwarding \d+ bytes /) {
+
+        # Forwarding 157 bytes of encrypted request body.
+        $c =~ s@(?<=Forwarding )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
+    } elsif ($c =~ m/^Buffering encrypted client body/) {
+
+        # Buffering encrypted client body. Prepared to read up to 2236 bytes.
+        $c =~ s@(?<=up to )(\d+)@$h{'Number'}$1$h{'Standard'}@;
+
     } elsif ($c =~ m/^Looks like we / or
              $c =~ m/^Unsetting keep-alive flag/ or
              $c =~ m/^No connections to wait/ or
     } elsif ($c =~ m/^Looks like we / or
              $c =~ m/^Unsetting keep-alive flag/ or
              $c =~ m/^No connections to wait/ or
@@ -2146,6 +2171,10 @@ sub handle_loglevel_error($) {
         $c =~ s@(?<=digested )(\d+)@$h{'Number'}$1$h{'Standard'}@;
         $c =~ s@(?<=of )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
         $c =~ s@(?<=digested )(\d+)@$h{'Number'}$1$h{'Standard'}@;
         $c =~ s@(?<=of )(\d+)@$h{'Number'}$1$h{'Standard'}@;
 
+    } elsif ($c =~ m/^The socks connection timed out after/) {
+
+        # The socks connection timed out after 60 seconds.
+        $c =~ s@(?<=after )(\d+)@$h{'Number'}$1$h{'Standard'}@;
     }
 
     # XXX: There are probably more messages that deserve highlighting.
     }
 
     # XXX: There are probably more messages that deserve highlighting.
@@ -2204,17 +2233,25 @@ sub gather_loglevel_clf_stats($) {
 }
 
 sub gather_loglevel_request_stats($$) {
 }
 
 sub gather_loglevel_request_stats($$) {
-    my $c = shift;
+    my $request_url = shift;
     my $thread = shift;
     our %stats;
     my $thread = shift;
     our %stats;
+    our %cli_options;
 
     $stats{requests}++;
 
     $stats{requests}++;
+    if ($cli_options{'passed-request-statistics-threshold'} != 0) {
+        # If the request get blocked we'll decrement
+        # in gather_loglevel_crunch_stats()
+        chomp $request_url;
+        $stats{'passed-request-url'}{$request_url}++;
+    }
 }
 
 sub gather_loglevel_crunch_stats($$) {
     my $c = shift;
     my $thread = shift;
     our %stats;
 }
 
 sub gather_loglevel_crunch_stats($$) {
     my $c = shift;
     my $thread = shift;
     our %stats;
+    our %cli_options;
 
     $stats{crunches}++;
 
 
     $stats{crunches}++;
 
@@ -2234,6 +2271,15 @@ sub gather_loglevel_crunch_stats($$) {
         # Connection failure: http://127.0.0.1:8080/
         $stats{'connection-failure'}++;
     }
         # Connection failure: http://127.0.0.1:8080/
         $stats{'connection-failure'}++;
     }
+    if ($cli_options{'passed-request-statistics-threshold'} != 0) {
+        $c =~ m/^[^:]+: (.*)/;
+        if ($stats{'passed-request-url'}{$1}) {
+            $stats{'passed-request-url'}{$1}-- ;
+            if ($stats{'passed-request-url'}{$1} == 0) {
+                delete($stats{'passed-request-url'}{$1});
+            }
+        }
+    }
 }
 
 
 }
 
 
@@ -2475,6 +2521,19 @@ sub print_stats() {
         }
     }
 
         }
     }
 
+    if ($cli_options{'passed-request-statistics-threshold'} == 0) {
+        print "Passed request statistics are disabled. Increase --passed-request-statistics-threshold to enable them.\n";
+    } else {
+        print "Requested requests that were passed:\n";
+        foreach my $passed_url (sort {$stats{'passed-request-url'}{$b} <=> $stats{'passed-request-url'}{$a}}
+                                keys %{$stats{'passed-request-url'}}) {
+            if ($stats{'passed-request-url'}{$passed_url} < $cli_options{'passed-request-statistics-threshold'}) {
+                print "Skipped statistics for passed URLs below the treshold.\n";
+                last;
+            }
+            printf "%d : %s\n", $stats{'passed-request-url'}{$passed_url}, $passed_url;
+        }
+    }
     if ($cli_options{'host-statistics-threshold'} == 0) {
         print "Host statistics are disabled. Increase --host-statistics-threshold to enable them.\n";
     } else {
     if ($cli_options{'host-statistics-threshold'} == 0) {
         print "Host statistics are disabled. Increase --host-statistics-threshold to enable them.\n";
     } else {
@@ -2708,6 +2767,75 @@ sub stats_loop() {
 
 }
 
 
 }
 
+# Convert a timestamp like 18:07:28.733 into milliseconds
+sub time_stamp_to_msecs($) {
+    my $time_stamp = shift;
+
+    if ($time_stamp =~ /(\d\d):(\d\d):(\d\d)\.(\d{3})/) {
+        my ($hours, $minutes, $seconds, $msecs) = ($1, $2, $3, $4);
+
+        $msecs += $seconds * 1000;
+        $msecs += $minutes * 1000 * 60;
+        $msecs += $hours   * 1000 * 60 * 60;
+
+        return $msecs;
+    }
+    return undef;
+}
+
+sub inactivity_detection_loop() {
+
+    our %cli_options;
+    my ($date, $time_stamp, $thread, $log_level, $content);
+    my ($msecs, $previous_msecs, $inactivity);
+    my $inactivity_threshold = $cli_options{'inactivity-threshold'};
+    my $previous_date;
+    my $log_messages_out_of_order = 0;
+
+    while (<>) {
+        ($date, $time_stamp, $thread, $log_level, $content) = split(/ /, $_, 5);
+
+        next if (not defined($log_level));
+        next if ($time_stamp eq "-");
+        $msecs = time_stamp_to_msecs($time_stamp);
+        unless (defined $msecs) {
+            print "Failed to convert $time_stamp into milliseconds\n";
+            print "$_";
+            next;
+        }
+        unless (defined $previous_msecs) {
+            $previous_msecs = $msecs;
+            $previous_date = $date;
+            print "$_";
+            next;
+        }
+        $inactivity = $msecs - $previous_msecs;
+        if ($inactivity < 0) {
+            # This can happen if there's a high load in which case
+            # a Privoxy thread may be moved off schedule between
+            # getting the timestamp for the log message and actually
+            # writing it.
+            $log_messages_out_of_order++;
+        }
+        if ($inactivity > $inactivity_threshold) {
+            if ($previous_date eq $date) {
+                print "Detected inactivity: $inactivity msecs\n";
+            } else {
+                # While we could include the date in the timestamp
+                # we currently don't.
+                print "Detected date change. Timestamp difference ignored.\n";
+            }
+        }
+        print "$_";
+        $previous_msecs = $msecs;
+        $previous_date = $date;
+    }
+    if ($log_messages_out_of_order) {
+        print "At least $log_messages_out_of_order messages were written out of the chronological order.\n";
+        print "This can result in false positives. Consider sorting the log first.\n";
+    }
+}
+
 sub unbreak_lines_only_loop() {
     my $log_messages_reached = 0;
     while (<>) {
 sub unbreak_lines_only_loop() {
     my $log_messages_reached = 0;
     while (<>) {
@@ -2744,6 +2872,8 @@ sub VersionMessage {
 sub get_cli_options() {
 
     our %cli_options = (
 sub get_cli_options() {
 
     our %cli_options = (
+        'detect-inactivity'        => CLI_OPTION_DETECT_INACTIVITY,
+        'inactivity-threshold'     => CLI_OPTION_INACTIVITY_THRESHOLD,
         'html-output'              => CLI_OPTION_DEFAULT_TO_HTML_OUTPUT,
         'title'                    => CLI_OPTION_TITLE,
         'keep-date'                => CLI_OPTION_KEEP_DATE,
         'html-output'              => CLI_OPTION_DEFAULT_TO_HTML_OUTPUT,
         'title'                    => CLI_OPTION_TITLE,
         'keep-date'                => CLI_OPTION_KEEP_DATE,
@@ -2757,10 +2887,13 @@ sub get_cli_options() {
         'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD,
         'unbreak-lines-only'       => CLI_OPTION_UNBREAK_LINES_ONLY,
         'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD,
         'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD,
         'unbreak-lines-only'       => CLI_OPTION_UNBREAK_LINES_ONLY,
         'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD,
+        'passed-request-statistics-threshold' => CLI_OPTION_PASSED_REQUEST_STATISTICS_THRESHOLD,
         'show-complete-request-distribution' => CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION,
     );
 
     GetOptions (
         'show-complete-request-distribution' => CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION,
     );
 
     GetOptions (
+        'detect-inactivity'        => \$cli_options{'detect-inactivity'},
+        'inactivity-threshold=i'   => \$cli_options{'inactivity-threshold'},
         'html-output'              => \$cli_options{'html-output'},
         'title'                    => \$cli_options{'title'},
         'keep-date'                => \$cli_options{'keep-date'},
         'html-output'              => \$cli_options{'html-output'},
         'title'                    => \$cli_options{'title'},
         'keep-date'                => \$cli_options{'keep-date'},
@@ -2774,6 +2907,7 @@ sub get_cli_options() {
         'unbreak-lines-only'       => \$cli_options{'unbreak-lines-only'},
         'url-statistics-threshold=i'=> \$cli_options{'url-statistics-threshold'},
         'host-statistics-threshold=i'=> \$cli_options{'host-statistics-threshold'},
         'unbreak-lines-only'       => \$cli_options{'unbreak-lines-only'},
         'url-statistics-threshold=i'=> \$cli_options{'url-statistics-threshold'},
         'host-statistics-threshold=i'=> \$cli_options{'host-statistics-threshold'},
+        'passed-request-statistics-threshold=i' => \$cli_options{'passed-request-statistics-threshold'},
         'show-complete-request-distribution' => \$cli_options{'show-complete-request-distribution'},
         'version'                  => sub { VersionMessage && exit(0) },
         'help'                     => \&help,
         'show-complete-request-distribution' => \$cli_options{'show-complete-request-distribution'},
         'version'                  => sub { VersionMessage && exit(0) },
         'help'                     => \&help,
@@ -2795,6 +2929,8 @@ sub help() {
     print << "    EOF"
 
 Options and their default values if they have any:
     print << "    EOF"
 
 Options and their default values if they have any:
+    [--detect-innactivity]
+    [--inactivity-threshold $cli_options{'inactivity-threshold'}]
     [--host-statistics-threshold $cli_options{'host-statistics-threshold'}]
     [--html-output]
     [--no-embedded-css]
     [--host-statistics-threshold $cli_options{'host-statistics-threshold'}]
     [--html-output]
     [--no-embedded-css]
@@ -2806,6 +2942,7 @@ Options and their default values if they have any:
     [--statistics]
     [--unbreak-lines-only]
     [--url-statistics-threshold $cli_options{'url-statistics-threshold'}]
     [--statistics]
     [--unbreak-lines-only]
     [--url-statistics-threshold $cli_options{'url-statistics-threshold'}]
+    [--passed-request-statistics-threshold $cli_options{'passed-request-statistics-threshold'}]
     [--title $cli_options{'title'}]
     [--version]
 see "perldoc $0" for more information
     [--title $cli_options{'title'}]
     [--version]
 see "perldoc $0" for more information
@@ -2823,18 +2960,18 @@ sub main() {
     set_background(DEFAULT_BACKGROUND);
     prepare_our_stuff();
 
     set_background(DEFAULT_BACKGROUND);
     prepare_our_stuff();
 
-    print_intro();
-
     # XXX: should explicitly reject incompatible argument combinations
     if (cli_option_is_set('unbreak-lines-only')) {
         unbreak_lines_only_loop();
     } elsif (cli_option_is_set('statistics')) {
         stats_loop();
     # XXX: should explicitly reject incompatible argument combinations
     if (cli_option_is_set('unbreak-lines-only')) {
         unbreak_lines_only_loop();
     } elsif (cli_option_is_set('statistics')) {
         stats_loop();
+    } elsif (cli_option_is_set('detect-inactivity')) {
+        inactivity_detection_loop();
     } else {
     } else {
+        print_intro();
         parse_loop();
         parse_loop();
+        print_outro();
     }
     }
-
-    print_outro();
 }
 
 main();
 }
 
 main();
@@ -2845,7 +2982,8 @@ B<privoxy-log-parser> - A parser and syntax-highlighter for Privoxy log messages
 
 =head1 SYNOPSIS
 
 
 =head1 SYNOPSIS
 
-B<privoxy-log-parser> [B<--html-output>]
+B<privoxy-log-parser> [B<--detect-inactivity>] [B<--inactivity-threshold msecs>]
+[B<--html-output>]
 [B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>]
 [B<--shorten-thread-ids>] [B<--show-ineffective-filters>]
 [B<--url-statistics-threshold>] [B<--version>]
 [B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>]
 [B<--shorten-thread-ids>] [B<--show-ineffective-filters>]
 [B<--url-statistics-threshold>] [B<--version>]
@@ -2872,6 +3010,10 @@ will hide the "filter foo caused 0 hits" message.
 
 =head1 OPTIONS
 
 
 =head1 OPTIONS
 
+[B<--detect-inactivity>] Instead of syntax highlighting, detect periods
+of log inactivity of more than the amount of milliseconds specified with
+the B<--inactivity-threshold> option. Mainly useful for debugging.
+
 [B<--host-statistics-threshold>] Only show the request count for a host
 if it's above or equal to the given threshold. If the threshold is 0, host
 statistics are disabled.
 [B<--host-statistics-threshold>] Only show the request count for a host
 if it's above or equal to the given threshold. If the threshold is 0, host
 statistics are disabled.
@@ -2881,6 +3023,10 @@ omitted, ANSI escape sequences are used unless B<--no-syntax-highlighting> is ac
 This option is only intended to make embedding log excerpts in web pages easier.
 It does not escape any input!
 
 This option is only intended to make embedding log excerpts in web pages easier.
 It does not escape any input!
 
+[B<--inactivity-threshold msecs>] Specifies the number of milliseconds between
+log messages to consider inactivity when running in [B<--detect-inactivity>]
+mode.
+
 [B<--keep-date>] Don't remove the date when printing highlighted log messages.
 Useful when parsing multiple log files at once.
 
 [B<--keep-date>] Don't remove the date when printing highlighted log messages.
 Useful when parsing multiple log files at once.
 
@@ -2891,6 +3037,10 @@ the filtered output is piped into less in which case the ANSI control
 codes don't work, or if the terminal itself doesn't support the control
 codes.
 
 codes don't work, or if the terminal itself doesn't support the control
 codes.
 
+[B<--passed-request-statistics-threshold>] Only show the request count for
+a passed requests if it's above or equal to the given threshold. If the
+threshold is 0, passed request statistics are disabled.
+
 [B<--shorten-thread-ids>] Shorten the thread ids to a three-digit decimal number.
 Note that the mapping from thread ids to shortened ids is created at run-time
 and thus varies with the input.
 [B<--shorten-thread-ids>] Shorten the thread ids to a three-digit decimal number.
 Note that the mapping from thread ids to shortened ids is created at run-time
 and thus varies with the input.