X-Git-Url: http://www.privoxy.org/gitweb/?p=privoxy.git;a=blobdiff_plain;f=tools%2Fprivoxy-log-parser.pl;h=f27188c42f59c51801a177cc8cabae8adbf0297e;hp=78e7aa536474ff9847f760d5dd5e97176cdfed55;hb=HEAD;hpb=babd5dc0098b2c1703e1c1395c5262ec293c283e diff --git a/tools/privoxy-log-parser.pl b/tools/privoxy-log-parser.pl index 78e7aa53..ae18c57e 100755 --- a/tools/privoxy-log-parser.pl +++ b/tools/privoxy-log-parser.pl @@ -23,7 +23,7 @@ # hash key as input. # - Add --compress and --decompress options. # -# Copyright (c) 2007-2021 Fabian Keil +# Copyright (c) 2007-2024 Fabian Keil # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -43,15 +43,17 @@ use warnings; use Getopt::Long; use constant { - PRIVOXY_LOG_PARSER_VERSION => '0.9.3', + PRIVOXY_LOG_PARSER_VERSION => '0.9.5', # Feel free to mess with these ... DEFAULT_BACKGROUND => 'black', # Choose registered colour (like 'black') DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black') HEADER_DEFAULT_COLOUR => 'yellow', REGISTER_HEADERS_WITH_THE_SAME_COLOUR => 1, + CLI_OPTION_DETECT_INACTIVITY => 0, CLI_OPTION_DEFAULT_TO_HTML_OUTPUT => 0, CLI_OPTION_TITLE => 'Privoxy-Log-Parser in da house', + CLI_OPTION_INACTIVITY_THRESHOLD => 100, CLI_OPTION_KEEP_DATE => 0, CLI_OPTION_NO_EMBEDDED_CSS => 0, CLI_OPTION_NO_MSECS => 0, @@ -62,6 +64,7 @@ use constant { CLI_OPTION_STRICT_CHECKS => 0, CLI_OPTION_UNBREAK_LINES_ONLY => 0, CLI_OPTION_URL_STATISTICS_THRESHOLD => 0, + CLI_OPTION_PASSED_REQUEST_STATISTICS_THRESHOLD => 0, CLI_OPTION_HOST_STATISTICS_THRESHOLD => 0, CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION => 0, @@ -183,6 +186,8 @@ sub prepare_our_stuff() { 'configuration-line' => 'red', 'content-type' => 'yellow', 'HOST' => HEADER_DEFAULT_COLOUR, + 'tls-version' => 'pink', + 'cipher-suite' => 'light_cyan', ); %h_colours = %h; @@ -1455,7 +1460,8 @@ sub handle_loglevel_crunch($) { # Highlight crunch reason foreach my $reason (keys %reason_colours) { - $content =~ s@($reason)@$reason_colours{$reason}$1$h{'Standard'}@g; + # Crunch: Blocked: https://capture.condenastdigital.com/track?_o=cne&[...]&dim2=%7B%22adBlocked%[...] + $content =~ s@($reason)@$reason_colours{$reason}$1$h{'Standard'}@; } if ($content =~ m/\[too long, truncated\]$/) { @@ -1568,6 +1574,11 @@ sub handle_loglevel_connect($) { $c =~ s@(?<=accept failed: )(.*)@$h{'Error'}$1$h{'Standard'}@; + } elsif ($c =~ m/^Failed to accept\(\) incoming connection:/) { + + # Failed to accept() incoming connection: Software caused connection abort + $c =~ s@(?<=connection: )(.*)@$h{'Error'}$1$h{'Standard'}@; + } elsif ($c =~ m/^Overriding forwarding settings/) { # Overriding forwarding settings based on 'forward 10.0.0.1:8123' @@ -1622,9 +1633,10 @@ sub handle_loglevel_connect($) { $c = highlight_matched_host($c, '(?<=for )[^\s]+'); $c =~ s@(?<=in slot )(\d+)@$h{'Number'}$1$h{'Standard'}@; - } elsif ($c =~ m/^Socket/) { + } elsif ($c =~ m/^Socket \d+ (already|closed)/) { # Socket 16 already forgotten or never remembered. + # Socket 9 closed while waiting for client headers $c =~ s@(?<=Socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; } elsif ($c =~ m/^The connection to/) { @@ -1641,7 +1653,7 @@ sub handle_loglevel_connect($) { $c =~ s@(?<=Assumed latency: )(\d+)@$h{'Number'}$1$h{'Standard'}@; } elsif ($c =~ m/^Stopped waiting for the request line/ or - $c =~ m/^No request line on socket \d received in time/ or + $c =~ m/^No request line on socket \d+ received in time/ or $c =~ m/^The client side of the connection on socket \d/) { # Stopped waiting for the request line. Timeout: 121. @@ -1798,8 +1810,13 @@ sub handle_loglevel_connect($) { } elsif ($c =~ m/^Optimistically sending /) { # Optimistically sending 318 bytes of client headers intended for www.privoxy.org + # Optimistically sending 318 bytes of client headers intended for www.privoxy.org. $c =~ s@(?<=sending )(\d+)@$h{'Number'}$1$h{'Standard'}@; - $c = highlight_matched_host($c, '(?<=for )[^\s]+'); + if ($c =~ /\.$/) { + $c = highlight_matched_host($c, '[^\s]+(?=\.)'); + } else { + $c = highlight_matched_host($c, '(?<=for )[^\s]+'); + } } elsif ($c =~ m/^Stopping to watch the client socket/) { @@ -1895,6 +1912,75 @@ sub handle_loglevel_connect($) { # The peer notified us that the connection on socket 11 is going to be closed $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } elsif ($c =~ m/^Client socket \d is no longer usable/) { + + # Client socket 7 is no longer usable. The server socket has been closed. + $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Socket timeout \d+ reached/) { + + # Socket timeout 3 reached: http://127.0.0.1:20000/no-filter/chunked-content/36 + $c =~ s@(?<=timeout )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c = highlight_matched_url($c, "(?<=reached: ).*") + + } elsif ($c =~ m/^Prepared to read up to /) { + + # Prepared to read up to 157 bytes of encrypted request body from the client. + $c =~ s@(?<=up to )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Forwarding \d+ bytes /) { + + # Forwarding 157 bytes of encrypted request body. + $c =~ s@(?<=Forwarding )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Buffering encrypted client body/) { + + # Buffering encrypted client body. Prepared to read up to 2236 bytes. + $c =~ s@(?<=up to )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^The last \d+ bytes of the encrypted request body have been read/) { + + # The last 6945 bytes of the encrypted request body have been read. + $c =~ s@(?<=The last )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Reducing the chunk offset from/) { + + # Reducing the chunk offset from 1096654 to 32704 after discarding 1063950 bytes to make room in the buffer. + # Reducing the chunk offset from 16219 to 128 after flushing 16091 bytes. + $c =~ s@(?<=\d to )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c =~ s@(?<=offset from )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c =~ s@(?<=after discarding )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c =~ s@(?<=after flushing )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Client socket \d+ is no longer usable/) { + + # Client socket 21 is no longer usable. The server socket has been closed. + $c =~ s@(?<=Client socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^(Client|Server) successfully connected over/) { + + # Server successfully connected over TLSv1.3 (TLS_AES_256_GCM_SHA384). + # Client successfully connected over TLSv1.3 (TLS_AES_128_GCM_SHA256). + $c =~ s@(?<=connected over )(TLSv\d\.\d)@$h{'tls-version'}$1$h{'Standard'}@; + $c =~ s@(?<=\()([^)]+)@$h{'cipher-suite'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Couldn't deliver the error message for/) { + + # Couldn't deliver the error message for https://m.media-amazon.com/[...] through client socket 18 using TLS/SSL + $c = highlight_matched_url($c, "(?<=error message for )[^ ]*"); + $c =~ s@(?<=client socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Keeping chunk offset at/) { + + # Keeping chunk offset at 0 despite flushing 31 bytes. + $c =~ s@(?<=offset at )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c =~ s@(?<=flushing )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Not shutting down client connection on/) { + + # Not shutting down client connection on socket 8. The socket is no longer alive. + $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } elsif ($c =~ m/^Looks like we / or $c =~ m/^Unsetting keep-alive flag/ or $c =~ m/^No connections to wait/ or @@ -2141,6 +2227,10 @@ sub handle_loglevel_error($) { $c =~ s@(?<=digested )(\d+)@$h{'Number'}$1$h{'Standard'}@; $c =~ s@(?<=of )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } elsif ($c =~ m/^The socks connection timed out after/) { + + # The socks connection timed out after 60 seconds. + $c =~ s@(?<=after )(\d+)@$h{'Number'}$1$h{'Standard'}@; } # XXX: There are probably more messages that deserve highlighting. @@ -2148,6 +2238,41 @@ sub handle_loglevel_error($) { return $c; } +sub handle_loglevel_received($) { + + my $c = shift; + + if ($c =~ m/^TLS from socket/) { + # TLS from socket 3: \x16\xda\xe2\xa2;\x0d\x0a + + $c =~ s@(?<=TLS from socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^from socket/) { + # from socket 3: HEAD http://p.p/ HTTP/1.1\x0d\x0aHost: p.p\x0d\x0aUser-Agent: curl/7.85.0\x0d\x0aAccept: */*\x0d\x0aProxy-Connection: Keep-Alive\x0d\x0a\x0d\x0a + + $c =~ s@(?<=from socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } + + return $c; +} + +sub handle_loglevel_writing($) { + + my $c = shift; + + if ($c =~ m/^to socket/) { + # to socket 11: HTTP/1.1 200 Connection established\x0d\x0a\x0d\x0a + + $c =~ s@(?<=to socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^TLS on socket /) { + # TLS on socket 9: o~\xfcS[\xfa\x8f\xd6\x96\xe6_\xc7$\x1b[...] + + $c =~ s@(?<=TLS on socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } + + return $c; +} sub handle_loglevel_ignore($) { return shift; @@ -2199,17 +2324,25 @@ sub gather_loglevel_clf_stats($) { } sub gather_loglevel_request_stats($$) { - my $c = shift; + my $request_url = shift; my $thread = shift; our %stats; + our %cli_options; $stats{requests}++; + if ($cli_options{'passed-request-statistics-threshold'} != 0) { + # If the request get blocked we'll decrement + # in gather_loglevel_crunch_stats() + chomp $request_url; + $stats{'passed-request-url'}{$request_url}++; + } } sub gather_loglevel_crunch_stats($$) { my $c = shift; my $thread = shift; our %stats; + our %cli_options; $stats{crunches}++; @@ -2229,6 +2362,15 @@ sub gather_loglevel_crunch_stats($$) { # Connection failure: http://127.0.0.1:8080/ $stats{'connection-failure'}++; } + if ($cli_options{'passed-request-statistics-threshold'} != 0) { + $c =~ m/^[^:]+: (.*)/; + if ($stats{'passed-request-url'}{$1}) { + $stats{'passed-request-url'}{$1}-- ; + if ($stats{'passed-request-url'}{$1} == 0) { + delete($stats{'passed-request-url'}{$1}); + } + } + } } @@ -2410,7 +2552,7 @@ sub print_stats() { ")\n"; print "Client connections: " . $stats{'closed-client-connections'} . "\n"; if ($stats{'content-size-total'}) { - print "Bytes of content transfered to the client: " . $stats{'content-size-total'} . "\n"; + print "Bytes of content transferred to the client: " . $stats{'content-size-total'} . "\n"; } my $lines_printed = 0; print "Client requests per connection distribution:\n"; @@ -2449,7 +2591,7 @@ sub print_stats() { print "HTTP version distribution unknown. No CLF message parsed yet. Is 'debug 512' enabled?\n"; } if (exists $stats{'status-code'}) { - print "HTTP status codes:\n"; + print "HTTP status codes according to 'debug 512' (status codes sent by the server may differ):\n"; foreach my $status_code (sort {$stats{'status-code'}{$b} <=> $stats{'status-code'}{$a}} keys %{$stats{'status-code'}}) { printf "%8d : %-8d\n", $stats{'status-code'}{$status_code}, $status_code; } @@ -2470,6 +2612,19 @@ sub print_stats() { } } + if ($cli_options{'passed-request-statistics-threshold'} == 0) { + print "Passed request statistics are disabled. Increase --passed-request-statistics-threshold to enable them.\n"; + } else { + print "Requested requests that were passed:\n"; + foreach my $passed_url (sort {$stats{'passed-request-url'}{$b} <=> $stats{'passed-request-url'}{$a}} + keys %{$stats{'passed-request-url'}}) { + if ($stats{'passed-request-url'}{$passed_url} < $cli_options{'passed-request-statistics-threshold'}) { + print "Skipped statistics for passed URLs below the treshold.\n"; + last; + } + printf "%d : %s\n", $stats{'passed-request-url'}{$passed_url}, $passed_url; + } + } if ($cli_options{'host-statistics-threshold'} == 0) { print "Host statistics are disabled. Increase --host-statistics-threshold to enable them.\n"; } else { @@ -2576,8 +2731,8 @@ sub parse_loop() { 'Force' => \&handle_loglevel_force, 'Error' => \&handle_loglevel_error, 'Fatal error' => \&handle_loglevel_ignore, - 'Writing' => \&handle_loglevel_ignore, - 'Received' => \&handle_loglevel_ignore, + 'Writing' => \&handle_loglevel_writing, + 'Received' => \&handle_loglevel_received, 'Tagging' => \&handle_loglevel_tagging, 'Actions' => \&handle_loglevel_ignore, 'Unknown log level' => \&handle_loglevel_ignore, @@ -2655,7 +2810,7 @@ sub parse_loop() { sub stats_loop() { - my ($day, $time_stamp, $msecs, $thread, $log_level, $content); + my ($day, $time_stamp, $thread, $log_level, $content); my $strict_checks = cli_option_is_set('strict-checks'); my %log_level_handlers = ( 'Connect:' => \&gather_loglevel_connect_stats, @@ -2703,6 +2858,75 @@ sub stats_loop() { } +# Convert a timestamp like 18:07:28.733 into milliseconds +sub time_stamp_to_msecs($) { + my $time_stamp = shift; + + if ($time_stamp =~ /(\d\d):(\d\d):(\d\d)\.(\d{3})/) { + my ($hours, $minutes, $seconds, $msecs) = ($1, $2, $3, $4); + + $msecs += $seconds * 1000; + $msecs += $minutes * 1000 * 60; + $msecs += $hours * 1000 * 60 * 60; + + return $msecs; + } + return undef; +} + +sub inactivity_detection_loop() { + + our %cli_options; + my ($date, $time_stamp, $thread, $log_level, $content); + my ($msecs, $previous_msecs, $inactivity); + my $inactivity_threshold = $cli_options{'inactivity-threshold'}; + my $previous_date; + my $log_messages_out_of_order = 0; + + while (<>) { + ($date, $time_stamp, $thread, $log_level, $content) = split(/ /, $_, 5); + + next if (not defined($log_level)); + next if ($time_stamp eq "-"); + $msecs = time_stamp_to_msecs($time_stamp); + unless (defined $msecs) { + print "Failed to convert $time_stamp into milliseconds\n"; + print "$_"; + next; + } + unless (defined $previous_msecs) { + $previous_msecs = $msecs; + $previous_date = $date; + print "$_"; + next; + } + $inactivity = $msecs - $previous_msecs; + if ($inactivity < 0) { + # This can happen if there's a high load in which case + # a Privoxy thread may be moved off schedule between + # getting the timestamp for the log message and actually + # writing it. + $log_messages_out_of_order++; + } + if ($inactivity > $inactivity_threshold) { + if ($previous_date eq $date) { + print "Detected inactivity: $inactivity msecs\n"; + } else { + # While we could include the date in the timestamp + # we currently don't. + print "Detected date change. Timestamp difference ignored.\n"; + } + } + print "$_"; + $previous_msecs = $msecs; + $previous_date = $date; + } + if ($log_messages_out_of_order) { + print "At least $log_messages_out_of_order messages were written out of the chronological order.\n"; + print "This can result in false positives. Consider sorting the log first.\n"; + } +} + sub unbreak_lines_only_loop() { my $log_messages_reached = 0; while (<>) { @@ -2739,6 +2963,8 @@ sub VersionMessage { sub get_cli_options() { our %cli_options = ( + 'detect-inactivity' => CLI_OPTION_DETECT_INACTIVITY, + 'inactivity-threshold' => CLI_OPTION_INACTIVITY_THRESHOLD, 'html-output' => CLI_OPTION_DEFAULT_TO_HTML_OUTPUT, 'title' => CLI_OPTION_TITLE, 'keep-date' => CLI_OPTION_KEEP_DATE, @@ -2752,10 +2978,13 @@ sub get_cli_options() { 'url-statistics-threshold' => CLI_OPTION_URL_STATISTICS_THRESHOLD, 'unbreak-lines-only' => CLI_OPTION_UNBREAK_LINES_ONLY, 'host-statistics-threshold'=> CLI_OPTION_HOST_STATISTICS_THRESHOLD, + 'passed-request-statistics-threshold' => CLI_OPTION_PASSED_REQUEST_STATISTICS_THRESHOLD, 'show-complete-request-distribution' => CLI_OPTION_SHOW_COMPLETE_REQUEST_DISTRIBUTION, ); GetOptions ( + 'detect-inactivity' => \$cli_options{'detect-inactivity'}, + 'inactivity-threshold=i' => \$cli_options{'inactivity-threshold'}, 'html-output' => \$cli_options{'html-output'}, 'title' => \$cli_options{'title'}, 'keep-date' => \$cli_options{'keep-date'}, @@ -2769,6 +2998,7 @@ sub get_cli_options() { 'unbreak-lines-only' => \$cli_options{'unbreak-lines-only'}, 'url-statistics-threshold=i'=> \$cli_options{'url-statistics-threshold'}, 'host-statistics-threshold=i'=> \$cli_options{'host-statistics-threshold'}, + 'passed-request-statistics-threshold=i' => \$cli_options{'passed-request-statistics-threshold'}, 'show-complete-request-distribution' => \$cli_options{'show-complete-request-distribution'}, 'version' => sub { VersionMessage && exit(0) }, 'help' => \&help, @@ -2790,6 +3020,8 @@ sub help() { print << " EOF" Options and their default values if they have any: + [--detect-innactivity] + [--inactivity-threshold $cli_options{'inactivity-threshold'}] [--host-statistics-threshold $cli_options{'host-statistics-threshold'}] [--html-output] [--no-embedded-css] @@ -2801,6 +3033,7 @@ Options and their default values if they have any: [--statistics] [--unbreak-lines-only] [--url-statistics-threshold $cli_options{'url-statistics-threshold'}] + [--passed-request-statistics-threshold $cli_options{'passed-request-statistics-threshold'}] [--title $cli_options{'title'}] [--version] see "perldoc $0" for more information @@ -2818,18 +3051,18 @@ sub main() { set_background(DEFAULT_BACKGROUND); prepare_our_stuff(); - print_intro(); - # XXX: should explicitly reject incompatible argument combinations if (cli_option_is_set('unbreak-lines-only')) { unbreak_lines_only_loop(); } elsif (cli_option_is_set('statistics')) { stats_loop(); + } elsif (cli_option_is_set('detect-inactivity')) { + inactivity_detection_loop(); } else { + print_intro(); parse_loop(); + print_outro(); } - - print_outro(); } main(); @@ -2840,7 +3073,8 @@ B - A parser and syntax-highlighter for Privoxy log messages =head1 SYNOPSIS -B [B<--html-output>] +B [B<--detect-inactivity>] [B<--inactivity-threshold msecs>] +[B<--html-output>] [B<--no-msecs>] [B<--no-syntax-higlighting>] [B<--statistics>] [B<--shorten-thread-ids>] [B<--show-ineffective-filters>] [B<--url-statistics-threshold>] [B<--version>] @@ -2867,15 +3101,23 @@ will hide the "filter foo caused 0 hits" message. =head1 OPTIONS +[B<--detect-inactivity>] Instead of syntax highlighting, detect periods +of log inactivity of more than the amount of milliseconds specified with +the B<--inactivity-threshold> option. Mainly useful for debugging. + [B<--host-statistics-threshold>] Only show the request count for a host if it's above or equal to the given threshold. If the threshold is 0, host statistics are disabled. -[B<--html-output>] Use HTML and CSS for the syntax highlighting. If this option is +[B<--html-output>] Use HTML and CSS when syntax highlighting. If this option is omitted, ANSI escape sequences are used unless B<--no-syntax-highlighting> is active. This option is only intended to make embedding log excerpts in web pages easier. It does not escape any input! +[B<--inactivity-threshold msecs>] Specifies the number of milliseconds between +log messages to consider inactivity when running in [B<--detect-inactivity>] +mode. + [B<--keep-date>] Don't remove the date when printing highlighted log messages. Useful when parsing multiple log files at once. @@ -2886,6 +3128,10 @@ the filtered output is piped into less in which case the ANSI control codes don't work, or if the terminal itself doesn't support the control codes. +[B<--passed-request-statistics-threshold>] Only show the request count for +a passed requests if it's above or equal to the given threshold. If the +threshold is 0, passed request statistics are disabled. + [B<--shorten-thread-ids>] Shorten the thread ids to a three-digit decimal number. Note that the mapping from thread ids to shortened ids is created at run-time and thus varies with the input.