X-Git-Url: http://www.privoxy.org/gitweb/?p=privoxy.git;a=blobdiff_plain;f=tools%2Fprivoxy-log-parser.pl;h=335e2dffb450d768cf831a8c215f58825abe00e0;hp=df28db95a080ef30f2e59cdaf3f3a1ec158f32a1;hb=60513e33c518dd45644143581976e601d4d6aa60;hpb=c402a37c55bb679f8b5b4a1b6eb3624f43ef5260 diff --git a/tools/privoxy-log-parser.pl b/tools/privoxy-log-parser.pl index df28db95..335e2dff 100755 --- a/tools/privoxy-log-parser.pl +++ b/tools/privoxy-log-parser.pl @@ -23,7 +23,7 @@ # hash key as input. # - Add --compress and --decompress options. # -# Copyright (c) 2007-2020 Fabian Keil +# Copyright (c) 2007-2021 Fabian Keil # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -43,7 +43,7 @@ use warnings; use Getopt::Long; use constant { - PRIVOXY_LOG_PARSER_VERSION => '0.9.1', + PRIVOXY_LOG_PARSER_VERSION => '0.9.2', # Feel free to mess with these ... DEFAULT_BACKGROUND => 'black', # Choose registered colour (like 'black') DEFAULT_TEXT_COLOUR => 'white', # Choose registered colour (like 'black') @@ -52,6 +52,7 @@ use constant { CLI_OPTION_DEFAULT_TO_HTML_OUTPUT => 0, CLI_OPTION_TITLE => 'Privoxy-Log-Parser in da house', + CLI_OPTION_KEEP_DATE => 0, CLI_OPTION_NO_EMBEDDED_CSS => 0, CLI_OPTION_NO_MSECS => 0, CLI_OPTION_NO_SYNTAX_HIGHLIGHTING => 0, @@ -107,6 +108,7 @@ my %h_colours; my $header_highlight_regex = ''; my $html_output_mode; +my $keep_date_mode; my $no_msecs_mode; # XXX: should probably be removed my $shorten_thread_ids; my $line_end; @@ -128,6 +130,7 @@ sub prepare_our_stuff() { 'Re-Filter' => 'purple', Connect => 'brown', Request => 'light_cyan', + Tagging => 'purple', CGI => 'light_green', Redirect => 'cyan', Error => 'light_red', @@ -176,6 +179,7 @@ sub prepare_our_stuff() { 'pcrs-delimiter' => 'light_red', 'ignored' => 'light_red', 'action-bits-update' => 'light_red', + 'http-downgrade' => 'light_red', 'configuration-line' => 'red', 'content-type' => 'yellow', 'HOST' => HEADER_DEFAULT_COLOUR, @@ -1061,7 +1065,7 @@ sub handle_loglevel_re_filter($) { return ''; } - $c =~ s@(?<=\(size )(\d+)\)(?= with)@$h{'Number'}$1$h{'Standard'}@; + $c =~ s@(?<=\(size )(\d+)@$h{'Number'}$1$h{'Standard'}@; $c =~ s@(?<=\(new size )(\d+)@$h{'Number'}$1$h{'Standard'}@; $c =~ s@(?<=produced )(\d+)(?= hits)@$h{'Number'}$1$h{'Standard'}@; @@ -1210,6 +1214,12 @@ sub handle_loglevel_re_filter($) { return '' unless SHOW_FILTER_READIN_IN; + } elsif ($c =~ m/^Decompression didn't result/) { + + # Decompression didn't result in any content. + + # Nothing to highlight. + } else { found_unknown_content($content); @@ -1219,6 +1229,26 @@ sub handle_loglevel_re_filter($) { return $content; } +sub handle_loglevel_tagging($) { + + my $c = shift; + + if ($c =~ /^Tagger \'([^\']*)\' added tag \'([^\']*)\'/ or + $c =~ m/^Adding tag \'([^\']*)\' created by header tagger \'([^\']*)\'/) { + + # Adding tag 'GET request' created by header tagger 'method-man' (XXX: no longer used) + # Tagger 'revalidation' added tag 'REVALIDATION-REQUEST'. No action bit update necessary. + # Tagger 'revalidation' added tag 'REVALIDATION-REQUEST'. Action bits updated accordingly. + + # XXX: Save tag and tagger + + $c =~ s@(?<=^Tagger \')([^\']*)@$h{'tagger'}$1$h{'Standard'}@; + $c =~ s@(?<=added tag \')([^\']*)@$h{'tag'}$1$h{'Standard'}@; + $c =~ s@(?<=Action bits )(updated)@$h{'action-bits-update'}$1$h{'Standard'}@; + } + return $c; +} + sub handle_loglevel_redirect($) { my $c = shift; @@ -1276,6 +1306,17 @@ sub handle_loglevel_redirect($) { # Percent-encoding redirect URL: http://www.example.org/\x02 $c = highlight_matched_url($c, '(?<=redirect URL: ).*'); + } elsif ($c =~ m/^Rewrite detected:/) { + + # Rewrite detected: GET http://10.0.0.2:88/blah.txt HTTP/1.1 + # Rewrite detected: GET https://www.electrobsd.org/CommonJS/ajax/libs/jquery/3.4.1/jquery.min.js HTTP/1.1 + $c = highlight_matched_request_line($c, '(?<=^Rewrite detected: ).*'); + + } elsif ($c =~ m/^Rewritten request line results in downgrade to http/) { + + # Rewritten request line results in downgrade to http + $c =~ s@(downgrade)@$h{'http-downgrade'}$1$h{'Standard'}@; + } else { found_unknown_content($c); @@ -1381,6 +1422,11 @@ sub handle_loglevel_crunch($) { # [...]&filter... [too long, truncated] $content = highlight_matched_pattern($content, 'request_', '^.*(?=\.\.\. \[too long, truncated\]$)'); + } elsif ($content =~ m/Certificate error:/) { + + # Certificate error: ASN date error, current date after: https://expired.badssl.com/ + $content = highlight_matched_pattern($content, 'request_', 'https://.*'); + } else { # Blocked: http://ads.example.org/ @@ -1610,7 +1656,10 @@ sub handle_loglevel_connect($) { # Connection from 81.163.28.218 dropped due to ACL # Rejecting connection from 178.63.152.227. Maximum number of connections reached. - $c =~ s@(?<=onnection from )((?:\d+\.?){3}\d+)@$h{'Number'}$1$h{'Standard'}@; + # Connection from 192.168.2.1 on 127.0.1.1:8118 (socket 3) dropped due to ACL + $c = highlight_matched_host($c, '(?<=onnection from )[\d.:]+'); + $c = highlight_matched_host($c, '(?<=on )[\d.:]+'); + $c =~ s@(?<=socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; } elsif ($c =~ m/^(?:Reusing|Closing) server socket / or $c =~ m/^No additional client request/) { @@ -1624,6 +1673,7 @@ sub handle_loglevel_connect($) { # Reusing server socket 7 connected to www.privoxy.org. Total requests: 2. # Closing server socket 6 connected to d.asset.soup.io. Keep-alive: 0.\ # Tainted: 1. Socket alive: 1. Timeout: 60. Configuration file change detected: 0. + # Reusing server socket 35 connected to nl.wikipedia.org. Requests already sent: 5. $c =~ s@(?<= socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; $c = highlight_matched_host($c, '(?<=for )[^\s]+(?=\.)'); @@ -1631,6 +1681,7 @@ sub handle_loglevel_connect($) { for my $number_pattern ('requests', 'Keep-alive', 'Tainted', ' alive', 'Timeout', 'detected') { $c = highlight_matched_pattern($c, 'Number', '(?<='. $number_pattern . ': )\d+'); } + $c =~ s@(?<=already sent: )(\d+)@$h{'Number'}$1$h{'Standard'}@; } elsif ($c =~ m/^Connected to /) { @@ -1746,6 +1797,48 @@ sub handle_loglevel_connect($) { # Performing the TLS/SSL handshake with client. Hash of host: bab5296b25e256c7b06b92b17b56bcae $c = highlight_matched_host($c, '(?<=Hash of host: ).+'); + } elsif ($c =~ m/^Forwarding \d+ bytes of encrypted POST data/) { + + # Forwarding 1954 bytes of encrypted POST data + $c =~ s@(?<=Forwarding )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Forwarded the last \d+ bytes/) { + + # Forwarded the last 1954 bytes + $c =~ s@(?<=the last )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Waiting for the next client connection. Currently active threads:/) { + + # Waiting for the next client connection. Currently active threads: 30 + $c =~ s@(?<=threads: )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Data arrived in time on client socket/) { + + # Data arrived in time on client socket 6. Requests so far: 3 + $c =~ s@(?<=client socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c =~ s@(?<=Requests so far: )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Dropping the client connection on socket/) { + + # Dropping the client connection on socket 71. The server connection has not been established yet. + $c =~ s@(?<=on socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^The client socket \d+ has become unusable while the server/) { + + # The client socket 16 has become unusable while the server socket 24 is still open. + $c =~ s@(?<=client socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + $c =~ s@(?<=server socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^The last \d+ bytes of the request body have been read/) { + + # The last 12078 bytes of the request body have been read + $c =~ s@(?<=The last )(\d+)@$h{'Number'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Flushed \d+ bytes of request body/) { + + # Flushed 3153 bytes of request body + $c =~ s@(?<=Flushed )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } elsif ($c =~ m/^Looks like we / or $c =~ m/^Unsetting keep-alive flag/ or $c =~ m/^No connections to wait/ or @@ -1979,6 +2072,12 @@ sub handle_loglevel_error($) { # Didn't receive data in time: a.fsdn.com:443 $c =~ s@(?<=in time: )(.*)@$h{'destination'}$1$h{'Standard'}@; + + } elsif ($c =~ m/^Sending data on socket \d+ over TLS/) { + + # Sending data on socket 33 over TLS/SSL failed: no TLS/SSL errors detected + $c =~ s@(?<=on socket )(\d+)@$h{'Number'}$1$h{'Standard'}@; + } # XXX: There are probably more messages that deserve highlighting. @@ -1999,15 +2098,25 @@ sub gather_loglevel_clf_stats($) { our %cli_options; # +0200] "GET https://www.youtube.com/watch?v=JmcA9LIIXWw HTTP/1.1" 200 68004 - $content =~ m/^[+-]\d{4}\] "(\w+) (.+) (HTTP\/\d\.\d)" (\d+) (\d+)/; + # +0200] "VERSION-CONTROL http://p.p/ HTTP/1.1" 200 2787 + $content =~ m/^[+-]\d{4}\] "([^ ]+) (.+) (HTTP\/\d\.\d)" (\d+) (\d+)/; $method = $1; $resource = $2; $http_version = $3; $status_code = $4; $size = $5; + $stats{requests_clf}++; + unless (defined $method) { - print("Failed to parse: $content\n"); + # +0200] "Invalid request" 400 0 + return if ($content =~ m/^[+-]\d{4}\] "Invalid request"/); + # +0100] "GET https://securepubads.g.doubleclick.net/gampad/ads?gd[...]... [too long, truncated] + if ($content =~ m/\[too long, truncated\]$/) { + print("Skipped LOG_LEVEL_CLF message that got truncated by Privoxy. Statistics will be inprecise.\n"); + } else { + print("Failed to parse: $content\n"); + } return; } $stats{'method'}{$method}++; @@ -2017,12 +2126,11 @@ sub gather_loglevel_clf_stats($) { $stats{'http-version'}{$http_version}++; if ($cli_options{'host-statistics-threshold'} != 0) { - $resource =~ m@(?:http[s]://)([^/]+)/?@; + $resource =~ m@(?:https?://)?([^/]+)/?@; $stats{'hosts'}{$1}++; } $stats{'content-size-total'} += $size; $stats{'status-code'}{$status_code}++; - $stats{requests_clf}++; } sub gather_loglevel_request_stats($$) { @@ -2237,7 +2345,7 @@ sub print_stats() { ")\n"; print "Client connections: " . $stats{'closed-client-connections'} . "\n"; if ($stats{'content-size-total'}) { - print "Bytes transfered excluding headers: " . $stats{'content-size-total'} . "\n"; + print "Bytes of content transfered to the client: " . $stats{'content-size-total'} . "\n"; } my $lines_printed = 0; print "Client requests per connection distribution:\n"; @@ -2341,12 +2449,14 @@ sub print_clf_message() { sub print_non_clf_message($) { my $content = shift; + my $date_string = $keep_date_mode ? $req{$t}{'day'} . ' ' : ''; my $msec_string = $no_msecs_mode ? '' : '.' . $req{$t}{'msecs'}; my $line_start = $html_output_mode ? '' : $h{"Standard"}; return if DEBUG_SUPPRESS_LOG_MESSAGES; print $line_start + . $date_string . $time_colours[$time_colour_index % 2] . $req{$t}{'time-stamp'} . $msec_string @@ -2403,6 +2513,7 @@ sub parse_loop() { 'Fatal error' => \&handle_loglevel_ignore, 'Writing' => \&handle_loglevel_ignore, 'Received' => \&handle_loglevel_ignore, + 'Tagging' => \&handle_loglevel_tagging, 'Actions' => \&handle_loglevel_ignore, 'Unknown log level' => \&handle_loglevel_ignore, ); @@ -2500,6 +2611,7 @@ sub stats_loop() { 'Redirect:' => \&handle_loglevel_ignore, 'Unknown log level:' => \&handle_loglevel_ignore, 'Writing:' => \&handle_loglevel_ignore, + 'Tagging:' => \&handle_loglevel_ignore, ); while (<>) { @@ -2564,6 +2676,7 @@ sub get_cli_options() { our %cli_options = ( 'html-output' => CLI_OPTION_DEFAULT_TO_HTML_OUTPUT, 'title' => CLI_OPTION_TITLE, + 'keep-date' => CLI_OPTION_KEEP_DATE, 'no-syntax-highlighting' => CLI_OPTION_NO_SYNTAX_HIGHLIGHTING, 'no-embedded-css' => CLI_OPTION_NO_EMBEDDED_CSS, 'no-msecs' => CLI_OPTION_NO_MSECS, @@ -2580,6 +2693,7 @@ sub get_cli_options() { GetOptions ( 'html-output' => \$cli_options{'html-output'}, 'title' => \$cli_options{'title'}, + 'keep-date' => \$cli_options{'keep-date'}, 'no-syntax-highlighting' => \$cli_options{'no-syntax-highlighting'}, 'no-embedded-css' => \$cli_options{'no-embedded-css'}, 'no-msecs' => \$cli_options{'no-msecs'}, @@ -2597,6 +2711,7 @@ sub get_cli_options() { $html_output_mode = cli_option_is_set('html-output'); $no_msecs_mode = cli_option_is_set('no-msecs'); + $keep_date_mode = cli_option_is_set('keep-date'); $shorten_thread_ids = cli_option_is_set('shorten-thread-ids'); $line_end = get_line_end(); } @@ -2696,7 +2811,10 @@ omitted, ANSI escape sequences are used unless B<--no-syntax-highlighting> is ac This option is only intended to make embedding log excerpts in web pages easier. It does not escape any input! -[B<--no-msecs>] Don't expect milisecond resolution +[B<--keep-date>] Don't remove the date when printing highlighted log messages. +Useful when parsing multiple log files at once. + +[B<--no-msecs>] Don't expect millisecond resolution [B<--no-syntax-highlighting>] Disable syntax-highlighting. Useful when the filtered output is piped into less in which case the ANSI control