Properly detect section titles with two-digit minor numbers
[privoxy.git] / utils / ldp_print / fix_print_html.lib
1 #
2 # fix_print_html.lib
3 #
4 #   Dan Scott  / <dan.scott (at) acm.org>
5 #   Ferg       / <gferg (at) sgi.com>
6 #
7 #   Used to prepare single-file HTML variant for PDF/Postscript creation
8 #   thru htmldoc.
9 #
10 # log:
11 #     16Oct2000 - 0.1   - initial entry <gferg (at) sgi.com>
12 #     03Apr2001 - 0.2   - fix for <preface>
13 #     05Jul2001 - 0.3   - fix for <tt> and -f
14 #     12Oct2001 - 0.4   - fix for sections; loop thru both files (body/title)
15 #     27Nov2001 - 0.5   - fixed bug in determining where doc-index lies
16 #     18Jan2002 - 0.5.1 - entity fix (822*)
17 #     02Apr2002 - 0.6   - misc fixes (bibliography/appendix, etc).
18 #     04Apr2002 - 0.7   - fix for newer DSSSL
19 #
20
21 sub fix_print_html {
22
23    my($in,$out,$ttl) = @_;
24
25    open(IN_FILE, "< $in") || do {
26         print "fix_print_html: cannot open $in: $!\n";
27         return 0;
28    };
29
30    my($buf, $ttl_buf) = '';
31    my($indx) = -1;
32    my($is_article) = 1;
33    while(<IN_FILE>) {
34
35          if( $indx == 1 ) {
36
37              # ignore everything until we see the chapter or sect
38              #
39              if( $_ =~ /CLASS="CHAP/i || $_ =~ /CLASS="PREF/i
40                  ||
41                  $_ =~ /CLASS="SECT/i )  {
42
43                  $buf .= $_;
44                  $indx++;
45
46              } else {
47                  next;
48              }
49
50          } elsif( $indx == 0 ) {
51
52              # write out the title page file
53              #
54              if( $_ =~ /CLASS="TOC"/ ) {
55
56                  $ttl_buf .= "></DIV>\n</BODY>\n</HTML>\n"; 
57                  $ttl_buf =~ s/<\/H1\n/<\/H1\n><P><BR><BR\n/ms;
58                  $ttl_buf =~ s/<HR><\/DIV\n><HR>/<HR><\/DIV\n>/ms;
59                  &fix_html(\$ttl_buf, 1);
60                  
61                  open(TOC_FILE, "> $ttl") || do {
62                       print "fix_print_html: cannot open $ttl: $!\n";
63                       close(IN_FILE);
64                       return 0;
65                  };
66                  print TOC_FILE $ttl_buf;
67                  close(TOC_FILE);
68                  $ttl_buf = '';
69                  $indx++;
70
71              } else {
72                 $ttl_buf .= $_;
73              }
74
75          } elsif( $indx < 0 ) {
76
77              if( $_ =~ /CLASS="BOOK"/i ) {
78                  $is_article = 0;
79              }
80
81              # up to this point, both buffers get the line
82              #
83              if( $_ =~ /CLASS="TITLEPAGE"/ ) {
84
85                  $ttl_buf .= $_ . ">\n<P>\n<BR><BR><BR><BR>\n<\/P\n";
86                  $indx++;
87
88              } else {
89                  $buf .= $_;
90                  $ttl_buf .= $_;
91              }
92
93          } else {
94
95              $buf .= $_;
96          }
97    }
98    close(IN_FILE);
99
100
101    # fix body file
102    #
103    open(OUT_FILE, "> $out") || do {
104         print "fix_print_html: cannot open $out: $!\n";
105         return 0;
106    };
107
108    &fix_html(\$buf, $is_article);
109
110    print OUT_FILE $buf;
111    close(OUT_FILE);
112
113
114    return 1;
115 }
116
117
118 sub fix_html {
119    
120    my($buf, $is_article) = @_;
121    my($indx) = -1;
122
123
124    # make corrections and write out the file
125    #
126
127    $$buf =~ s/(\n><LI\n)><P\n(.*?)<\/P\n>/$1$2\n/gms;
128    $$buf =~ s/(\n><LI\n><DIV\nCLASS="FORMALPARA"\n)><P\n(.*?)<\/P\n>/$1$2\n/gms;
129    $$buf =~ s/(\n><LI\nSTYLE="[^\"]+"\n)><P\n(.*?)<\/P\n>/$1$2\n/gms;
130    if( $is_article == 0 ) {
131        $$buf =~ 
132          s/(\nCLASS="SECT[TION\d]+"\n>)<H1\n(.*?)<\/H1/$1<H0\n$2<\/H0/gims;
133        $$buf =~ 
134          s/(\nCLASS="SECT[TION\d]+"\n><HR>)<H1\n(.*?)<\/H1/$1<H0\n$2<\/H0/gims;
135    }
136    $$buf =~ s/<H1(\nCLASS="INDEXDIV"\n)(.*?)<\/H1/<H2$1$2<\/H2/gims;
137    if( ($indx = rindex($$buf, "<H1\n><A\nNAME=\"DOC-INDEX\"")) > -1 ) {
138        $$buf = substr($$buf, 0, $indx);
139        $$buf .= "\n<\/BODY>\n<\/HTML>\n\n";
140    } elsif( ($indx = rindex($$buf, "<H1\n><A\nNAME=\"doc-index\"")) > -1 ) {
141        $$buf = substr($$buf, 0, $indx);
142        $$buf .= "\n<\/BODY>\n<\/HTML>\n\n";
143    }
144
145    $$buf =~ s/\&\#13;//g;
146    $$buf =~ s/\&\#60;/\&lt;/g;
147    $$buf =~ s/\&\#62;/\&gt;/g;
148    $$buf =~ s/\&\#8211;/\-/g;
149    $$buf =~ s/\&\#8220;/\"/g;
150    $$buf =~ s/\&\#8221;/\"/g;
151    $$buf =~ s/WIDTH=\"\d\"//g;
152    $$buf =~ s/><[\/]*TBODY//g;
153    $$buf =~ s/><[\/]*THEAD//g;
154    $$buf =~ s/TYPE=\"1\"\n//gim;
155
156    $$buf =~ s/<P\nCLASS="LITERALLAYOUT"(.*?)<\/P/<P CLASS="LITERALLAYOUT"><FONT FACE=\"courier\"$1<\/FONT><\/P/gms;
157
158    my($cnt, $j) = 0;
159
160    if( $$buf !~ /<H1/ ) {
161        
162        # for newer docbook styles, set h2 to h1, etc.
163        #
164        for($cnt=2; $cnt < 7; $cnt++ ) {
165            $j = $cnt - 1;
166            $$buf =~ s/<H${cnt}/<H${j}/g;
167            $$buf =~ s/<\/H${cnt}/<\/H${j}/g;
168        }
169
170    } elsif( $is_article == 0 ) {
171
172        # decrement the headers by 1 and then re-set the
173        # chapter level only to H1...
174        #
175        for($cnt=5; $cnt >= 0; $cnt--) {
176            $j = $cnt + 1;
177            if( $cnt == 0 ) {
178                $j = 2;
179            }
180            $$buf =~ s/<H${cnt}/<H${j}/g;
181            $$buf =~ s/<\/H${cnt}/<\/H${j}/g;
182        }
183
184        my(@l) = split(/\n/, $$buf);
185        for( $cnt=0; $cnt < (@l + 0); $cnt++ ) {
186
187             if( $j == 1 ) {
188                 if( $l[$cnt] =~ /<DIV/ ) {
189                     $j = 0;
190                     next;
191                 }
192                 $l[$cnt] =~ s/<H2/<H1/g;
193                 $l[$cnt] =~ s/<\/H2/<\/H1/g;
194             }
195
196             if( $l[$cnt] =~ /^CLASS=\"CHAP/i
197                 ||
198                 $l[$cnt] =~ /^NAME=\"BIBL/i
199                 ||
200                 $l[$cnt] =~ /^CLASS=\"APPENDIX/i
201                 ||
202                 $l[$cnt] =~ /^CLASS=\"GLOSSARY/i
203                 ||
204                 $l[$cnt] =~ /^CLASS=\"PREF/i ) {
205                 $j = 1;
206             }
207        }
208
209        $$buf = join("\n", @l);
210
211    }
212    $$buf =~ s/><DIV\nCLASS="\w+"\n//gms;
213    $$buf =~ s/><\/DIV\n//gms;
214
215    $buf =~ s/<SPAN\n[^>]*?>//gms;
216    $buf =~ s/<\/SPAN\n>//gms;
217
218    $$buf =~ s/(><LI\n)><P\n(.*?)<\/P\n>(<\/LI\n)/$1$2$3/gms;
219
220    return;
221 }
222
223
224 # Return true from package include
225 #
226 1;
227