replace image[] earlier than unmarked lists
[bse.git] / site / cgi-bin / modules / DevHelp / Formatter.pm
CommitLineData
4772671f
TC
1package DevHelp::Formatter;
2use strict;
3use DevHelp::HTML;
00dd8d82 4use Carp 'confess';
4772671f 5
503a05bf 6our $VERSION = "1.011";
cb7fd78d 7
84370ed2
TC
8use constant DEBUG => 0;
9
f93248c3
TC
10# markers to avoid inserting a <p> or </p>
11use constant NO_P => "\x02";
12use constant NO_CP => "\x03";
4f61dffe 13use constant TO_NL => "\x04";
f93248c3
TC
14
15# block tags replaced in some common location
16# other block tags with their own replacement should be moved into this too
17my @block_tags = qw(div address blockquote article section header footer aside nav figure figcaption);
18my $block_tags = join "|", @block_tags;
19
05bb7c45 20my @all_block_tags = ( @block_tags, qw(h1 h2 h3 h4 h5 h6 p li ol ul) );
f93248c3
TC
21
22my $all_block_tags = join "|", @all_block_tags;
23
4772671f
TC
24sub new {
25 my ($class) = @_;
26
27 return bless {}, $class;
28}
29
30sub embed {
31 '';
32}
33
34sub image {
35 my ($self, $imagename, $align) = @_;
36
37 return '';
38}
39
852e69d6
TC
40sub rewrite_url {
41 my ($self, $url, $type) = @_;
42
43 return $url;
44}
45
4772671f
TC
46sub replace {
47}
48
49sub _make_hr {
50 my ($width, $height) = @_;
f93248c3 51 my $tag = "\n\n" . NO_P . "<hr";
4772671f 52 $tag .= qq! width="$width"! if length $width;
00dd8d82 53 $tag .= qq! size="$height"! if length $height;
f93248c3 54 $tag .= " />" . NO_CP . "\n\n";
4772671f
TC
55 return $tag;
56}
57
58# produces a table, possibly with options for the <table> and <tr> tags
59sub _make_table {
60 my ($options, $text) = @_;
61 my $tag = "<table";
62 my $cellend = '';
63 my $cellstart = '';
64 if ($options =~ /=/) {
dc872a32 65 $tag .= " " . unescape_html($options);
4772671f
TC
66 }
67 elsif ($options =~ /\S/) {
68 $options =~ s/\s+$//;
69 my ($width, $bg, $pad, $fontsz, $fontface) = split /\|/, $options;
70 for ($width, $bg, $pad, $fontsz, $fontface) {
71 $_ = '' unless defined;
72 }
73 $tag .= qq! width="$width"! if length $width;
74 $tag .= qq! bgcolor="$bg"! if length $bg;
75 $tag .= qq! cellpadding="$pad"! if length $pad;
76 if (length $fontsz || length $fontface) {
77 $cellstart = qq!<font!;
78 $cellstart .= qq! size="$fontsz"! if length $fontsz;
79 $cellstart .= qq! face="$fontface"! if length $fontface;
80 $cellstart .= qq!>!;
81 $cellend = "</font>";
82 }
83 }
84 $tag .= ">";
85 my @rows = split '\n', $text;
86 my $maxwidth = 0;
87 for my $row (@rows) {
88 my ($opts, @cols) = split /\|/, $row;
89 $tag .= "<tr";
90e768db 90 if (defined $opts && $opts =~ /=/) {
4772671f
TC
91 $tag .= " ".unescape_html($opts);
92 }
93 $tag .= "><td>$cellstart".join("$cellend</td><td>$cellstart", @cols)
94 ."$cellend</td></tr>";
95 }
96 $tag .= "</table>";
97 return $tag;
98}
99
100# make a UL
101sub _format_bullets {
4810ebda 102 my ($self, $text, $extras) = @_;
4772671f
TC
103
104 $text =~ s/^\s+|\s+$//g;
84370ed2 105 my @points = split /(?:\r?\n)? *\*\*\s*/, $text;
4772671f
TC
106 shift @points if @points and $points[0] eq '';
107 return '' unless @points;
108 for my $point (@points) {
8641ef39 109 $point =~ s!\n *$!!
85802bd5 110 and $point = "<p>$point</p>";
4772671f 111 }
4810ebda
TC
112 return "\n\n" . NO_P . $self->_tag_with_attrs("ul", $extras) .
113 "<li>".join("</li><li>", @points)."</li></ul>" . NO_CP . "\n\n";
4772671f
TC
114}
115
116# make a OL
117sub _format_ol {
4810ebda 118 my ($self, $text, $type, $code, $extras) = @_;
84370ed2
TC
119
120 print STDERR "_format_ol(..., $type, $code)\n" if DEBUG;
121 print STDERR "text: ",unpack("H*", $text),"\n" if DEBUG;
4772671f 122 $text =~ s/^\s+|\s+$//g;
00dd8d82 123 $code ||= "##";
84370ed2 124 my @points = split /(?:\r?\n)? *$code\s*/, $text;
4772671f
TC
125 shift @points if @points and $points[0] eq '';
126 return '' unless @points;
127 for my $point (@points) {
8641ef39 128 $point =~ s!\n *$!!
85802bd5 129 and $point = "<p>$point</p>";
4772671f 130 }
4810ebda
TC
131 my $ol = $self->_tag_with_attrs("ol", $extras);
132 $ol =~ s!>$! type="$type">! if $type;
05bb7c45 133 return "\n\n" . NO_P . "$ol<li>".join("</li><li>", @points)."</li></ol>" . NO_CP . "\n\n";
4772671f
TC
134}
135
e3d242f7 136sub _format_lists {
4810ebda 137 my ($self, $text, $extras) = @_;
e3d242f7
TC
138
139 my $out = '';
140
141 while (length $text) {
8641ef39 142 if ($text =~ s(^((?: *\#\#[^\n]+(?:\n(?!\*\*|\#\#|\%\%)[^\n]+)*(?:\n *|$)\n?[^\S\n]*)+)\n?)()) {
4810ebda 143 $out .= $self->_format_ol($1, undef, undef, $extras);
e3d242f7 144 }
8641ef39 145 elsif ($text =~ s(^((?: *\*\*[^\n]+(?:\n(?!\*\*|\#\#|\%\%)[^\n]+)*(?:\n *|$)\n?[^\S\n]*)+)\n?)()) {
4810ebda 146 $out .= $self->_format_bullets($1, $extras);
e3d242f7 147 }
8641ef39 148 elsif ($text =~ s(^((?: *%%[^\n]+(?:\n(?!\*\*|\#\#|\%\%)[^\n]+)*(?:\n *|$)\n?[^\S\n]*)+)\n?)()) {
4810ebda 149 $out .= $self->_format_ol($1, 'a', '%%', $extras);
e3d242f7
TC
150 }
151 else {
152 $out .= $text;
153 $text = '';
154 }
155 }
156
157 return $out;
158}
159
4772671f
TC
160# raw html - this has some limitations
161# the input text has already been escaped, so we need to unescape it
162# too bad if you want [] in your html (but you can use entities)
163sub _make_html {
164 return unescape_html($_[0]);
165}
166
167sub _fix_spanned {
8f84f3f1
TC
168 my ($self, $start, $end, $text, $type) = @_;
169
170 if ($type) {
171 my $class = $self->tag_class($type);
172 if ($class) {
173 $start =~ s/>$/ class="$class">/;
174 }
175 }
4772671f
TC
176
177 $text =~ s!(\n(?:[ \r]*\n)+)!$end$1$start!g;
178
179 "$start$text$end";
180}
181
f93248c3
TC
182sub _blockify {
183 my ($self, $text) = @_;
184
185 my $orig = $text;
186
187 $text =~ s/^\s+//;
188 $text =~ s/\s+\z//;
189 $text =~ s#(\x03?\n\s*\n\x02?)#
190 my $m = $1;
191 my $r = ($m =~ /\x03/ ? "" : "</p>")
192 . ($m =~ /\x02/ ? "" : "<p>");
193 $r #eg;
194
195 $text =~ s!(\n([ \r]*\n)+)!$1 eq "\n" ? "<br />\n" : "</p>\n<p>"!eg;
196 $text =~ s#\A(?!\x02)#<p>#;
197 $text =~ s#(?<!\x03)\z#</p>#;
198
199print STDERR "blockify ", unpack("H*", $orig), " => ", unpack("H*", $text), "\n" if DEBUG;
200
201 return $text;
202}
203
5af99440 204sub link {
13f9cc67 205 my ($self, $url, $text, $type, $extras) = @_;
5af99440 206
f93248c3
TC
207 $extras ||= "";
208
13f9cc67 209 qq/<a href="/ . $self->rewrite_url($url, $text, $type) . qq("$extras>$text</a>)
86ce84b2
AO
210}
211
4f61dffe
TC
212sub _inline_html {
213 my ($self, $text) = @_;
214
215 $text =~ s/\n/ TO_NL /ge;
216
217 return unescape_html($text);
218}
219
4772671f
TC
220sub replace_char {
221 my ($self, $rpart) = @_;
c5d6a4ea 222 $$rpart =~ s#(acronym|abbr|dfn|cite)\[(?:\r?\n)?([^|\]\[]+)\|([^\]\[]+)\|([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 223 $self->_fix_spanned(qq/<$1 class="$3" title="$2">/, "</$1>", $4)#egi
3fa9f5b4 224 and return 1;
c5d6a4ea 225 $$rpart =~ s#(acronym|abbr|dfn|cite)\[(?:\r?\n)?([^|\]\[]+)\|([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 226 $self->_fix_spanned(qq/<$1 title="$2">/, "</$1>", $3)#egi
3fa9f5b4 227 and return 1;
c5d6a4ea 228 $$rpart =~ s#(acronym|abbr|dfn|cite)\[(?:\r?\n)?\|([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 229 $self->_fix_spanned("<$1>", "</$1>", $2)#egi
4772671f 230 and return 1;
c5d6a4ea 231 $$rpart =~ s#(acronym|abbr|dfn|cite)\[(?:\r?\n)?([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 232 $self->_fix_spanned("<$1>", "</$1>", $2)#egi
85802bd5 233 and return 1;
3c246d6a 234 $$rpart =~ s#bdo\[(?:\r?\n)?([^|\]\[]+)\|([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 235 $self->_fix_spanned(qq/<bdo dir="$1">/, "</bdo>", $2)#egi
85802bd5 236 and return 1;
6780dd33 237 $$rpart =~ s#(strong|em|samp|code|var|sub|sup|kbd|q|b|i|tt|span|small|large|mark)\[(?:\r?\n)?([^|\]\[]+)\|([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 238 $self->_fix_spanned(qq/<$1 class="$2">/, "</$1>", $3)#egi
3fa9f5b4 239 and return 1;
6780dd33 240 $$rpart =~ s#(strong|em|samp|code|var|sub|sup|kbd|q|b|i|tt|span|small|large|mark)\[(?:\r?\n)?\|([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 241 $self->_fix_spanned("<$1>", "</$1>", $2)#egi
4772671f 242 and return 1;
6780dd33 243 $$rpart =~ s#(strong|em|samp|code|var|sub|sup|kbd|q|b|i|tt|span|small|large|mark)\[(?:\r?\n)?([^\]\[]+?)(?:\r?\n)?\]#
8f84f3f1 244 $self->_fix_spanned("<$1>", "</$1>", $2)#egi
4772671f 245 and return 1;
4bf84d3a
TC
246 $$rpart =~ s#poplink\[([^|\]\[]+)\|([^\]\[]*\n\s*\n[^\]\[]*)\]#
247 "\n\n" . NO_P . $self->link($1, $self->_blockify($2), "poplink", qq/ target="_blank"/) . NO_CP . "\n\n" #eig
248 and return 1;
85802bd5 249 $$rpart =~ s#poplink\[([^|\]\[]+)\|([^\]\[]+)\]#
13f9cc67 250 $self->link($1, $2, "poplink", qq/ target="_blank"/)#eig
85802bd5 251 and return 1;
8f84f3f1 252 $$rpart =~ s#poplink\[([^|\]\[]+)\]#
13f9cc67 253 $self->link($1, $2, "poplink", qq/ target="_blank"/)#eig
85802bd5 254 and return 1;
bf85ffc0
AO
255 $$rpart =~ s#link\[([^|\]\[]+)\|([^\]\[]*\n\s*\n[^\]\[]*)\]#
256 "\n\n" . NO_P . $self->link($1, $self->_blockify($2), "link") . NO_CP . "\n\n" #eig
f93248c3 257 and return 1;
85802bd5 258 $$rpart =~ s#link\[([^|\]\[]+)\|([^\]\[]+)\]#
13f9cc67 259 $self->link($1, $2, "link")#eig
85802bd5 260 and return 1;
8f84f3f1 261 $$rpart =~ s#link\[([^|\]\[]+)\]#
13f9cc67 262 $self->link($1, $1, "link")#ieg
4772671f
TC
263 and return 1;
264 $$rpart =~ s#font\[([^|\]\[]+)\|([^\]\[]+)\]#
8f84f3f1 265 $self->_fix_spanned(qq/<font size="$1">/, "</font>", $2)#egi
85802bd5 266 and return 1;
4772671f
TC
267 $$rpart =~ s#anchor\[([^|\]\[]*)\]#<a name="$1"></a>#ig
268 and return 1;
269 $$rpart =~ s#fontcolor\[([^|\]\[]+)\|([^\]\[]+)\|([^\]\[]+)\]#
8f84f3f1 270 $self->_fix_spanned(qq/<font size="$1" color="$2">/, "</font>", $3)#egi
85802bd5 271 and return 1;
4f61dffe
TC
272 $$rpart =~ s#(\n\r?|\A)html\[([^\]\[]*)\](\n\r?|\z)#
273 $1 . NO_P . $self->_inline_html($2) . NO_CP . $3 #eg
274 and return 1;
275 $$rpart =~ s#\s?\bhtml\[([^\]\[]*)\]# $self->_inline_html($1) #eg
276 and return 1;
99b7cef0
TC
277 $$rpart =~ s!(?<=\W)\[([^\]\[]+)\]!&#91;$1&#93;!g
278 and return 1;
4772671f
TC
279
280 return 0;
281}
282
ebe62791
TC
283sub _tag_with_attrs {
284 my ($self, $tag, $extra) = @_;
285
286 my $out = "<$tag";
287 my @classes;
288 while ($extra) {
05c025ef 289 if ($extra =~ s/^\#([\w-]+)(?:\s+|$)//) {
ebe62791
TC
290 $out .= qq! id="$1"!;
291 }
05c025ef 292 elsif ($extra =~ s/^([a-z][\w-]*)(?:\s+|$)//i) {
ebe62791
TC
293 push @classes, $1;
294 }
295 elsif ($extra =~ s/^((?:[a-z][\w-]*: .*?;\s*)+)//) {
296 $out .= qq! style="$1"!;
297 }
298 else {
299 print STDERR "** don't understand $extra from $tag **\n";
300 last;
301 }
302 }
303 if (@classes) {
304 $out .= qq! class="@classes"!;
305 }
306 $out .= '>';
307
308 return $out;
309}
310
f93248c3
TC
311sub _block {
312 my ($self, $tag, $text, $end) = @_;
313
314 if ($text =~ /\A\n|\n\s*\n/) {
315 $text = $self->_blockify($text);
316 }
317
318 return "\n\n" . NO_P . $tag . $text . $end . NO_CP . "\n\n";
319}
320
ebe62791
TC
321sub _blocktag {
322 my ($self, $tag, $attrs, $text) = @_;
323
f93248c3
TC
324 return $self->_block
325 ($self->_tag_with_attrs(lc $tag, $attrs), $text, "</\L$tag>")
326}
327
328sub _head_tag {
329 my ($self, $tag, $attrs, $text) = @_;
330
331 my $start = "\n" . NO_P . $self->_tag_with_attrs(lc $tag, $attrs);
332 my $end = "</\L$tag>" . NO_CP . "\n";
333 return "\n" . $self->_fix_spanned($start, $end, $text) . "\n";
ebe62791
TC
334}
335
4772671f
TC
336sub format {
337 my ($self, $body) = @_;
338
84370ed2
TC
339 print STDERR "format(...)\nbody: ",unpack("H*", $body),"\n" if DEBUG;
340
341 if ($body =~ /\n/) {
342 $body =~ tr/\r//d;
343 }
344
62533efa 345 $body = $self->escape($body);
4772671f 346 my $out = '';
49e46a89
AO
347 for my $part (split /((?:raw\[(?:[^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\])
348 |embed\[(?:[^,\[\]]*)(?:,(?:[^,\[\]]*)){0,2}\]
4772671f
TC
349 |pre\[(?:[^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\])/ix, $body) {
350 #print STDERR "Part is $part\n";
49e46a89 351 if ($part =~ /^raw\[([^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\]$/i) {
4772671f
TC
352 $out .= _make_html($1);
353 }
354 elsif ($part =~ /^embed\[([^,\[\]]*),([^,\[\]]*),([^,\[\]]*)\]$/i) {
355 $out .= $self->embed($1, $2, $3);
356 }
357 elsif ($part =~ /^embed\[([^,\[\]]*),([^,\[\]]*)\]$/i) {
358 $out .= $self->embed($1, $2);
359 }
360 elsif ($part =~ /^embed\[([^,\[\]]*)\]$/i) {
361 $out .= $self->embed($1)
362 }
363 elsif ($part =~ /^pre\[([^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\]$/i) {
364 my $work = $1;
365 1 while $self->replace_char(\$work);
d794b180 366 $out .= "<pre>$work</pre>";
4772671f
TC
367 }
368 else {
85802bd5 369 next unless $part =~ /\S/;
4772671f
TC
370 TRY: while (1) {
371 $self->replace(\$part)
372 and next TRY;
373 $self->replace_char(\$part)
374 and next TRY;
503a05bf
TC
375 $part =~ s#image\[([^\]\[]+)\]# $self->image($1) #ige
376 and next TRY;
377 $part =~ s#class\[([^\]\[\|]+)\|([^\]\[]+)\]#
378 $self->_fix_spanned(qq/<span class="$1">/, "</span>", $2)#eig
379 and next TRY;
380 $part =~ s#style\[([^\]\[\|]+)\|([^\]\[]+)\]#
381 $self->_fix_spanned(qq/<span style="$1">/, "</span>", $2)#eig
382 and next TRY;
4772671f
TC
383 $part =~ s#pre\[([^\]\[]+)\]#<pre>$1</pre>#ig
384 and next TRY;
85802bd5 385 $part =~ s#h([1-6])\[([^\[\]\|]+)\|([^\[\]]+)\](?:\r?\n)?#
f93248c3 386 $self->_head_tag("h$1", $2, $3)#ieg
85802bd5
TC
387 and next TRY;
388 $part =~ s#\n*h([1-6])\[\|([^\[\]]+)\]\n*#
f93248c3 389 $self->_head_tag("h$1", '', $2)#ieg
85802bd5
TC
390 and next TRY;
391 $part =~ s#\n*h([1-6])\[([^\[\]]+)\]\n*#
f93248c3 392 $self->_head_tag("h$1", '', $2)#ieg
85802bd5 393 and next TRY;
84c7da90 394 $part =~ s#align\[([^|\]\[]+)\|([^\]\[]+)\]#\n\n<div align="$1">$2</div>\n\n#ig
4772671f
TC
395 and next TRY;
396 $part =~ s#hr\[([^|\]\[]*)\|([^\]\[]*)\]#_make_hr($1, $2)#ieg
397 and next TRY;
398 $part =~ s#hr\[([^|\]\[]*)\]#_make_hr($1, '')#ieg
399 and next TRY;
400 $part =~ s#table\[([^\n\[\]]*)\n([^\[\]]+)\n\s*\]#_make_table($1, $2)#ieg
401 and next TRY;
402 $part =~ s#table\[([^\]\[]+)\|([^\]\[|]+)\]#_make_table($1, "|$2")#ieg
403 and next TRY;
4810ebda
TC
404 $part =~ s# ?\blist\[([^\]\[\|]*)\|\s*(\S[^\]\[]+)\]#
405 "\n\n" . $self->_format_lists($2, $1) #eg
406 and next TRY;
85802bd5 407 #print STDERR "step: ",unpack("H*", $part),"\n$part\n";
503a05bf
TC
408 $part =~ s#indent\[([^\]\[]+)\]#\n\n\x02<div class="indent">$1</div>\x03\n\n#ig
409 and next TRY;
410 $part =~ s#center\[([^\]\[]+)\]#<center>$1</center>#ig
411 and next TRY;
412 $part =~ s#hrcolor\[([^|\]\[]+)\|([^\]\[]+)\|([^\]\[]+)\]#<table width="$1" height="$2" border="0" bgcolor="$3" cellpadding="0" cellspacing="0"><tr><td><img src="/images/trans_pixel.gif" width="1" height="1" alt="" /></td></tr></table>#ig
413 and next TRY;
f21c3494 414 $part =~ s((?:^|\n+|\G)
84c7da90
TC
415 ( # capture
416 (?: # an item
417 \ * # maybe some spaces
418 (?:\*\*|\#\#|\%\%) # marker
9b60b691 419 [^\n]+(?:\n(?!\*\*|\#\#|\%\%)[^\[\]\n]+)* # some non-newline text
84c7da90
TC
420 (?:\n|$)\n? # with one or two line endings
421 [^\S\n]* # and any extra non-newline whitespace
422 )
423 + # one or more times
4810ebda 424 )(\n|$)?)("\n\n".$self->_format_lists($1)."\n\n")egx
00dd8d82 425 and next TRY;
f93248c3 426 $part =~ s#($block_tags)\[([^\[\]\|]+)\|([^\[\]]+?)\]# $self->_block($self->_tag_with_attrs($1, $2), $3, "</$1>")#eig
85802bd5 427 and next TRY;
b99d104e
TC
428 $part =~ s#comment\[[^\[\]]*\]##ig
429 and next TRY;
f93248c3 430 $part =~ s#($block_tags)\[\|([^\[\]]+?)\]# $self->_block("<$1>", $2, "</$1>") #ieg
85802bd5 431 and next TRY;
f93248c3 432 $part =~ s#($block_tags)\[([^\[\]]+?)\]# $self->_block("<$1>", $2, "</$1>") #ieg
85802bd5 433 and next TRY;
4772671f
TC
434 last;
435 }
85802bd5 436 $part =~ s/^\s+|\s+\z//g; # avoid spurious leading/trailing <p>
f93248c3
TC
437 $part = $self->_blockify($part);
438 $part =~ s#\n+#<br />\n#g;
439 $part =~ s#[\x02\x03]##g;
dfa13bbf
AO
440 1 while $part =~ s/<p>(<div(?: [^>]*)?>)/$1\n<p>/g;
441 1 while $part =~ s!</div></p>!</p>\n</div>!g;
a36ac9fd 442 1 while $part =~ s!\s+?</p>!</p>!g;
f93248c3
TC
443 1 while $part =~ s#</($all_block_tags)><#</$1>\n<#g;
444 1 while $part =~ s#(<(?:$all_block_tags)[^>]*>)(<(?:$all_block_tags)\b)#$1\n$2#g;
5974ecdf
TC
445 1 while $part =~ s#(</a>)(<a\s+[^>]*>)(<(?:$all_block_tags))#$1\n$2\n$3#g;
446 1 while $part =~ s#(<a\s+[^>]*>)(<(?:$all_block_tags))#$1\n$2#g;
729a4264 447 1 while $part =~ s#(</a>)(<(?:$all_block_tags)\b)#$1\n$2#g;
7d05b12a
TC
448 1 while $part =~ s#(>)(<hr\b[^>]*/>)#$1\n$2#g;
449 1 while $part =~ s#(<hr\b[^>]*/>)(<)#$1\n$2#g;
f93248c3 450 #$part =~ s!<p>(<hr[^>]*>)</p>!$1!g;
85802bd5
TC
451 $part =~ s!<p>(<(?:table|ol|ul|center|h[1-6])[^>]*>)!$1!g;
452 $part =~ s!(</(?:table|ol|ul|center|h[1-6])>)</p>!$1!g;
453 # attempts to convert class[name|paragraph] into <p class="name">...
454 # tried to use a negative lookahead but it wouldn't work
a13fe4d8
TC
455 $part =~ s#<(p\b[^>]*)><span\ class="([^"<>]+)">(.*?)</span></p>
456 #<$1 class="$2">$3</p>#xg;
457 $part =~ s#<(p\b[^>]*)><span\ style="([^"<>]+)">(.*?)</span></p>
458 #<$1 style="$2">$3</p>#xg;
8f84f3f1
TC
459 if (my $p_class = $self->tag_class('p')) {
460 $part =~ s!(<p(?: style="[^"<>]+")?)>!$1 class="$p_class">!g;
461 }
4772671f 462 #$part =~ s!\n!<br />!g;
f93248c3 463 1 while $part =~ s#(</(?:$all_block_tags)>)(</(?:$all_block_tags))#$1\n$2#g;
4f61dffe
TC
464 my $to_nl = TO_NL;
465 $part =~ s/$to_nl/\n/g;
4772671f
TC
466 $out .= $part;
467 }
468 }
469
470 return $out;
471}
472
473sub remove_format {
474 my ($self, $body) = @_;
475
00dd8d82
TC
476 defined $body
477 or confess "undef body supplied to remove_format";
478
4772671f
TC
479 if ($body =~ /^<html>/i) {
480 return _strip_html(substr($body, 6));
481 }
482
483 my $out = '';
484 for my $part (split /((?:html\[(?:[^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\])
e3d242f7
TC
485 |embed\[(?:[^,\[\]]*)(?:,(?:[^,\[\]]*)){0,2}\]
486 |pre\[(?:[^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\])/ix, $body) {
4772671f
TC
487 #print STDERR "Part is $part\n";
488 if ($part =~ /^html\[([^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\]$/i) {
489 $out .= _strip_html($1);
490 }
491 elsif ($part =~ /^embed\[([^,\[\]]*),([^,\[\]]*)\]$/i) {
492 $out .= ""; # what would you do here?
493 }
494 elsif ($part =~ /^embed\[([^,\[\]]*)\]$/i) {
495 $out .= "";
496 }
e3d242f7
TC
497 elsif ($part =~ /^pre\[([^\[\]]*(?:(?:\[[^\[\]]*\])[^\[\]]*)*)\]$/i) {
498 my $work = $1;
499 $out .= $self->remove_format($work);
500 }
4772671f
TC
501 else {
502 TRY: while (1) {
503 $self->remove(\$part)
504 and next TRY;
c5d6a4ea 505 $part =~ s#(?:acronym|abbr|dfn|cite)\[([^|\]\[]+)\|([^\]\[]+)\|([^\]\[]*)\]#$3#ig
85802bd5 506 and next TRY;
c5d6a4ea 507 $part =~ s#(?:acronym|abbr|dfn|cite|bdo)\[([^|\]\[]+)\|([^\]\[]*)\]#$2#ig
85802bd5 508 and next TRY;
c5d6a4ea 509 $part =~ s#(?:acronym|abbr|dfn|cite|bdo)\[\|([^|\]\[]*)\]#$1#ig
85802bd5 510 and next TRY;
c5d6a4ea 511 $part =~ s#(?:acronym|abbr|dfn|cite)\[([^|\]\[]*)\]#$1#ig
85802bd5 512 and next TRY;
6780dd33
AO
513 $part =~ s#(?:strong|em|samp|code|var|sub|sup|kbd|q|b|i|tt|span|small|large|mark)\[([^|\]\[]+)\|([^\]\[]*)\]#$2#ig
514 and next TRY;
515 $part =~ s#(?:strong|em|samp|code|var|sub|sup|kbd|q|b|i|tt|span|small|large|mark)\[\|([^\]\[]*)\]#$1#ig
516 and next TRY;
517 $part =~ s#(?:strong|em|samp|code|var|sub|sup|kbd|q|b|i|tt|span|small|large|mark)\[([^\]\[]*)\]#$1#ig
85802bd5 518 and next TRY;
6780dd33 519 $part =~ s#(?:div|address|blockquote|article|section|header|footer|aside|nav|figure|figcaption)\[([^\[\]\|]*)\|([^\[\]]*)\](?:\r?\n)?#$2#ig
85802bd5 520 and next TRY;
6780dd33 521 $part =~ s#(?:div|address|blockquote|article|section|header|footer|aside|nav|figure|figcaption)\[\|([^\[\]]*)\](?:\r?\n)?#$1#ig
85802bd5 522 and next TRY;
6780dd33 523 $part =~ s#(?:div|address|blockquote|article|section|header|footer|aside|nav|figure|figcaption)\[([^\[\]]*)\](?:\r?\n)?#$1#ig
85802bd5 524 and next TRY;
69750116 525 $part =~ s#comment\[([^\[\]]*)\](?:\r?\n)?##ig
b99d104e 526 and next TRY;
a2dd030f 527 $part =~ s#h([1-6])\[([^\[\]\|]*)\|([^\[\]]*)\](?:\r?\n)?#$3#ig
85802bd5 528 and next TRY;
69750116 529 $part =~ s#h([1-6])\[\|([^\[\]]*)\](?:\r?\n)?#$2#ig
85802bd5 530 and next TRY;
69750116 531 $part =~ s#h([1-6])\[([^\[\]]*)\](?:\r?\n)?#$2#ig
85802bd5 532 and next TRY;
a2dd030f 533 $part =~ s#poplink\[([^|\]\[]*)\|([^\]\[]*)\]#$2#ig
3fa9f5b4 534 and next TRY;
69750116 535 $part =~ s#poplink\[([^|\]\[]*)\]#$1#ig
3fa9f5b4 536 and next TRY;
a2dd030f 537 $part =~ s#link\[([^|\]\[]*)\|([^\]\[]*)\]#$2#ig
4772671f 538 and next TRY;
69750116 539 $part =~ s#link\[([^|\]\[]*)\]#$1#ig
3fa9f5b4 540 and next TRY;
69750116 541 $part =~ s#align\[([^|\]\[]+)\|([^\]\[]*)\]#$2#ig
4772671f 542 and next TRY;
69750116 543 $part =~ s#font\[([^|\]\[]+)\|([^\]\[]*)\]#$2#ig
4772671f
TC
544 and next TRY;
545 $part =~ s#hr\[([^|\]\[]*)\|([^\]\[]*)\]##ig
546 and next TRY;
547 $part =~ s#hr\[([^|\]\[]*)\]##ig
548 and next TRY;
549 $part =~ s#anchor\[([^|\]\[]*)\]##ig
550 and next TRY;
551 $part =~ s#table\[([^\n\[\]]*)\n([^\[\]]+)\n\s*\]#_cleanup_table($1, $2)#ieg
552 and next TRY;
553 $part =~ s#table\[([^\]\[]+)\|([^\]\[|]+)\]#_cleanup_table($1, "|$2")#ieg
554 and next TRY;
2b8496cb
TC
555 $part =~ s# ?\blist\[([^\]\[\|]*)\|\s*(\S[^\]\[]+)\]#$2#g
556 and next TRY;
4772671f
TC
557 $part =~ s#\*\*([^\n]+)#$1#g
558 and next TRY;
559 $part =~ s!##([^\n]+)!$1!g
560 and next TRY;
00dd8d82
TC
561 $part =~ s!%%([^\n]+)!$1!g
562 and next TRY;
4772671f
TC
563 $part =~ s#fontcolor\[([^|\]\[]+)\|([^\]\[]+)\|([^\]\[]+)\]#$3#ig
564 and next TRY;
565 $part =~ s#(?:indent|center)\[([^\]\[]*)\]#$1#ig
566 and next TRY;
567 $part =~ s#hrcolor\[([^|\]\[]+)\|([^\]\[]+)\|([^\]\[]+)\]##ig
568 and next TRY;
69750116 569 $part =~ s#image\[([^\]\[]+)\] *##ig
4772671f 570 and next TRY;
69750116 571 $part =~ s#class\[([^\]\[\|]+)\|([^\]\[]*)\]#$2#ig
85802bd5 572 and next TRY;
69750116 573 $part =~ s#style\[([^\]\[\|]+)\|([^\]\[]*)\]#$2#ig
85802bd5 574 and next TRY;
37dd20ad
TC
575 $part =~ s!(?<=\W)\[([^\]\[]+)\]!\x01$1\x02!g
576 and next TRY;
4772671f
TC
577
578 last TRY;
579 }
37dd20ad 580 $part =~ tr/\x01\x02/[]/; # put back the bare []
4772671f
TC
581 $out .= $part;
582 }
583 }
584
585 return $out;
586}
587
588sub remove {
589 0;
590}
591
592# removes any html tags from the supplied text
593sub _strip_html {
594 my ($text) = @_;
595
596 my $out = '';
597 require HTML::Parser;
598
599 # this may need to detect and skip <script></script> and stylesheets
600 my $ignore_text = 0; # non-zero in a <script></script> or <style></style>
601 my $start_h =
602 sub {
603 ++$ignore_text if $_[0] eq 'script' or $_[0] eq 'style';
604 if ($_[0] eq 'img' && $_[1]{alt} && !$ignore_text) {
605 $out .= $_[1]{alt};
606 }
607 };
608 my $end_h =
609 sub {
610 --$ignore_text if $_[0] eq 'script' or $_[0] eq 'style';
611 };
612 my $text_h =
613 sub {
614 $out .= $_[0] unless $ignore_text
615 };
616 my $p = HTML::Parser->new( text_h => [ $text_h, "dtext" ],
617 start_h => [ $start_h, "tagname, attr" ],
618 end_h => [ $end_h, "tagname" ]);
619 $p->parse($text);
620 $p->eof();
621
622 $text = $out;
623
624 return $text;
625}
626
627# this takes the same inputs as _make_table(), but eliminates any
628# markup instead
629sub _cleanup_table {
630 my ($opts, $data) = @_;
631 my @lines = split /\n/, $data;
632 for (@lines) {
633 s/^[^|]*\|//;
634 tr/|/ /s;
635 }
636 return join(' ', @lines);
637}
638
f2bf0d11
TC
639my %ms_entities =
640 (
641 34 => 'quot',
642 60 => 'lt',
643 62 => 'gt',
644 38 => 'amp',
645 128 => '#x20ac',
646 130 => '#x201a',
647 131 => '#x192',
648 132 => '#x201e',
649 133 => '#x2026',
650 134 => '#x2020',
651 135 => '#x2021',
652 136 => '#x2c6',
653 137 => '#x2030',
654 138 => '#x160',
655 139 => '#x2039',
656 140 => '#x152',
657 142 => '#x17D',
658 145 => 'lsquo',
659 146 => 'rsquo',
660 147 => 'ldquo',
661 148 => 'rdquo',
662 149 => '#x2022',
663 150 => 'ndash',
664 151 => 'mdash',
665 152 => '#x2dc',
666 153 => 'trade',
667 154 => '#x161',
668 155 => '#x203a',
669 156 => '#x153',
670 158 => '#x17e',
671 159 => '#x178',
672 );
673
62533efa
TC
674sub escape {
675 my ($self, $html) = @_;
676
677 if ($self->{conservative_escape}) {
678 return escape_html($html, '<>&"');
679 }
f2bf0d11
TC
680 elsif ($self->{msentify}) {
681 $html =~ s{([<>&\"\x80-\x9F])}
682 { $ms_entities{ord $1} ? "&$ms_entities{ord $1};"
683 : "** unknown code ".ord($1). " **"; }ge;
684
685 return $html;
686 }
62533efa
TC
687 else {
688 return escape_html($html);
689 }
690}
691
8f84f3f1
TC
692# for subclasses to override
693sub tag_class {
694 return;
695}
696
4772671f 6971;