Commit | Line | Data |
---|---|---|
12b72891 RGS |
1 | package UTF8::Matcher; |
2 | use strict; | |
3 | use warnings; | |
4 | use Text::Wrap qw(wrap); | |
5 | use Encode; | |
6 | use Data::Dumper; | |
7 | ||
8 | # Author: Yves Orton (demerphq) 2007. | |
9 | ||
10 | =pod | |
11 | ||
12 | Dynamically generates macros for detecting special charclasses | |
13 | in both latin-1, utf8, and codepoint forms. | |
14 | ||
15 | To regenerate regcharclass.h, run this script from perl-root. No arguments | |
16 | are necessary. | |
17 | ||
18 | Each charclass handler is constructed as follows: | |
19 | Each string the charclass must match is rendered as unicode (codepoints>255), | |
20 | and if possible as latin1 (codepoints>127), and if possible as "neutral" | |
21 | (all codepoints<128). | |
22 | ||
23 | The rendered strings are then inserted into digit-tries by type and length. | |
24 | With shorter strings being added to tries that are allowed to contain longer | |
25 | strings, but not vice versa. Thus the "longest" trie contains all strings | |
26 | for that charclass. | |
27 | ||
28 | The following types of trie are generated: | |
29 | ||
30 | n - Neutral only. All strings in this type have codepoints<128 | |
31 | l - Latin1 only. All strings in this type have a codepoint>127 in them | |
32 | u - UTF8 only. All strings in this type have a codepoint>255 in them | |
33 | L - Latin1. All strings in 'n' and 'l' | |
34 | U - UTF8. All string in 'n' and 'u' | |
35 | c - Codepoint. All strings in U but in codepoint and not utf8 form. | |
36 | ||
37 | The ternary() routine is responsible for converting the trie data into a | |
38 | ternary conditional that matches the required set of strings. The generated | |
39 | macro normally takes at least the argument 's' which is expected to be a | |
40 | pointer of type C<char *> or C<U8 *>. The condition generated will be | |
41 | optimised to match the string as efficiently as possible, with range lookups | |
42 | being used where possible, and in some situations relying on "true" to be 1. | |
43 | ||
44 | ternary() takes two optional arguments, $type which is one of the above | |
45 | characters and $ext which is used to add an extra extension to the macro name. | |
46 | ||
47 | If $type is omitted or false then the generated macro will take an additional | |
48 | argument, 'is_utf8'. | |
49 | ||
50 | If $ext has the string 'safe' in it then the generated macro will take an extra | |
51 | argument 'e' for the end of the string, and all lookups will be length checked | |
52 | to prevent lookups past e. If 'safe' is not used then the lookup is assumed to | |
53 | be guaranteed safe, and no 'e' argument is provided and no length checks are | |
54 | made during execution. | |
55 | ||
56 | The 'c' type is different as compared to the rest. Instead of producing | |
57 | a condition that does octet comparisons of a string array, the 'c' type | |
58 | produces a macro that takes a single codepoint as an argument (instead of a | |
59 | char* or U8*) and does the lookup based on only that char, thus it cannot be | |
60 | used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass. | |
61 | This is primarily used for populating charclass bitmaps for codepoints 0..255 | |
62 | but will also match codepoints in the unicode range if necessary. | |
63 | ||
64 | Using LNBREAK as an example the following macros will be produced: | |
65 | ||
66 | =over 4 | |
67 | ||
68 | =item is_LNBREAK(s,is_utf8) | |
69 | ||
70 | =item is_LNBREAK_safe(s,e,is_utf8) | |
71 | ||
72 | Do a lookup as apporpriate based on the is_utf8 flag. When possible | |
73 | comparisons involving octect<128 are done before checking the is_utf8 | |
74 | flag, hopefully saving time. | |
75 | ||
76 | =item is_LNBREAK_utf8(s) | |
77 | ||
78 | =item is_LNBREAK_utf8_safe(s,e) | |
79 | ||
80 | Do a lookup assuming the string is encoded in (normalized) UTF8. | |
81 | ||
82 | =item is_LNBREAK_latin1(s) | |
83 | ||
84 | =item is_LNBREAK_latin1_safe(s,e) | |
85 | ||
86 | Do a lookup assuming the string is encoded in latin-1 (aka plan octets). | |
87 | ||
88 | =item is_LNBREAK_cp(cp) | |
89 | ||
90 | Check to see if the string matches a given codepoint (hypotethically a | |
91 | U32). The condition is constructed as as to "break out" as early as | |
92 | possible if the codepoint is out of range of the condition. | |
93 | ||
94 | IOW: | |
95 | ||
96 | (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) | |
97 | ||
98 | Thus if the character is X+1 only two comparisons will be done. Making | |
99 | matching lookups slower, but non-matching faster. | |
100 | ||
101 | =back | |
102 | ||
103 | =cut | |
104 | ||
105 | # store a list of numbers into a hash based trie. | |
106 | sub _trie_store { | |
107 | my $root= shift; | |
108 | foreach my $b ( @_ ) { | |
109 | $root->{$b} ||= {}; | |
110 | $root= $root->{$b}; | |
111 | } | |
112 | $root->{''}++; | |
113 | } | |
114 | ||
115 | # Convert a string into its neutral, latin1, utf8 forms, where | |
116 | # the form is undefined unless the string can be completely represented | |
117 | # in that form. The string is then decomposed into the octects representing | |
118 | # it. A list is returned for each. Additional a list of codepoints making | |
119 | # up the string. | |
120 | # returns (\@n,\@u,\@l,\@cp) | |
121 | # | |
122 | sub _uni_latin1 { | |
123 | my $str= shift; | |
124 | my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) }; | |
125 | my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) }; | |
126 | my $n= $l; | |
127 | undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/; | |
128 | return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )), | |
129 | [map { ord $_ } split //,$str]); | |
130 | } | |
131 | ||
132 | # store an array ref of char data into the appropriate | |
133 | # type bins, tracking sizes as we go. | |
134 | sub _store { | |
135 | my ( $self, $r, @k )= @_; | |
136 | for my $z ( @k ) { | |
137 | $self->{size}{$z}{ 0 + @$r }++; | |
138 | push @{ $self->{data}{$z} }, $r; | |
139 | } | |
140 | } | |
141 | ||
142 | # construct a new charclass constructor object. | |
143 | # $title ends up in the code a as a comment. | |
144 | # $opcode is the name of the operation the charclass implements. | |
145 | # the rest of the arguments are strings that the charclass | |
146 | # can match. | |
147 | sub new { | |
148 | my $class= shift; | |
149 | my $title= shift; | |
150 | my $opcode= shift; | |
151 | my $self= bless { op => $opcode, title => $title }, $class; | |
152 | my %seen; | |
153 | # convert the strings to the numeric equivelents and store | |
154 | # them for later insertion while tracking their sizes. | |
155 | foreach my $seq ( @_ ) { | |
156 | next if $seen{$seq}++; | |
157 | push @{$self->{seq}},$seq; | |
158 | my ( $n, $u, $l,$cp )= _uni_latin1( $seq ); | |
159 | if ( $n ) { | |
160 | _store( $self, $n, qw(n U L) ); | |
161 | } else { | |
162 | if ( $l ) { | |
163 | _store( $self, $l, qw(l L) ); | |
164 | } | |
165 | _store( $self, $u, qw(u U) ); | |
166 | } | |
167 | _store($self,$cp,'c'); | |
168 | } | |
169 | # | |
170 | # now construct the tries. For each type of data we insert | |
171 | # the data into all the tries of length $size and smaller. | |
172 | # | |
173 | ||
174 | my %allsize; | |
175 | foreach my $k ( keys %{ $self->{data} } ) { | |
176 | my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} }; | |
177 | $self->{size}{$k}=\@size; | |
178 | undef @allsize{@size}; | |
179 | foreach my $d ( @{ $self->{data}{$k} } ) { | |
180 | foreach my $sz ( @size ) { | |
181 | last if $sz < @$d; | |
182 | $self->{trie}{$k}{$sz} ||= {}; | |
183 | _trie_store( $self->{trie}{$k}{$sz}, @$d ); | |
184 | } | |
185 | } | |
186 | #delete $self->{data}{$k}; | |
187 | } | |
188 | my @size= sort { $b <=> $a } keys %allsize; | |
189 | $self->{size}{''}= \@size; | |
190 | return $self; | |
191 | } | |
192 | ||
193 | # | |
194 | # _cond([$v1,$v2,$v2...],$ofs) | |
195 | # | |
196 | # converts an array of codepoints into a conditional expression | |
197 | # consequtive codepoints are merged into a range test | |
198 | # returns a string containing the conditional expression in the form | |
199 | # '( li[x]==v || li[x]==y )' When possible we also use range lookups. | |
200 | ||
201 | sub _cond { | |
202 | my ( $c, $ofs,$fmt )= @_; | |
203 | $fmt||='((U8*)s)[%d]'; | |
204 | # cheapo rangification routine. | |
205 | # Convert the first element into a singleton represented | |
206 | # as [$x,$x] and then merge the rest in as we go. | |
207 | my @v= sort { $a <=> $b } @$c; | |
208 | my @r= ( [ ( shift @v ) x 2 ] ); | |
209 | for my $n ( @v ) { | |
210 | if ( $n == $r[-1][1] + 1 ) { | |
211 | $r[-1][1]++; | |
212 | } else { | |
213 | push @r, [ $n, $n ]; | |
214 | } | |
215 | } | |
216 | @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r; | |
217 | # sort the ranges by size and order. | |
218 | @r= sort { $a->[0] <=> $b->[0] } @r; | |
219 | my $alu= sprintf $fmt,$ofs; # C array look up | |
220 | ||
221 | if ($fmt=~/%d/) { | |
222 | # map the ranges into conditions | |
223 | @r= map { | |
224 | # singleton | |
225 | $_->[0] == $_->[1] ? "$alu==$_->[0]" : | |
226 | # range | |
227 | "($_->[0]<=$alu && $alu<=$_->[1])" | |
228 | } @r; | |
229 | # return the joined results. | |
230 | return '( ' . join( " || ", @r ) . ' )'; | |
231 | } else { | |
232 | return combine($alu,@r); | |
233 | } | |
234 | } | |
235 | ||
236 | # | |
237 | # Do the condition in such a way that we break out early if the value | |
238 | # we are looking at is in between two elements in the list. | |
239 | # Currently used only for codepoint macros (depth 1) | |
240 | # | |
241 | sub combine { | |
242 | my $alu=shift; | |
243 | local $_ = shift; | |
244 | my $txt= $_->[0] == $_->[1] | |
245 | ? "$alu==$_->[0]" | |
246 | : "($_->[0]<=$alu && $alu<=$_->[1])"; | |
247 | return $txt unless @_; | |
248 | return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )"; | |
249 | } | |
250 | ||
251 | # recursively convert a trie to an optree represented by | |
252 | # [condition,yes,no] where yes and no can be a ref to another optree | |
253 | # or a scalar representing code. | |
254 | # called by make_optree | |
255 | ||
256 | sub _trie_to_optree { | |
257 | my ( $node, $ofs, $else, $fmt )= @_; | |
258 | return $else unless $node; | |
259 | $ofs ||= 0; | |
260 | if ( $node->{''} ) { | |
261 | $else= $ofs; | |
262 | } else { | |
263 | $else ||= 0; | |
264 | } | |
265 | my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] } | |
266 | map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] } | |
267 | grep length, keys %$node; | |
268 | ||
269 | return $ofs if !@k; | |
270 | ||
271 | my ( $root, $expr ); | |
272 | while ( @k ) { | |
273 | my @cond= ( $k[0][0] ); | |
274 | my $d= $k[0][1]; | |
275 | my $r= $k[0][2]; | |
276 | shift @k; | |
277 | while ( @k && $k[0][1] eq $d ) { | |
278 | push @cond, $k[0][0]; | |
279 | shift @k; | |
280 | } | |
281 | my $op= | |
282 | [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ]; | |
283 | if ( !$root ) { | |
284 | $root= $expr= $op; | |
285 | } else { | |
286 | push @$expr, $op; | |
287 | $expr= $op; | |
288 | } | |
289 | } | |
290 | push @$expr, $else; | |
291 | return $root; | |
292 | } | |
293 | ||
294 | # construct the optree for a type. | |
295 | # handles the special logic of type ''. | |
296 | sub make_optree { | |
297 | my ( $self, $type, $size, $fmt )= @_; | |
298 | my $else= 0; | |
299 | $size||=$self->{size}{$type}[0]; | |
300 | $size=1 if $type eq 'c'; | |
301 | if ( !$type ) { | |
302 | my ( $u, $l ); | |
303 | for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { | |
304 | $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); | |
305 | } | |
306 | for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { | |
307 | $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); | |
308 | } | |
309 | if ( $u ) { | |
310 | $else= [ '(is_utf8)', $u, $l || 0 ]; | |
311 | } elsif ( $l ) { | |
312 | $else= [ '(!is_utf8)', $l, 0 ]; | |
313 | } | |
314 | $type= 'n'; | |
315 | $size-- while !$self->{trie}{n}{$size}; | |
316 | } | |
317 | return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); | |
318 | } | |
319 | ||
320 | # construct the optree for a type with length checks to prevent buffer | |
321 | # overruns. Only one length check is performed per lookup trading code | |
322 | # size for speed. | |
323 | sub length_optree { | |
324 | my ( $self, $type,$fmt )= @_; | |
325 | $type ||= ''; | |
326 | return $self->{len_op}{$type} if $self->{len_op}{$type}; | |
327 | my @size = @{$self->{size}{$type}}; | |
328 | ||
329 | my ( $root, $expr ); | |
330 | foreach my $size ( @size ) { | |
331 | my $op= [ | |
332 | "( (e) - (s) > " . ( $size - 1 ) . " )", | |
333 | $self->make_optree( $type, $size ), | |
334 | ]; | |
335 | if ( !$root ) { | |
336 | $root= $expr= $op; | |
337 | } else { | |
338 | push @$expr, $op; | |
339 | $expr= $op; | |
340 | } | |
341 | } | |
342 | push @$expr, 0; | |
343 | return $self->{len_op}{$type}= $root ? $root : $expr->[0]; | |
344 | } | |
345 | ||
346 | # | |
347 | # recursively walk an optree and covert it to a huge nested ternary expression. | |
348 | # | |
349 | sub _optree_to_ternary { | |
350 | my ( $node )= @_; | |
351 | return $node | |
352 | if !ref $node; | |
353 | my $depth = 0; | |
354 | if ( $node->[0] =~ /\[(\d+)\]/ ) { | |
355 | $depth= $1 + 1; | |
356 | } | |
357 | return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0], | |
358 | _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] ); | |
359 | } | |
360 | ||
361 | # add \\ to the end of strings in a reasonable neat way. | |
362 | sub _macro($) { | |
363 | my $str= shift; | |
364 | my @lines= split /[^\S\n]*\n/, $str; | |
365 | return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n"; | |
366 | } | |
367 | ||
368 | # default type extensions. 'uln' dont have one because normally | |
369 | # they are used only as part of type '' which doesnt get an extension | |
370 | my %ext= ( | |
371 | U => '_utf8', | |
372 | L => '_latin1', | |
373 | c => '_cp', | |
374 | ||
375 | ); | |
376 | ||
377 | # produce the ternary, handling arguments and putting on the macro headers | |
378 | # and boiler plate | |
379 | sub ternary { | |
380 | my ( $self, $type, $ext )= @_; | |
381 | $type ||= ''; | |
382 | $ext = ($ext{$type} || '') . ($ext||""); | |
383 | my ($root,$fmt,$arg); | |
384 | if ($type eq 'c') { | |
385 | $arg= $fmt= 'cp'; | |
386 | } else { | |
387 | $arg= 's'; | |
388 | } | |
389 | if ( $type eq 'c' || $ext !~ /safe/) { | |
390 | $root= $self->make_optree( $type, 0, $fmt ); | |
391 | } else { | |
392 | $root= $self->length_optree( $type, $fmt ); | |
393 | } | |
394 | ||
395 | our $parens; | |
396 | $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x; | |
397 | my $expr= qr/ | |
398 | \( \s* | |
399 | ($parens) | |
400 | \s* \? \s* | |
401 | \( \s* | |
402 | ($parens) | |
403 | \s* \? \s* | |
404 | (\d+|$parens) | |
405 | \s* : \s* | |
406 | (\d+|$parens) | |
407 | \s* \) | |
408 | \s* : \s* | |
409 | \4 | |
410 | \s* \) | |
411 | /x; | |
412 | my $code= _optree_to_ternary( $root ); | |
413 | for ( $code ) { | |
414 | s/^\s*//; | |
415 | 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g | |
416 | || s<$expr><(($1 && $2) ? $3 : $4)>g | |
417 | || s<\(\s*($parens)\s*\)><$1>g; | |
418 | } | |
419 | my @args=($arg); | |
420 | push @args,'e' if $ext=~/safe/; | |
421 | push @args,'is_utf8' if !$type; | |
422 | my $args=join ",",@args; | |
423 | return "/*** GENERATED CODE ***/\n" | |
424 | . _macro "#define is_$self->{op}$ext($args)\n$code"; | |
425 | } | |
426 | ||
427 | my $path=shift @ARGV; | |
428 | if (!$path) { | |
429 | $path= "regcharclass.h"; | |
430 | if (!-e $path) { $path="../$path" } | |
431 | if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; | |
432 | } | |
433 | ||
434 | rename $path,"$path.bak"; | |
435 | open my $out_fh,">",$path | |
436 | or die "Can't write to '$path':$!"; | |
437 | binmode $out_fh; # want unix line endings even when run on win32. | |
438 | my ($zero)=$0=~/([^\\\/]+)$/; | |
439 | print $out_fh <<"HEADER"; | |
440 | /*********************** WARNING WARNING WARNING ************************ | |
441 | ||
442 | Do not modify this code directly: This file was autogenerated by | |
443 | ||
444 | Porting/$zero | |
445 | ||
446 | from data contained within the script. Change the script instead. | |
447 | ||
448 | Generated at: @{[ scalar gmtime ]} GMT | |
449 | ||
450 | ************************ WARNING WARNING WARNING ************************/ | |
451 | ||
452 | HEADER | |
453 | ||
454 | my ($op,$title,@strs,@txt); | |
455 | my $doit= sub { | |
456 | return unless $op; | |
457 | my $o= __PACKAGE__->new($title,$op,@strs); | |
458 | print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; | |
459 | print $out_fh join "\n",@txt,"*/",""; | |
460 | for ('', 'U', 'L') { | |
461 | print $out_fh $o->ternary( $_ ); | |
462 | print $out_fh $o->ternary( $_,'_safe' ); | |
463 | } | |
464 | print $out_fh $o->ternary( 'c' ); | |
465 | }; | |
466 | while (<DATA>) { | |
467 | next unless /\S/; | |
468 | chomp; | |
469 | if (/^([A-Z]+)/) { | |
470 | $doit->(); | |
471 | ($op,$title)=split /\s*:\s*/,$_,2; | |
472 | @txt=@strs=(); | |
473 | } else { | |
474 | push @txt, "\t$_"; | |
475 | s/#.*$//; | |
476 | if (/^0x/) { | |
477 | push @strs,map { chr $_ } eval $_; | |
478 | } elsif (/^[""'']/) { | |
479 | push @strs,eval $_; | |
480 | } | |
481 | } | |
482 | } | |
483 | $doit->(); | |
484 | print "$path has been updated\n"; | |
485 | ||
486 | __DATA__ | |
487 | LNBREAK: Line Break: \R | |
488 | "\x0D\x0A" # CRLF - Network (Windows) line ending | |
489 | 0x0A # LF | LINE FEED | |
490 | 0x0B # VT | VERTICAL TAB | |
491 | 0x0C # FF | FORM FEED | |
492 | 0x0D # CR | CARRIAGE RETURN | |
493 | 0x85 # NEL | NEXT LINE | |
494 | 0x2028 # LINE SEPARATOR | |
495 | 0x2029 # PARAGRAPH SEPARATOR | |
496 | ||
497 | HORIZWS: Horizontal Whitespace: \h \H | |
498 | 0x09 # HT | |
499 | 0x20 # SPACE | |
500 | 0xa0 # NBSP | |
501 | 0x1680 # OGHAM SPACE MARK | |
502 | 0x180e # MONGOLIAN VOWEL SEPARATOR | |
503 | 0x2000 # EN QUAD | |
504 | 0x2001 # EM QUAD | |
505 | 0x2002 # EN SPACE | |
506 | 0x2003 # EM SPACE | |
507 | 0x2004 # THREE-PER-EM SPACE | |
508 | 0x2005 # FOUR-PER-EM SPACE | |
509 | 0x2006 # SIX-PER-EM SPACE | |
510 | 0x2007 # FIGURE SPACE | |
511 | 0x2008 # PUNCTUATION SPACE | |
512 | 0x2009 # THIN SPACE | |
513 | 0x200A # HAIR SPACE | |
514 | 0x202f # NARROW NO-BREAK SPACE | |
515 | 0x205f # MEDIUM MATHEMATICAL SPACE | |
516 | 0x3000 # IDEOGRAPHIC SPACE | |
517 | ||
518 | VERTWS: Vertical Whitespace: \v \V | |
519 | 0x0A # LF | |
520 | 0x0B # VT | |
521 | 0x0C # FF | |
522 | 0x0D # CR | |
523 | 0x85 # NEL | |
524 | 0x2028 # LINE SEPARATOR | |
525 | 0x2029 # PARAGRAPH SEPARATOR | |
526 |