| 1 | package UTF8::Matcher; |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use Text::Wrap qw(wrap); |
| 5 | use Encode; |
| 6 | use Data::Dumper; |
| 7 | |
| 8 | # Author: Yves Orton (demerphq) 2007. |
| 9 | |
| 10 | =pod |
| 11 | |
| 12 | Dynamically generates macros for detecting special charclasses |
| 13 | in both latin-1, utf8, and codepoint forms. |
| 14 | |
| 15 | To regenerate regcharclass.h, run this script from perl-root. No arguments |
| 16 | are necessary. |
| 17 | |
| 18 | Each charclass handler is constructed as follows: |
| 19 | Each string the charclass must match is rendered as unicode (codepoints>255), |
| 20 | and if possible as latin1 (codepoints>127), and if possible as "neutral" |
| 21 | (all codepoints<128). |
| 22 | |
| 23 | The rendered strings are then inserted into digit-tries by type and length. |
| 24 | With shorter strings being added to tries that are allowed to contain longer |
| 25 | strings, but not vice versa. Thus the "longest" trie contains all strings |
| 26 | for that charclass. |
| 27 | |
| 28 | The following types of trie are generated: |
| 29 | |
| 30 | n - Neutral only. All strings in this type have codepoints<128 |
| 31 | l - Latin1 only. All strings in this type have a codepoint>127 in them |
| 32 | u - UTF8 only. All strings in this type have a codepoint>255 in them |
| 33 | L - Latin1. All strings in 'n' and 'l' |
| 34 | U - UTF8. All string in 'n' and 'u' |
| 35 | c - Codepoint. All strings in U but in codepoint and not utf8 form. |
| 36 | |
| 37 | The ternary() routine is responsible for converting the trie data into a |
| 38 | ternary conditional that matches the required set of strings. The generated |
| 39 | macro normally takes at least the argument 's' which is expected to be a |
| 40 | pointer of type C<char *> or C<U8 *>. The condition generated will be |
| 41 | optimised to match the string as efficiently as possible, with range lookups |
| 42 | being used where possible, and in some situations relying on "true" to be 1. |
| 43 | |
| 44 | ternary() takes two optional arguments, $type which is one of the above |
| 45 | characters and $ext which is used to add an extra extension to the macro name. |
| 46 | |
| 47 | If $type is omitted or false then the generated macro will take an additional |
| 48 | argument, 'is_utf8'. |
| 49 | |
| 50 | If $ext has the string 'safe' in it then the generated macro will take an extra |
| 51 | argument 'e' for the end of the string, and all lookups will be length checked |
| 52 | to prevent lookups past e. If 'safe' is not used then the lookup is assumed to |
| 53 | be guaranteed safe, and no 'e' argument is provided and no length checks are |
| 54 | made during execution. |
| 55 | |
| 56 | The 'c' type is different as compared to the rest. Instead of producing |
| 57 | a condition that does octet comparisons of a string array, the 'c' type |
| 58 | produces a macro that takes a single codepoint as an argument (instead of a |
| 59 | char* or U8*) and does the lookup based on only that char, thus it cannot be |
| 60 | used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass. |
| 61 | This is primarily used for populating charclass bitmaps for codepoints 0..255 |
| 62 | but will also match codepoints in the unicode range if necessary. |
| 63 | |
| 64 | Using LNBREAK as an example the following macros will be produced: |
| 65 | |
| 66 | =over 4 |
| 67 | |
| 68 | =item is_LNBREAK(s,is_utf8) |
| 69 | |
| 70 | =item is_LNBREAK_safe(s,e,is_utf8) |
| 71 | |
| 72 | Do a lookup as apporpriate based on the is_utf8 flag. When possible |
| 73 | comparisons involving octect<128 are done before checking the is_utf8 |
| 74 | flag, hopefully saving time. |
| 75 | |
| 76 | =item is_LNBREAK_utf8(s) |
| 77 | |
| 78 | =item is_LNBREAK_utf8_safe(s,e) |
| 79 | |
| 80 | Do a lookup assuming the string is encoded in (normalized) UTF8. |
| 81 | |
| 82 | =item is_LNBREAK_latin1(s) |
| 83 | |
| 84 | =item is_LNBREAK_latin1_safe(s,e) |
| 85 | |
| 86 | Do a lookup assuming the string is encoded in latin-1 (aka plan octets). |
| 87 | |
| 88 | =item is_LNBREAK_cp(cp) |
| 89 | |
| 90 | Check to see if the string matches a given codepoint (hypotethically a |
| 91 | U32). The condition is constructed as as to "break out" as early as |
| 92 | possible if the codepoint is out of range of the condition. |
| 93 | |
| 94 | IOW: |
| 95 | |
| 96 | (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) |
| 97 | |
| 98 | Thus if the character is X+1 only two comparisons will be done. Making |
| 99 | matching lookups slower, but non-matching faster. |
| 100 | |
| 101 | =back |
| 102 | |
| 103 | =cut |
| 104 | |
| 105 | # store a list of numbers into a hash based trie. |
| 106 | sub _trie_store { |
| 107 | my $root= shift; |
| 108 | foreach my $b ( @_ ) { |
| 109 | $root->{$b} ||= {}; |
| 110 | $root= $root->{$b}; |
| 111 | } |
| 112 | $root->{''}++; |
| 113 | } |
| 114 | |
| 115 | # Convert a string into its neutral, latin1, utf8 forms, where |
| 116 | # the form is undefined unless the string can be completely represented |
| 117 | # in that form. The string is then decomposed into the octects representing |
| 118 | # it. A list is returned for each. Additional a list of codepoints making |
| 119 | # up the string. |
| 120 | # returns (\@n,\@u,\@l,\@cp) |
| 121 | # |
| 122 | sub _uni_latin1 { |
| 123 | my $str= shift; |
| 124 | my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) }; |
| 125 | my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) }; |
| 126 | my $n= $l; |
| 127 | undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/; |
| 128 | return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )), |
| 129 | [map { ord $_ } split //,$str]); |
| 130 | } |
| 131 | |
| 132 | # store an array ref of char data into the appropriate |
| 133 | # type bins, tracking sizes as we go. |
| 134 | sub _store { |
| 135 | my ( $self, $r, @k )= @_; |
| 136 | for my $z ( @k ) { |
| 137 | $self->{size}{$z}{ 0 + @$r }++; |
| 138 | push @{ $self->{data}{$z} }, $r; |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | # construct a new charclass constructor object. |
| 143 | # $title ends up in the code a as a comment. |
| 144 | # $opcode is the name of the operation the charclass implements. |
| 145 | # the rest of the arguments are strings that the charclass |
| 146 | # can match. |
| 147 | sub new { |
| 148 | my $class= shift; |
| 149 | my $title= shift; |
| 150 | my $opcode= shift; |
| 151 | my $self= bless { op => $opcode, title => $title }, $class; |
| 152 | my %seen; |
| 153 | # convert the strings to the numeric equivelents and store |
| 154 | # them for later insertion while tracking their sizes. |
| 155 | foreach my $seq ( @_ ) { |
| 156 | next if $seen{$seq}++; |
| 157 | push @{$self->{seq}},$seq; |
| 158 | my ( $n, $u, $l,$cp )= _uni_latin1( $seq ); |
| 159 | if ( $n ) { |
| 160 | _store( $self, $n, qw(n U L) ); |
| 161 | } else { |
| 162 | if ( $l ) { |
| 163 | _store( $self, $l, qw(l L) ); |
| 164 | } |
| 165 | _store( $self, $u, qw(u U) ); |
| 166 | } |
| 167 | _store($self,$cp,'c'); |
| 168 | } |
| 169 | # |
| 170 | # now construct the tries. For each type of data we insert |
| 171 | # the data into all the tries of length $size and smaller. |
| 172 | # |
| 173 | |
| 174 | my %allsize; |
| 175 | foreach my $k ( keys %{ $self->{data} } ) { |
| 176 | my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} }; |
| 177 | $self->{size}{$k}=\@size; |
| 178 | undef @allsize{@size}; |
| 179 | foreach my $d ( @{ $self->{data}{$k} } ) { |
| 180 | foreach my $sz ( @size ) { |
| 181 | last if $sz < @$d; |
| 182 | $self->{trie}{$k}{$sz} ||= {}; |
| 183 | _trie_store( $self->{trie}{$k}{$sz}, @$d ); |
| 184 | } |
| 185 | } |
| 186 | #delete $self->{data}{$k}; |
| 187 | } |
| 188 | my @size= sort { $b <=> $a } keys %allsize; |
| 189 | $self->{size}{''}= \@size; |
| 190 | return $self; |
| 191 | } |
| 192 | |
| 193 | # |
| 194 | # _cond([$v1,$v2,$v2...],$ofs) |
| 195 | # |
| 196 | # converts an array of codepoints into a conditional expression |
| 197 | # consequtive codepoints are merged into a range test |
| 198 | # returns a string containing the conditional expression in the form |
| 199 | # '( li[x]==v || li[x]==y )' When possible we also use range lookups. |
| 200 | |
| 201 | sub _cond { |
| 202 | my ( $c, $ofs,$fmt )= @_; |
| 203 | $fmt||='((U8*)s)[%d]'; |
| 204 | # cheapo rangification routine. |
| 205 | # Convert the first element into a singleton represented |
| 206 | # as [$x,$x] and then merge the rest in as we go. |
| 207 | my @v= sort { $a <=> $b } @$c; |
| 208 | my @r= ( [ ( shift @v ) x 2 ] ); |
| 209 | for my $n ( @v ) { |
| 210 | if ( $n == $r[-1][1] + 1 ) { |
| 211 | $r[-1][1]++; |
| 212 | } else { |
| 213 | push @r, [ $n, $n ]; |
| 214 | } |
| 215 | } |
| 216 | @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r; |
| 217 | # sort the ranges by size and order. |
| 218 | @r= sort { $a->[0] <=> $b->[0] } @r; |
| 219 | my $alu= sprintf $fmt,$ofs; # C array look up |
| 220 | |
| 221 | if ($fmt=~/%d/) { |
| 222 | # map the ranges into conditions |
| 223 | @r= map { |
| 224 | # singleton |
| 225 | $_->[0] == $_->[1] ? "$alu==$_->[0]" : |
| 226 | # range |
| 227 | "($_->[0]<=$alu && $alu<=$_->[1])" |
| 228 | } @r; |
| 229 | # return the joined results. |
| 230 | return '( ' . join( " || ", @r ) . ' )'; |
| 231 | } else { |
| 232 | return combine($alu,@r); |
| 233 | } |
| 234 | } |
| 235 | |
| 236 | # |
| 237 | # Do the condition in such a way that we break out early if the value |
| 238 | # we are looking at is in between two elements in the list. |
| 239 | # Currently used only for codepoint macros (depth 1) |
| 240 | # |
| 241 | sub combine { |
| 242 | my $alu=shift; |
| 243 | local $_ = shift; |
| 244 | my $txt= $_->[0] == $_->[1] |
| 245 | ? "$alu==$_->[0]" |
| 246 | : "($_->[0]<=$alu && $alu<=$_->[1])"; |
| 247 | return $txt unless @_; |
| 248 | return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )"; |
| 249 | } |
| 250 | |
| 251 | # recursively convert a trie to an optree represented by |
| 252 | # [condition,yes,no] where yes and no can be a ref to another optree |
| 253 | # or a scalar representing code. |
| 254 | # called by make_optree |
| 255 | |
| 256 | sub _trie_to_optree { |
| 257 | my ( $node, $ofs, $else, $fmt )= @_; |
| 258 | return $else unless $node; |
| 259 | $ofs ||= 0; |
| 260 | if ( $node->{''} ) { |
| 261 | $else= $ofs; |
| 262 | } else { |
| 263 | $else ||= 0; |
| 264 | } |
| 265 | my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] } |
| 266 | map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] } |
| 267 | grep length, keys %$node; |
| 268 | |
| 269 | return $ofs if !@k; |
| 270 | |
| 271 | my ( $root, $expr ); |
| 272 | while ( @k ) { |
| 273 | my @cond= ( $k[0][0] ); |
| 274 | my $d= $k[0][1]; |
| 275 | my $r= $k[0][2]; |
| 276 | shift @k; |
| 277 | while ( @k && $k[0][1] eq $d ) { |
| 278 | push @cond, $k[0][0]; |
| 279 | shift @k; |
| 280 | } |
| 281 | my $op= |
| 282 | [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ]; |
| 283 | if ( !$root ) { |
| 284 | $root= $expr= $op; |
| 285 | } else { |
| 286 | push @$expr, $op; |
| 287 | $expr= $op; |
| 288 | } |
| 289 | } |
| 290 | push @$expr, $else; |
| 291 | return $root; |
| 292 | } |
| 293 | |
| 294 | # construct the optree for a type. |
| 295 | # handles the special logic of type ''. |
| 296 | sub make_optree { |
| 297 | my ( $self, $type, $size, $fmt )= @_; |
| 298 | my $else= 0; |
| 299 | $size||=$self->{size}{$type}[0]; |
| 300 | $size=1 if $type eq 'c'; |
| 301 | if ( !$type ) { |
| 302 | my ( $u, $l ); |
| 303 | for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { |
| 304 | $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); |
| 305 | } |
| 306 | for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { |
| 307 | $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); |
| 308 | } |
| 309 | if ( $u ) { |
| 310 | $else= [ '(is_utf8)', $u, $l || 0 ]; |
| 311 | } elsif ( $l ) { |
| 312 | $else= [ '(!is_utf8)', $l, 0 ]; |
| 313 | } |
| 314 | $type= 'n'; |
| 315 | $size-- while !$self->{trie}{n}{$size}; |
| 316 | } |
| 317 | return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); |
| 318 | } |
| 319 | |
| 320 | # construct the optree for a type with length checks to prevent buffer |
| 321 | # overruns. Only one length check is performed per lookup trading code |
| 322 | # size for speed. |
| 323 | sub length_optree { |
| 324 | my ( $self, $type,$fmt )= @_; |
| 325 | $type ||= ''; |
| 326 | return $self->{len_op}{$type} if $self->{len_op}{$type}; |
| 327 | my @size = @{$self->{size}{$type}}; |
| 328 | |
| 329 | my ( $root, $expr ); |
| 330 | foreach my $size ( @size ) { |
| 331 | my $op= [ |
| 332 | "( (e) - (s) > " . ( $size - 1 ) . " )", |
| 333 | $self->make_optree( $type, $size ), |
| 334 | ]; |
| 335 | if ( !$root ) { |
| 336 | $root= $expr= $op; |
| 337 | } else { |
| 338 | push @$expr, $op; |
| 339 | $expr= $op; |
| 340 | } |
| 341 | } |
| 342 | push @$expr, 0; |
| 343 | return $self->{len_op}{$type}= $root ? $root : $expr->[0]; |
| 344 | } |
| 345 | |
| 346 | # |
| 347 | # recursively walk an optree and covert it to a huge nested ternary expression. |
| 348 | # |
| 349 | sub _optree_to_ternary { |
| 350 | my ( $node )= @_; |
| 351 | return $node |
| 352 | if !ref $node; |
| 353 | my $depth = 0; |
| 354 | if ( $node->[0] =~ /\[(\d+)\]/ ) { |
| 355 | $depth= $1 + 1; |
| 356 | } |
| 357 | return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0], |
| 358 | _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] ); |
| 359 | } |
| 360 | |
| 361 | # add \\ to the end of strings in a reasonable neat way. |
| 362 | sub _macro($) { |
| 363 | my $str= shift; |
| 364 | my @lines= split /[^\S\n]*\n/, $str; |
| 365 | return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n"; |
| 366 | } |
| 367 | |
| 368 | # default type extensions. 'uln' dont have one because normally |
| 369 | # they are used only as part of type '' which doesnt get an extension |
| 370 | my %ext= ( |
| 371 | U => '_utf8', |
| 372 | L => '_latin1', |
| 373 | c => '_cp', |
| 374 | |
| 375 | ); |
| 376 | |
| 377 | # produce the ternary, handling arguments and putting on the macro headers |
| 378 | # and boiler plate |
| 379 | sub ternary { |
| 380 | my ( $self, $type, $ext )= @_; |
| 381 | $type ||= ''; |
| 382 | $ext = ($ext{$type} || '') . ($ext||""); |
| 383 | my ($root,$fmt,$arg); |
| 384 | if ($type eq 'c') { |
| 385 | $arg= $fmt= 'cp'; |
| 386 | } else { |
| 387 | $arg= 's'; |
| 388 | } |
| 389 | if ( $type eq 'c' || $ext !~ /safe/) { |
| 390 | $root= $self->make_optree( $type, 0, $fmt ); |
| 391 | } else { |
| 392 | $root= $self->length_optree( $type, $fmt ); |
| 393 | } |
| 394 | |
| 395 | our $parens; |
| 396 | $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x; |
| 397 | my $expr= qr/ |
| 398 | \( \s* |
| 399 | ($parens) |
| 400 | \s* \? \s* |
| 401 | \( \s* |
| 402 | ($parens) |
| 403 | \s* \? \s* |
| 404 | (\d+|$parens) |
| 405 | \s* : \s* |
| 406 | (\d+|$parens) |
| 407 | \s* \) |
| 408 | \s* : \s* |
| 409 | \4 |
| 410 | \s* \) |
| 411 | /x; |
| 412 | my $code= _optree_to_ternary( $root ); |
| 413 | for ( $code ) { |
| 414 | s/^\s*//; |
| 415 | 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g |
| 416 | || s<$expr><(($1 && $2) ? $3 : $4)>g |
| 417 | || s<\(\s*($parens)\s*\)><$1>g; |
| 418 | } |
| 419 | my @args=($arg); |
| 420 | push @args,'e' if $ext=~/safe/; |
| 421 | push @args,'is_utf8' if !$type; |
| 422 | my $args=join ",",@args; |
| 423 | return "/*** GENERATED CODE ***/\n" |
| 424 | . _macro "#define is_$self->{op}$ext($args)\n$code"; |
| 425 | } |
| 426 | |
| 427 | my $path=shift @ARGV; |
| 428 | if (!$path) { |
| 429 | $path= "regcharclass.h"; |
| 430 | if (!-e $path) { $path="../$path" } |
| 431 | if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; |
| 432 | } |
| 433 | |
| 434 | rename $path,"$path.bak"; |
| 435 | open my $out_fh,">",$path |
| 436 | or die "Can't write to '$path':$!"; |
| 437 | binmode $out_fh; # want unix line endings even when run on win32. |
| 438 | my ($zero)=$0=~/([^\\\/]+)$/; |
| 439 | print $out_fh <<"HEADER"; |
| 440 | /*********************** WARNING WARNING WARNING ************************ |
| 441 | |
| 442 | Do not modify this code directly: This file was autogenerated by |
| 443 | |
| 444 | Porting/$zero |
| 445 | |
| 446 | from data contained within the script. Change the script instead. |
| 447 | |
| 448 | Generated at: @{[ scalar gmtime ]} GMT |
| 449 | |
| 450 | ************************ WARNING WARNING WARNING ************************/ |
| 451 | |
| 452 | HEADER |
| 453 | |
| 454 | my ($op,$title,@strs,@txt); |
| 455 | my $doit= sub { |
| 456 | return unless $op; |
| 457 | my $o= __PACKAGE__->new($title,$op,@strs); |
| 458 | print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; |
| 459 | print $out_fh join "\n",@txt,"*/",""; |
| 460 | for ('', 'U', 'L') { |
| 461 | print $out_fh $o->ternary( $_ ); |
| 462 | print $out_fh $o->ternary( $_,'_safe' ); |
| 463 | } |
| 464 | print $out_fh $o->ternary( 'c' ); |
| 465 | }; |
| 466 | while (<DATA>) { |
| 467 | next unless /\S/; |
| 468 | chomp; |
| 469 | if (/^([A-Z]+)/) { |
| 470 | $doit->(); |
| 471 | ($op,$title)=split /\s*:\s*/,$_,2; |
| 472 | @txt=@strs=(); |
| 473 | } else { |
| 474 | push @txt, "\t$_"; |
| 475 | s/#.*$//; |
| 476 | if (/^0x/) { |
| 477 | push @strs,map { chr $_ } eval $_; |
| 478 | } elsif (/^[""'']/) { |
| 479 | push @strs,eval $_; |
| 480 | } |
| 481 | } |
| 482 | } |
| 483 | $doit->(); |
| 484 | print "$path has been updated\n"; |
| 485 | |
| 486 | __DATA__ |
| 487 | LNBREAK: Line Break: \R |
| 488 | "\x0D\x0A" # CRLF - Network (Windows) line ending |
| 489 | 0x0A # LF | LINE FEED |
| 490 | 0x0B # VT | VERTICAL TAB |
| 491 | 0x0C # FF | FORM FEED |
| 492 | 0x0D # CR | CARRIAGE RETURN |
| 493 | 0x85 # NEL | NEXT LINE |
| 494 | 0x2028 # LINE SEPARATOR |
| 495 | 0x2029 # PARAGRAPH SEPARATOR |
| 496 | |
| 497 | HORIZWS: Horizontal Whitespace: \h \H |
| 498 | 0x09 # HT |
| 499 | 0x20 # SPACE |
| 500 | 0xa0 # NBSP |
| 501 | 0x1680 # OGHAM SPACE MARK |
| 502 | 0x180e # MONGOLIAN VOWEL SEPARATOR |
| 503 | 0x2000 # EN QUAD |
| 504 | 0x2001 # EM QUAD |
| 505 | 0x2002 # EN SPACE |
| 506 | 0x2003 # EM SPACE |
| 507 | 0x2004 # THREE-PER-EM SPACE |
| 508 | 0x2005 # FOUR-PER-EM SPACE |
| 509 | 0x2006 # SIX-PER-EM SPACE |
| 510 | 0x2007 # FIGURE SPACE |
| 511 | 0x2008 # PUNCTUATION SPACE |
| 512 | 0x2009 # THIN SPACE |
| 513 | 0x200A # HAIR SPACE |
| 514 | 0x202f # NARROW NO-BREAK SPACE |
| 515 | 0x205f # MEDIUM MATHEMATICAL SPACE |
| 516 | 0x3000 # IDEOGRAPHIC SPACE |
| 517 | |
| 518 | VERTWS: Vertical Whitespace: \v \V |
| 519 | 0x0A # LF |
| 520 | 0x0B # VT |
| 521 | 0x0C # FF |
| 522 | 0x0D # CR |
| 523 | 0x85 # NEL |
| 524 | 0x2028 # LINE SEPARATOR |
| 525 | 0x2029 # PARAGRAPH SEPARATOR |
| 526 | |