This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add Yves Orton's script to regenerate regcharclass.h
[perl5.git] / Porting / regcharclass.pl
... / ...
CommitLineData
1package UTF8::Matcher;
2use strict;
3use warnings;
4use Text::Wrap qw(wrap);
5use Encode;
6use Data::Dumper;
7
8# Author: Yves Orton (demerphq) 2007.
9
10=pod
11
12Dynamically generates macros for detecting special charclasses
13in both latin-1, utf8, and codepoint forms.
14
15To regenerate regcharclass.h, run this script from perl-root. No arguments
16are necessary.
17
18Each charclass handler is constructed as follows:
19Each string the charclass must match is rendered as unicode (codepoints>255),
20and if possible as latin1 (codepoints>127), and if possible as "neutral"
21(all codepoints<128).
22
23The rendered strings are then inserted into digit-tries by type and length.
24With shorter strings being added to tries that are allowed to contain longer
25strings, but not vice versa. Thus the "longest" trie contains all strings
26for that charclass.
27
28The following types of trie are generated:
29
30 n - Neutral only. All strings in this type have codepoints<128
31 l - Latin1 only. All strings in this type have a codepoint>127 in them
32 u - UTF8 only. All strings in this type have a codepoint>255 in them
33 L - Latin1. All strings in 'n' and 'l'
34 U - UTF8. All string in 'n' and 'u'
35 c - Codepoint. All strings in U but in codepoint and not utf8 form.
36
37The ternary() routine is responsible for converting the trie data into a
38ternary conditional that matches the required set of strings. The generated
39macro normally takes at least the argument 's' which is expected to be a
40pointer of type C<char *> or C<U8 *>. The condition generated will be
41optimised to match the string as efficiently as possible, with range lookups
42being used where possible, and in some situations relying on "true" to be 1.
43
44ternary() takes two optional arguments, $type which is one of the above
45characters and $ext which is used to add an extra extension to the macro name.
46
47If $type is omitted or false then the generated macro will take an additional
48argument, 'is_utf8'.
49
50If $ext has the string 'safe' in it then the generated macro will take an extra
51argument 'e' for the end of the string, and all lookups will be length checked
52to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
53be guaranteed safe, and no 'e' argument is provided and no length checks are
54made during execution.
55
56The 'c' type is different as compared to the rest. Instead of producing
57a condition that does octet comparisons of a string array, the 'c' type
58produces a macro that takes a single codepoint as an argument (instead of a
59char* or U8*) and does the lookup based on only that char, thus it cannot be
60used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
61This is primarily used for populating charclass bitmaps for codepoints 0..255
62but will also match codepoints in the unicode range if necessary.
63
64Using LNBREAK as an example the following macros will be produced:
65
66=over 4
67
68=item is_LNBREAK(s,is_utf8)
69
70=item is_LNBREAK_safe(s,e,is_utf8)
71
72Do a lookup as apporpriate based on the is_utf8 flag. When possible
73comparisons involving octect<128 are done before checking the is_utf8
74flag, hopefully saving time.
75
76=item is_LNBREAK_utf8(s)
77
78=item is_LNBREAK_utf8_safe(s,e)
79
80Do a lookup assuming the string is encoded in (normalized) UTF8.
81
82=item is_LNBREAK_latin1(s)
83
84=item is_LNBREAK_latin1_safe(s,e)
85
86Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
87
88=item is_LNBREAK_cp(cp)
89
90Check to see if the string matches a given codepoint (hypotethically a
91U32). The condition is constructed as as to "break out" as early as
92possible if the codepoint is out of range of the condition.
93
94IOW:
95
96 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
97
98Thus if the character is X+1 only two comparisons will be done. Making
99matching lookups slower, but non-matching faster.
100
101=back
102
103=cut
104
105# store a list of numbers into a hash based trie.
106sub _trie_store {
107 my $root= shift;
108 foreach my $b ( @_ ) {
109 $root->{$b} ||= {};
110 $root= $root->{$b};
111 }
112 $root->{''}++;
113}
114
115# Convert a string into its neutral, latin1, utf8 forms, where
116# the form is undefined unless the string can be completely represented
117# in that form. The string is then decomposed into the octects representing
118# it. A list is returned for each. Additional a list of codepoints making
119# up the string.
120# returns (\@n,\@u,\@l,\@cp)
121#
122sub _uni_latin1 {
123 my $str= shift;
124 my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
125 my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
126 my $n= $l;
127 undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
128 return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
129 [map { ord $_ } split //,$str]);
130}
131
132# store an array ref of char data into the appropriate
133# type bins, tracking sizes as we go.
134sub _store {
135 my ( $self, $r, @k )= @_;
136 for my $z ( @k ) {
137 $self->{size}{$z}{ 0 + @$r }++;
138 push @{ $self->{data}{$z} }, $r;
139 }
140}
141
142# construct a new charclass constructor object.
143# $title ends up in the code a as a comment.
144# $opcode is the name of the operation the charclass implements.
145# the rest of the arguments are strings that the charclass
146# can match.
147sub new {
148 my $class= shift;
149 my $title= shift;
150 my $opcode= shift;
151 my $self= bless { op => $opcode, title => $title }, $class;
152 my %seen;
153 # convert the strings to the numeric equivelents and store
154 # them for later insertion while tracking their sizes.
155 foreach my $seq ( @_ ) {
156 next if $seen{$seq}++;
157 push @{$self->{seq}},$seq;
158 my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
159 if ( $n ) {
160 _store( $self, $n, qw(n U L) );
161 } else {
162 if ( $l ) {
163 _store( $self, $l, qw(l L) );
164 }
165 _store( $self, $u, qw(u U) );
166 }
167 _store($self,$cp,'c');
168 }
169 #
170 # now construct the tries. For each type of data we insert
171 # the data into all the tries of length $size and smaller.
172 #
173
174 my %allsize;
175 foreach my $k ( keys %{ $self->{data} } ) {
176 my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
177 $self->{size}{$k}=\@size;
178 undef @allsize{@size};
179 foreach my $d ( @{ $self->{data}{$k} } ) {
180 foreach my $sz ( @size ) {
181 last if $sz < @$d;
182 $self->{trie}{$k}{$sz} ||= {};
183 _trie_store( $self->{trie}{$k}{$sz}, @$d );
184 }
185 }
186 #delete $self->{data}{$k};
187 }
188 my @size= sort { $b <=> $a } keys %allsize;
189 $self->{size}{''}= \@size;
190 return $self;
191}
192
193#
194# _cond([$v1,$v2,$v2...],$ofs)
195#
196# converts an array of codepoints into a conditional expression
197# consequtive codepoints are merged into a range test
198# returns a string containing the conditional expression in the form
199# '( li[x]==v || li[x]==y )' When possible we also use range lookups.
200
201sub _cond {
202 my ( $c, $ofs,$fmt )= @_;
203 $fmt||='((U8*)s)[%d]';
204 # cheapo rangification routine.
205 # Convert the first element into a singleton represented
206 # as [$x,$x] and then merge the rest in as we go.
207 my @v= sort { $a <=> $b } @$c;
208 my @r= ( [ ( shift @v ) x 2 ] );
209 for my $n ( @v ) {
210 if ( $n == $r[-1][1] + 1 ) {
211 $r[-1][1]++;
212 } else {
213 push @r, [ $n, $n ];
214 }
215 }
216 @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
217 # sort the ranges by size and order.
218 @r= sort { $a->[0] <=> $b->[0] } @r;
219 my $alu= sprintf $fmt,$ofs; # C array look up
220
221 if ($fmt=~/%d/) {
222 # map the ranges into conditions
223 @r= map {
224 # singleton
225 $_->[0] == $_->[1] ? "$alu==$_->[0]" :
226 # range
227 "($_->[0]<=$alu && $alu<=$_->[1])"
228 } @r;
229 # return the joined results.
230 return '( ' . join( " || ", @r ) . ' )';
231 } else {
232 return combine($alu,@r);
233 }
234}
235
236#
237# Do the condition in such a way that we break out early if the value
238# we are looking at is in between two elements in the list.
239# Currently used only for codepoint macros (depth 1)
240#
241sub combine {
242 my $alu=shift;
243 local $_ = shift;
244 my $txt= $_->[0] == $_->[1]
245 ? "$alu==$_->[0]"
246 : "($_->[0]<=$alu && $alu<=$_->[1])";
247 return $txt unless @_;
248 return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
249}
250
251# recursively convert a trie to an optree represented by
252# [condition,yes,no] where yes and no can be a ref to another optree
253# or a scalar representing code.
254# called by make_optree
255
256sub _trie_to_optree {
257 my ( $node, $ofs, $else, $fmt )= @_;
258 return $else unless $node;
259 $ofs ||= 0;
260 if ( $node->{''} ) {
261 $else= $ofs;
262 } else {
263 $else ||= 0;
264 }
265 my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
266 map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
267 grep length, keys %$node;
268
269 return $ofs if !@k;
270
271 my ( $root, $expr );
272 while ( @k ) {
273 my @cond= ( $k[0][0] );
274 my $d= $k[0][1];
275 my $r= $k[0][2];
276 shift @k;
277 while ( @k && $k[0][1] eq $d ) {
278 push @cond, $k[0][0];
279 shift @k;
280 }
281 my $op=
282 [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
283 if ( !$root ) {
284 $root= $expr= $op;
285 } else {
286 push @$expr, $op;
287 $expr= $op;
288 }
289 }
290 push @$expr, $else;
291 return $root;
292}
293
294# construct the optree for a type.
295# handles the special logic of type ''.
296sub make_optree {
297 my ( $self, $type, $size, $fmt )= @_;
298 my $else= 0;
299 $size||=$self->{size}{$type}[0];
300 $size=1 if $type eq 'c';
301 if ( !$type ) {
302 my ( $u, $l );
303 for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
304 $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
305 }
306 for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
307 $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
308 }
309 if ( $u ) {
310 $else= [ '(is_utf8)', $u, $l || 0 ];
311 } elsif ( $l ) {
312 $else= [ '(!is_utf8)', $l, 0 ];
313 }
314 $type= 'n';
315 $size-- while !$self->{trie}{n}{$size};
316 }
317 return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
318}
319
320# construct the optree for a type with length checks to prevent buffer
321# overruns. Only one length check is performed per lookup trading code
322# size for speed.
323sub length_optree {
324 my ( $self, $type,$fmt )= @_;
325 $type ||= '';
326 return $self->{len_op}{$type} if $self->{len_op}{$type};
327 my @size = @{$self->{size}{$type}};
328
329 my ( $root, $expr );
330 foreach my $size ( @size ) {
331 my $op= [
332 "( (e) - (s) > " . ( $size - 1 ) . " )",
333 $self->make_optree( $type, $size ),
334 ];
335 if ( !$root ) {
336 $root= $expr= $op;
337 } else {
338 push @$expr, $op;
339 $expr= $op;
340 }
341 }
342 push @$expr, 0;
343 return $self->{len_op}{$type}= $root ? $root : $expr->[0];
344}
345
346#
347# recursively walk an optree and covert it to a huge nested ternary expression.
348#
349sub _optree_to_ternary {
350 my ( $node )= @_;
351 return $node
352 if !ref $node;
353 my $depth = 0;
354 if ( $node->[0] =~ /\[(\d+)\]/ ) {
355 $depth= $1 + 1;
356 }
357 return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
358 _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
359}
360
361# add \\ to the end of strings in a reasonable neat way.
362sub _macro($) {
363 my $str= shift;
364 my @lines= split /[^\S\n]*\n/, $str;
365 return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n";
366}
367
368# default type extensions. 'uln' dont have one because normally
369# they are used only as part of type '' which doesnt get an extension
370my %ext= (
371 U => '_utf8',
372 L => '_latin1',
373 c => '_cp',
374
375);
376
377# produce the ternary, handling arguments and putting on the macro headers
378# and boiler plate
379sub ternary {
380 my ( $self, $type, $ext )= @_;
381 $type ||= '';
382 $ext = ($ext{$type} || '') . ($ext||"");
383 my ($root,$fmt,$arg);
384 if ($type eq 'c') {
385 $arg= $fmt= 'cp';
386 } else {
387 $arg= 's';
388 }
389 if ( $type eq 'c' || $ext !~ /safe/) {
390 $root= $self->make_optree( $type, 0, $fmt );
391 } else {
392 $root= $self->length_optree( $type, $fmt );
393 }
394
395 our $parens;
396 $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
397 my $expr= qr/
398 \( \s*
399 ($parens)
400 \s* \? \s*
401 \( \s*
402 ($parens)
403 \s* \? \s*
404 (\d+|$parens)
405 \s* : \s*
406 (\d+|$parens)
407 \s* \)
408 \s* : \s*
409 \4
410 \s* \)
411 /x;
412 my $code= _optree_to_ternary( $root );
413 for ( $code ) {
414 s/^\s*//;
415 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
416 || s<$expr><(($1 && $2) ? $3 : $4)>g
417 || s<\(\s*($parens)\s*\)><$1>g;
418 }
419 my @args=($arg);
420 push @args,'e' if $ext=~/safe/;
421 push @args,'is_utf8' if !$type;
422 my $args=join ",",@args;
423 return "/*** GENERATED CODE ***/\n"
424 . _macro "#define is_$self->{op}$ext($args)\n$code";
425}
426
427my $path=shift @ARGV;
428if (!$path) {
429 $path= "regcharclass.h";
430 if (!-e $path) { $path="../$path" }
431 if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
432}
433
434rename $path,"$path.bak";
435open my $out_fh,">",$path
436 or die "Can't write to '$path':$!";
437binmode $out_fh; # want unix line endings even when run on win32.
438my ($zero)=$0=~/([^\\\/]+)$/;
439print $out_fh <<"HEADER";
440/*********************** WARNING WARNING WARNING ************************
441
442Do not modify this code directly: This file was autogenerated by
443
444 Porting/$zero
445
446from data contained within the script. Change the script instead.
447
448Generated at: @{[ scalar gmtime ]} GMT
449
450************************ WARNING WARNING WARNING ************************/
451
452HEADER
453
454my ($op,$title,@strs,@txt);
455my $doit= sub {
456 return unless $op;
457 my $o= __PACKAGE__->new($title,$op,@strs);
458 print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
459 print $out_fh join "\n",@txt,"*/","";
460 for ('', 'U', 'L') {
461 print $out_fh $o->ternary( $_ );
462 print $out_fh $o->ternary( $_,'_safe' );
463 }
464 print $out_fh $o->ternary( 'c' );
465};
466while (<DATA>) {
467 next unless /\S/;
468 chomp;
469 if (/^([A-Z]+)/) {
470 $doit->();
471 ($op,$title)=split /\s*:\s*/,$_,2;
472 @txt=@strs=();
473 } else {
474 push @txt, "\t$_";
475 s/#.*$//;
476 if (/^0x/) {
477 push @strs,map { chr $_ } eval $_;
478 } elsif (/^[""'']/) {
479 push @strs,eval $_;
480 }
481 }
482}
483$doit->();
484print "$path has been updated\n";
485
486__DATA__
487LNBREAK: Line Break: \R
488"\x0D\x0A" # CRLF - Network (Windows) line ending
4890x0A # LF | LINE FEED
4900x0B # VT | VERTICAL TAB
4910x0C # FF | FORM FEED
4920x0D # CR | CARRIAGE RETURN
4930x85 # NEL | NEXT LINE
4940x2028 # LINE SEPARATOR
4950x2029 # PARAGRAPH SEPARATOR
496
497HORIZWS: Horizontal Whitespace: \h \H
4980x09 # HT
4990x20 # SPACE
5000xa0 # NBSP
5010x1680 # OGHAM SPACE MARK
5020x180e # MONGOLIAN VOWEL SEPARATOR
5030x2000 # EN QUAD
5040x2001 # EM QUAD
5050x2002 # EN SPACE
5060x2003 # EM SPACE
5070x2004 # THREE-PER-EM SPACE
5080x2005 # FOUR-PER-EM SPACE
5090x2006 # SIX-PER-EM SPACE
5100x2007 # FIGURE SPACE
5110x2008 # PUNCTUATION SPACE
5120x2009 # THIN SPACE
5130x200A # HAIR SPACE
5140x202f # NARROW NO-BREAK SPACE
5150x205f # MEDIUM MATHEMATICAL SPACE
5160x3000 # IDEOGRAPHIC SPACE
517
518VERTWS: Vertical Whitespace: \v \V
5190x0A # LF
5200x0B # VT
5210x0C # FF
5220x0D # CR
5230x85 # NEL
5240x2028 # LINE SEPARATOR
5250x2029 # PARAGRAPH SEPARATOR
526