This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/regcharclass.pl: Change to work on an empty class
[perl5.git] / regen / regcharclass.pl
CommitLineData
e64b1bd1 1package CharClass::Matcher;
12b72891 2use strict;
8770da0e 3use 5.008;
12b72891 4use warnings;
e64b1bd1 5use warnings FATAL => 'all';
12b72891 6use Text::Wrap qw(wrap);
12b72891 7use Data::Dumper;
e64b1bd1
YO
8$Data::Dumper::Useqq= 1;
9our $hex_fmt= "0x%02X";
12b72891 10
295bcca9
KW
11sub ASCII_PLATFORM { (ord('A') == 65) }
12
8770da0e
NC
13require 'regen/regen_lib.pl';
14
ab84f958 15=head1 NAME
0ccab2bc 16
e64b1bd1 17CharClass::Matcher -- Generate C macros that match character classes efficiently
12b72891 18
e64b1bd1
YO
19=head1 SYNOPSIS
20
ab84f958 21 perl Porting/regcharclass.pl
e64b1bd1
YO
22
23=head1 DESCRIPTION
12b72891
RGS
24
25Dynamically generates macros for detecting special charclasses
e64b1bd1
YO
26in latin-1, utf8, and codepoint forms. Macros can be set to return
27the length (in bytes) of the matched codepoint, or the codepoint itself.
12b72891
RGS
28
29To regenerate regcharclass.h, run this script from perl-root. No arguments
30are necessary.
31
e64b1bd1 32Using WHATEVER as an example the following macros will be produced:
12b72891
RGS
33
34=over 4
35
e64b1bd1 36=item is_WHATEVER(s,is_utf8)
12b72891 37
e64b1bd1 38=item is_WHATEVER_safe(s,e,is_utf8)
12b72891 39
e64b1bd1 40Do a lookup as appropriate based on the is_utf8 flag. When possible
12b72891
RGS
41comparisons involving octect<128 are done before checking the is_utf8
42flag, hopefully saving time.
43
e64b1bd1 44=item is_WHATEVER_utf8(s)
12b72891 45
e64b1bd1 46=item is_WHATEVER_utf8_safe(s,e)
12b72891
RGS
47
48Do a lookup assuming the string is encoded in (normalized) UTF8.
49
e64b1bd1 50=item is_WHATEVER_latin1(s)
12b72891 51
e64b1bd1 52=item is_WHATEVER_latin1_safe(s,e)
12b72891
RGS
53
54Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
55
e64b1bd1 56=item is_WHATEVER_cp(cp)
12b72891 57
47e01c32 58Check to see if the string matches a given codepoint (hypothetically a
12b72891
RGS
59U32). The condition is constructed as as to "break out" as early as
60possible if the codepoint is out of range of the condition.
61
62IOW:
63
64 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
65
66Thus if the character is X+1 only two comparisons will be done. Making
67matching lookups slower, but non-matching faster.
68
69=back
70
e64b1bd1
YO
71Additionally it is possible to generate C<what_> variants that return
72the codepoint read instead of the number of octets read, this can be
73done by suffixing '-cp' to the type description.
74
75=head2 CODE FORMAT
76
77perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
78
79
80=head1 AUTHOR
81
82Author: Yves Orton (demerphq) 2007
83
84=head1 BUGS
85
86No tests directly here (although the regex engine will fail tests
87if this code is broken). Insufficient documentation and no Getopts
88handler for using the module as a script.
89
90=head1 LICENSE
91
92You may distribute under the terms of either the GNU General Public
93License or the Artistic License, as specified in the README file.
94
12b72891
RGS
95=cut
96
e64b1bd1
YO
97# Sub naming convention:
98# __func : private subroutine, can not be called as a method
99# _func : private method, not meant for external use
100# func : public method.
101
102# private subs
103#-------------------------------------------------------------------------------
104#
105# ($cp,$n,$l,$u)=__uni_latin($str);
106#
47e01c32 107# Return a list of arrays, each of which when interpreted correctly
e64b1bd1
YO
108# represent the string in some given encoding with specific conditions.
109#
110# $cp - list of codepoints that make up the string.
295bcca9
KW
111# $n - list of octets that make up the string if all codepoints are invariant
112# regardless of if the string is in UTF-8 or not.
e64b1bd1 113# $l - list of octets that make up the string in latin1 encoding if all
295bcca9
KW
114# codepoints < 256, and at least one codepoint is UTF-8 variant.
115# $u - list of octets that make up the string in utf8 if any codepoint is
116# UTF-8 variant
e64b1bd1
YO
117#
118# High CP | Defined
119#-----------+----------
295bcca9 120# 0 - 127 : $n (127/128 are the values for ASCII platforms)
e64b1bd1
YO
121# 128 - 255 : $l, $u
122# 256 - ... : $u
123#
124
125sub __uni_latin1 {
126 my $str= shift;
127 my $max= 0;
128 my @cp;
295bcca9 129 my $only_has_invariants = 1;
e64b1bd1
YO
130 for my $ch ( split //, $str ) {
131 my $cp= ord $ch;
132 push @cp, $cp;
133 $max= $cp if $max < $cp;
295bcca9
KW
134 if (! ASCII_PLATFORM && $only_has_invariants) {
135 if ($cp > 255) {
136 $only_has_invariants = 0;
137 }
138 else {
139 my $temp = chr($cp);
140 utf8::upgrade($temp);
141 my @utf8 = unpack "U0C*", $temp;
142 $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
143 }
144 }
e64b1bd1
YO
145 }
146 my ( $n, $l, $u );
295bcca9
KW
147 $only_has_invariants = $max < 128 if ASCII_PLATFORM;
148 if ($only_has_invariants) {
e64b1bd1
YO
149 $n= [@cp];
150 } else {
151 $l= [@cp] if $max && $max < 256;
152
ca51670f
KW
153 $u= $str;
154 utf8::upgrade($u);
155 $u= [ unpack "U0C*", $u ] if defined $u;
12b72891 156 }
e64b1bd1 157 return ( \@cp, $n, $l, $u );
12b72891
RGS
158}
159
12b72891 160#
e64b1bd1
YO
161# $clean= __clean($expr);
162#
163# Cleanup a ternary expression, removing unnecessary parens and apply some
164# simplifications using regexes.
165#
166
167sub __clean {
168 my ( $expr )= @_;
169 our $parens;
170 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
171
172 #print "$parens\n$expr\n";
173 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
174 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
175 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
176 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
177 return $expr;
12b72891
RGS
178}
179
e64b1bd1
YO
180#
181# $text= __macro(@args);
182# Join args together by newlines, and then neatly add backslashes to the end
183# of every line as expected by the C pre-processor for #define's.
184#
185
186sub __macro {
187 my $str= join "\n", @_;
188 $str =~ s/\s*$//;
189 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
190 my $last= pop @lines;
191 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
192 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
193 return $str . "\n";
12b72891
RGS
194}
195
e64b1bd1
YO
196#
197# my $op=__incrdepth($op);
198#
199# take an 'op' hashref and add one to it and all its childrens depths.
200#
201
202sub __incrdepth {
203 my $op= shift;
204 return unless ref $op;
205 $op->{depth} += 1;
206 __incrdepth( $op->{yes} );
207 __incrdepth( $op->{no} );
208 return $op;
209}
210
211# join two branches of an opcode together with a condition, incrementing
212# the depth on the yes branch when we do so.
213# returns the new root opcode of the tree.
214sub __cond_join {
215 my ( $cond, $yes, $no )= @_;
216 return {
217 test => $cond,
218 yes => __incrdepth( $yes ),
219 no => $no,
220 depth => 0,
221 };
222}
223
224# Methods
225
226# constructor
227#
228# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
229#
230# Create a new CharClass::Matcher object by parsing the text in
231# the txt array. Currently applies the following rules:
232#
233# Element starts with C<0x>, line is evaled the result treated as
234# a number which is passed to chr().
235#
236# Element starts with C<">, line is evaled and the result treated
237# as a string.
238#
239# Each string is then stored in the 'strs' subhash as a hash record
240# made up of the results of __uni_latin1, using the keynames
241# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
242# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
243#
244# Size data is tracked per type in the 'size' subhash.
245#
246# Return an object
247#
12b72891
RGS
248sub new {
249 my $class= shift;
e64b1bd1
YO
250 my %opt= @_;
251 for ( qw(op txt) ) {
252 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
253 if !exists $opt{$_};
254 }
255
256 my $self= bless {
257 op => $opt{op},
258 title => $opt{title} || '',
259 }, $class;
260 foreach my $txt ( @{ $opt{txt} } ) {
261 my $str= $txt;
262 if ( $str =~ /^[""]/ ) {
263 $str= eval $str;
05b688d9
KW
264 } elsif ($str =~ / - /x ) { # A range: Replace this element on the
265 # list with its expansion
266 my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
267 die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
268 foreach my $cp (hex $lower .. hex $upper) {
269 push @{$opt{txt}}, sprintf "0x%X", $cp;
270 }
271 next;
295bcca9
KW
272 } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
273 # Otherwise undocumented, a leading N means is already in the
274 # native character set; don't convert.
e64b1bd1 275 $str= chr eval $str;
295bcca9
KW
276 } elsif ( $str =~ /^0x/ ) {
277 $str= eval $str;
278
279 # Convert from Unicode/ASCII to native, if necessary
280 $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
281 && $str <= 0xFF;
282 $str = chr $str;
05b688d9
KW
283 } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
284 my $property = $1;
285 use Unicode::UCD qw(prop_invlist);
286
287 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
288 if (! @invlist) {
289
290 # An empty return could mean an unknown property, or merely
291 # that it is empty. Call in scalar context to differentiate
292 my $count = prop_invlist($property, '_perl_core_internal_ok');
293 die "$property not found" unless defined $count;
294 }
295
296 # Replace this element on the list with the property's expansion
297 for (my $i = 0; $i < @invlist; $i += 2) {
298 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
295bcca9
KW
299
300 # prop_invlist() returns native values; add leading 'N'
301 # to indicate that.
302 push @{$opt{txt}}, sprintf "N0x%X", $cp;
05b688d9
KW
303 }
304 }
305 next;
e64b1bd1 306 } elsif ( /\S/ ) {
47e01c32 307 die "Unparsable line: $txt\n";
12b72891 308 } else {
e64b1bd1 309 next;
12b72891 310 }
e64b1bd1
YO
311 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
312 my $UTF8= $low || $utf8;
313 my $LATIN1= $low || $latin1;
dda856b2
YO
314 #die Dumper($txt,$cp,$low,$latin1,$utf8)
315 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
e64b1bd1
YO
316
317 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
318 ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
319 my $rec= $self->{strs}{$str};
320 foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
321 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
322 if $self->{strs}{$str}{$key};
12b72891 323 }
e64b1bd1
YO
324 $self->{has_multi} ||= @$cp > 1;
325 $self->{has_ascii} ||= $latin1 && @$latin1;
326 $self->{has_low} ||= $low && @$low;
327 $self->{has_high} ||= !$low && !$latin1;
12b72891 328 }
e64b1bd1
YO
329 $self->{val_fmt}= $hex_fmt;
330 $self->{count}= 0 + keys %{ $self->{strs} };
12b72891
RGS
331 return $self;
332}
333
e64b1bd1 334# my $trie = make_trie($type,$maxlen);
12b72891 335#
47e01c32 336# using the data stored in the object build a trie of a specific type,
e64b1bd1
YO
337# and with specific maximum depth. The trie is made up the elements of
338# the given types array for each string in the object (assuming it is
339# not too long.)
340#
47e01c32 341# returns the trie, or undef if there was no relevant data in the object.
e64b1bd1
YO
342#
343
344sub make_trie {
345 my ( $self, $type, $maxlen )= @_;
346
347 my $strs= $self->{strs};
348 my %trie;
349 foreach my $rec ( values %$strs ) {
350 die "panic: unknown type '$type'"
351 if !exists $rec->{$type};
352 my $dat= $rec->{$type};
353 next unless $dat;
354 next if $maxlen && @$dat > $maxlen;
355 my $node= \%trie;
356 foreach my $elem ( @$dat ) {
357 $node->{$elem} ||= {};
358 $node= $node->{$elem};
12b72891 359 }
e64b1bd1 360 $node->{''}= $rec->{str};
12b72891 361 }
e64b1bd1 362 return 0 + keys( %trie ) ? \%trie : undef;
12b72891
RGS
363}
364
e64b1bd1
YO
365# my $optree= _optree()
366#
367# recursively convert a trie to an optree where every node represents
368# an if else branch.
12b72891 369#
12b72891 370#
12b72891 371
e64b1bd1
YO
372sub _optree {
373 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
374 return unless defined $trie;
375 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
376 die "Can't do 'cp' optree from multi-codepoint strings";
12b72891 377 }
e64b1bd1
YO
378 $ret_type ||= 'len';
379 $else= 0 unless defined $else;
380 $depth= 0 unless defined $depth;
381
382 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
895e25a5 383 if (exists $trie->{''} ) {
e64b1bd1
YO
384 if ( $ret_type eq 'cp' ) {
385 $else= $self->{strs}{ $trie->{''} }{cp}[0];
386 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
387 } elsif ( $ret_type eq 'len' ) {
388 $else= $depth;
389 } elsif ( $ret_type eq 'both') {
390 $else= $self->{strs}{ $trie->{''} }{cp}[0];
391 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
392 $else= "len=$depth, $else";
12b72891 393 }
e64b1bd1
YO
394 }
395 return $else if !@conds;
396 my $node= {};
397 my $root= $node;
398 my ( $yes_res, $as_code, @cond );
399 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
400 my $Update= sub {
401 $node->{vals}= [@cond];
402 $node->{test}= $test;
403 $node->{yes}= $yes_res;
404 $node->{depth}= $depth;
405 $node->{no}= shift;
406 };
407 while ( @conds ) {
408 my $cond= shift @conds;
409 my $res=
410 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
411 $depth + 1 );
412 my $res_code= Dumper( $res );
413 if ( !$yes_res || $res_code ne $as_code ) {
414 if ( $yes_res ) {
415 $Update->( {} );
416 $node= $node->{no};
417 }
418 ( $yes_res, $as_code )= ( $res, $res_code );
419 @cond= ( $cond );
12b72891 420 } else {
e64b1bd1 421 push @cond, $cond;
12b72891
RGS
422 }
423 }
e64b1bd1 424 $Update->( $else );
12b72891
RGS
425 return $root;
426}
427
e64b1bd1
YO
428# my $optree= optree(%opts);
429#
430# Convert a trie to an optree, wrapper for _optree
431
432sub optree {
433 my $self= shift;
434 my %opt= @_;
435 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
436 $opt{ret_type} ||= 'len';
437 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
438 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
12b72891
RGS
439}
440
e64b1bd1
YO
441# my $optree= generic_optree(%opts);
442#
443# build a "generic" optree out of the three 'low', 'latin1', 'utf8'
444# sets of strings, including a branch for handling the string type check.
445#
446
447sub generic_optree {
448 my $self= shift;
449 my %opt= @_;
450
451 $opt{ret_type} ||= 'len';
452 my $test_type= 'depth';
453 my $else= $opt{else} || 0;
454
455 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
456 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
457
458 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
459 for $latin1, $utf8;
460
461 if ( $utf8 ) {
462 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
463 } elsif ( $latin1 ) {
464 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
465 }
466 my $low= $self->make_trie( 'low', $opt{max_depth} );
467 if ( $low ) {
468 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
12b72891 469 }
e64b1bd1
YO
470
471 return $else;
12b72891
RGS
472}
473
e64b1bd1 474# length_optree()
12b72891 475#
e64b1bd1 476# create a string length guarded optree.
12b72891 477#
e64b1bd1
YO
478
479sub length_optree {
480 my $self= shift;
481 my %opt= @_;
482 my $type= $opt{type};
483
484 die "Can't do a length_optree on type 'cp', makes no sense."
485 if $type eq 'cp';
486
487 my ( @size, $method );
488
489 if ( $type eq 'generic' ) {
490 $method= 'generic_optree';
491 my %sizes= (
492 %{ $self->{size}{low} || {} },
493 %{ $self->{size}{latin1} || {} },
494 %{ $self->{size}{utf8} || {} }
495 );
496 @size= sort { $a <=> $b } keys %sizes;
497 } else {
498 $method= 'optree';
499 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
12b72891 500 }
e64b1bd1
YO
501
502 my $else= ( $opt{else} ||= 0 );
503 for my $size ( @size ) {
504 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
505 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
506 $else= __cond_join( $cond, $optree, $else );
507 }
508 return $else;
12b72891
RGS
509}
510
e64b1bd1
YO
511# _cond_as_str
512# turn a list of conditions into a text expression
513# - merges ranges of conditions, and joins the result with ||
514sub _cond_as_str {
515 my ( $self, $op, $combine )= @_;
516 my $cond= $op->{vals};
517 my $test= $op->{test};
518 return "( $test )" if !defined $cond;
519
520 # rangify the list
521 my @ranges;
522 my $Update= sub {
523 if ( @ranges ) {
524 if ( $ranges[-1][0] == $ranges[-1][1] ) {
525 $ranges[-1]= $ranges[-1][0];
526 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
527 $ranges[-1]= $ranges[-1][0];
528 push @ranges, $ranges[-1] + 1;
529 }
530 }
531 };
532 for my $cond ( @$cond ) {
533 if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
534 $Update->();
535 push @ranges, [ $cond, $cond ];
536 } else {
537 $ranges[-1][1]++;
538 }
539 }
540 $Update->();
541 return $self->_combine( $test, @ranges )
542 if $combine;
543 @ranges= map {
544 ref $_
545 ? sprintf(
546 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
547 @$_ )
548 : sprintf( "$self->{val_fmt} == $test", $_ );
549 } @ranges;
550 return "( " . join( " || ", @ranges ) . " )";
12b72891
RGS
551}
552
e64b1bd1
YO
553# _combine
554# recursively turn a list of conditions into a fast break-out condition
555# used by _cond_as_str() for 'cp' type macros.
556sub _combine {
557 my ( $self, $test, @cond )= @_;
558 return if !@cond;
559 my $item= shift @cond;
560 my ( $cstr, $gtv );
561 if ( ref $item ) {
562 $cstr=
563 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
564 @$item );
565 $gtv= sprintf "$self->{val_fmt}", $item->[1];
12b72891 566 } else {
e64b1bd1
YO
567 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
568 $gtv= sprintf "$self->{val_fmt}", $item;
12b72891 569 }
e64b1bd1
YO
570 if ( @cond ) {
571 return "( $cstr || ( $gtv < $test &&\n"
572 . $self->_combine( $test, @cond ) . " ) )";
12b72891 573 } else {
e64b1bd1 574 return $cstr;
12b72891 575 }
e64b1bd1 576}
12b72891 577
e64b1bd1
YO
578# _render()
579# recursively convert an optree to text with reasonably neat formatting
580sub _render {
581 my ( $self, $op, $combine, $brace )= @_;
2e39f0c2 582 return 0 if ! defined $op; # The set is empty
e64b1bd1
YO
583 if ( !ref $op ) {
584 return $op;
12b72891 585 }
e64b1bd1
YO
586 my $cond= $self->_cond_as_str( $op, $combine );
587 my $yes= $self->_render( $op->{yes}, $combine, 1 );
588 my $no= $self->_render( $op->{no}, $combine, 0 );
589 return "( $cond )" if $yes eq '1' and $no eq '0';
590 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
591 return "$lb$cond ? $yes : $no$rb"
592 if !ref( $op->{yes} ) && !ref( $op->{no} );
593 my $ind1= " " x 4;
594 my $ind= "\n" . ( $ind1 x $op->{depth} );
595
596 if ( ref $op->{yes} ) {
597 $yes= $ind . $ind1 . $yes;
598 } else {
599 $yes= " " . $yes;
600 }
601
602 return "$lb$cond ?$yes$ind: $no$rb";
12b72891 603}
32e6a07c 604
e64b1bd1
YO
605# $expr=render($op,$combine)
606#
607# convert an optree to text with reasonably neat formatting. If $combine
608# is true then the condition is created using "fast breakouts" which
609# produce uglier expressions that are more efficient for common case,
610# longer lists such as that resulting from type 'cp' output.
611# Currently only used for type 'cp' macros.
612sub render {
613 my ( $self, $op, $combine )= @_;
614 my $str= "( " . $self->_render( $op, $combine ) . " )";
615 return __clean( $str );
12b72891 616}
e64b1bd1
YO
617
618# make_macro
619# make a macro of a given type.
620# calls into make_trie and (generic_|length_)optree as needed
621# Opts are:
622# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
623# ret_type : 'cp' or 'len'
624# safe : add length guards to macro
625#
626# type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
627# in which case it defaults to 'cp' as well.
628#
629# it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
630# sequences in it, as the generated macro will accept only a single codepoint
631# as an argument.
632#
633# returns the macro.
634
635
636sub make_macro {
637 my $self= shift;
638 my %opts= @_;
639 my $type= $opts{type} || 'generic';
640 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
641 if $type eq 'cp'
642 and $self->{has_multi};
643 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
644 my $method;
645 if ( $opts{safe} ) {
646 $method= 'length_optree';
647 } elsif ( $type eq 'generic' ) {
648 $method= 'generic_optree';
649 } else {
650 $method= 'optree';
651 }
652 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
653 my $text= $self->render( $optree, $type eq 'cp' );
654 my @args= $type eq 'cp' ? 'cp' : 's';
655 push @args, "e" if $opts{safe};
656 push @args, "is_utf8" if $type eq 'generic';
657 push @args, "len" if $ret_type eq 'both';
658 my $pfx= $ret_type eq 'both' ? 'what_len_' :
659 $ret_type eq 'cp' ? 'what_' : 'is_';
660 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
661 $ext .= "_safe" if $opts{safe};
662 my $argstr= join ",", @args;
663 return "/*** GENERATED CODE ***/\n"
664 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
32e6a07c 665}
e64b1bd1
YO
666
667# if we arent being used as a module (highly likely) then process
668# the __DATA__ below and produce macros in regcharclass.h
669# if an argument is provided to the script then it is assumed to
670# be the path of the file to output to, if the arg is '-' outputs
671# to STDOUT.
672if ( !caller ) {
e64b1bd1 673 $|++;
8770da0e 674 my $path= shift @ARGV || "regcharclass.h";
e64b1bd1
YO
675 my $out_fh;
676 if ( $path eq '-' ) {
677 $out_fh= \*STDOUT;
678 } else {
29c22b52 679 $out_fh = open_new( $path );
e64b1bd1 680 }
8770da0e
NC
681 print $out_fh read_only_top( lang => 'C', by => $0,
682 file => 'regcharclass.h', style => '*',
2eee27d7 683 copyright => [2007, 2011] );
d10c72f2 684 print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
12b72891 685
e64b1bd1
YO
686 my ( $op, $title, @txt, @types, @mods );
687 my $doit= sub {
688 return unless $op;
689 print $out_fh "/*\n\t$op: $title\n\n";
690 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
691 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
692
693 #die Dumper(\@types,\@mods);
694
695 foreach my $type_spec ( @types ) {
696 my ( $type, $ret )= split /-/, $type_spec;
697 $ret ||= 'len';
698 foreach my $mod ( @mods ) {
699 next if $mod eq 'safe' and $type eq 'cp';
700 my $macro= $obj->make_macro(
701 type => $type,
702 ret_type => $ret,
703 safe => $mod eq 'safe'
704 );
705 print $out_fh $macro, "\n";
706 }
32e6a07c 707 }
e64b1bd1
YO
708 };
709
710 while ( <DATA> ) {
711 s/^\s*#//;
712 next unless /\S/;
713 chomp;
714 if ( /^([A-Z]+)/ ) {
715 $doit->();
716 ( $op, $title )= split /\s*:\s*/, $_, 2;
717 @txt= ();
718 } elsif ( s/^=>// ) {
719 my ( $type, $modifier )= split /:/, $_;
720 @types= split ' ', $type;
721 @mods= split ' ', $modifier;
722 } else {
723 push @txt, "$_";
12b72891
RGS
724 }
725 }
e64b1bd1 726 $doit->();
d10c72f2
KW
727
728 print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
729
8770da0e
NC
730 if($path eq '-') {
731 print $out_fh "/* ex: set ro: */\n";
732 } else {
733 read_only_bottom_close_and_rename($out_fh)
734 }
12b72891 735}
e64b1bd1
YO
736
737#
738# Valid types: generic, LATIN1, UTF8, low, latin1, utf8
739# default return value is octects read.
740# append -cp to make it codepoint matched.
741# modifiers come after the colon, valid possibilities
742# being 'fast' and 'safe'.
743#
295bcca9 744# Accepts a single Unicode code point per line, prefaced by '0x'
05b688d9
KW
745# or a range of two code points separated by a minus (and optional space)
746# or a single \p{} per line.
e90ac8de 747#
295bcca9
KW
748# If run on a non-ASCII platform will automatically convert the Unicode input
749# to native
750#
e90ac8de
KW
751# This is no longer used, but retained in case it is needed some day. Put the
752# lines below under __DATA__
753# TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
754# => generic cp generic-cp generic-both :fast safe
755# 0x00DF # LATIN SMALL LETTER SHARP S
756# 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
757# 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
758# 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
759# 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
760# 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
761
e64b1bd1 7621; # in the unlikely case we are being used as a module
12b72891
RGS
763
764__DATA__
765LNBREAK: Line Break: \R
e64b1bd1 766=> generic UTF8 LATIN1 :fast safe
12b72891 767"\x0D\x0A" # CRLF - Network (Windows) line ending
05b688d9 768\p{VertSpace}
12b72891
RGS
769
770HORIZWS: Horizontal Whitespace: \h \H
e64b1bd1 771=> generic UTF8 LATIN1 cp :fast safe
05b688d9 772\p{HorizSpace}
12b72891
RGS
773
774VERTWS: Vertical Whitespace: \v \V
e64b1bd1 775=> generic UTF8 LATIN1 cp :fast safe
05b688d9 776\p{VertSpace}