This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/regcharclass.pl: Handle ranges, \p{}
[perl5.git] / regen / regcharclass.pl
CommitLineData
e64b1bd1 1package CharClass::Matcher;
12b72891 2use strict;
8770da0e 3use 5.008;
12b72891 4use warnings;
e64b1bd1 5use warnings FATAL => 'all';
12b72891
RGS
6use Text::Wrap qw(wrap);
7use Encode;
8use Data::Dumper;
e64b1bd1
YO
9$Data::Dumper::Useqq= 1;
10our $hex_fmt= "0x%02X";
12b72891 11
8770da0e
NC
12require 'regen/regen_lib.pl';
13
ab84f958 14=head1 NAME
0ccab2bc 15
e64b1bd1 16CharClass::Matcher -- Generate C macros that match character classes efficiently
12b72891 17
e64b1bd1
YO
18=head1 SYNOPSIS
19
ab84f958 20 perl Porting/regcharclass.pl
e64b1bd1
YO
21
22=head1 DESCRIPTION
12b72891
RGS
23
24Dynamically generates macros for detecting special charclasses
e64b1bd1
YO
25in latin-1, utf8, and codepoint forms. Macros can be set to return
26the length (in bytes) of the matched codepoint, or the codepoint itself.
12b72891
RGS
27
28To regenerate regcharclass.h, run this script from perl-root. No arguments
29are necessary.
30
e64b1bd1 31Using WHATEVER as an example the following macros will be produced:
12b72891
RGS
32
33=over 4
34
e64b1bd1 35=item is_WHATEVER(s,is_utf8)
12b72891 36
e64b1bd1 37=item is_WHATEVER_safe(s,e,is_utf8)
12b72891 38
e64b1bd1 39Do a lookup as appropriate based on the is_utf8 flag. When possible
12b72891
RGS
40comparisons involving octect<128 are done before checking the is_utf8
41flag, hopefully saving time.
42
e64b1bd1 43=item is_WHATEVER_utf8(s)
12b72891 44
e64b1bd1 45=item is_WHATEVER_utf8_safe(s,e)
12b72891
RGS
46
47Do a lookup assuming the string is encoded in (normalized) UTF8.
48
e64b1bd1 49=item is_WHATEVER_latin1(s)
12b72891 50
e64b1bd1 51=item is_WHATEVER_latin1_safe(s,e)
12b72891
RGS
52
53Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
54
e64b1bd1 55=item is_WHATEVER_cp(cp)
12b72891 56
47e01c32 57Check to see if the string matches a given codepoint (hypothetically a
12b72891
RGS
58U32). The condition is constructed as as to "break out" as early as
59possible if the codepoint is out of range of the condition.
60
61IOW:
62
63 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
64
65Thus if the character is X+1 only two comparisons will be done. Making
66matching lookups slower, but non-matching faster.
67
68=back
69
e64b1bd1
YO
70Additionally it is possible to generate C<what_> variants that return
71the codepoint read instead of the number of octets read, this can be
72done by suffixing '-cp' to the type description.
73
74=head2 CODE FORMAT
75
76perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
77
78
79=head1 AUTHOR
80
81Author: Yves Orton (demerphq) 2007
82
83=head1 BUGS
84
85No tests directly here (although the regex engine will fail tests
86if this code is broken). Insufficient documentation and no Getopts
87handler for using the module as a script.
88
89=head1 LICENSE
90
91You may distribute under the terms of either the GNU General Public
92License or the Artistic License, as specified in the README file.
93
12b72891
RGS
94=cut
95
e64b1bd1
YO
96# Sub naming convention:
97# __func : private subroutine, can not be called as a method
98# _func : private method, not meant for external use
99# func : public method.
100
101# private subs
102#-------------------------------------------------------------------------------
103#
104# ($cp,$n,$l,$u)=__uni_latin($str);
105#
47e01c32 106# Return a list of arrays, each of which when interpreted correctly
e64b1bd1
YO
107# represent the string in some given encoding with specific conditions.
108#
109# $cp - list of codepoints that make up the string.
110# $n - list of octets that make up the string if all codepoints < 128
111# $l - list of octets that make up the string in latin1 encoding if all
112# codepoints < 256, and at least one codepoint is >127.
113# $u - list of octets that make up the string in utf8 if any codepoint >127
114#
115# High CP | Defined
116#-----------+----------
117# 0 - 127 : $n
118# 128 - 255 : $l, $u
119# 256 - ... : $u
120#
121
122sub __uni_latin1 {
123 my $str= shift;
124 my $max= 0;
125 my @cp;
126 for my $ch ( split //, $str ) {
127 my $cp= ord $ch;
128 push @cp, $cp;
129 $max= $cp if $max < $cp;
130 }
131 my ( $n, $l, $u );
132 if ( $max < 128 ) {
133 $n= [@cp];
134 } else {
135 $l= [@cp] if $max && $max < 256;
136
137 my $copy= $str; # must copy string, FB_CROAK makes encode destructive
138 $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
dda856b2
YO
139 # $u is utf8 but with the utf8 flag OFF
140 # therefore "C*" gets us the values of the bytes involved.
141 $u= [ unpack "C*", $u ] if defined $u;
12b72891 142 }
e64b1bd1 143 return ( \@cp, $n, $l, $u );
12b72891
RGS
144}
145
12b72891 146#
e64b1bd1
YO
147# $clean= __clean($expr);
148#
149# Cleanup a ternary expression, removing unnecessary parens and apply some
150# simplifications using regexes.
151#
152
153sub __clean {
154 my ( $expr )= @_;
155 our $parens;
156 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
157
158 #print "$parens\n$expr\n";
159 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
160 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
161 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
162 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
163 return $expr;
12b72891
RGS
164}
165
e64b1bd1
YO
166#
167# $text= __macro(@args);
168# Join args together by newlines, and then neatly add backslashes to the end
169# of every line as expected by the C pre-processor for #define's.
170#
171
172sub __macro {
173 my $str= join "\n", @_;
174 $str =~ s/\s*$//;
175 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
176 my $last= pop @lines;
177 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
178 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
179 return $str . "\n";
12b72891
RGS
180}
181
e64b1bd1
YO
182#
183# my $op=__incrdepth($op);
184#
185# take an 'op' hashref and add one to it and all its childrens depths.
186#
187
188sub __incrdepth {
189 my $op= shift;
190 return unless ref $op;
191 $op->{depth} += 1;
192 __incrdepth( $op->{yes} );
193 __incrdepth( $op->{no} );
194 return $op;
195}
196
197# join two branches of an opcode together with a condition, incrementing
198# the depth on the yes branch when we do so.
199# returns the new root opcode of the tree.
200sub __cond_join {
201 my ( $cond, $yes, $no )= @_;
202 return {
203 test => $cond,
204 yes => __incrdepth( $yes ),
205 no => $no,
206 depth => 0,
207 };
208}
209
210# Methods
211
212# constructor
213#
214# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
215#
216# Create a new CharClass::Matcher object by parsing the text in
217# the txt array. Currently applies the following rules:
218#
219# Element starts with C<0x>, line is evaled the result treated as
220# a number which is passed to chr().
221#
222# Element starts with C<">, line is evaled and the result treated
223# as a string.
224#
225# Each string is then stored in the 'strs' subhash as a hash record
226# made up of the results of __uni_latin1, using the keynames
227# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
228# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
229#
230# Size data is tracked per type in the 'size' subhash.
231#
232# Return an object
233#
12b72891
RGS
234sub new {
235 my $class= shift;
e64b1bd1
YO
236 my %opt= @_;
237 for ( qw(op txt) ) {
238 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
239 if !exists $opt{$_};
240 }
241
242 my $self= bless {
243 op => $opt{op},
244 title => $opt{title} || '',
245 }, $class;
246 foreach my $txt ( @{ $opt{txt} } ) {
247 my $str= $txt;
248 if ( $str =~ /^[""]/ ) {
249 $str= eval $str;
05b688d9
KW
250 } elsif ($str =~ / - /x ) { # A range: Replace this element on the
251 # list with its expansion
252 my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
253 die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
254 foreach my $cp (hex $lower .. hex $upper) {
255 push @{$opt{txt}}, sprintf "0x%X", $cp;
256 }
257 next;
e64b1bd1
YO
258 } elsif ( $str =~ /^0x/ ) {
259 $str= chr eval $str;
05b688d9
KW
260 } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
261 my $property = $1;
262 use Unicode::UCD qw(prop_invlist);
263
264 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
265 if (! @invlist) {
266
267 # An empty return could mean an unknown property, or merely
268 # that it is empty. Call in scalar context to differentiate
269 my $count = prop_invlist($property, '_perl_core_internal_ok');
270 die "$property not found" unless defined $count;
271 }
272
273 # Replace this element on the list with the property's expansion
274 for (my $i = 0; $i < @invlist; $i += 2) {
275 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
276 push @{$opt{txt}}, sprintf "0x%X", $cp;
277 }
278 }
279 next;
e64b1bd1 280 } elsif ( /\S/ ) {
47e01c32 281 die "Unparsable line: $txt\n";
12b72891 282 } else {
e64b1bd1 283 next;
12b72891 284 }
e64b1bd1
YO
285 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
286 my $UTF8= $low || $utf8;
287 my $LATIN1= $low || $latin1;
dda856b2
YO
288 #die Dumper($txt,$cp,$low,$latin1,$utf8)
289 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
e64b1bd1
YO
290
291 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
292 ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
293 my $rec= $self->{strs}{$str};
294 foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
295 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
296 if $self->{strs}{$str}{$key};
12b72891 297 }
e64b1bd1
YO
298 $self->{has_multi} ||= @$cp > 1;
299 $self->{has_ascii} ||= $latin1 && @$latin1;
300 $self->{has_low} ||= $low && @$low;
301 $self->{has_high} ||= !$low && !$latin1;
12b72891 302 }
e64b1bd1
YO
303 $self->{val_fmt}= $hex_fmt;
304 $self->{count}= 0 + keys %{ $self->{strs} };
12b72891
RGS
305 return $self;
306}
307
e64b1bd1 308# my $trie = make_trie($type,$maxlen);
12b72891 309#
47e01c32 310# using the data stored in the object build a trie of a specific type,
e64b1bd1
YO
311# and with specific maximum depth. The trie is made up the elements of
312# the given types array for each string in the object (assuming it is
313# not too long.)
314#
47e01c32 315# returns the trie, or undef if there was no relevant data in the object.
e64b1bd1
YO
316#
317
318sub make_trie {
319 my ( $self, $type, $maxlen )= @_;
320
321 my $strs= $self->{strs};
322 my %trie;
323 foreach my $rec ( values %$strs ) {
324 die "panic: unknown type '$type'"
325 if !exists $rec->{$type};
326 my $dat= $rec->{$type};
327 next unless $dat;
328 next if $maxlen && @$dat > $maxlen;
329 my $node= \%trie;
330 foreach my $elem ( @$dat ) {
331 $node->{$elem} ||= {};
332 $node= $node->{$elem};
12b72891 333 }
e64b1bd1 334 $node->{''}= $rec->{str};
12b72891 335 }
e64b1bd1 336 return 0 + keys( %trie ) ? \%trie : undef;
12b72891
RGS
337}
338
e64b1bd1
YO
339# my $optree= _optree()
340#
341# recursively convert a trie to an optree where every node represents
342# an if else branch.
12b72891 343#
12b72891 344#
12b72891 345
e64b1bd1
YO
346sub _optree {
347 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
348 return unless defined $trie;
349 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
350 die "Can't do 'cp' optree from multi-codepoint strings";
12b72891 351 }
e64b1bd1
YO
352 $ret_type ||= 'len';
353 $else= 0 unless defined $else;
354 $depth= 0 unless defined $depth;
355
356 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
357 if ( $trie->{''} ) {
358 if ( $ret_type eq 'cp' ) {
359 $else= $self->{strs}{ $trie->{''} }{cp}[0];
360 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
361 } elsif ( $ret_type eq 'len' ) {
362 $else= $depth;
363 } elsif ( $ret_type eq 'both') {
364 $else= $self->{strs}{ $trie->{''} }{cp}[0];
365 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
366 $else= "len=$depth, $else";
12b72891 367 }
e64b1bd1
YO
368 }
369 return $else if !@conds;
370 my $node= {};
371 my $root= $node;
372 my ( $yes_res, $as_code, @cond );
373 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
374 my $Update= sub {
375 $node->{vals}= [@cond];
376 $node->{test}= $test;
377 $node->{yes}= $yes_res;
378 $node->{depth}= $depth;
379 $node->{no}= shift;
380 };
381 while ( @conds ) {
382 my $cond= shift @conds;
383 my $res=
384 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
385 $depth + 1 );
386 my $res_code= Dumper( $res );
387 if ( !$yes_res || $res_code ne $as_code ) {
388 if ( $yes_res ) {
389 $Update->( {} );
390 $node= $node->{no};
391 }
392 ( $yes_res, $as_code )= ( $res, $res_code );
393 @cond= ( $cond );
12b72891 394 } else {
e64b1bd1 395 push @cond, $cond;
12b72891
RGS
396 }
397 }
e64b1bd1 398 $Update->( $else );
12b72891
RGS
399 return $root;
400}
401
e64b1bd1
YO
402# my $optree= optree(%opts);
403#
404# Convert a trie to an optree, wrapper for _optree
405
406sub optree {
407 my $self= shift;
408 my %opt= @_;
409 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
410 $opt{ret_type} ||= 'len';
411 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
412 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
12b72891
RGS
413}
414
e64b1bd1
YO
415# my $optree= generic_optree(%opts);
416#
417# build a "generic" optree out of the three 'low', 'latin1', 'utf8'
418# sets of strings, including a branch for handling the string type check.
419#
420
421sub generic_optree {
422 my $self= shift;
423 my %opt= @_;
424
425 $opt{ret_type} ||= 'len';
426 my $test_type= 'depth';
427 my $else= $opt{else} || 0;
428
429 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
430 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
431
432 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
433 for $latin1, $utf8;
434
435 if ( $utf8 ) {
436 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
437 } elsif ( $latin1 ) {
438 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
439 }
440 my $low= $self->make_trie( 'low', $opt{max_depth} );
441 if ( $low ) {
442 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
12b72891 443 }
e64b1bd1
YO
444
445 return $else;
12b72891
RGS
446}
447
e64b1bd1 448# length_optree()
12b72891 449#
e64b1bd1 450# create a string length guarded optree.
12b72891 451#
e64b1bd1
YO
452
453sub length_optree {
454 my $self= shift;
455 my %opt= @_;
456 my $type= $opt{type};
457
458 die "Can't do a length_optree on type 'cp', makes no sense."
459 if $type eq 'cp';
460
461 my ( @size, $method );
462
463 if ( $type eq 'generic' ) {
464 $method= 'generic_optree';
465 my %sizes= (
466 %{ $self->{size}{low} || {} },
467 %{ $self->{size}{latin1} || {} },
468 %{ $self->{size}{utf8} || {} }
469 );
470 @size= sort { $a <=> $b } keys %sizes;
471 } else {
472 $method= 'optree';
473 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
12b72891 474 }
e64b1bd1
YO
475
476 my $else= ( $opt{else} ||= 0 );
477 for my $size ( @size ) {
478 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
479 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
480 $else= __cond_join( $cond, $optree, $else );
481 }
482 return $else;
12b72891
RGS
483}
484
e64b1bd1
YO
485# _cond_as_str
486# turn a list of conditions into a text expression
487# - merges ranges of conditions, and joins the result with ||
488sub _cond_as_str {
489 my ( $self, $op, $combine )= @_;
490 my $cond= $op->{vals};
491 my $test= $op->{test};
492 return "( $test )" if !defined $cond;
493
494 # rangify the list
495 my @ranges;
496 my $Update= sub {
497 if ( @ranges ) {
498 if ( $ranges[-1][0] == $ranges[-1][1] ) {
499 $ranges[-1]= $ranges[-1][0];
500 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
501 $ranges[-1]= $ranges[-1][0];
502 push @ranges, $ranges[-1] + 1;
503 }
504 }
505 };
506 for my $cond ( @$cond ) {
507 if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
508 $Update->();
509 push @ranges, [ $cond, $cond ];
510 } else {
511 $ranges[-1][1]++;
512 }
513 }
514 $Update->();
515 return $self->_combine( $test, @ranges )
516 if $combine;
517 @ranges= map {
518 ref $_
519 ? sprintf(
520 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
521 @$_ )
522 : sprintf( "$self->{val_fmt} == $test", $_ );
523 } @ranges;
524 return "( " . join( " || ", @ranges ) . " )";
12b72891
RGS
525}
526
e64b1bd1
YO
527# _combine
528# recursively turn a list of conditions into a fast break-out condition
529# used by _cond_as_str() for 'cp' type macros.
530sub _combine {
531 my ( $self, $test, @cond )= @_;
532 return if !@cond;
533 my $item= shift @cond;
534 my ( $cstr, $gtv );
535 if ( ref $item ) {
536 $cstr=
537 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
538 @$item );
539 $gtv= sprintf "$self->{val_fmt}", $item->[1];
12b72891 540 } else {
e64b1bd1
YO
541 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
542 $gtv= sprintf "$self->{val_fmt}", $item;
12b72891 543 }
e64b1bd1
YO
544 if ( @cond ) {
545 return "( $cstr || ( $gtv < $test &&\n"
546 . $self->_combine( $test, @cond ) . " ) )";
12b72891 547 } else {
e64b1bd1 548 return $cstr;
12b72891 549 }
e64b1bd1 550}
12b72891 551
e64b1bd1
YO
552# _render()
553# recursively convert an optree to text with reasonably neat formatting
554sub _render {
555 my ( $self, $op, $combine, $brace )= @_;
556 if ( !ref $op ) {
557 return $op;
12b72891 558 }
e64b1bd1
YO
559 my $cond= $self->_cond_as_str( $op, $combine );
560 my $yes= $self->_render( $op->{yes}, $combine, 1 );
561 my $no= $self->_render( $op->{no}, $combine, 0 );
562 return "( $cond )" if $yes eq '1' and $no eq '0';
563 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
564 return "$lb$cond ? $yes : $no$rb"
565 if !ref( $op->{yes} ) && !ref( $op->{no} );
566 my $ind1= " " x 4;
567 my $ind= "\n" . ( $ind1 x $op->{depth} );
568
569 if ( ref $op->{yes} ) {
570 $yes= $ind . $ind1 . $yes;
571 } else {
572 $yes= " " . $yes;
573 }
574
575 return "$lb$cond ?$yes$ind: $no$rb";
12b72891 576}
32e6a07c 577
e64b1bd1
YO
578# $expr=render($op,$combine)
579#
580# convert an optree to text with reasonably neat formatting. If $combine
581# is true then the condition is created using "fast breakouts" which
582# produce uglier expressions that are more efficient for common case,
583# longer lists such as that resulting from type 'cp' output.
584# Currently only used for type 'cp' macros.
585sub render {
586 my ( $self, $op, $combine )= @_;
587 my $str= "( " . $self->_render( $op, $combine ) . " )";
588 return __clean( $str );
12b72891 589}
e64b1bd1
YO
590
591# make_macro
592# make a macro of a given type.
593# calls into make_trie and (generic_|length_)optree as needed
594# Opts are:
595# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
596# ret_type : 'cp' or 'len'
597# safe : add length guards to macro
598#
599# type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
600# in which case it defaults to 'cp' as well.
601#
602# it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
603# sequences in it, as the generated macro will accept only a single codepoint
604# as an argument.
605#
606# returns the macro.
607
608
609sub make_macro {
610 my $self= shift;
611 my %opts= @_;
612 my $type= $opts{type} || 'generic';
613 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
614 if $type eq 'cp'
615 and $self->{has_multi};
616 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
617 my $method;
618 if ( $opts{safe} ) {
619 $method= 'length_optree';
620 } elsif ( $type eq 'generic' ) {
621 $method= 'generic_optree';
622 } else {
623 $method= 'optree';
624 }
625 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
626 my $text= $self->render( $optree, $type eq 'cp' );
627 my @args= $type eq 'cp' ? 'cp' : 's';
628 push @args, "e" if $opts{safe};
629 push @args, "is_utf8" if $type eq 'generic';
630 push @args, "len" if $ret_type eq 'both';
631 my $pfx= $ret_type eq 'both' ? 'what_len_' :
632 $ret_type eq 'cp' ? 'what_' : 'is_';
633 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
634 $ext .= "_safe" if $opts{safe};
635 my $argstr= join ",", @args;
636 return "/*** GENERATED CODE ***/\n"
637 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
32e6a07c 638}
e64b1bd1
YO
639
640# if we arent being used as a module (highly likely) then process
641# the __DATA__ below and produce macros in regcharclass.h
642# if an argument is provided to the script then it is assumed to
643# be the path of the file to output to, if the arg is '-' outputs
644# to STDOUT.
645if ( !caller ) {
e64b1bd1 646 $|++;
8770da0e 647 my $path= shift @ARGV || "regcharclass.h";
e64b1bd1
YO
648 my $out_fh;
649 if ( $path eq '-' ) {
650 $out_fh= \*STDOUT;
651 } else {
29c22b52 652 $out_fh = open_new( $path );
e64b1bd1 653 }
8770da0e
NC
654 print $out_fh read_only_top( lang => 'C', by => $0,
655 file => 'regcharclass.h', style => '*',
2eee27d7 656 copyright => [2007, 2011] );
d10c72f2 657 print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
12b72891 658
e64b1bd1
YO
659 my ( $op, $title, @txt, @types, @mods );
660 my $doit= sub {
661 return unless $op;
662 print $out_fh "/*\n\t$op: $title\n\n";
663 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
664 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
665
666 #die Dumper(\@types,\@mods);
667
668 foreach my $type_spec ( @types ) {
669 my ( $type, $ret )= split /-/, $type_spec;
670 $ret ||= 'len';
671 foreach my $mod ( @mods ) {
672 next if $mod eq 'safe' and $type eq 'cp';
673 my $macro= $obj->make_macro(
674 type => $type,
675 ret_type => $ret,
676 safe => $mod eq 'safe'
677 );
678 print $out_fh $macro, "\n";
679 }
32e6a07c 680 }
e64b1bd1
YO
681 };
682
683 while ( <DATA> ) {
684 s/^\s*#//;
685 next unless /\S/;
686 chomp;
687 if ( /^([A-Z]+)/ ) {
688 $doit->();
689 ( $op, $title )= split /\s*:\s*/, $_, 2;
690 @txt= ();
691 } elsif ( s/^=>// ) {
692 my ( $type, $modifier )= split /:/, $_;
693 @types= split ' ', $type;
694 @mods= split ' ', $modifier;
695 } else {
696 push @txt, "$_";
12b72891
RGS
697 }
698 }
e64b1bd1 699 $doit->();
d10c72f2
KW
700
701 print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
702
8770da0e
NC
703 if($path eq '-') {
704 print $out_fh "/* ex: set ro: */\n";
705 } else {
706 read_only_bottom_close_and_rename($out_fh)
707 }
12b72891 708}
e64b1bd1
YO
709
710#
711# Valid types: generic, LATIN1, UTF8, low, latin1, utf8
712# default return value is octects read.
713# append -cp to make it codepoint matched.
714# modifiers come after the colon, valid possibilities
715# being 'fast' and 'safe'.
716#
05b688d9
KW
717# Accepts a single code point per line, prefaced by '0x'
718# or a range of two code points separated by a minus (and optional space)
719# or a single \p{} per line.
e90ac8de
KW
720#
721# This is no longer used, but retained in case it is needed some day. Put the
722# lines below under __DATA__
723# TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
724# => generic cp generic-cp generic-both :fast safe
725# 0x00DF # LATIN SMALL LETTER SHARP S
726# 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
727# 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
728# 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
729# 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
730# 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
731
e64b1bd1 7321; # in the unlikely case we are being used as a module
12b72891
RGS
733
734__DATA__
735LNBREAK: Line Break: \R
e64b1bd1 736=> generic UTF8 LATIN1 :fast safe
12b72891 737"\x0D\x0A" # CRLF - Network (Windows) line ending
05b688d9 738\p{VertSpace}
12b72891
RGS
739
740HORIZWS: Horizontal Whitespace: \h \H
e64b1bd1 741=> generic UTF8 LATIN1 cp :fast safe
05b688d9 742\p{HorizSpace}
12b72891
RGS
743
744VERTWS: Vertical Whitespace: \v \V
e64b1bd1 745=> generic UTF8 LATIN1 cp :fast safe
05b688d9 746\p{VertSpace}