This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: White-space only
[perl5.git] / regen / regcharclass.pl
CommitLineData
e64b1bd1 1package CharClass::Matcher;
12b72891 2use strict;
8770da0e 3use 5.008;
12b72891 4use warnings;
e64b1bd1 5use warnings FATAL => 'all';
12b72891
RGS
6use Text::Wrap qw(wrap);
7use Encode;
8use Data::Dumper;
e64b1bd1
YO
9$Data::Dumper::Useqq= 1;
10our $hex_fmt= "0x%02X";
12b72891 11
8770da0e
NC
12require 'regen/regen_lib.pl';
13
ab84f958 14=head1 NAME
0ccab2bc 15
e64b1bd1 16CharClass::Matcher -- Generate C macros that match character classes efficiently
12b72891 17
e64b1bd1
YO
18=head1 SYNOPSIS
19
ab84f958 20 perl Porting/regcharclass.pl
e64b1bd1
YO
21
22=head1 DESCRIPTION
12b72891
RGS
23
24Dynamically generates macros for detecting special charclasses
e64b1bd1
YO
25in latin-1, utf8, and codepoint forms. Macros can be set to return
26the length (in bytes) of the matched codepoint, or the codepoint itself.
12b72891
RGS
27
28To regenerate regcharclass.h, run this script from perl-root. No arguments
29are necessary.
30
e64b1bd1 31Using WHATEVER as an example the following macros will be produced:
12b72891
RGS
32
33=over 4
34
e64b1bd1 35=item is_WHATEVER(s,is_utf8)
12b72891 36
e64b1bd1 37=item is_WHATEVER_safe(s,e,is_utf8)
12b72891 38
e64b1bd1 39Do a lookup as appropriate based on the is_utf8 flag. When possible
12b72891
RGS
40comparisons involving octect<128 are done before checking the is_utf8
41flag, hopefully saving time.
42
e64b1bd1 43=item is_WHATEVER_utf8(s)
12b72891 44
e64b1bd1 45=item is_WHATEVER_utf8_safe(s,e)
12b72891
RGS
46
47Do a lookup assuming the string is encoded in (normalized) UTF8.
48
e64b1bd1 49=item is_WHATEVER_latin1(s)
12b72891 50
e64b1bd1 51=item is_WHATEVER_latin1_safe(s,e)
12b72891
RGS
52
53Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
54
e64b1bd1 55=item is_WHATEVER_cp(cp)
12b72891 56
47e01c32 57Check to see if the string matches a given codepoint (hypothetically a
12b72891
RGS
58U32). The condition is constructed as as to "break out" as early as
59possible if the codepoint is out of range of the condition.
60
61IOW:
62
63 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
64
65Thus if the character is X+1 only two comparisons will be done. Making
66matching lookups slower, but non-matching faster.
67
68=back
69
e64b1bd1
YO
70Additionally it is possible to generate C<what_> variants that return
71the codepoint read instead of the number of octets read, this can be
72done by suffixing '-cp' to the type description.
73
74=head2 CODE FORMAT
75
76perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
77
78
79=head1 AUTHOR
80
81Author: Yves Orton (demerphq) 2007
82
83=head1 BUGS
84
85No tests directly here (although the regex engine will fail tests
86if this code is broken). Insufficient documentation and no Getopts
87handler for using the module as a script.
88
89=head1 LICENSE
90
91You may distribute under the terms of either the GNU General Public
92License or the Artistic License, as specified in the README file.
93
12b72891
RGS
94=cut
95
e64b1bd1
YO
96# Sub naming convention:
97# __func : private subroutine, can not be called as a method
98# _func : private method, not meant for external use
99# func : public method.
100
101# private subs
102#-------------------------------------------------------------------------------
103#
104# ($cp,$n,$l,$u)=__uni_latin($str);
105#
47e01c32 106# Return a list of arrays, each of which when interpreted correctly
e64b1bd1
YO
107# represent the string in some given encoding with specific conditions.
108#
109# $cp - list of codepoints that make up the string.
110# $n - list of octets that make up the string if all codepoints < 128
111# $l - list of octets that make up the string in latin1 encoding if all
112# codepoints < 256, and at least one codepoint is >127.
113# $u - list of octets that make up the string in utf8 if any codepoint >127
114#
115# High CP | Defined
116#-----------+----------
117# 0 - 127 : $n
118# 128 - 255 : $l, $u
119# 256 - ... : $u
120#
121
122sub __uni_latin1 {
123 my $str= shift;
124 my $max= 0;
125 my @cp;
126 for my $ch ( split //, $str ) {
127 my $cp= ord $ch;
128 push @cp, $cp;
129 $max= $cp if $max < $cp;
130 }
131 my ( $n, $l, $u );
132 if ( $max < 128 ) {
133 $n= [@cp];
134 } else {
135 $l= [@cp] if $max && $max < 256;
136
137 my $copy= $str; # must copy string, FB_CROAK makes encode destructive
138 $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
dda856b2
YO
139 # $u is utf8 but with the utf8 flag OFF
140 # therefore "C*" gets us the values of the bytes involved.
141 $u= [ unpack "C*", $u ] if defined $u;
12b72891 142 }
e64b1bd1 143 return ( \@cp, $n, $l, $u );
12b72891
RGS
144}
145
12b72891 146#
e64b1bd1
YO
147# $clean= __clean($expr);
148#
149# Cleanup a ternary expression, removing unnecessary parens and apply some
150# simplifications using regexes.
151#
152
153sub __clean {
154 my ( $expr )= @_;
155 our $parens;
156 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
157
158 #print "$parens\n$expr\n";
159 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
160 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
161 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
162 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
163 return $expr;
12b72891
RGS
164}
165
e64b1bd1
YO
166#
167# $text= __macro(@args);
168# Join args together by newlines, and then neatly add backslashes to the end
169# of every line as expected by the C pre-processor for #define's.
170#
171
172sub __macro {
173 my $str= join "\n", @_;
174 $str =~ s/\s*$//;
175 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
176 my $last= pop @lines;
177 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
178 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
179 return $str . "\n";
12b72891
RGS
180}
181
e64b1bd1
YO
182#
183# my $op=__incrdepth($op);
184#
185# take an 'op' hashref and add one to it and all its childrens depths.
186#
187
188sub __incrdepth {
189 my $op= shift;
190 return unless ref $op;
191 $op->{depth} += 1;
192 __incrdepth( $op->{yes} );
193 __incrdepth( $op->{no} );
194 return $op;
195}
196
197# join two branches of an opcode together with a condition, incrementing
198# the depth on the yes branch when we do so.
199# returns the new root opcode of the tree.
200sub __cond_join {
201 my ( $cond, $yes, $no )= @_;
202 return {
203 test => $cond,
204 yes => __incrdepth( $yes ),
205 no => $no,
206 depth => 0,
207 };
208}
209
210# Methods
211
212# constructor
213#
214# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
215#
216# Create a new CharClass::Matcher object by parsing the text in
217# the txt array. Currently applies the following rules:
218#
219# Element starts with C<0x>, line is evaled the result treated as
220# a number which is passed to chr().
221#
222# Element starts with C<">, line is evaled and the result treated
223# as a string.
224#
225# Each string is then stored in the 'strs' subhash as a hash record
226# made up of the results of __uni_latin1, using the keynames
227# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
228# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
229#
230# Size data is tracked per type in the 'size' subhash.
231#
232# Return an object
233#
12b72891
RGS
234sub new {
235 my $class= shift;
e64b1bd1
YO
236 my %opt= @_;
237 for ( qw(op txt) ) {
238 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
239 if !exists $opt{$_};
240 }
241
242 my $self= bless {
243 op => $opt{op},
244 title => $opt{title} || '',
245 }, $class;
246 foreach my $txt ( @{ $opt{txt} } ) {
247 my $str= $txt;
248 if ( $str =~ /^[""]/ ) {
249 $str= eval $str;
250 } elsif ( $str =~ /^0x/ ) {
251 $str= chr eval $str;
252 } elsif ( /\S/ ) {
47e01c32 253 die "Unparsable line: $txt\n";
12b72891 254 } else {
e64b1bd1 255 next;
12b72891 256 }
e64b1bd1
YO
257 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
258 my $UTF8= $low || $utf8;
259 my $LATIN1= $low || $latin1;
dda856b2
YO
260 #die Dumper($txt,$cp,$low,$latin1,$utf8)
261 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
e64b1bd1
YO
262
263 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
264 ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
265 my $rec= $self->{strs}{$str};
266 foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
267 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
268 if $self->{strs}{$str}{$key};
12b72891 269 }
e64b1bd1
YO
270 $self->{has_multi} ||= @$cp > 1;
271 $self->{has_ascii} ||= $latin1 && @$latin1;
272 $self->{has_low} ||= $low && @$low;
273 $self->{has_high} ||= !$low && !$latin1;
12b72891 274 }
e64b1bd1
YO
275 $self->{val_fmt}= $hex_fmt;
276 $self->{count}= 0 + keys %{ $self->{strs} };
12b72891
RGS
277 return $self;
278}
279
e64b1bd1 280# my $trie = make_trie($type,$maxlen);
12b72891 281#
47e01c32 282# using the data stored in the object build a trie of a specific type,
e64b1bd1
YO
283# and with specific maximum depth. The trie is made up the elements of
284# the given types array for each string in the object (assuming it is
285# not too long.)
286#
47e01c32 287# returns the trie, or undef if there was no relevant data in the object.
e64b1bd1
YO
288#
289
290sub make_trie {
291 my ( $self, $type, $maxlen )= @_;
292
293 my $strs= $self->{strs};
294 my %trie;
295 foreach my $rec ( values %$strs ) {
296 die "panic: unknown type '$type'"
297 if !exists $rec->{$type};
298 my $dat= $rec->{$type};
299 next unless $dat;
300 next if $maxlen && @$dat > $maxlen;
301 my $node= \%trie;
302 foreach my $elem ( @$dat ) {
303 $node->{$elem} ||= {};
304 $node= $node->{$elem};
12b72891 305 }
e64b1bd1 306 $node->{''}= $rec->{str};
12b72891 307 }
e64b1bd1 308 return 0 + keys( %trie ) ? \%trie : undef;
12b72891
RGS
309}
310
e64b1bd1
YO
311# my $optree= _optree()
312#
313# recursively convert a trie to an optree where every node represents
314# an if else branch.
12b72891 315#
12b72891 316#
12b72891 317
e64b1bd1
YO
318sub _optree {
319 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
320 return unless defined $trie;
321 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
322 die "Can't do 'cp' optree from multi-codepoint strings";
12b72891 323 }
e64b1bd1
YO
324 $ret_type ||= 'len';
325 $else= 0 unless defined $else;
326 $depth= 0 unless defined $depth;
327
328 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
329 if ( $trie->{''} ) {
330 if ( $ret_type eq 'cp' ) {
331 $else= $self->{strs}{ $trie->{''} }{cp}[0];
332 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
333 } elsif ( $ret_type eq 'len' ) {
334 $else= $depth;
335 } elsif ( $ret_type eq 'both') {
336 $else= $self->{strs}{ $trie->{''} }{cp}[0];
337 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
338 $else= "len=$depth, $else";
12b72891 339 }
e64b1bd1
YO
340 }
341 return $else if !@conds;
342 my $node= {};
343 my $root= $node;
344 my ( $yes_res, $as_code, @cond );
345 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
346 my $Update= sub {
347 $node->{vals}= [@cond];
348 $node->{test}= $test;
349 $node->{yes}= $yes_res;
350 $node->{depth}= $depth;
351 $node->{no}= shift;
352 };
353 while ( @conds ) {
354 my $cond= shift @conds;
355 my $res=
356 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
357 $depth + 1 );
358 my $res_code= Dumper( $res );
359 if ( !$yes_res || $res_code ne $as_code ) {
360 if ( $yes_res ) {
361 $Update->( {} );
362 $node= $node->{no};
363 }
364 ( $yes_res, $as_code )= ( $res, $res_code );
365 @cond= ( $cond );
12b72891 366 } else {
e64b1bd1 367 push @cond, $cond;
12b72891
RGS
368 }
369 }
e64b1bd1 370 $Update->( $else );
12b72891
RGS
371 return $root;
372}
373
e64b1bd1
YO
374# my $optree= optree(%opts);
375#
376# Convert a trie to an optree, wrapper for _optree
377
378sub optree {
379 my $self= shift;
380 my %opt= @_;
381 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
382 $opt{ret_type} ||= 'len';
383 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
384 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
12b72891
RGS
385}
386
e64b1bd1
YO
387# my $optree= generic_optree(%opts);
388#
389# build a "generic" optree out of the three 'low', 'latin1', 'utf8'
390# sets of strings, including a branch for handling the string type check.
391#
392
393sub generic_optree {
394 my $self= shift;
395 my %opt= @_;
396
397 $opt{ret_type} ||= 'len';
398 my $test_type= 'depth';
399 my $else= $opt{else} || 0;
400
401 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
402 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
403
404 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
405 for $latin1, $utf8;
406
407 if ( $utf8 ) {
408 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
409 } elsif ( $latin1 ) {
410 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
411 }
412 my $low= $self->make_trie( 'low', $opt{max_depth} );
413 if ( $low ) {
414 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
12b72891 415 }
e64b1bd1
YO
416
417 return $else;
12b72891
RGS
418}
419
e64b1bd1 420# length_optree()
12b72891 421#
e64b1bd1 422# create a string length guarded optree.
12b72891 423#
e64b1bd1
YO
424
425sub length_optree {
426 my $self= shift;
427 my %opt= @_;
428 my $type= $opt{type};
429
430 die "Can't do a length_optree on type 'cp', makes no sense."
431 if $type eq 'cp';
432
433 my ( @size, $method );
434
435 if ( $type eq 'generic' ) {
436 $method= 'generic_optree';
437 my %sizes= (
438 %{ $self->{size}{low} || {} },
439 %{ $self->{size}{latin1} || {} },
440 %{ $self->{size}{utf8} || {} }
441 );
442 @size= sort { $a <=> $b } keys %sizes;
443 } else {
444 $method= 'optree';
445 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
12b72891 446 }
e64b1bd1
YO
447
448 my $else= ( $opt{else} ||= 0 );
449 for my $size ( @size ) {
450 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
451 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
452 $else= __cond_join( $cond, $optree, $else );
453 }
454 return $else;
12b72891
RGS
455}
456
e64b1bd1
YO
457# _cond_as_str
458# turn a list of conditions into a text expression
459# - merges ranges of conditions, and joins the result with ||
460sub _cond_as_str {
461 my ( $self, $op, $combine )= @_;
462 my $cond= $op->{vals};
463 my $test= $op->{test};
464 return "( $test )" if !defined $cond;
465
466 # rangify the list
467 my @ranges;
468 my $Update= sub {
469 if ( @ranges ) {
470 if ( $ranges[-1][0] == $ranges[-1][1] ) {
471 $ranges[-1]= $ranges[-1][0];
472 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
473 $ranges[-1]= $ranges[-1][0];
474 push @ranges, $ranges[-1] + 1;
475 }
476 }
477 };
478 for my $cond ( @$cond ) {
479 if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
480 $Update->();
481 push @ranges, [ $cond, $cond ];
482 } else {
483 $ranges[-1][1]++;
484 }
485 }
486 $Update->();
487 return $self->_combine( $test, @ranges )
488 if $combine;
489 @ranges= map {
490 ref $_
491 ? sprintf(
492 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
493 @$_ )
494 : sprintf( "$self->{val_fmt} == $test", $_ );
495 } @ranges;
496 return "( " . join( " || ", @ranges ) . " )";
12b72891
RGS
497}
498
e64b1bd1
YO
499# _combine
500# recursively turn a list of conditions into a fast break-out condition
501# used by _cond_as_str() for 'cp' type macros.
502sub _combine {
503 my ( $self, $test, @cond )= @_;
504 return if !@cond;
505 my $item= shift @cond;
506 my ( $cstr, $gtv );
507 if ( ref $item ) {
508 $cstr=
509 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
510 @$item );
511 $gtv= sprintf "$self->{val_fmt}", $item->[1];
12b72891 512 } else {
e64b1bd1
YO
513 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
514 $gtv= sprintf "$self->{val_fmt}", $item;
12b72891 515 }
e64b1bd1
YO
516 if ( @cond ) {
517 return "( $cstr || ( $gtv < $test &&\n"
518 . $self->_combine( $test, @cond ) . " ) )";
12b72891 519 } else {
e64b1bd1 520 return $cstr;
12b72891 521 }
e64b1bd1 522}
12b72891 523
e64b1bd1
YO
524# _render()
525# recursively convert an optree to text with reasonably neat formatting
526sub _render {
527 my ( $self, $op, $combine, $brace )= @_;
528 if ( !ref $op ) {
529 return $op;
12b72891 530 }
e64b1bd1
YO
531 my $cond= $self->_cond_as_str( $op, $combine );
532 my $yes= $self->_render( $op->{yes}, $combine, 1 );
533 my $no= $self->_render( $op->{no}, $combine, 0 );
534 return "( $cond )" if $yes eq '1' and $no eq '0';
535 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
536 return "$lb$cond ? $yes : $no$rb"
537 if !ref( $op->{yes} ) && !ref( $op->{no} );
538 my $ind1= " " x 4;
539 my $ind= "\n" . ( $ind1 x $op->{depth} );
540
541 if ( ref $op->{yes} ) {
542 $yes= $ind . $ind1 . $yes;
543 } else {
544 $yes= " " . $yes;
545 }
546
547 return "$lb$cond ?$yes$ind: $no$rb";
12b72891 548}
32e6a07c 549
e64b1bd1
YO
550# $expr=render($op,$combine)
551#
552# convert an optree to text with reasonably neat formatting. If $combine
553# is true then the condition is created using "fast breakouts" which
554# produce uglier expressions that are more efficient for common case,
555# longer lists such as that resulting from type 'cp' output.
556# Currently only used for type 'cp' macros.
557sub render {
558 my ( $self, $op, $combine )= @_;
559 my $str= "( " . $self->_render( $op, $combine ) . " )";
560 return __clean( $str );
12b72891 561}
e64b1bd1
YO
562
563# make_macro
564# make a macro of a given type.
565# calls into make_trie and (generic_|length_)optree as needed
566# Opts are:
567# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
568# ret_type : 'cp' or 'len'
569# safe : add length guards to macro
570#
571# type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
572# in which case it defaults to 'cp' as well.
573#
574# it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
575# sequences in it, as the generated macro will accept only a single codepoint
576# as an argument.
577#
578# returns the macro.
579
580
581sub make_macro {
582 my $self= shift;
583 my %opts= @_;
584 my $type= $opts{type} || 'generic';
585 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
586 if $type eq 'cp'
587 and $self->{has_multi};
588 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
589 my $method;
590 if ( $opts{safe} ) {
591 $method= 'length_optree';
592 } elsif ( $type eq 'generic' ) {
593 $method= 'generic_optree';
594 } else {
595 $method= 'optree';
596 }
597 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
598 my $text= $self->render( $optree, $type eq 'cp' );
599 my @args= $type eq 'cp' ? 'cp' : 's';
600 push @args, "e" if $opts{safe};
601 push @args, "is_utf8" if $type eq 'generic';
602 push @args, "len" if $ret_type eq 'both';
603 my $pfx= $ret_type eq 'both' ? 'what_len_' :
604 $ret_type eq 'cp' ? 'what_' : 'is_';
605 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
606 $ext .= "_safe" if $opts{safe};
607 my $argstr= join ",", @args;
608 return "/*** GENERATED CODE ***/\n"
609 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
32e6a07c 610}
e64b1bd1
YO
611
612# if we arent being used as a module (highly likely) then process
613# the __DATA__ below and produce macros in regcharclass.h
614# if an argument is provided to the script then it is assumed to
615# be the path of the file to output to, if the arg is '-' outputs
616# to STDOUT.
617if ( !caller ) {
e64b1bd1 618 $|++;
8770da0e 619 my $path= shift @ARGV || "regcharclass.h";
e64b1bd1
YO
620 my $out_fh;
621 if ( $path eq '-' ) {
622 $out_fh= \*STDOUT;
623 } else {
29c22b52 624 $out_fh = open_new( $path );
e64b1bd1 625 }
8770da0e
NC
626 print $out_fh read_only_top( lang => 'C', by => $0,
627 file => 'regcharclass.h', style => '*',
628 copyright => [2007] );
12b72891 629
e64b1bd1
YO
630 my ( $op, $title, @txt, @types, @mods );
631 my $doit= sub {
632 return unless $op;
633 print $out_fh "/*\n\t$op: $title\n\n";
634 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
635 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
636
637 #die Dumper(\@types,\@mods);
638
639 foreach my $type_spec ( @types ) {
640 my ( $type, $ret )= split /-/, $type_spec;
641 $ret ||= 'len';
642 foreach my $mod ( @mods ) {
643 next if $mod eq 'safe' and $type eq 'cp';
644 my $macro= $obj->make_macro(
645 type => $type,
646 ret_type => $ret,
647 safe => $mod eq 'safe'
648 );
649 print $out_fh $macro, "\n";
650 }
32e6a07c 651 }
e64b1bd1
YO
652 };
653
654 while ( <DATA> ) {
655 s/^\s*#//;
656 next unless /\S/;
657 chomp;
658 if ( /^([A-Z]+)/ ) {
659 $doit->();
660 ( $op, $title )= split /\s*:\s*/, $_, 2;
661 @txt= ();
662 } elsif ( s/^=>// ) {
663 my ( $type, $modifier )= split /:/, $_;
664 @types= split ' ', $type;
665 @mods= split ' ', $modifier;
666 } else {
667 push @txt, "$_";
12b72891
RGS
668 }
669 }
e64b1bd1 670 $doit->();
8770da0e
NC
671 if($path eq '-') {
672 print $out_fh "/* ex: set ro: */\n";
673 } else {
674 read_only_bottom_close_and_rename($out_fh)
675 }
12b72891 676}
e64b1bd1
YO
677
678#
679# Valid types: generic, LATIN1, UTF8, low, latin1, utf8
680# default return value is octects read.
681# append -cp to make it codepoint matched.
682# modifiers come after the colon, valid possibilities
683# being 'fast' and 'safe'.
684#
6851; # in the unlikely case we are being used as a module
12b72891
RGS
686
687__DATA__
688LNBREAK: Line Break: \R
e64b1bd1 689=> generic UTF8 LATIN1 :fast safe
12b72891
RGS
690"\x0D\x0A" # CRLF - Network (Windows) line ending
6910x0A # LF | LINE FEED
6920x0B # VT | VERTICAL TAB
6930x0C # FF | FORM FEED
6940x0D # CR | CARRIAGE RETURN
6950x85 # NEL | NEXT LINE
6960x2028 # LINE SEPARATOR
6970x2029 # PARAGRAPH SEPARATOR
698
699HORIZWS: Horizontal Whitespace: \h \H
e64b1bd1 700=> generic UTF8 LATIN1 cp :fast safe
12b72891
RGS
7010x09 # HT
7020x20 # SPACE
7030xa0 # NBSP
7040x1680 # OGHAM SPACE MARK
7050x180e # MONGOLIAN VOWEL SEPARATOR
7060x2000 # EN QUAD
7070x2001 # EM QUAD
7080x2002 # EN SPACE
7090x2003 # EM SPACE
7100x2004 # THREE-PER-EM SPACE
7110x2005 # FOUR-PER-EM SPACE
7120x2006 # SIX-PER-EM SPACE
7130x2007 # FIGURE SPACE
7140x2008 # PUNCTUATION SPACE
7150x2009 # THIN SPACE
7160x200A # HAIR SPACE
7170x202f # NARROW NO-BREAK SPACE
7180x205f # MEDIUM MATHEMATICAL SPACE
7190x3000 # IDEOGRAPHIC SPACE
720
721VERTWS: Vertical Whitespace: \v \V
e64b1bd1 722=> generic UTF8 LATIN1 cp :fast safe
12b72891
RGS
7230x0A # LF
7240x0B # VT
7250x0C # FF
7260x0D # CR
7270x85 # NEL
7280x2028 # LINE SEPARATOR
7290x2029 # PARAGRAPH SEPARATOR
730
e64b1bd1 731
dddecdc7 732TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
e64b1bd1 733=> generic cp generic-cp generic-both :fast safe
90826b5c 7340x00DF # LATIN SMALL LETTER SHARP S
32e6a07c
YO
7350x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
7360x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
90826b5c
KW
7370x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
7380x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
7390x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0