perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!perl
	2	package CharClass::Matcher;
	3	use strict;
	4	use 5.008;
	5	use warnings;
	6	use warnings FATAL => 'all';
	7	use Text::Wrap qw(wrap);
	8	use Data::Dumper;
	9	$Data::Dumper::Useqq= 1;
	10	our $hex_fmt= "0x%02X";
	11
	12	sub ASCII_PLATFORM { (ord('A') == 65) }
	13
	14	require 'regen/regen_lib.pl';
	15
	16	=head1 NAME
	17
	18	CharClass::Matcher -- Generate C macros that match character classes efficiently
	19
	20	=head1 SYNOPSIS
	21
	22	perl Porting/regcharclass.pl
	23
	24	=head1 DESCRIPTION
	25
	26	Dynamically generates macros for detecting special charclasses
	27	in latin-1, utf8, and codepoint forms. Macros can be set to return
	28	the length (in bytes) of the matched codepoint, and/or the codepoint itself.
	29
	30	To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
	31	are necessary.
	32
	33	Using WHATEVER as an example the following macros can be produced, depending
	34	on the input parameters (how to get each is described by internal comments at
	35	the C<__DATA__> line):
	36
	37	=over 4
	38
	39	=item C<is_WHATEVER(s,is_utf8)>
	40
	41	=item C<is_WHATEVER_safe(s,e,is_utf8)>
	42
	43	Do a lookup as appropriate based on the C<is_utf8> flag. When possible
	44	comparisons involving octect<128 are done before checking the C<is_utf8>
	45	flag, hopefully saving time.
	46
	47	The version without the C<_safe> suffix should be used only when the input is
	48	known to be well-formed.
	49
	50	=item C<is_WHATEVER_utf8(s)>
	51
	52	=item C<is_WHATEVER_utf8_safe(s,e)>
	53
	54	Do a lookup assuming the string is encoded in (normalized) UTF8.
	55
	56	The version without the C<_safe> suffix should be used only when the input is
	57	known to be well-formed.
	58
	59	=item C<is_WHATEVER_latin1(s)>
	60
	61	=item C<is_WHATEVER_latin1_safe(s,e)>
	62
	63	Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
	64
	65	The version without the C<_safe> suffix should be used only when it is known
	66	that C<s> contains at least one character.
	67
	68	=item C<is_WHATEVER_cp(cp)>
	69
	70	Check to see if the string matches a given codepoint (hypothetically a
	71	U32). The condition is constructed as as to "break out" as early as
	72	possible if the codepoint is out of range of the condition.
	73
	74	IOW:
	75
	76	(cp==X \|\| (cp>X && (cp==Y \|\| (cp>Y && ...))))
	77
	78	Thus if the character is X+1 only two comparisons will be done. Making
	79	matching lookups slower, but non-matching faster.
	80
	81	=item C<what_len_WHATEVER_FOO(arg1, ..., len)>
	82
	83	A variant form of each of the macro types described above can be generated, in
	84	which the code point is returned by the macro, and an extra parameter (in the
	85	final position) is added, which is a pointer for the macro to set the byte
	86	length of the returned code point.
	87
	88	These forms all have a C<what_len> prefix instead of the C<is_>, for example
	89	C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
	90	C<what_len_WHATEVER_utf8(s,len)>.
	91
	92	These forms should not be used I<except> on small sets of mostly widely
	93	separated code points; otherwise the code generated is inefficient. For these
	94	cases, it is best to use the C<is_> forms, and then find the code point with
	95	C<utf8_to_uvchr_buf>(). This program can fail with a "deep recursion"
	96	message on the worst of the inappropriate sets. Examine the generated macro
	97	to see if it is acceptable.
	98
	99	=item C<what_WHATEVER_FOO(arg1, ...)>
	100
	101	A variant form of each of the C<is_> macro types described above can be generated, in
	102	which the code point and not the length is returned by the macro. These have
	103	the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
	104	not be used where the set contains a NULL, as 0 is returned for two different
	105	cases: a) the set doesn't include the input code point; b) the set does
	106	include it, and it is a NULL.
	107
	108	=back
	109
	110	=head2 CODE FORMAT
	111
	112	perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
	113
	114
	115	=head1 AUTHOR
	116
	117	Author: Yves Orton (demerphq) 2007. Maintained by Perl5 Porters.
	118
	119	=head1 BUGS
	120
	121	No tests directly here (although the regex engine will fail tests
	122	if this code is broken). Insufficient documentation and no Getopts
	123	handler for using the module as a script.
	124
	125	=head1 LICENSE
	126
	127	You may distribute under the terms of either the GNU General Public
	128	License or the Artistic License, as specified in the README file.
	129
	130	=cut
	131
	132	# Sub naming convention:
	133	# __func : private subroutine, can not be called as a method
	134	# _func : private method, not meant for external use
	135	# func : public method.
	136
	137	# private subs
	138	#-------------------------------------------------------------------------------
	139	#
	140	# ($cp,$n,$l,$u)=__uni_latin($str);
	141	#
	142	# Return a list of arrays, each of which when interpreted correctly
	143	# represent the string in some given encoding with specific conditions.
	144	#
	145	# $cp - list of codepoints that make up the string.
	146	# $n - list of octets that make up the string if all codepoints are invariant
	147	# regardless of if the string is in UTF-8 or not.
	148	# $l - list of octets that make up the string in latin1 encoding if all
	149	# codepoints < 256, and at least one codepoint is UTF-8 variant.
	150	# $u - list of octets that make up the string in utf8 if any codepoint is
	151	# UTF-8 variant
	152	#
	153	# High CP \| Defined
	154	#-----------+----------
	155	# 0 - 127 : $n (127/128 are the values for ASCII platforms)
	156	# 128 - 255 : $l, $u
	157	# 256 - ... : $u
	158	#
	159
	160	sub __uni_latin1 {
	161	my $str= shift;
	162	my $max= 0;
	163	my @cp;
	164	my $only_has_invariants = 1;
	165	for my $ch ( split //, $str ) {
	166	my $cp= ord $ch;
	167	push @cp, $cp;
	168	$max= $cp if $max < $cp;
	169	if (! ASCII_PLATFORM && $only_has_invariants) {
	170	if ($cp > 255) {
	171	$only_has_invariants = 0;
	172	}
	173	else {
	174	my $temp = chr($cp);
	175	utf8::upgrade($temp);
	176	my @utf8 = unpack "U0C*", $temp;
	177	$only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
	178	}
	179	}
	180	}
	181	my ( $n, $l, $u );
	182	$only_has_invariants = $max < 128 if ASCII_PLATFORM;
	183	if ($only_has_invariants) {
	184	$n= [@cp];
	185	} else {
	186	$l= [@cp] if $max && $max < 256;
	187
	188	$u= $str;
	189	utf8::upgrade($u);
	190	$u= [ unpack "U0C*", $u ] if defined $u;
	191	}
	192	return ( \@cp, $n, $l, $u );
	193	}
	194
	195	#
	196	# $clean= __clean($expr);
	197	#
	198	# Cleanup a ternary expression, removing unnecessary parens and apply some
	199	# simplifications using regexes.
	200	#
	201
	202	sub __clean {
	203	my ( $expr )= @_;
	204
	205	#return $expr;
	206
	207	our $parens;
	208	$parens= qr/ (?> $ (?> (?: (?> [^()]+ ) \| (??{ $parens }) )* ) $ ) /x;
	209
	210	## remove redundant parens
	211	1 while $expr =~ s/ $ \s* ( $parens ) \s* $ /$1/gx;
	212
	213
	214	# repeatedly simplify conditions like
	215	# ( (cond1) ? ( (cond2) ? X : Y ) : Y )
	216	# into
	217	# ( ( (cond1) && (cond2) ) ? X : Y )
	218	# Also similarly handles expressions like:
	219	# : (cond1) ? ( (cond2) ? X : Y ) : Y )
	220	# Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
	221	# purely to ensure we have a balanced set of parens in the expression which makes
	222	# it easier to understand the pattern in an editor that understands paren's, we do
	223	# not expect either of these cases to actually fire. - Yves
	224	1 while $expr =~ s/
	225	([:()]) \s*
	226	($parens) \s*
	227	\? \s*
	228	\( \s* ($parens) \s*
	229	\? \s* ($parens\|[^()?:\s]+?) \s*
	230	: \s* ($parens\|[^()?:\s]+?) \s*
	231	\) \s*
	232	: \s* \5 \s*
	233	([()])
	234	/$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
	235
	236	return $expr;
	237	}
	238
	239	#
	240	# $text= __macro(@args);
	241	# Join args together by newlines, and then neatly add backslashes to the end
	242	# of every line as expected by the C pre-processor for #define's.
	243	#
	244
	245	sub __macro {
	246	my $str= join "\n", @_;
	247	$str =~ s/\s*$//;
	248	my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
	249	my $last= pop @lines;
	250	$str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
	251	1 while $str =~ s/^(\t*) {8}/$1\t/gm;
	252	return $str . "\n";
	253	}
	254
	255	#
	256	# my $op=__incrdepth($op);
	257	#
	258	# take an 'op' hashref and add one to it and all its childrens depths.
	259	#
	260
	261	sub __incrdepth {
	262	my $op= shift;
	263	return unless ref $op;
	264	$op->{depth} += 1;
	265	__incrdepth( $op->{yes} );
	266	__incrdepth( $op->{no} );
	267	return $op;
	268	}
	269
	270	# join two branches of an opcode together with a condition, incrementing
	271	# the depth on the yes branch when we do so.
	272	# returns the new root opcode of the tree.
	273	sub __cond_join {
	274	my ( $cond, $yes, $no )= @_;
	275	return {
	276	test => $cond,
	277	yes => __incrdepth( $yes ),
	278	no => $no,
	279	depth => 0,
	280	};
	281	}
	282
	283	# Methods
	284
	285	# constructor
	286	#
	287	# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
	288	#
	289	# Create a new CharClass::Matcher object by parsing the text in
	290	# the txt array. Currently applies the following rules:
	291	#
	292	# Element starts with C<0x>, line is evaled the result treated as
	293	# a number which is passed to chr().
	294	#
	295	# Element starts with C<">, line is evaled and the result treated
	296	# as a string.
	297	#
	298	# Each string is then stored in the 'strs' subhash as a hash record
	299	# made up of the results of __uni_latin1, using the keynames
	300	# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
	301	# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
	302	#
	303	# Size data is tracked per type in the 'size' subhash.
	304	#
	305	# Return an object
	306	#
	307	sub new {
	308	my $class= shift;
	309	my %opt= @_;
	310	for ( qw(op txt) ) {
	311	die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
	312	if !exists $opt{$_};
	313	}
	314
	315	my $self= bless {
	316	op => $opt{op},
	317	title => $opt{title} \|\| '',
	318	}, $class;
	319	foreach my $txt ( @{ $opt{txt} } ) {
	320	my $str= $txt;
	321	if ( $str =~ /^[""]/ ) {
	322	$str= eval $str;
	323	} elsif ($str =~ / - /x ) { # A range: Replace this element on the
	324	# list with its expansion
	325	my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
	326	die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower \|\| ! defined $upper;
	327	foreach my $cp (hex $lower .. hex $upper) {
	328	push @{$opt{txt}}, sprintf "0x%X", $cp;
	329	}
	330	next;
	331	} elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
	332	# Otherwise undocumented, a leading N means is already in the
	333	# native character set; don't convert.
	334	$str= chr eval $str;
	335	} elsif ( $str =~ /^0x/ ) {
	336	$str= eval $str;
	337
	338	# Convert from Unicode/ASCII to native, if necessary
	339	$str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
	340	&& $str <= 0xFF;
	341	$str = chr $str;
	342	} elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
	343	my $property = $1;
	344	use Unicode::UCD qw(prop_invlist);
	345
	346	my @invlist = prop_invlist($property, '_perl_core_internal_ok');
	347	if (! @invlist) {
	348
	349	# An empty return could mean an unknown property, or merely
	350	# that it is empty. Call in scalar context to differentiate
	351	my $count = prop_invlist($property, '_perl_core_internal_ok');
	352	die "$property not found" unless defined $count;
	353	}
	354
	355	# Replace this element on the list with the property's expansion
	356	for (my $i = 0; $i < @invlist; $i += 2) {
	357	foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
	358
	359	# prop_invlist() returns native values; add leading 'N'
	360	# to indicate that.
	361	push @{$opt{txt}}, sprintf "N0x%X", $cp;
	362	}
	363	}
	364	next;
	365	} elsif ($str =~ / ^ do \s+ ( .* ) /x) {
	366	die "do '$1' failed: $!$@" if ! do $1 or $@;
	367	next;
	368	} elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
	369	my @results = eval "$1";
	370	die "eval '$1' failed: $@" if $@;
	371	push @{$opt{txt}}, @results;
	372	next;
	373	} else {
	374	die "Unparsable line: $txt\n";
	375	}
	376	my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
	377	my $UTF8= $low \|\| $utf8;
	378	my $LATIN1= $low \|\| $latin1;
	379	my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
	380	#die Dumper($txt,$cp,$low,$latin1,$utf8)
	381	# if $txt=~/NEL/ or $utf8 and @$utf8>3;
	382
	383	@{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp UTF8 LATIN1 )}=
	384	( $str, $txt, $low, $utf8, $latin1, $high, $cp, $UTF8, $LATIN1 );
	385	my $rec= $self->{strs}{$str};
	386	foreach my $key ( qw(low utf8 latin1 high cp UTF8 LATIN1) ) {
	387	$self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
	388	if $self->{strs}{$str}{$key};
	389	}
	390	$self->{has_multi} \|\|= @$cp > 1;
	391	$self->{has_ascii} \|\|= $latin1 && @$latin1;
	392	$self->{has_low} \|\|= $low && @$low;
	393	$self->{has_high} \|\|= !$low && !$latin1;
	394	}
	395	$self->{val_fmt}= $hex_fmt;
	396	$self->{count}= 0 + keys %{ $self->{strs} };
	397	return $self;
	398	}
	399
	400	# my $trie = make_trie($type,$maxlen);
	401	#
	402	# using the data stored in the object build a trie of a specific type,
	403	# and with specific maximum depth. The trie is made up the elements of
	404	# the given types array for each string in the object (assuming it is
	405	# not too long.)
	406	#
	407	# returns the trie, or undef if there was no relevant data in the object.
	408	#
	409
	410	sub make_trie {
	411	my ( $self, $type, $maxlen )= @_;
	412
	413	my $strs= $self->{strs};
	414	my %trie;
	415	foreach my $rec ( values %$strs ) {
	416	die "panic: unknown type '$type'"
	417	if !exists $rec->{$type};
	418	my $dat= $rec->{$type};
	419	next unless $dat;
	420	next if $maxlen && @$dat > $maxlen;
	421	my $node= \%trie;
	422	foreach my $elem ( @$dat ) {
	423	$node->{$elem} \|\|= {};
	424	$node= $node->{$elem};
	425	}
	426	$node->{''}= $rec->{str};
	427	}
	428	return 0 + keys( %trie ) ? \%trie : undef;
	429	}
	430
	431	sub pop_count ($) {
	432	my $word = shift;
	433
	434	# This returns a list of the positions of the bits in the input word that
	435	# are 1.
	436
	437	my @positions;
	438	my $position = 0;
	439	while ($word) {
	440	push @positions, $position if $word & 1;
	441	$position++;
	442	$word >>= 1;
	443	}
	444	return @positions;
	445	}
	446
	447	# my $optree= _optree()
	448	#
	449	# recursively convert a trie to an optree where every node represents
	450	# an if else branch.
	451	#
	452	#
	453
	454	sub _optree {
	455	my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
	456	return unless defined $trie;
	457	if ( $self->{has_multi} and $ret_type =~ /cp\|both/ ) {
	458	die "Can't do 'cp' optree from multi-codepoint strings";
	459	}
	460	$ret_type \|\|= 'len';
	461	$else= 0 unless defined $else;
	462	$depth= 0 unless defined $depth;
	463
	464	# if we have an emptry string as a key it means we are in an
	465	# accepting state and unless we can match further on should
	466	# return the value of the '' key.
	467	if (exists $trie->{''} ) {
	468	# we can now update the "else" value, anything failing to match
	469	# after this point should return the value from this.
	470	if ( $ret_type eq 'cp' ) {
	471	$else= $self->{strs}{ $trie->{''} }{cp}[0];
	472	$else= sprintf "$self->{val_fmt}", $else if $else > 9;
	473	} elsif ( $ret_type eq 'len' ) {
	474	$else= $depth;
	475	} elsif ( $ret_type eq 'both') {
	476	$else= $self->{strs}{ $trie->{''} }{cp}[0];
	477	$else= sprintf "$self->{val_fmt}", $else if $else > 9;
	478	$else= "len=$depth, $else";
	479	}
	480	}
	481	# extract the meaningful keys from the trie, filter out '' as
	482	# it means we are an accepting state (end of sequence).
	483	my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
	484
	485	# if we havent any keys there is no further we can match and we
	486	# can return the "else" value.
	487	return $else if !@conds;
	488
	489
	490	my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
	491	# first we loop over the possible keys/conditions and find out what they look like
	492	# we group conditions with the same optree together.
	493	my %dmp_res;
	494	my @res_order;
	495	local $Data::Dumper::Sortkeys=1;
	496	foreach my $cond ( @conds ) {
	497
	498	# get the optree for this child/condition
	499	my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
	500	# convert it to a string with Dumper
	501	my $res_code= Dumper( $res );
	502
	503	push @{$dmp_res{$res_code}{vals}}, $cond;
	504	if (!$dmp_res{$res_code}{optree}) {
	505	$dmp_res{$res_code}{optree}= $res;
	506	push @res_order, $res_code;
	507	}
	508	}
	509
	510	# now that we have deduped the optrees we construct a new optree containing the merged
	511	# results.
	512	my %root;
	513	my $node= \%root;
	514	foreach my $res_code_idx (0 .. $#res_order) {
	515	my $res_code= $res_order[$res_code_idx];
	516	$node->{vals}= $dmp_res{$res_code}{vals};
	517	$node->{test}= $test;
	518	$node->{yes}= $dmp_res{$res_code}{optree};
	519	$node->{depth}= $depth;
	520	if ($res_code_idx < $#res_order) {
	521	$node= $node->{no}= {};
	522	} else {
	523	$node->{no}= $else;
	524	}
	525	}
	526
	527	# return the optree.
	528	return \%root;
	529	}
	530
	531	# my $optree= optree(%opts);
	532	#
	533	# Convert a trie to an optree, wrapper for _optree
	534
	535	sub optree {
	536	my $self= shift;
	537	my %opt= @_;
	538	my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
	539	$opt{ret_type} \|\|= 'len';
	540	my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
	541	return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
	542	}
	543
	544	# my $optree= generic_optree(%opts);
	545	#
	546	# build a "generic" optree out of the three 'low', 'latin1', 'utf8'
	547	# sets of strings, including a branch for handling the string type check.
	548	#
	549
	550	sub generic_optree {
	551	my $self= shift;
	552	my %opt= @_;
	553
	554	$opt{ret_type} \|\|= 'len';
	555	my $test_type= 'depth';
	556	my $else= $opt{else} \|\| 0;
	557
	558	my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
	559	my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
	560
	561	$_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
	562	for $latin1, $utf8;
	563
	564	if ( $utf8 ) {
	565	$else= __cond_join( "( is_utf8 )", $utf8, $latin1 \|\| $else );
	566	} elsif ( $latin1 ) {
	567	$else= __cond_join( "!( is_utf8 )", $latin1, $else );
	568	}
	569	my $low= $self->make_trie( 'low', $opt{max_depth} );
	570	if ( $low ) {
	571	$else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
	572	}
	573
	574	return $else;
	575	}
	576
	577	# length_optree()
	578	#
	579	# create a string length guarded optree.
	580	#
	581
	582	sub length_optree {
	583	my $self= shift;
	584	my %opt= @_;
	585	my $type= $opt{type};
	586
	587	die "Can't do a length_optree on type 'cp', makes no sense."
	588	if $type eq 'cp';
	589
	590	my ( @size, $method );
	591
	592	if ( $type eq 'generic' ) {
	593	$method= 'generic_optree';
	594	my %sizes= (
	595	%{ $self->{size}{low} \|\| {} },
	596	%{ $self->{size}{latin1} \|\| {} },
	597	%{ $self->{size}{utf8} \|\| {} }
	598	);
	599	@size= sort { $a <=> $b } keys %sizes;
	600	} else {
	601	$method= 'optree';
	602	@size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
	603	}
	604
	605	my $else= ( $opt{else} \|\|= 0 );
	606	for my $size ( @size ) {
	607	my $optree= $self->$method( %opt, type => $type, max_depth => $size );
	608	my $cond= "((e)-(s) > " . ( $size - 1 ).")";
	609	$else= __cond_join( $cond, $optree, $else );
	610	}
	611	return $else;
	612	}
	613
	614	sub calculate_mask(@) {
	615	my @list = @_;
	616	my $list_count = @list;
	617
	618	# Look at the input list of byte values. This routine sees if the set
	619	# consisting of those bytes is exactly determinable by using a
	620	# mask/compare operation. If not, it returns an empty list; if so, it
	621	# returns a list consisting of (mask, compare). For example, consider a
	622	# set consisting of the numbers 0xF0, 0xF1, 0xF2, and 0xF3. If we want to
	623	# know if a number 'c' is in the set, we could write:
	624	# 0xF0 <= c && c <= 0xF4
	625	# But the following mask/compare also works, and has just one test:
	626	# c & 0xFC == 0xF0
	627	# The reason it works is that the set consists of exactly those numbers
	628	# whose first 4 bits are 1, and the next two are 0. (The value of the
	629	# other 2 bits is immaterial in determining if a number is in the set or
	630	# not.) The mask masks out those 2 irrelevant bits, and the comparison
	631	# makes sure that the result matches all bytes that which match those 6
	632	# material bits exactly. In other words, the set of numbers contains
	633	# exactly those whose bottom two bit positions are either 0 or 1. The
	634	# same principle applies to bit positions that are not necessarily
	635	# adjacent. And it can be applied to bytes that differ in 1 through all 8
	636	# bit positions. In order to be a candidate for this optimization, the
	637	# number of numbers in the test must be a power of 2. Based on this
	638	# count, we know the number of bit positions that must differ.
	639	my $bit_diff_count = 0;
	640	my $compare = $list[0];
	641	if ($list_count == 2) {
	642	$bit_diff_count = 1;
	643	}
	644	elsif ($list_count == 4) {
	645	$bit_diff_count = 2;
	646	}
	647	elsif ($list_count == 8) {
	648	$bit_diff_count = 3;
	649	}
	650	elsif ($list_count == 16) {
	651	$bit_diff_count = 4;
	652	}
	653	elsif ($list_count == 32) {
	654	$bit_diff_count = 5;
	655	}
	656	elsif ($list_count == 64) {
	657	$bit_diff_count = 6;
	658	}
	659	elsif ($list_count == 128) {
	660	$bit_diff_count = 7;
	661	}
	662	elsif ($list_count == 256) {
	663	return (0, 0);
	664	}
	665
	666	# If the count wasn't a power of 2, we can't apply this optimization
	667	return if ! $bit_diff_count;
	668
	669	my %bit_map;
	670
	671	# For each byte in the list, find the bit positions in it whose value
	672	# differs from the first byte in the set.
	673	for (my $i = 1; $i < @list; $i++) {
	674	my @positions = pop_count($list[0] ^ $list[$i]);
	675
	676	# If the number of differing bits is greater than those permitted by
	677	# the set size, this optimization doesn't apply.
	678	return if @positions > $bit_diff_count;
	679
	680	# Save the bit positions that differ.
	681	foreach my $bit (@positions) {
	682	$bit_map{$bit} = 1;
	683	}
	684
	685	# If the total so far is greater than those permitted by the set size,
	686	# this optimization doesn't apply.
	687	return if keys %bit_map > $bit_diff_count;
	688
	689
	690	# The value to compare against is the AND of all the members of the
	691	# set. The bit positions that are the same in all will be correct in
	692	# the AND, and the bit positions that differ will be 0.
	693	$compare &= $list[$i];
	694	}
	695
	696	# To get to here, we have gone through all bytes in the set,
	697	# and determined that they all differ from each other in at most
	698	# the number of bits allowed for the set's quantity. And since we have
	699	# tested all 2**N possibilities, we know that the set includes no fewer
	700	# elements than we need,, so the optimization applies.
	701	die "panic: internal logic error" if keys %bit_map != $bit_diff_count;
	702
	703	# The mask is the bit positions where things differ, complemented.
	704	my $mask = 0;
	705	foreach my $position (keys %bit_map) {
	706	$mask \|= 1 << $position;
	707	}
	708	$mask = ~$mask & 0xFF;
	709
	710	return ($mask, $compare);
	711	}
	712
	713	# _cond_as_str
	714	# turn a list of conditions into a text expression
	715	# - merges ranges of conditions, and joins the result with \|\|
	716	sub _cond_as_str {
	717	my ( $self, $op, $combine, $opts_ref )= @_;
	718	my $cond= $op->{vals};
	719	my $test= $op->{test};
	720	my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
	721	return "( $test )" if !defined $cond;
	722
	723	# rangify the list.
	724	my @ranges;
	725	my $Update= sub {
	726	# We skip this if there are optimizations that
	727	# we can apply (below) to the individual ranges
	728	if ( ($is_cp_ret \|\| $combine) && @ranges && ref $ranges[-1]) {
	729	if ( $ranges[-1][0] == $ranges[-1][1] ) {
	730	$ranges[-1]= $ranges[-1][0];
	731	} elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
	732	$ranges[-1]= $ranges[-1][0];
	733	push @ranges, $ranges[-1] + 1;
	734	}
	735	}
	736	};
	737	for my $condition ( @$cond ) {
	738	if ( !@ranges \|\| $condition != $ranges[-1][1] + 1 ) {
	739	$Update->();
	740	push @ranges, [ $condition, $condition ];
	741	} else {
	742	$ranges[-1][1]++;
	743	}
	744	}
	745	$Update->();
	746
	747	return $self->_combine( $test, @ranges )
	748	if $combine;
	749
	750	if ($is_cp_ret) {
	751	@ranges= map {
	752	ref $_
	753	? sprintf(
	754	"( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
	755	@$_ )
	756	: sprintf( "$self->{val_fmt} == $test", $_ );
	757	} @ranges;
	758	}
	759	else {
	760	# If the input set has certain characteristics, we can optimize tests
	761	# for it. This doesn't apply if returning the code point, as we want
	762	# each element of the set individually. The code above is for this
	763	# simpler case.
	764
	765	return 1 if @$cond == 256; # If all bytes match, is trivially true
	766
	767	if (@ranges > 1) {
	768	# See if the entire set shares optimizable characterstics, and if
	769	# so, return the optimization. We delay checking for this on sets
	770	# with just a single range, as there may be better optimizations
	771	# available in that case.
	772	my ($mask, $base) = calculate_mask(@$cond);
	773	if (defined $mask && defined $base) {
	774	return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base;
	775	}
	776	}
	777
	778	# Here, there was no entire-class optimization. Look at each range.
	779	for (my $i = 0; $i < @ranges; $i++) {
	780	if (! ref $ranges[$i]) { # Trivial case: no range
	781	$ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
	782	}
	783	elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
	784	$ranges[$i] = # Trivial case: single element range
	785	sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
	786	}
	787	else {
	788	my $output = "";
	789
	790	# Well-formed UTF-8 continuation bytes on ascii platforms must
	791	# be in the range 0x80 .. 0xBF. If we know that the input is
	792	# well-formed (indicated by not trying to be 'safe'), we can
	793	# omit tests that verify that the input is within either of
	794	# these bounds. (No legal UTF-8 character can begin with
	795	# anything in this range, so we don't have to worry about this
	796	# being a continuation byte or not.)
	797	if (ASCII_PLATFORM
	798	&& ! $opts_ref->{safe}
	799	&& $opts_ref->{type} =~ / ^ (?: utf8 \| high ) $ /xi)
	800	{
	801	my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
	802	my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
	803
	804	# If the range is the entire legal range, it matches any
	805	# legal byte, so we can omit both tests. (This should
	806	# happen only if the number of ranges is 1.)
	807	if ($lower_limit_is_80 && $upper_limit_is_BF) {
	808	return 1;
	809	}
	810	elsif ($lower_limit_is_80) { # Just use the upper limit test
	811	$output = sprintf("( $test <= $self->{val_fmt} )",
	812	$ranges[$i]->[1]);
	813	}
	814	elsif ($upper_limit_is_BF) { # Just use the lower limit test
	815	$output = sprintf("( $test >= $self->{val_fmt} )",
	816	$ranges[$i]->[0]);
	817	}
	818	}
	819
	820	# If we didn't change to omit a test above, see if the number
	821	# of elements is a power of 2 (only a single bit in the
	822	# representation of its count will be set) and if so, it may
	823	# be that a mask/compare optimization is possible.
	824	if ($output eq ""
	825	&& pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
	826	{
	827	my @list;
	828	push @list, $_ for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
	829	my ($mask, $base) = calculate_mask(@list);
	830	if (defined $mask && defined $base) {
	831	$output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $mask, $base;
	832	}
	833	}
	834
	835	if ($output ne "") { # Prefer any optimization
	836	$ranges[$i] = $output;
	837	}
	838	elsif ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
	839	# No optimization happened. We need a test that the code
	840	# point is within both bounds. But, if the bounds are
	841	# adjacent code points, it is cleaner to say
	842	# 'first == test \|\| second == test'
	843	# than it is to say
	844	# 'first <= test && test <= second'
	845	$ranges[$i] = "( "
	846	. join( " \|\| ", ( map
	847	{ sprintf "$self->{val_fmt} == $test", $_ }
	848	@{$ranges[$i]} ) )
	849	. " )";
	850	}
	851	else { # Full bounds checking
	852	$ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
	853	}
	854	}
	855	}
	856	}
	857
	858	return "( " . join( " \|\| ", @ranges ) . " )";
	859
	860	}
	861
	862	# _combine
	863	# recursively turn a list of conditions into a fast break-out condition
	864	# used by _cond_as_str() for 'cp' type macros.
	865	sub _combine {
	866	my ( $self, $test, @cond )= @_;
	867	return if !@cond;
	868	my $item= shift @cond;
	869	my ( $cstr, $gtv );
	870	if ( ref $item ) {
	871	$cstr=
	872	sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
	873	@$item );
	874	$gtv= sprintf "$self->{val_fmt}", $item->[1];
	875	} else {
	876	$cstr= sprintf( "$self->{val_fmt} == $test", $item );
	877	$gtv= sprintf "$self->{val_fmt}", $item;
	878	}
	879	if ( @cond ) {
	880	return "( $cstr \|\| ( $gtv < $test &&\n"
	881	. $self->_combine( $test, @cond ) . " ) )";
	882	} else {
	883	return $cstr;
	884	}
	885	}
	886
	887	# _render()
	888	# recursively convert an optree to text with reasonably neat formatting
	889	sub _render {
	890	my ( $self, $op, $combine, $brace, $opts_ref )= @_;
	891	return 0 if ! defined $op; # The set is empty
	892	if ( !ref $op ) {
	893	return $op;
	894	}
	895	my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
	896	#no warnings 'recursion'; # This would allow really really inefficient
	897	# code to be generated. See pod
	898	my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref );
	899	return $yes if $cond eq '1';
	900
	901	my $no= $self->_render( $op->{no}, $combine, 0, $opts_ref );
	902	return "( $cond )" if $yes eq '1' and $no eq '0';
	903	my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
	904	return "$lb$cond ? $yes : $no$rb"
	905	if !ref( $op->{yes} ) && !ref( $op->{no} );
	906	my $ind1= " " x 4;
	907	my $ind= "\n" . ( $ind1 x $op->{depth} );
	908
	909	if ( ref $op->{yes} ) {
	910	$yes= $ind . $ind1 . $yes;
	911	} else {
	912	$yes= " " . $yes;
	913	}
	914
	915	return "$lb$cond ?$yes$ind: $no$rb";
	916	}
	917
	918	# $expr=render($op,$combine)
	919	#
	920	# convert an optree to text with reasonably neat formatting. If $combine
	921	# is true then the condition is created using "fast breakouts" which
	922	# produce uglier expressions that are more efficient for common case,
	923	# longer lists such as that resulting from type 'cp' output.
	924	# Currently only used for type 'cp' macros.
	925	sub render {
	926	my ( $self, $op, $combine, $opts_ref )= @_;
	927	my $str= "( " . $self->_render( $op, $combine, 0, $opts_ref ) . " )";
	928	return __clean( $str );
	929	}
	930
	931	# make_macro
	932	# make a macro of a given type.
	933	# calls into make_trie and (generic_\|length_)optree as needed
	934	# Opts are:
	935	# type : 'cp','generic','high','low','latin1','utf8','LATIN1','UTF8'
	936	# ret_type : 'cp' or 'len'
	937	# safe : add length guards to macro
	938	#
	939	# type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
	940	# in which case it defaults to 'cp' as well.
	941	#
	942	# it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
	943	# sequences in it, as the generated macro will accept only a single codepoint
	944	# as an argument.
	945	#
	946	# returns the macro.
	947
	948
	949	sub make_macro {
	950	my $self= shift;
	951	my %opts= @_;
	952	my $type= $opts{type} \|\| 'generic';
	953	die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
	954	if $type eq 'cp'
	955	and $self->{has_multi};
	956	my $ret_type= $opts{ret_type} \|\| ( $opts{type} eq 'cp' ? 'cp' : 'len' );
	957	my $method;
	958	if ( $opts{safe} ) {
	959	$method= 'length_optree';
	960	} elsif ( $type eq 'generic' ) {
	961	$method= 'generic_optree';
	962	} else {
	963	$method= 'optree';
	964	}
	965	my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
	966	my $text= $self->render( $optree, $type eq 'cp', \%opts );
	967	my @args= $type eq 'cp' ? 'cp' : 's';
	968	push @args, "e" if $opts{safe};
	969	push @args, "is_utf8" if $type eq 'generic';
	970	push @args, "len" if $ret_type eq 'both';
	971	my $pfx= $ret_type eq 'both' ? 'what_len_' :
	972	$ret_type eq 'cp' ? 'what_' : 'is_';
	973	my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
	974	$ext .= "_safe" if $opts{safe};
	975	my $argstr= join ",", @args;
	976	return "/* GENERATED CODE */\n"
	977	. __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
	978	}
	979
	980	# if we arent being used as a module (highly likely) then process
	981	# the __DATA__ below and produce macros in regcharclass.h
	982	# if an argument is provided to the script then it is assumed to
	983	# be the path of the file to output to, if the arg is '-' outputs
	984	# to STDOUT.
	985	if ( !caller ) {
	986	$\|++;
	987	my $path= shift @ARGV \|\| "regcharclass.h";
	988	my $out_fh;
	989	if ( $path eq '-' ) {
	990	$out_fh= \*STDOUT;
	991	} else {
	992	$out_fh = open_new( $path );
	993	}
	994	print $out_fh read_only_top( lang => 'C', by => $0,
	995	file => 'regcharclass.h', style => '*',
	996	copyright => [2007, 2011] );
	997	print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
	998
	999	my ( $op, $title, @txt, @types, %mods );
	1000	my $doit= sub {
	1001	return unless $op;
	1002
	1003	# Skip if to compile on a different platform.
	1004	return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
	1005	return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
	1006
	1007	print $out_fh "/*\n\t$op: $title\n\n";
	1008	print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
	1009	my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
	1010
	1011	#die Dumper(\@types,\%mods);
	1012
	1013	my @mods;
	1014	push @mods, 'safe' if delete $mods{safe};
	1015	unshift @mods, 'fast' if delete $mods{fast} \|\| ! @mods; # Default to 'fast'
	1016	# do this one
	1017	# first, as
	1018	# traditional
	1019	if (%mods) {
	1020	die "Unknown modifiers: ", join ", ", map { "'$_'" } keys %mods;
	1021	}
	1022
	1023	foreach my $type_spec ( @types ) {
	1024	my ( $type, $ret )= split /-/, $type_spec;
	1025	$ret \|\|= 'len';
	1026	foreach my $mod ( @mods ) {
	1027	next if $mod eq 'safe' and $type eq 'cp';
	1028	delete $mods{$mod};
	1029	my $macro= $obj->make_macro(
	1030	type => $type,
	1031	ret_type => $ret,
	1032	safe => $mod eq 'safe'
	1033	);
	1034	print $out_fh $macro, "\n";
	1035	}
	1036	}
	1037	};
	1038
	1039	while ( <DATA> ) {
	1040	s/^ \s* (?: \# .* ) ? $ //x; # squeeze out comment and blanks
	1041	next unless /\S/;
	1042	chomp;
	1043	if ( /^[A-Z]/ ) {
	1044	$doit->(); # This starts a new definition; do the previous one
	1045	( $op, $title )= split /\s:\s/, $_, 2;
	1046	@txt= ();
	1047	} elsif ( s/^=>// ) {
	1048	my ( $type, $modifier )= split /:/, $_;
	1049	@types= split ' ', $type;
	1050	undef %mods;
	1051	map { $mods{$_} = 1 } split ' ', $modifier;
	1052	} else {
	1053	push @txt, "$_";
	1054	}
	1055	}
	1056	$doit->();
	1057
	1058	print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
	1059
	1060	if($path eq '-') {
	1061	print $out_fh "/* ex: set ro: */\n";
	1062	} else {
	1063	read_only_bottom_close_and_rename($out_fh)
	1064	}
	1065	}
	1066
	1067	# The form of the input is a series of definitions to make macros for.
	1068	# The first line gives the base name of the macro, followed by a colon, and
	1069	# then text to be used in comments associated with the macro that are its
	1070	# title or description. In all cases the first (perhaps only) parameter to
	1071	# the macro is a pointer to the first byte of the code point it is to test to
	1072	# see if it is in the class determined by the macro. In the case of non-UTF8,
	1073	# the code point consists only of a single byte.
	1074	#
	1075	# The second line must begin with a '=>' and be followed by the types of
	1076	# macro(s) to be generated; these are specified below. A colon follows the
	1077	# types, followed by the modifiers, also specified below. At least one
	1078	# modifier is required.
	1079	#
	1080	# The subsequent lines give what code points go into the class defined by the
	1081	# macro. Multiple characters may be specified via a string like "\x0D\x0A",
	1082	# enclosed in quotes. Otherwise the lines consist of one of:
	1083	# 1) a single Unicode code point, prefaced by 0x
	1084	# 2) a single range of Unicode code points separated by a minus (and
	1085	# optional space)
	1086	# 3) a single Unicode property specified in the standard Perl form
	1087	# "\p{...}"
	1088	# 4) a line like 'do path'. This will do a 'do' on the file given by
	1089	# 'path'. It is assumed that this does nothing but load subroutines
	1090	# (See item 5 below). The reason 'require path' is not used instead is
	1091	# because 'do' doesn't assume that path is in @INC.
	1092	# 5) a subroutine call
	1093	# &pkg::foo(arg1, ...)
	1094	# where pkg::foo was loaded by a 'do' line (item 4). The subroutine
	1095	# returns an array of entries of forms like items 1-3 above. This
	1096	# allows more complex inputs than achievable from the other input types.
	1097	#
	1098	# A blank line or one whose first non-blank character is '#' is a comment.
	1099	# The definition of the macro is terminated by a line unlike those described.
	1100	#
	1101	# Valid types:
	1102	# low generate a macro whose name is 'is_BASE_low' and defines a
	1103	# class that includes only ASCII-range chars. (BASE is the
	1104	# input macro base name.)
	1105	# latin1 generate a macro whose name is 'is_BASE_latin1' and defines a
	1106	# class that includes only upper-Latin1-range chars. It is not
	1107	# designed to take a UTF-8 input parameter.
	1108	# high generate a macro whose name is 'is_BASE_high' and defines a
	1109	# class that includes all relevant code points that are above
	1110	# the Latin1 range. This is for very specialized uses only.
	1111	# It is designed to take only an input UTF-8 parameter.
	1112	# utf8 generate a macro whose name is 'is_BASE_utf8' and defines a
	1113	# class that includes all relevant characters that aren't ASCII.
	1114	# It is designed to take only an input UTF-8 parameter.
	1115	# LATIN1 generate a macro whose name is 'is_BASE_latin1' and defines a
	1116	# class that includes both ASCII and upper-Latin1-range chars.
	1117	# It is not designed to take a UTF-8 input parameter.
	1118	# UTF8 generate a macro whose name is 'is_BASE_utf8' and defines a
	1119	# class that can include any code point, adding the 'low' ones
	1120	# to what 'utf8' works on. It is designed to take only an input
	1121	# UTF-8 parameter.
	1122	# generic generate a macro whose name is 'is_BASE". It has a 2nd,
	1123	# boolean, parameter which indicates if the first one points to
	1124	# a UTF-8 string or not. Thus it works in all circumstances.
	1125	# cp generate a macro whose name is 'is_BASE_cp' and defines a
	1126	# class that returns true if the UV parameter is a member of the
	1127	# class; false if not.
	1128	# A macro of the given type is generated for each type listed in the input.
	1129	# The default return value is the number of octets read to generate the match.
	1130	# Append "-cp" to the type to have it instead return the matched codepoint.
	1131	# The macro name is changed to 'what_BASE...'. See pod for
	1132	# caveats
	1133	# Appending '-both" instead adds an extra parameter to the end of the argument
	1134	# list, which is a pointer as to where to store the number of
	1135	# bytes matched, while also returning the code point. The macro
	1136	# name is changed to 'what_len_BASE...'. See pod for caveats
	1137	#
	1138	# Valid modifiers:
	1139	# safe The input string is not necessarily valid UTF-8. In
	1140	# particular an extra parameter (always the 2nd) to the macro is
	1141	# required, which points to one beyond the end of the string.
	1142	# The macro will make sure not to read off the end of the
	1143	# string. In the case of non-UTF8, it makes sure that the
	1144	# string has at least one byte in it. The macro name has
	1145	# '_safe' appended to it.
	1146	# fast The input string is valid UTF-8. No bounds checking is done,
	1147	# and the macro can make assumptions that lead to faster
	1148	# execution.
	1149	# only_ascii_platform Skip this definition if this program is being run on
	1150	# a non-ASCII platform.
	1151	# only_ebcdic_platform Skip this definition if this program is being run on
	1152	# a non-EBCDIC platform.
	1153	# No modifier need be specified; fast is assumed for this case. If both
	1154	# 'fast', and 'safe' are specified, two macros will be created for each
	1155	# 'type'.
	1156	#
	1157	# If run on a non-ASCII platform will automatically convert the Unicode input
	1158	# to native. The documentation above is slightly wrong in this case. 'low'
	1159	# actually refers to code points whose UTF-8 representation is the same as the
	1160	# non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
	1161	# code points less than 256.
	1162
	1163	1; # in the unlikely case we are being used as a module
	1164
	1165	__DATA__
	1166	# This is no longer used, but retained in case it is needed some day.
	1167	# TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
	1168	# => generic cp generic-cp generic-both :fast safe
	1169	# 0x00DF # LATIN SMALL LETTER SHARP S
	1170	# 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	1171	# 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	1172	# 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
	1173	# 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
	1174	# 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
	1175
	1176	LNBREAK: Line Break: \R
	1177	=> generic UTF8 LATIN1 :fast safe
	1178	"\x0D\x0A" # CRLF - Network (Windows) line ending
	1179	\p{VertSpace}
	1180
	1181	HORIZWS: Horizontal Whitespace: \h \H
	1182	=> generic UTF8 LATIN1 cp :fast safe
	1183	\p{HorizSpace}
	1184
	1185	VERTWS: Vertical Whitespace: \v \V
	1186	=> generic UTF8 LATIN1 cp :fast safe
	1187	\p{VertSpace}
	1188
	1189	REPLACEMENT: Unicode REPLACEMENT CHARACTER
	1190	=> UTF8 :safe
	1191	0xFFFD
	1192
	1193	NONCHAR: Non character code points
	1194	=> UTF8 :fast
	1195	\p{Nchar}
	1196
	1197	SURROGATE: Surrogate characters
	1198	=> UTF8 :fast
	1199	\p{Gc=Cs}
	1200
	1201	GCB_L: Grapheme_Cluster_Break=L
	1202	=> UTF8 :fast
	1203	\p{_X_GCB_L}
	1204
	1205	GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
	1206	=> UTF8 :fast
	1207	\p{_X_LV_LVT_V}
	1208
	1209	GCB_Prepend: Grapheme_Cluster_Break=Prepend
	1210	=> UTF8 :fast
	1211	\p{_X_GCB_Prepend}
	1212
	1213	GCB_RI: Grapheme_Cluster_Break=RI
	1214	=> UTF8 :fast
	1215	\p{_X_RI}
	1216
	1217	GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins
	1218	=> UTF8 :fast
	1219	\p{_X_Special_Begin}
	1220
	1221	GCB_T: Grapheme_Cluster_Break=T
	1222	=> UTF8 :fast
	1223	\p{_X_GCB_T}
	1224
	1225	GCB_V: Grapheme_Cluster_Break=V
	1226	=> UTF8 :fast
	1227	\p{_X_GCB_V}
	1228
	1229	# This program was run with this enabled, and the results copied to utf8.h;
	1230	# then this was commented out because it takes so long to figure out these 2
	1231	# million code points. The results would not change unless utf8.h decides it
	1232	# wants a maximum other than 4 bytes, or this program creates better
	1233	# optimizations
	1234	#UTF8_CHAR: Matches utf8 from 1 to 4 bytes
	1235	#=> UTF8 :safe only_ascii_platform
	1236	#0x0 - 0x1FFFFF
	1237
	1238	# This hasn't been commented out, because we haven't an EBCDIC platform to run
	1239	# it on, and the 3 types of EBCDIC allegedly supported by Perl would have
	1240	# different results
	1241	UTF8_CHAR: Matches utf8 from 1 to 5 bytes
	1242	=> UTF8 :safe only_ebcdic_platform
	1243	0x0 - 0x3FFFFF:
	1244
	1245	QUOTEMETA: Meta-characters that \Q should quote
	1246	=> high :fast
	1247	\p{_Perl_Quotemeta}
	1248
	1249	MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
	1250	=> UTF8 :safe
	1251	do regen/regcharclass_multi_char_folds.pl
	1252
	1253	# 1 => All folds
	1254	&regcharclass_multi_char_folds::multi_char_folds(1)
	1255
	1256	MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
	1257	=> LATIN1 :safe
	1258
	1259	&regcharclass_multi_char_folds::multi_char_folds(0)
	1260	# 0 => Latin1-only