perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	package UTF8::Matcher;
	2	use strict;
	3	use warnings;
	4	use Text::Wrap qw(wrap);
	5	use Encode;
	6	use Data::Dumper;
	7
	8	# Author: Yves Orton (demerphq) 2007.
	9
	10	=pod
	11
	12	Dynamically generates macros for detecting special charclasses
	13	in both latin-1, utf8, and codepoint forms.
	14
	15	To regenerate regcharclass.h, run this script from perl-root. No arguments
	16	are necessary.
	17
	18	Each charclass handler is constructed as follows:
	19	Each string the charclass must match is rendered as unicode (codepoints>255),
	20	and if possible as latin1 (codepoints>127), and if possible as "neutral"
	21	(all codepoints<128).
	22
	23	The rendered strings are then inserted into digit-tries by type and length.
	24	With shorter strings being added to tries that are allowed to contain longer
	25	strings, but not vice versa. Thus the "longest" trie contains all strings
	26	for that charclass.
	27
	28	The following types of trie are generated:
	29
	30	n - Neutral only. All strings in this type have codepoints<128
	31	l - Latin1 only. All strings in this type have a codepoint>127 in them
	32	u - UTF8 only. All strings in this type have a codepoint>255 in them
	33	L - Latin1. All strings in 'n' and 'l'
	34	U - UTF8. All string in 'n' and 'u'
	35	c - Codepoint. All strings in U but in codepoint and not utf8 form.
	36
	37	The ternary() routine is responsible for converting the trie data into a
	38	ternary conditional that matches the required set of strings. The generated
	39	macro normally takes at least the argument 's' which is expected to be a
	40	pointer of type C<char > or C<U8 >. The condition generated will be
	41	optimised to match the string as efficiently as possible, with range lookups
	42	being used where possible, and in some situations relying on "true" to be 1.
	43
	44	ternary() takes two optional arguments, $type which is one of the above
	45	characters and $ext which is used to add an extra extension to the macro name.
	46
	47	If $type is omitted or false then the generated macro will take an additional
	48	argument, 'is_utf8'.
	49
	50	If $ext has the string 'safe' in it then the generated macro will take an extra
	51	argument 'e' for the end of the string, and all lookups will be length checked
	52	to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
	53	be guaranteed safe, and no 'e' argument is provided and no length checks are
	54	made during execution.
	55
	56	The 'c' type is different as compared to the rest. Instead of producing
	57	a condition that does octet comparisons of a string array, the 'c' type
	58	produces a macro that takes a single codepoint as an argument (instead of a
	59	char* or U8*) and does the lookup based on only that char, thus it cannot be
	60	used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
	61	This is primarily used for populating charclass bitmaps for codepoints 0..255
	62	but will also match codepoints in the unicode range if necessary.
	63
	64	Using LNBREAK as an example the following macros will be produced:
	65
	66	=over 4
	67
	68	=item is_LNBREAK(s,is_utf8)
	69
	70	=item is_LNBREAK_safe(s,e,is_utf8)
	71
	72	Do a lookup as apporpriate based on the is_utf8 flag. When possible
	73	comparisons involving octect<128 are done before checking the is_utf8
	74	flag, hopefully saving time.
	75
	76	=item is_LNBREAK_utf8(s)
	77
	78	=item is_LNBREAK_utf8_safe(s,e)
	79
	80	Do a lookup assuming the string is encoded in (normalized) UTF8.
	81
	82	=item is_LNBREAK_latin1(s)
	83
	84	=item is_LNBREAK_latin1_safe(s,e)
	85
	86	Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
	87
	88	=item is_LNBREAK_cp(cp)
	89
	90	Check to see if the string matches a given codepoint (hypotethically a
	91	U32). The condition is constructed as as to "break out" as early as
	92	possible if the codepoint is out of range of the condition.
	93
	94	IOW:
	95
	96	(cp==X \|\| (cp>X && (cp==Y \|\| (cp>Y && ...))))
	97
	98	Thus if the character is X+1 only two comparisons will be done. Making
	99	matching lookups slower, but non-matching faster.
	100
	101	=back
	102
	103	=cut
	104
	105	# store a list of numbers into a hash based trie.
	106	sub _trie_store {
	107	my $root= shift;
	108	foreach my $b ( @_ ) {
	109	$root->{$b} \|\|= {};
	110	$root= $root->{$b};
	111	}
	112	$root->{''}++;
	113	}
	114
	115	# Convert a string into its neutral, latin1, utf8 forms, where
	116	# the form is undefined unless the string can be completely represented
	117	# in that form. The string is then decomposed into the octects representing
	118	# it. A list is returned for each. Additional a list of codepoints making
	119	# up the string.
	120	# returns (\@n,\@u,\@l,\@cp)
	121	#
	122	sub _uni_latin1 {
	123	my $str= shift;
	124	my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
	125	my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
	126	my $n= $l;
	127	undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
	128	return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
	129	[map { ord $_ } split //,$str]);
	130	}
	131
	132	# store an array ref of char data into the appropriate
	133	# type bins, tracking sizes as we go.
	134	sub _store {
	135	my ( $self, $r, @k )= @_;
	136	for my $z ( @k ) {
	137	$self->{size}{$z}{ 0 + @$r }++;
	138	push @{ $self->{data}{$z} }, $r;
	139	}
	140	}
	141
	142	# construct a new charclass constructor object.
	143	# $title ends up in the code a as a comment.
	144	# $opcode is the name of the operation the charclass implements.
	145	# the rest of the arguments are strings that the charclass
	146	# can match.
	147	sub new {
	148	my $class= shift;
	149	my $title= shift;
	150	my $opcode= shift;
	151	my $self= bless { op => $opcode, title => $title }, $class;
	152	my %seen;
	153	# convert the strings to the numeric equivelents and store
	154	# them for later insertion while tracking their sizes.
	155	foreach my $seq ( @_ ) {
	156	next if $seen{$seq}++;
	157	push @{$self->{seq}},$seq;
	158	my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
	159	if ( $n ) {
	160	_store( $self, $n, qw(n U L) );
	161	} else {
	162	if ( $l ) {
	163	_store( $self, $l, qw(l L) );
	164	}
	165	_store( $self, $u, qw(u U) );
	166	}
	167	_store($self,$cp,'c');
	168	}
	169	#
	170	# now construct the tries. For each type of data we insert
	171	# the data into all the tries of length $size and smaller.
	172	#
	173
	174	my %allsize;
	175	foreach my $k ( keys %{ $self->{data} } ) {
	176	my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
	177	$self->{size}{$k}=\@size;
	178	undef @allsize{@size};
	179	foreach my $d ( @{ $self->{data}{$k} } ) {
	180	foreach my $sz ( @size ) {
	181	last if $sz < @$d;
	182	$self->{trie}{$k}{$sz} \|\|= {};
	183	_trie_store( $self->{trie}{$k}{$sz}, @$d );
	184	}
	185	}
	186	#delete $self->{data}{$k};
	187	}
	188	my @size= sort { $b <=> $a } keys %allsize;
	189	$self->{size}{''}= \@size;
	190	return $self;
	191	}
	192
	193	#
	194	# _cond([$v1,$v2,$v2...],$ofs)
	195	#
	196	# converts an array of codepoints into a conditional expression
	197	# consequtive codepoints are merged into a range test
	198	# returns a string containing the conditional expression in the form
	199	# '( li[x]==v \|\| li[x]==y )' When possible we also use range lookups.
	200
	201	sub _cond {
	202	my ( $c, $ofs,$fmt )= @_;
	203	$fmt\|\|='((U8*)s)[%d]';
	204	# cheapo rangification routine.
	205	# Convert the first element into a singleton represented
	206	# as [$x,$x] and then merge the rest in as we go.
	207	my @v= sort { $a <=> $b } @$c;
	208	my @r= ( [ ( shift @v ) x 2 ] );
	209	for my $n ( @v ) {
	210	if ( $n == $r[-1][1] + 1 ) {
	211	$r[-1][1]++;
	212	} else {
	213	push @r, [ $n, $n ];
	214	}
	215	}
	216	@r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
	217	# sort the ranges by size and order.
	218	@r= sort { $a->[0] <=> $b->[0] } @r;
	219	my $alu= sprintf $fmt,$ofs; # C array look up
	220
	221	if ($fmt=~/%d/) {
	222	# map the ranges into conditions
	223	@r= map {
	224	# singleton
	225	$_->[0] == $_->[1] ? "$alu==$_->[0]" :
	226	# range
	227	"($_->[0]<=$alu && $alu<=$_->[1])"
	228	} @r;
	229	# return the joined results.
	230	return '( ' . join( " \|\| ", @r ) . ' )';
	231	} else {
	232	return combine($alu,@r);
	233	}
	234	}
	235
	236	#
	237	# Do the condition in such a way that we break out early if the value
	238	# we are looking at is in between two elements in the list.
	239	# Currently used only for codepoint macros (depth 1)
	240	#
	241	sub combine {
	242	my $alu=shift;
	243	local $_ = shift;
	244	my $txt= $_->[0] == $_->[1]
	245	? "$alu==$_->[0]"
	246	: "($_->[0]<=$alu && $alu<=$_->[1])";
	247	return $txt unless @_;
	248	return "( $txt \|\| ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
	249	}
	250
	251	# recursively convert a trie to an optree represented by
	252	# [condition,yes,no] where yes and no can be a ref to another optree
	253	# or a scalar representing code.
	254	# called by make_optree
	255
	256	sub _trie_to_optree {
	257	my ( $node, $ofs, $else, $fmt )= @_;
	258	return $else unless $node;
	259	$ofs \|\|= 0;
	260	if ( $node->{''} ) {
	261	$else= $ofs;
	262	} else {
	263	$else \|\|= 0;
	264	}
	265	my @k= sort { $b->[1] cmp $a->[1] \|\| $a->[0] <=> $b->[0] }
	266	map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
	267	grep length, keys %$node;
	268
	269	return $ofs if !@k;
	270
	271	my ( $root, $expr );
	272	while ( @k ) {
	273	my @cond= ( $k[0][0] );
	274	my $d= $k[0][1];
	275	my $r= $k[0][2];
	276	shift @k;
	277	while ( @k && $k[0][1] eq $d ) {
	278	push @cond, $k[0][0];
	279	shift @k;
	280	}
	281	my $op=
	282	[ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
	283	if ( !$root ) {
	284	$root= $expr= $op;
	285	} else {
	286	push @$expr, $op;
	287	$expr= $op;
	288	}
	289	}
	290	push @$expr, $else;
	291	return $root;
	292	}
	293
	294	# construct the optree for a type.
	295	# handles the special logic of type ''.
	296	sub make_optree {
	297	my ( $self, $type, $size, $fmt )= @_;
	298	my $else= 0;
	299	$size\|\|=$self->{size}{$type}[0];
	300	$size=1 if $type eq 'c';
	301	if ( !$type ) {
	302	my ( $u, $l );
	303	for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
	304	$u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
	305	}
	306	for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
	307	$l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
	308	}
	309	if ( $u ) {
	310	$else= [ '(is_utf8)', $u, $l \|\| 0 ];
	311	} elsif ( $l ) {
	312	$else= [ '(!is_utf8)', $l, 0 ];
	313	}
	314	$type= 'n';
	315	$size-- while !$self->{trie}{n}{$size};
	316	}
	317	return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
	318	}
	319
	320	# construct the optree for a type with length checks to prevent buffer
	321	# overruns. Only one length check is performed per lookup trading code
	322	# size for speed.
	323	sub length_optree {
	324	my ( $self, $type,$fmt )= @_;
	325	$type \|\|= '';
	326	return $self->{len_op}{$type} if $self->{len_op}{$type};
	327	my @size = @{$self->{size}{$type}};
	328
	329	my ( $root, $expr );
	330	foreach my $size ( @size ) {
	331	my $op= [
	332	"( (e) - (s) > " . ( $size - 1 ) . " )",
	333	$self->make_optree( $type, $size ),
	334	];
	335	if ( !$root ) {
	336	$root= $expr= $op;
	337	} else {
	338	push @$expr, $op;
	339	$expr= $op;
	340	}
	341	}
	342	push @$expr, 0;
	343	return $self->{len_op}{$type}= $root ? $root : $expr->[0];
	344	}
	345
	346	#
	347	# recursively walk an optree and covert it to a huge nested ternary expression.
	348	#
	349	sub _optree_to_ternary {
	350	my ( $node )= @_;
	351	return $node
	352	if !ref $node;
	353	my $depth = 0;
	354	if ( $node->[0] =~ /\[(\d+)\]/ ) {
	355	$depth= $1 + 1;
	356	}
	357	return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
	358	_optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
	359	}
	360
	361	# add \\ to the end of strings in a reasonable neat way.
	362	sub _macro($) {
	363	my $str= shift;
	364	my @lines= split /[^\S\n]*\n/, $str;
	365	return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n";
	366	}
	367
	368	# default type extensions. 'uln' dont have one because normally
	369	# they are used only as part of type '' which doesnt get an extension
	370	my %ext= (
	371	U => '_utf8',
	372	L => '_latin1',
	373	c => '_cp',
	374
	375	);
	376
	377	# produce the ternary, handling arguments and putting on the macro headers
	378	# and boiler plate
	379	sub ternary {
	380	my ( $self, $type, $ext )= @_;
	381	$type \|\|= '';
	382	$ext = ($ext{$type} \|\| '') . ($ext\|\|"");
	383	my ($root,$fmt,$arg);
	384	if ($type eq 'c') {
	385	$arg= $fmt= 'cp';
	386	} else {
	387	$arg= 's';
	388	}
	389	if ( $type eq 'c' \|\| $ext !~ /safe/) {
	390	$root= $self->make_optree( $type, 0, $fmt );
	391	} else {
	392	$root= $self->length_optree( $type, $fmt );
	393	}
	394
	395	our $parens;
	396	$parens= qr/ $ (?: (?> [^()]+? ) \| (??{$parens}) )+? $ /x;
	397	my $expr= qr/
	398	\( \s*
	399	($parens)
	400	\s* \? \s*
	401	\( \s*
	402	($parens)
	403	\s* \? \s*
	404	(\d+\|$parens)
	405	\s* : \s*
	406	(\d+\|$parens)
	407	\s* \)
	408	\s* : \s*
	409	\4
	410	\s* \)
	411	/x;
	412	my $code= _optree_to_ternary( $root );
	413	for ( $code ) {
	414	s/^\s*//;
	415	1 while s/$\s($parens)\s\?\s1\s:\s0\s$/$1/g
	416	\|\| s<$expr><(($1 && $2) ? $3 : $4)>g
	417	\|\| s<$\s($parens)\s$><$1>g;
	418	}
	419	my @args=($arg);
	420	push @args,'e' if $ext=~/safe/;
	421	push @args,'is_utf8' if !$type;
	422	my $args=join ",",@args;
	423	return "/* GENERATED CODE */\n"
	424	. _macro "#define is_$self->{op}$ext($args)\n$code";
	425	}
	426
	427	my $path=shift @ARGV;
	428	if (!$path) {
	429	$path= "regcharclass.h";
	430	if (!-e $path) { $path="../$path" }
	431	if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
	432	}
	433
	434	rename $path,"$path.bak";
	435	open my $out_fh,">",$path
	436	or die "Can't write to '$path':$!";
	437	binmode $out_fh; # want unix line endings even when run on win32.
	438	my ($zero)=$0=~/([^\\\/]+)$/;
	439	print $out_fh <<"HEADER";
	440	/********************* WARNING WARNING WARNING **********************
	441
	442	Do not modify this code directly: This file was autogenerated by
	443
	444	Porting/$zero
	445
	446	from data contained within the script. Change the script instead.
	447
	448	Generated at: @{[ scalar gmtime ]} GMT
	449
	450	********************** WARNING WARNING WARNING **********************/
	451
	452	HEADER
	453
	454	my ($op,$title,@strs,@txt);
	455	my $doit= sub {
	456	return unless $op;
	457	my $o= __PACKAGE__->new($title,$op,@strs);
	458	print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
	459	print $out_fh join "\n",@txt,"*/","";
	460	for ('', 'U', 'L') {
	461	print $out_fh $o->ternary( $_ );
	462	print $out_fh $o->ternary( $_,'_safe' );
	463	}
	464	print $out_fh $o->ternary( 'c' );
	465	};
	466	while (<DATA>) {
	467	next unless /\S/;
	468	chomp;
	469	if (/^([A-Z]+)/) {
	470	$doit->();
	471	($op,$title)=split /\s:\s/,$_,2;
	472	@txt=@strs=();
	473	} else {
	474	push @txt, "\t$_";
	475	s/#.*$//;
	476	if (/^0x/) {
	477	push @strs,map { chr $_ } eval $_;
	478	} elsif (/^[""'']/) {
	479	push @strs,eval $_;
	480	}
	481	}
	482	}
	483	$doit->();
	484	print "$path has been updated\n";
	485
	486	__DATA__
	487	LNBREAK: Line Break: \R
	488	"\x0D\x0A" # CRLF - Network (Windows) line ending
	489	0x0A # LF \| LINE FEED
	490	0x0B # VT \| VERTICAL TAB
	491	0x0C # FF \| FORM FEED
	492	0x0D # CR \| CARRIAGE RETURN
	493	0x85 # NEL \| NEXT LINE
	494	0x2028 # LINE SEPARATOR
	495	0x2029 # PARAGRAPH SEPARATOR
	496
	497	HORIZWS: Horizontal Whitespace: \h \H
	498	0x09 # HT
	499	0x20 # SPACE
	500	0xa0 # NBSP
	501	0x1680 # OGHAM SPACE MARK
	502	0x180e # MONGOLIAN VOWEL SEPARATOR
	503	0x2000 # EN QUAD
	504	0x2001 # EM QUAD
	505	0x2002 # EN SPACE
	506	0x2003 # EM SPACE
	507	0x2004 # THREE-PER-EM SPACE
	508	0x2005 # FOUR-PER-EM SPACE
	509	0x2006 # SIX-PER-EM SPACE
	510	0x2007 # FIGURE SPACE
	511	0x2008 # PUNCTUATION SPACE
	512	0x2009 # THIN SPACE
	513	0x200A # HAIR SPACE
	514	0x202f # NARROW NO-BREAK SPACE
	515	0x205f # MEDIUM MATHEMATICAL SPACE
	516	0x3000 # IDEOGRAPHIC SPACE
	517
	518	VERTWS: Vertical Whitespace: \v \V
	519	0x0A # LF
	520	0x0B # VT
	521	0x0C # FF
	522	0x0D # CR
	523	0x85 # NEL
	524	0x2028 # LINE SEPARATOR
	525	0x2029 # PARAGRAPH SEPARATOR
	526