Commit | Line | Data |
---|---|---|
12b72891 RGS |
1 | package UTF8::Matcher; |
2 | use strict; | |
3 | use warnings; | |
4 | use Text::Wrap qw(wrap); | |
5 | use Encode; | |
6 | use Data::Dumper; | |
7 | ||
0ccab2bc RGS |
8 | our $hex_fmt="0x%02X"; |
9 | ||
12b72891 RGS |
10 | # Author: Yves Orton (demerphq) 2007. |
11 | ||
12 | =pod | |
13 | ||
14 | Dynamically generates macros for detecting special charclasses | |
15 | in both latin-1, utf8, and codepoint forms. | |
16 | ||
17 | To regenerate regcharclass.h, run this script from perl-root. No arguments | |
18 | are necessary. | |
19 | ||
20 | Each charclass handler is constructed as follows: | |
21 | Each string the charclass must match is rendered as unicode (codepoints>255), | |
22 | and if possible as latin1 (codepoints>127), and if possible as "neutral" | |
23 | (all codepoints<128). | |
24 | ||
25 | The rendered strings are then inserted into digit-tries by type and length. | |
26 | With shorter strings being added to tries that are allowed to contain longer | |
27 | strings, but not vice versa. Thus the "longest" trie contains all strings | |
28 | for that charclass. | |
29 | ||
30 | The following types of trie are generated: | |
31 | ||
32 | n - Neutral only. All strings in this type have codepoints<128 | |
33 | l - Latin1 only. All strings in this type have a codepoint>127 in them | |
34 | u - UTF8 only. All strings in this type have a codepoint>255 in them | |
35 | L - Latin1. All strings in 'n' and 'l' | |
36 | U - UTF8. All string in 'n' and 'u' | |
37 | c - Codepoint. All strings in U but in codepoint and not utf8 form. | |
38 | ||
39 | The ternary() routine is responsible for converting the trie data into a | |
40 | ternary conditional that matches the required set of strings. The generated | |
41 | macro normally takes at least the argument 's' which is expected to be a | |
42 | pointer of type C<char *> or C<U8 *>. The condition generated will be | |
43 | optimised to match the string as efficiently as possible, with range lookups | |
44 | being used where possible, and in some situations relying on "true" to be 1. | |
45 | ||
46 | ternary() takes two optional arguments, $type which is one of the above | |
47 | characters and $ext which is used to add an extra extension to the macro name. | |
48 | ||
49 | If $type is omitted or false then the generated macro will take an additional | |
50 | argument, 'is_utf8'. | |
51 | ||
52 | If $ext has the string 'safe' in it then the generated macro will take an extra | |
53 | argument 'e' for the end of the string, and all lookups will be length checked | |
54 | to prevent lookups past e. If 'safe' is not used then the lookup is assumed to | |
55 | be guaranteed safe, and no 'e' argument is provided and no length checks are | |
56 | made during execution. | |
57 | ||
58 | The 'c' type is different as compared to the rest. Instead of producing | |
59 | a condition that does octet comparisons of a string array, the 'c' type | |
60 | produces a macro that takes a single codepoint as an argument (instead of a | |
61 | char* or U8*) and does the lookup based on only that char, thus it cannot be | |
62 | used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass. | |
63 | This is primarily used for populating charclass bitmaps for codepoints 0..255 | |
64 | but will also match codepoints in the unicode range if necessary. | |
65 | ||
66 | Using LNBREAK as an example the following macros will be produced: | |
67 | ||
68 | =over 4 | |
69 | ||
70 | =item is_LNBREAK(s,is_utf8) | |
71 | ||
72 | =item is_LNBREAK_safe(s,e,is_utf8) | |
73 | ||
74 | Do a lookup as apporpriate based on the is_utf8 flag. When possible | |
75 | comparisons involving octect<128 are done before checking the is_utf8 | |
76 | flag, hopefully saving time. | |
77 | ||
78 | =item is_LNBREAK_utf8(s) | |
79 | ||
80 | =item is_LNBREAK_utf8_safe(s,e) | |
81 | ||
82 | Do a lookup assuming the string is encoded in (normalized) UTF8. | |
83 | ||
84 | =item is_LNBREAK_latin1(s) | |
85 | ||
86 | =item is_LNBREAK_latin1_safe(s,e) | |
87 | ||
88 | Do a lookup assuming the string is encoded in latin-1 (aka plan octets). | |
89 | ||
90 | =item is_LNBREAK_cp(cp) | |
91 | ||
92 | Check to see if the string matches a given codepoint (hypotethically a | |
93 | U32). The condition is constructed as as to "break out" as early as | |
94 | possible if the codepoint is out of range of the condition. | |
95 | ||
96 | IOW: | |
97 | ||
98 | (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) | |
99 | ||
100 | Thus if the character is X+1 only two comparisons will be done. Making | |
101 | matching lookups slower, but non-matching faster. | |
102 | ||
103 | =back | |
104 | ||
105 | =cut | |
106 | ||
107 | # store a list of numbers into a hash based trie. | |
108 | sub _trie_store { | |
109 | my $root= shift; | |
110 | foreach my $b ( @_ ) { | |
111 | $root->{$b} ||= {}; | |
112 | $root= $root->{$b}; | |
113 | } | |
114 | $root->{''}++; | |
115 | } | |
116 | ||
117 | # Convert a string into its neutral, latin1, utf8 forms, where | |
118 | # the form is undefined unless the string can be completely represented | |
119 | # in that form. The string is then decomposed into the octects representing | |
120 | # it. A list is returned for each. Additional a list of codepoints making | |
121 | # up the string. | |
122 | # returns (\@n,\@u,\@l,\@cp) | |
123 | # | |
124 | sub _uni_latin1 { | |
125 | my $str= shift; | |
126 | my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) }; | |
127 | my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) }; | |
128 | my $n= $l; | |
129 | undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/; | |
130 | return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )), | |
131 | [map { ord $_ } split //,$str]); | |
132 | } | |
133 | ||
134 | # store an array ref of char data into the appropriate | |
135 | # type bins, tracking sizes as we go. | |
136 | sub _store { | |
137 | my ( $self, $r, @k )= @_; | |
138 | for my $z ( @k ) { | |
139 | $self->{size}{$z}{ 0 + @$r }++; | |
140 | push @{ $self->{data}{$z} }, $r; | |
141 | } | |
142 | } | |
143 | ||
144 | # construct a new charclass constructor object. | |
145 | # $title ends up in the code a as a comment. | |
146 | # $opcode is the name of the operation the charclass implements. | |
147 | # the rest of the arguments are strings that the charclass | |
148 | # can match. | |
149 | sub new { | |
150 | my $class= shift; | |
151 | my $title= shift; | |
152 | my $opcode= shift; | |
153 | my $self= bless { op => $opcode, title => $title }, $class; | |
154 | my %seen; | |
155 | # convert the strings to the numeric equivelents and store | |
156 | # them for later insertion while tracking their sizes. | |
157 | foreach my $seq ( @_ ) { | |
158 | next if $seen{$seq}++; | |
159 | push @{$self->{seq}},$seq; | |
160 | my ( $n, $u, $l,$cp )= _uni_latin1( $seq ); | |
161 | if ( $n ) { | |
162 | _store( $self, $n, qw(n U L) ); | |
163 | } else { | |
164 | if ( $l ) { | |
165 | _store( $self, $l, qw(l L) ); | |
166 | } | |
167 | _store( $self, $u, qw(u U) ); | |
168 | } | |
169 | _store($self,$cp,'c'); | |
170 | } | |
171 | # | |
172 | # now construct the tries. For each type of data we insert | |
173 | # the data into all the tries of length $size and smaller. | |
174 | # | |
175 | ||
176 | my %allsize; | |
177 | foreach my $k ( keys %{ $self->{data} } ) { | |
178 | my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} }; | |
179 | $self->{size}{$k}=\@size; | |
180 | undef @allsize{@size}; | |
181 | foreach my $d ( @{ $self->{data}{$k} } ) { | |
182 | foreach my $sz ( @size ) { | |
183 | last if $sz < @$d; | |
184 | $self->{trie}{$k}{$sz} ||= {}; | |
185 | _trie_store( $self->{trie}{$k}{$sz}, @$d ); | |
186 | } | |
187 | } | |
188 | #delete $self->{data}{$k}; | |
189 | } | |
190 | my @size= sort { $b <=> $a } keys %allsize; | |
191 | $self->{size}{''}= \@size; | |
192 | return $self; | |
193 | } | |
194 | ||
195 | # | |
196 | # _cond([$v1,$v2,$v2...],$ofs) | |
197 | # | |
198 | # converts an array of codepoints into a conditional expression | |
199 | # consequtive codepoints are merged into a range test | |
200 | # returns a string containing the conditional expression in the form | |
201 | # '( li[x]==v || li[x]==y )' When possible we also use range lookups. | |
202 | ||
203 | sub _cond { | |
204 | my ( $c, $ofs,$fmt )= @_; | |
205 | $fmt||='((U8*)s)[%d]'; | |
206 | # cheapo rangification routine. | |
207 | # Convert the first element into a singleton represented | |
208 | # as [$x,$x] and then merge the rest in as we go. | |
209 | my @v= sort { $a <=> $b } @$c; | |
210 | my @r= ( [ ( shift @v ) x 2 ] ); | |
211 | for my $n ( @v ) { | |
212 | if ( $n == $r[-1][1] + 1 ) { | |
213 | $r[-1][1]++; | |
214 | } else { | |
215 | push @r, [ $n, $n ]; | |
216 | } | |
217 | } | |
218 | @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r; | |
219 | # sort the ranges by size and order. | |
220 | @r= sort { $a->[0] <=> $b->[0] } @r; | |
221 | my $alu= sprintf $fmt,$ofs; # C array look up | |
222 | ||
223 | if ($fmt=~/%d/) { | |
224 | # map the ranges into conditions | |
225 | @r= map { | |
226 | # singleton | |
0ccab2bc | 227 | $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) : |
12b72891 | 228 | # range |
0ccab2bc | 229 | sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_) |
12b72891 RGS |
230 | } @r; |
231 | # return the joined results. | |
232 | return '( ' . join( " || ", @r ) . ' )'; | |
233 | } else { | |
234 | return combine($alu,@r); | |
235 | } | |
236 | } | |
237 | ||
238 | # | |
239 | # Do the condition in such a way that we break out early if the value | |
240 | # we are looking at is in between two elements in the list. | |
241 | # Currently used only for codepoint macros (depth 1) | |
242 | # | |
243 | sub combine { | |
244 | my $alu=shift; | |
245 | local $_ = shift; | |
246 | my $txt= $_->[0] == $_->[1] | |
0ccab2bc RGS |
247 | ? sprintf("$alu == $hex_fmt",$_->[0]) |
248 | : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_); | |
12b72891 | 249 | return $txt unless @_; |
32e6a07c YO |
250 | return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )", |
251 | $txt,$alu,$_->[1],combine($alu,@_); | |
12b72891 RGS |
252 | } |
253 | ||
254 | # recursively convert a trie to an optree represented by | |
255 | # [condition,yes,no] where yes and no can be a ref to another optree | |
256 | # or a scalar representing code. | |
257 | # called by make_optree | |
258 | ||
259 | sub _trie_to_optree { | |
260 | my ( $node, $ofs, $else, $fmt )= @_; | |
261 | return $else unless $node; | |
262 | $ofs ||= 0; | |
263 | if ( $node->{''} ) { | |
264 | $else= $ofs; | |
265 | } else { | |
266 | $else ||= 0; | |
267 | } | |
268 | my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] } | |
269 | map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] } | |
270 | grep length, keys %$node; | |
271 | ||
272 | return $ofs if !@k; | |
273 | ||
274 | my ( $root, $expr ); | |
275 | while ( @k ) { | |
276 | my @cond= ( $k[0][0] ); | |
277 | my $d= $k[0][1]; | |
278 | my $r= $k[0][2]; | |
279 | shift @k; | |
280 | while ( @k && $k[0][1] eq $d ) { | |
281 | push @cond, $k[0][0]; | |
282 | shift @k; | |
283 | } | |
284 | my $op= | |
285 | [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ]; | |
286 | if ( !$root ) { | |
287 | $root= $expr= $op; | |
288 | } else { | |
289 | push @$expr, $op; | |
290 | $expr= $op; | |
291 | } | |
292 | } | |
293 | push @$expr, $else; | |
294 | return $root; | |
295 | } | |
296 | ||
297 | # construct the optree for a type. | |
298 | # handles the special logic of type ''. | |
299 | sub make_optree { | |
300 | my ( $self, $type, $size, $fmt )= @_; | |
301 | my $else= 0; | |
302 | $size||=$self->{size}{$type}[0]; | |
303 | $size=1 if $type eq 'c'; | |
304 | if ( !$type ) { | |
305 | my ( $u, $l ); | |
32e6a07c YO |
306 | if ($self->{trie}{u}) { |
307 | for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { | |
308 | $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); | |
309 | } | |
12b72891 | 310 | } |
32e6a07c YO |
311 | if ($self->{trie}{l}) { |
312 | for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { | |
313 | $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); | |
314 | } | |
12b72891 RGS |
315 | } |
316 | if ( $u ) { | |
317 | $else= [ '(is_utf8)', $u, $l || 0 ]; | |
318 | } elsif ( $l ) { | |
319 | $else= [ '(!is_utf8)', $l, 0 ]; | |
320 | } | |
321 | $type= 'n'; | |
12b72891 | 322 | } |
32e6a07c YO |
323 | if (!$self->{trie}{$type}) { |
324 | return $else; | |
325 | } else { | |
326 | $size-- while $size>0 && !$self->{trie}{$type}{$size}; | |
327 | return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); | |
328 | } | |
12b72891 RGS |
329 | } |
330 | ||
331 | # construct the optree for a type with length checks to prevent buffer | |
332 | # overruns. Only one length check is performed per lookup trading code | |
333 | # size for speed. | |
334 | sub length_optree { | |
335 | my ( $self, $type,$fmt )= @_; | |
336 | $type ||= ''; | |
337 | return $self->{len_op}{$type} if $self->{len_op}{$type}; | |
338 | my @size = @{$self->{size}{$type}}; | |
339 | ||
340 | my ( $root, $expr ); | |
341 | foreach my $size ( @size ) { | |
342 | my $op= [ | |
343 | "( (e) - (s) > " . ( $size - 1 ) . " )", | |
344 | $self->make_optree( $type, $size ), | |
345 | ]; | |
346 | if ( !$root ) { | |
347 | $root= $expr= $op; | |
348 | } else { | |
349 | push @$expr, $op; | |
350 | $expr= $op; | |
351 | } | |
352 | } | |
353 | push @$expr, 0; | |
354 | return $self->{len_op}{$type}= $root ? $root : $expr->[0]; | |
355 | } | |
356 | ||
357 | # | |
358 | # recursively walk an optree and covert it to a huge nested ternary expression. | |
359 | # | |
360 | sub _optree_to_ternary { | |
361 | my ( $node )= @_; | |
362 | return $node | |
363 | if !ref $node; | |
364 | my $depth = 0; | |
365 | if ( $node->[0] =~ /\[(\d+)\]/ ) { | |
366 | $depth= $1 + 1; | |
367 | } | |
368 | return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0], | |
369 | _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] ); | |
370 | } | |
371 | ||
372 | # add \\ to the end of strings in a reasonable neat way. | |
373 | sub _macro($) { | |
374 | my $str= shift; | |
375 | my @lines= split /[^\S\n]*\n/, $str; | |
0ccab2bc RGS |
376 | my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines ); |
377 | $macro =~ s/ *$//; | |
378 | return $macro . "\n\n"; | |
12b72891 RGS |
379 | } |
380 | ||
381 | # default type extensions. 'uln' dont have one because normally | |
382 | # they are used only as part of type '' which doesnt get an extension | |
383 | my %ext= ( | |
384 | U => '_utf8', | |
385 | L => '_latin1', | |
386 | c => '_cp', | |
387 | ||
388 | ); | |
389 | ||
390 | # produce the ternary, handling arguments and putting on the macro headers | |
391 | # and boiler plate | |
392 | sub ternary { | |
393 | my ( $self, $type, $ext )= @_; | |
394 | $type ||= ''; | |
395 | $ext = ($ext{$type} || '') . ($ext||""); | |
396 | my ($root,$fmt,$arg); | |
397 | if ($type eq 'c') { | |
398 | $arg= $fmt= 'cp'; | |
399 | } else { | |
400 | $arg= 's'; | |
401 | } | |
402 | if ( $type eq 'c' || $ext !~ /safe/) { | |
403 | $root= $self->make_optree( $type, 0, $fmt ); | |
404 | } else { | |
405 | $root= $self->length_optree( $type, $fmt ); | |
406 | } | |
407 | ||
408 | our $parens; | |
409 | $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x; | |
410 | my $expr= qr/ | |
411 | \( \s* | |
412 | ($parens) | |
413 | \s* \? \s* | |
414 | \( \s* | |
415 | ($parens) | |
416 | \s* \? \s* | |
417 | (\d+|$parens) | |
418 | \s* : \s* | |
419 | (\d+|$parens) | |
420 | \s* \) | |
421 | \s* : \s* | |
422 | \4 | |
423 | \s* \) | |
424 | /x; | |
425 | my $code= _optree_to_ternary( $root ); | |
426 | for ( $code ) { | |
427 | s/^\s*//; | |
428 | 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g | |
429 | || s<$expr><(($1 && $2) ? $3 : $4)>g | |
430 | || s<\(\s*($parens)\s*\)><$1>g; | |
431 | } | |
432 | my @args=($arg); | |
433 | push @args,'e' if $ext=~/safe/; | |
434 | push @args,'is_utf8' if !$type; | |
435 | my $args=join ",",@args; | |
436 | return "/*** GENERATED CODE ***/\n" | |
437 | . _macro "#define is_$self->{op}$ext($args)\n$code"; | |
438 | } | |
32e6a07c | 439 | $|++; |
12b72891 | 440 | my $path=shift @ARGV; |
32e6a07c | 441 | |
12b72891 RGS |
442 | if (!$path) { |
443 | $path= "regcharclass.h"; | |
444 | if (!-e $path) { $path="../$path" } | |
445 | if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; | |
446 | } | |
32e6a07c YO |
447 | my $out_fh; |
448 | if ($path eq '-') { | |
449 | $out_fh= \*STDOUT; | |
450 | } else { | |
451 | rename $path,"$path.bak"; | |
452 | open $out_fh,">",$path | |
453 | or die "Can't write to '$path':$!"; | |
454 | binmode $out_fh; # want unix line endings even when run on win32. | |
455 | } | |
58fbde93 | 456 | my ($zero) = $0=~/([^\\\/]+)$/; |
12b72891 | 457 | print $out_fh <<"HEADER"; |
58fbde93 RGS |
458 | /* -*- buffer-read-only: t -*- |
459 | * | |
460 | * regcharclass.h | |
461 | * | |
462 | * Copyright (C) 2007, by Larry Wall and others | |
463 | * | |
464 | * You may distribute under the terms of either the GNU General Public | |
465 | * License or the Artistic License, as specified in the README file. | |
466 | * | |
467 | * !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
468 | * This file is built by Porting/$zero. | |
469 | * (Generated at: @{[ scalar gmtime ]} GMT) | |
470 | * Any changes made here will be lost! | |
471 | */ | |
12b72891 RGS |
472 | |
473 | HEADER | |
474 | ||
32e6a07c | 475 | my ($op,$title,@strs,@txt,$type); |
12b72891 RGS |
476 | my $doit= sub { |
477 | return unless $op; | |
478 | my $o= __PACKAGE__->new($title,$op,@strs); | |
479 | print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; | |
480 | print $out_fh join "\n",@txt,"*/",""; | |
32e6a07c YO |
481 | $type||="U L c _safe"; |
482 | my @ext=(""); | |
483 | my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } } | |
484 | split /\s+/,$type); | |
485 | for my $type (@types) { | |
486 | for my $ext (@ext) { | |
487 | next if $type eq 'c' and $ext eq '_safe'; | |
488 | print $out_fh $o->ternary( $type,$ext ); | |
489 | } | |
12b72891 | 490 | } |
12b72891 RGS |
491 | }; |
492 | while (<DATA>) { | |
493 | next unless /\S/; | |
494 | chomp; | |
495 | if (/^([A-Z]+)/) { | |
496 | $doit->(); | |
497 | ($op,$title)=split /\s*:\s*/,$_,2; | |
498 | @txt=@strs=(); | |
32e6a07c YO |
499 | $type=""; |
500 | } elsif (/^=(.*)/) { | |
501 | $type.=$1; | |
12b72891 RGS |
502 | } else { |
503 | push @txt, "\t$_"; | |
504 | s/#.*$//; | |
505 | if (/^0x/) { | |
506 | push @strs,map { chr $_ } eval $_; | |
507 | } elsif (/^[""'']/) { | |
508 | push @strs,eval $_; | |
509 | } | |
510 | } | |
511 | } | |
512 | $doit->(); | |
58fbde93 | 513 | print $out_fh "/* ex: set ro: */\n"; |
12b72891 RGS |
514 | |
515 | __DATA__ | |
516 | LNBREAK: Line Break: \R | |
517 | "\x0D\x0A" # CRLF - Network (Windows) line ending | |
518 | 0x0A # LF | LINE FEED | |
519 | 0x0B # VT | VERTICAL TAB | |
520 | 0x0C # FF | FORM FEED | |
521 | 0x0D # CR | CARRIAGE RETURN | |
522 | 0x85 # NEL | NEXT LINE | |
523 | 0x2028 # LINE SEPARATOR | |
524 | 0x2029 # PARAGRAPH SEPARATOR | |
525 | ||
526 | HORIZWS: Horizontal Whitespace: \h \H | |
527 | 0x09 # HT | |
528 | 0x20 # SPACE | |
529 | 0xa0 # NBSP | |
530 | 0x1680 # OGHAM SPACE MARK | |
531 | 0x180e # MONGOLIAN VOWEL SEPARATOR | |
532 | 0x2000 # EN QUAD | |
533 | 0x2001 # EM QUAD | |
534 | 0x2002 # EN SPACE | |
535 | 0x2003 # EM SPACE | |
536 | 0x2004 # THREE-PER-EM SPACE | |
537 | 0x2005 # FOUR-PER-EM SPACE | |
538 | 0x2006 # SIX-PER-EM SPACE | |
539 | 0x2007 # FIGURE SPACE | |
540 | 0x2008 # PUNCTUATION SPACE | |
541 | 0x2009 # THIN SPACE | |
542 | 0x200A # HAIR SPACE | |
543 | 0x202f # NARROW NO-BREAK SPACE | |
544 | 0x205f # MEDIUM MATHEMATICAL SPACE | |
545 | 0x3000 # IDEOGRAPHIC SPACE | |
546 | ||
547 | VERTWS: Vertical Whitespace: \v \V | |
548 | 0x0A # LF | |
549 | 0x0B # VT | |
550 | 0x0C # FF | |
551 | 0x0D # CR | |
552 | 0x85 # NEL | |
553 | 0x2028 # LINE SEPARATOR | |
554 | 0x2029 # PARAGRAPH SEPARATOR | |
555 | ||
32e6a07c YO |
556 | TRICKYFOLD: Problematic fold case letters. |
557 | 0x00DF # LATIN SMALL LETTER SHARP S | |
558 | 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS | |
559 | 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS |