From 9b7c43baf09d4c57d5cd6c9a052ce398d1626a6a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 30 Oct 2010 16:48:55 -0600 Subject: [PATCH] [:posix:] now works under /u This patch is part of fixing the Unicode bug. The /u regex modifier now applies to posix character classes. This resolves [perl #18281]. The Todo tests in reg_posicc.t have all been made not todo. --- pod/perldelta.pod | 20 ++++++++++++++++++++ pod/perlunicode.pod | 5 +++-- regcomp.c | 22 +++++++++++----------- t/re/reg_posixcc.t | 29 ++++++++++------------------- 4 files changed, 44 insertions(+), 32 deletions(-) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 162ce85..203260c 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -65,6 +65,18 @@ See L for details. Statement labels can now occur before any type of statement or declaration, such as C. +=head2 C now applies to more regex matching + +Another chunk of the L is fixed in this +release. Now, regular expressions compiled within the scope of the +"unicode_strings" feature (or under the "u" regex modifier (specifiable +currently only with infix notation C<(?u:...)> or via C) +will match the same whether or not the target string is encoded in utf8, +with regard to C<[[:posix:]]> character classes + +Work is underway to add the case sensitive matching to the control of +this feature, but was not complete in time for this dot release. + =head1 Security XXX Any security-related notices go here. In particular, any security @@ -626,6 +638,14 @@ L<[perl #77498]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=77498>. C was ignoring locales when called with constant arguments L<[perl #78632]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=78632>. +=item * + +A non-ASCII character in the Latin-1 range could match both a Posix +class, such as C<[[:alnum:]]>, and its inverse C<[[:^alnum:]]>. This is +now fixed for regular expressions compiled under the C<"u"> modifier. +See L now applies to more regex matching>. +L<[perl #18281]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=18281>. + =back =head1 Known Problems diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 8ff5bb0..dfd6d42 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -1510,8 +1510,9 @@ support seamlessly. The result wasn't seamless: these characters were orphaned. Work is being done to correct this, but only some of it is complete. -What has been finished is the matching of C<\b>, C<\s>, C<\w> and their -complements in regular expressions, and the important part of the case +What has been finished is the matching of C<\b>, C<\s>, C<\w> and the Posix +character classes and their complements in regular expressions, and the +important part of the case changing component. Due to concerns, and some evidence, that older code might have come to rely on the existing behavior, the new behavior must be explicitly enabled by the feature C in the L pragma, even though diff --git a/regcomp.c b/regcomp.c index 0d469c1..0489cc9 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8471,16 +8471,16 @@ parseit: * --jhi */ switch ((I32)namedclass) { - case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); - case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); - case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); - case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); - case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); - case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); - case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); - case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); - case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); - case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); + case _C_C_T_UNI_8_BIT(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum"); + case _C_C_T_UNI_8_BIT(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha"); + case _C_C_T_UNI_8_BIT(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank"); + case _C_C_T_UNI_8_BIT(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl"); + case _C_C_T_UNI_8_BIT(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph"); + case _C_C_T_UNI_8_BIT(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower"); + case _C_C_T_UNI_8_BIT(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint"); + case _C_C_T_UNI_8_BIT(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace"); + case _C_C_T_UNI_8_BIT(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct"); + case _C_C_T_UNI_8_BIT(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper"); #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS /* \s, \w match all unicode if utf8. */ case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl"); @@ -8490,7 +8490,7 @@ parseit: case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace"); case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord"); #endif - case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_UNI_8_BIT(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit"); case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: diff --git a/t/re/reg_posixcc.t b/t/re/reg_posixcc.t index cd3890c..aa7f445 100644 --- a/t/re/reg_posixcc.t +++ b/t/re/reg_posixcc.t @@ -41,9 +41,6 @@ my @pats=( "[:^space:]", "[:blank:]", "[:^blank:]" ); -if (1 or $ENV{PERL_TEST_LEGACY_POSIX_CC}) { - $::TODO = "Only works under PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS = 0"; -} sub rangify { my $ary= shift; @@ -72,6 +69,9 @@ sub rangify { return $ret; } +# The bug is only fixed for /u +use feature 'unicode_strings'; + my $description = ""; while (@pats) { my ($yes,$no)= splice @pats,0,2; @@ -81,6 +81,7 @@ while (@pats) { my %complements; foreach my $b (0..255) { my %got; + my $display_b = sprintf("\\x%02X", $b); for my $type ('unicode','not-unicode') { my $str=chr($b).chr($b); if ($type eq 'unicode') { @@ -88,10 +89,8 @@ while (@pats) { chop $str; } if ($str=~/[$yes][$no]/){ - TODO: { - unlike($str,qr/[$yes][$no]/, - "chr($b)=~/[$yes][$no]/ should not match under $type"); - } + unlike($str,qr/[$yes][$no]/, + "chr($display_b) X 2 =~/[$yes][$no]/ should not match under $type"); push @{$err_by_type{$type}},$b; } $got{"[$yes]"}{$type} = $str=~/[$yes]/ ? 1 : 0; @@ -101,20 +100,16 @@ while (@pats) { } foreach my $which ("[$yes]","[$no]","[^$yes]","[^$no]") { if ($got{$which}{'unicode'} != $got{$which}{'not-unicode'}){ - TODO: { - is($got{$which}{'unicode'},$got{$which}{'not-unicode'}, - "chr($b)=~/$which/ should have the same results regardless of internal string encoding"); - } + is($got{$which}{'unicode'},$got{$which}{'not-unicode'}, + "chr($display_b) X 2=~ /$which/ should have the same results regardless of internal string encoding"); push @{$singles{$which}},$b; } } foreach my $which ($yes,$no) { foreach my $strtype ('unicode','not-unicode') { if ($got{"[$which]"}{$strtype} == $got{"[^$which]"}{$strtype}) { - TODO: { - isnt($got{"[$which]"}{$strtype},$got{"[^$which]"}{$strtype}, - "chr($b)=~/[$which]/ should not have the same result as chr($b)=~/[^$which]/"); - } + isnt($got{"[$which]"}{$strtype},$got{"[^$which]"}{$strtype}, + "chr($display_b) X 2 =~ /[$which]/ should not have the same result as chr($display_b)=~/[^$which]/"); push @{$complements{$which}{$strtype}},$b; } } @@ -153,8 +148,4 @@ while (@pats) { } } } -TODO: { - is( $description, "", "POSIX and perl charclasses should not depend on string type"); -} - __DATA__ -- 1.8.3.1