This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add /a regex modifier
authorKarl Williamson <public@khwilliamson.com>
Mon, 17 Jan 2011 15:58:53 +0000 (08:58 -0700)
committerKarl Williamson <public@khwilliamson.com>
Mon, 17 Jan 2011 16:20:20 +0000 (09:20 -0700)
This restricts certain constructs, like \w, to matching in the ASCII range
only.

ext/re/re.pm
op_reg_common.h
pod/perldelta.pod
pod/perlre.pod
pod/perlrebackslash.pod
pod/perlrecharclass.pod
regcomp.c
regexec.c
regexp.h
t/re/charset.t
t/re/re_tests

index 7a48995..3ce5beb 100644 (file)
@@ -28,6 +28,7 @@ my %reflags = (
     d => 0,
     l => 1,
     u => 2,
+    a => 3,
 );
 
 sub setcolor {
@@ -144,7 +145,7 @@ sub bits {
            my $reflags = $^H{reflags} || 0;
            my $seen_charset;
            for(split//, $s) {
-               if (/[dul]/) {
+               if (/[adul]/) {
                    if ($on) {
                        if ($seen_charset && $seen_charset ne $_) {
                            require Carp;
@@ -291,7 +292,7 @@ default, simply put
 
 at the top of your code.
 
-The character set /dul flags cancel each other out. So, in this example,
+The character set /adul flags cancel each other out. So, in this example,
 
     use re "/u";
     "ss" =~ /\xdf/;
index c8a3518..7d78005 100644 (file)
@@ -40,7 +40,8 @@
 typedef enum {
     REGEX_DEPENDS_CHARSET = 0,
     REGEX_LOCALE_CHARSET,
-    REGEX_UNICODE_CHARSET
+    REGEX_UNICODE_CHARSET,
+    REGEX_ASCII_RESTRICTED_CHARSET
 } regex_charset;
 
 #define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5)
index b1ea785..13b79a1 100644 (file)
@@ -28,6 +28,26 @@ here, but most should go in the L</Performance Enhancements> section.
 
 [ List each enhancement as a =head2 entry ]
 
+=head2 New regular expression modifier C</a>
+
+The C</a> regular expression modifier restricts C<\s> to match precisely
+the five characters C<[ \f\n\r\t]>, C<\d> to match precisely the 10
+characters C<[0-9]>, C<\w> to match precisely the 63 characters
+C<[A-Za-z0-9_]>, and the Posix (C<[[:posix:]]>) character classes to
+match only the appropriate ASCII characters.  The complements, of
+course, match everything but; and C<\b> and C<\B> are correspondingly
+affected.  Otherwise, C</a> behaves like the C</u> modifier, in that
+case-insensitive matching uses Unicode semantics; for example, "k" will
+match the Unicode C<\N{KELVIN SIGN}> under C</i> matching, and code
+points in the Latin1 range, above ASCII will have Unicode semantics when
+it comes to case-insensitive matching.  Like its cousins (C</u>, C</l>,
+and C</d>), and in spite of the terminology, C</a> in 5.14 will not
+actually be able to be used as a suffix at the end of a regular
+expression (this restriction is planned to be lifted in 5.16).  It must
+occur either as an infix modifier, such as C<(?a:...)> or (C<(?a)...>,
+or it can be turned on within the lexical scope of C<use re '/a'>.
+Turning on C</a> turns off the other "character set" modifiers.
+
 =head2 Any unsigned value can be encoded as a character
 
 With this release, Perl is adopting a model that any unsigned value can
index b74618f..39840fc 100644 (file)
@@ -596,9 +596,9 @@ whitespace formatting, a simple C<#> will suffice.  Note that Perl closes
 the comment as soon as it sees a C<)>, so there is no way to put a literal
 C<)> in the comment.
 
-=item C<(?dlupimsx-imsx)>
+=item C<(?adlupimsx-imsx)>
 
-=item C<(?^lupimsx)>
+=item C<(?^alupimsx)>
 X<(?)> X<(?^)>
 
 One or more embedded pattern-match modifiers, to be turned on (or
@@ -636,8 +636,8 @@ after the C<"?"> is a shorthand equivalent to C<d-imsx>.  Flags (except
 C<"d">) may follow the caret to override it.
 But a minus sign is not legal with it.
 
-Also, starting in Perl 5.14, are modifiers C<"d">, C<"l">, and C<"u">,
-which for 5.14 may not be used as suffix modifiers.
+Also, starting in Perl 5.14, are modifiers C<"a">, C<"d">, C<"l">, and
+C<"u">, which for 5.14 may not be used as suffix modifiers.
 
 C<"l"> means to use a locale (see L<perllocale>) when pattern matching.
 The locale used will be the one in effect at the time of execution of
@@ -659,7 +659,7 @@ Latin-1 (ISO-8859-1) meanings (which are the same as Unicode's), whereas
 in strict ASCII their meanings are undefined.  Thus the platform
 effectively becomes a Unicode platform.  The ASCII characters remain as
 ASCII characters (since ASCII is a subset of Latin-1 and Unicode).  For
-example, when this option is not on, on a non-utf8 string, C<"\w">
+example, when this option is XXX not on, on a non-utf8 string, C<"\w">
 matches precisely C<[A-Za-z0-9_]>.  When the option is on, it matches
 not just those, but all the Latin-1 word characters (such as an "n" with
 a tilde).  On EBCDIC platforms, which already are equivalent to Latin-1,
@@ -670,6 +670,16 @@ small letters C<MU>; otherwise not; and the C<LATIN CAPITAL LETTER SHARP
 S> will match any of C<SS>, C<Ss>, C<sS>, and C<ss>, otherwise not.
 (This last case is buggy, however.)
 
+C<"a"> is the same as C<"u">, but C<\d>, C<\s>, C<\w>, and the Posix
+character classes are restricted to matching in the ASCII range only.
+That is, with this modifier, C<\d> always means precisely the digits
+C<"0"> to C<"9">; C<\s> means the five characters C<[ \f\n\r\t]>;
+C<\w> means the 53 characters C<[A-Za-z0-9_]>; and likewise, all the
+Posix classes such as C<[[:print:]]> match only the appropriate
+ASCII-range characters.  As you would expect, this modifier causes, for
+example, C<\D> to mean the same thing as C<[^0-9]>.  C<"a"> behaves the
+same as C<"u"> with regards to case-insensitive matches.  XXX
+
 C<"d"> means to use the traditional Perl pattern matching behavior.
 This is dualistic (hence the name C<"d">, which also could stand for
 "depends").  When this is in effect, Perl matches utf8-encoded strings
@@ -695,9 +705,9 @@ anywhere in a pattern has a global effect.
 =item C<(?:pattern)>
 X<(?:)>
 
-=item C<(?dluimsx-imsx:pattern)>
+=item C<(?adluimsx-imsx:pattern)>
 
-=item C<(?^luimsx:pattern)>
+=item C<(?^aluimsx:pattern)>
 X<(?^:)>
 
 This is for clustering, not capturing; it groups subexpressions like
@@ -713,7 +723,7 @@ but doesn't spit out extra fields.  It's also cheaper not to capture
 characters if you don't need to.
 
 Any letters between C<?> and C<:> act as flags modifiers as with
-C<(?dluimsx-imsx)>.  For example,
+C<(?adluimsx-imsx)>.  For example,
 
     /(?s-i:more.*than).*million/i
 
index 1b0689b..6422a2d 100644 (file)
@@ -366,6 +366,10 @@ digit, while the character class C<\s> matches any whitespace character.
 New in perl 5.10.0 are the classes C<\h> and C<\v> which match horizontal
 and vertical whitespace characters.
 
+The exact set of characters matched by C<\d>, C<\s>, and C<\w> varies
+depending on various pragma and regular expression modifiers.  See
+L<perlre>.
+
 The uppercase variants (C<\W>, C<\D>, C<\S>, C<\H>, and C<\V>) are
 character classes that match, respectively, any character that isn't a
 word character, digit, whitespace, horizontal whitespace, or vertical
index 3329d60..1baeb16 100644 (file)
@@ -84,7 +84,8 @@ characters the locale considers decimal digits.  Without a locale, C<\d>
 matches just the digits '0' to '9'.
 
 Unicode digits may cause some confusion, and some security issues.  In UTF-8
-strings, C<\d> matches the same characters matched by
+strings, unless the C<"a"> modifier is specified, C<\d> matches the same
+characters matched by
 C<\p{General_Category=Decimal_Number}>, or synonymously,
 C<\p{General_Category=Digit}>.  Starting with Unicode version 4.1, this is the
 same set of characters matched by C<\p{Numeric_Type=Decimal}>.  
index 8260884..0c34695 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -371,6 +371,9 @@ static const scan_data_t zero_scan_data =
 #define UTF cBOOL(RExC_utf8)
 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
+#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 
 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
@@ -6301,6 +6304,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        cs = REGEX_UNICODE_CHARSET;
                         has_charset_modifier = 1;
                         break;
+                    case ASCII_RESTRICT_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                       cs = REGEX_ASCII_RESTRICTED_CHARSET;
+                        has_charset_modifier = 1;
+                        break;
                     case DEPENDS_PAT_MOD:
                         if (has_use_defaults
                             || has_charset_modifier
@@ -7370,6 +7380,9 @@ tryagain:
                case REGEX_UNICODE_CHARSET:
                    op = ALNUMU;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = ALNUMA;
+                   break;
                case REGEX_DEPENDS_CHARSET:
                    op = ALNUM;
                    break;
@@ -7387,6 +7400,9 @@ tryagain:
                case REGEX_UNICODE_CHARSET:
                    op = NALNUMU;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NALNUMA;
+                   break;
                case REGEX_DEPENDS_CHARSET:
                    op = NALNUM;
                    break;
@@ -7406,6 +7422,9 @@ tryagain:
                case REGEX_UNICODE_CHARSET:
                    op = BOUNDU;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = BOUNDA;
+                   break;
                case REGEX_DEPENDS_CHARSET:
                    op = BOUND;
                    break;
@@ -7426,6 +7445,9 @@ tryagain:
                case REGEX_UNICODE_CHARSET:
                    op = NBOUNDU;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NBOUNDA;
+                   break;
                case REGEX_DEPENDS_CHARSET:
                    op = NBOUND;
                    break;
@@ -7444,6 +7466,9 @@ tryagain:
                case REGEX_UNICODE_CHARSET:
                    op = SPACEU;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = SPACEA;
+                   break;
                case REGEX_DEPENDS_CHARSET:
                    op = SPACE;
                    break;
@@ -7461,6 +7486,9 @@ tryagain:
                case REGEX_UNICODE_CHARSET:
                    op = NSPACEU;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NSPACEA;
+                   break;
                case REGEX_DEPENDS_CHARSET:
                    op = NSPACE;
                    break;
@@ -7475,6 +7503,9 @@ tryagain:
                case REGEX_LOCALE_CHARSET:
                    op = DIGITL;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = DIGITA;
+                   break;
                case REGEX_DEPENDS_CHARSET: /* No difference between these */
                case REGEX_UNICODE_CHARSET:
                    op = DIGIT;
@@ -7490,6 +7521,9 @@ tryagain:
                case REGEX_LOCALE_CHARSET:
                    op = NDIGITL;
                    break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NDIGITA;
+                   break;
                case REGEX_DEPENDS_CHARSET: /* No difference between these */
                case REGEX_UNICODE_CHARSET:
                    op = NDIGIT;
@@ -7596,7 +7630,7 @@ tryagain:
                 ret = reganode(pRExC_state,
                                ((! FOLD)
                                  ? NREF
-                                 : (UNI_SEMANTICS)
+                                 : (AT_LEAST_UNI_SEMANTICS)
                                    ? NREFFU
                                    : (LOC)
                                      ? NREFFL
@@ -7664,7 +7698,7 @@ tryagain:
                    ret = reganode(pRExC_state,
                                   ((! FOLD)
                                     ? REF
-                                    : (UNI_SEMANTICS)
+                                    : (AT_LEAST_UNI_SEMANTICS)
                                       ? REFFU
                                       : (LOC)
                                         ? REFFL
@@ -7718,7 +7752,7 @@ tryagain:
                           (U8) ((! FOLD) ? EXACT
                                          : (LOC)
                                             ? EXACTFL
-                                            : (UNI_SEMANTICS)
+                                            : (AT_LEAST_UNI_SEMANTICS)
                                               ? EXACTFU
                                               : EXACTF)
                    );
@@ -8272,12 +8306,20 @@ case ANYOF_N##NAME:                                                            \
             if (! TEST_7) stored +=                                            \
                    S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
         }                                                                      \
-       /* For a non-ut8 target string with DEPENDS semantics, all above ASCII \
-        * Latin1 code points match the complement of any of the classes.  But \
-        * in utf8, they have their Unicode semantics, so can't just set them  \
-        * in the bitmap, or else regexec.c will think they matched when they  \
-        * shouldn't. */                                                       \
-       ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;              \
+       if (ASCII_RESTRICTED) {                                                \
+           for (value = 128; value < 256; value++) {                          \
+             stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
+           }                                                                  \
+           ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
+       }                                                                      \
+       else {                                                                 \
+           /* For a non-ut8 target string with DEPENDS semantics, all above   \
+            * ASCII Latin1 code points match the complement of any of the     \
+            * classes.  But in utf8, they have their Unicode semantics, so    \
+            * can't just set them in the bitmap, or else regexec.c will think \
+            * they matched when they shouldn't. */                            \
+           ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;          \
+       }                                                                      \
     }                                                                          \
     yesno = '!';                                                               \
     what = WORD;                                                               \
@@ -8317,7 +8359,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
     U8 stored = 0;
     U8 fold;
 
-    fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
+    fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
                            : PL_fold[value];
 
     /* It assumes the bit for 'value' has already been set */
@@ -8744,6 +8786,7 @@ parseit:
                            stored +=
                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
+                   ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
                    yesno = '!';
                    what = "ASCII";
                    break;              
@@ -8773,6 +8816,9 @@ parseit:
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
+                   if (ASCII_RESTRICTED ) {
+                       ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+                   }
                    break;              
                case ANYOF_MAX:
                    /* this is to handle \p and \P */
@@ -8781,7 +8827,7 @@ parseit:
                    vFAIL("Invalid [::] class");
                    break;
                }
-               if (what) {
+               if (what && ! (ASCII_RESTRICTED)) {
                    /* Strings such as "+utf8::isWord\n" */
                    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
                    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
@@ -9067,7 +9113,7 @@ parseit:
                op = EXACT;
            }
        }   /* else 2 chars in the bit map: the folds of each other */
-       else if (UNI_SEMANTICS || !isASCII(value)) {
+       else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
 
            /* To join adjacent nodes, they must be the exact EXACTish type.
             * Try to use the most likely type, by using EXACTFU if the regex
@@ -9567,6 +9613,9 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
                 case REGEX_LOCALE_CHARSET:
                     PerlIO_printf(Perl_debug_log, "LOCALE");
                     break;
+                case REGEX_ASCII_RESTRICTED_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
+                    break;
                 default:
                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
                     break;
index c360cb2..cbdbeaf 100644 (file)
--- a/regexec.c
+++ b/regexec.c
  * This is because that code fails when the test succeeds, so we want to have
  * the test fail so that the code succeeds.  The swash is stored in a
  * predictable PL_ place */
-#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)                     \
+#define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
+                          CLASS, STR)                                        \
     case NAME:                                                                \
        _CCC_TRY_CODE( !, FUNC,                                               \
                          cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
  * irrelevant here */
 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
                NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
+               NAMEA, NNAMEA, FUNCA,                                         \
                CLASS, STR)                                                   \
     case NAMEL:                                                               \
        PL_reg_flags |= RF_tainted;                                           \
     case NNAMEL:                                                              \
        PL_reg_flags |= RF_tainted;                                           \
        _CCC_TRY_CODE(  , LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
+    case NAMEA:                                                               \
+       if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
+           sayNO;                                                            \
+       }                                                                     \
+       /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
+       nextchr = UCHARAT(++locinput);                                        \
+       break;                                                                \
+    case NNAMEA:                                                              \
+       if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
+           sayNO;                                                            \
+       }                                                                     \
+       if (utf8_target) {                                                    \
+           locinput += PL_utf8skip[nextchr];                                 \
+           nextchr = UCHARAT(locinput);                                      \
+       }                                                                     \
+       else {                                                                \
+           nextchr = UCHARAT(++locinput);                                    \
+       }                                                                     \
+       break;                                                                \
     /* Generate the non-locale cases */                                       \
     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 
 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
                  NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
                  NAMEU, NNAMEU, FUNCU,                                        \
+                 NAMEA, NNAMEA, FUNCA,                                        \
                  CLASS, STR)                                                  \
-    CCC_TRY(NAME, NNAME, FUNC, NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, CLASS, STR) \
+    CCC_TRY(NAME, NNAME, FUNC,                                                 \
+           NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
+           NAMEA, NNAMEA, FUNCA,                                              \
+           CLASS, STR)                                                        \
     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 
 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
@@ -1332,15 +1357,56 @@ if ((!reginfo || regtry(reginfo, &s))) \
 #define DUMP_EXEC_POS(li,s,doutf8) \
     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
 
+
+#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
+       tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
+       tmp = TEST_NON_UTF8(tmp);                                              \
+       REXEC_FBC_UTF8_SCAN(                                                   \
+           if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
+               tmp = !tmp;                                                    \
+               IF_SUCCESS;                                                    \
+           }                                                                  \
+           else {                                                             \
+               IF_FAIL;                                                       \
+           }                                                                  \
+       );                                                                     \
+
+#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
+       if (s == PL_bostr) {                                                   \
+           tmp = '\n';                                                        \
+       }                                                                      \
+       else {                                                                 \
+           U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
+           tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
+       }                                                                      \
+       tmp = TeSt1_UtF8;                                                      \
+       LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
+       REXEC_FBC_UTF8_SCAN(                                                   \
+           if (tmp == ! (TeSt2_UtF8)) { \
+               tmp = !tmp;                                                    \
+               IF_SUCCESS;                                                    \
+           }                                                                  \
+           else {                                                             \
+               IF_FAIL;                                                       \
+           }                                                                  \
+       );                                                                     \
+
 /* The only difference between the BOUND and NBOUND cases is that
  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
  * NBOUND.  This is accomplished by passing it in either the if or else clause,
  * with the other one being empty */
 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
-    FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, )
+    FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, ), TEST_NON_UTF8, REXEC_FBC_TRYIT, )
+
+#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+    FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, ), TEST_NON_UTF8, REXEC_FBC_TRYIT, )
 
 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
-    FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, , REXEC_FBC_TRYIT)
+    FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, , REXEC_FBC_TRYIT), TEST_NON_UTF8, , REXEC_FBC_TRYIT)
+
+#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+    FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, , REXEC_FBC_TRYIT), TEST_NON_UTF8, , REXEC_FBC_TRYIT)
+
 
 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
  * be passed in completely with the variable name being tested, which isn't
@@ -1351,27 +1417,9 @@ if ((!reginfo || regtry(reginfo, &s))) \
  * one, and compare it with the wordness of this one.  If they differ, we have
  * a boundary.  At the beginning of the string, pretend that the previous
  * character was a new-line */
-#define FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8,                \
-                                                         IF_SUCCESS, IF_FAIL) \
+#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
     if (utf8_target) {                                                         \
-       if (s == PL_bostr) {                                                   \
-           tmp = '\n';                                                        \
-       }                                                                      \
-       else {                                                                 \
-           U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
-           tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
-       }                                                                      \
-       tmp = TEST1_UTF8;                                                      \
-       LOAD_UTF8_CHARCLASS_ALNUM();                                           \
-       REXEC_FBC_UTF8_SCAN(                                                   \
-           if (tmp == ! (TEST2_UTF8)) {                                       \
-               tmp = !tmp;                                                    \
-               IF_SUCCESS;                                                    \
-           }                                                                  \
-           else {                                                             \
-               IF_FAIL;                                                       \
-           }                                                                  \
-       );                                                                     \
+               UTF8_CODE \
     }                                                                          \
     else {  /* Not utf8 */                                                     \
        tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
@@ -1596,11 +1644,21 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                      isALNUM_uni(tmp),
                      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
            break;
+       case BOUNDA:
+           FBC_BOUND_NOLOAD(isWORDCHAR_A,
+                            isWORDCHAR_A(tmp),
+                            isWORDCHAR_A((U8*)s));
+           break;
        case NBOUND:
            FBC_NBOUND(isWORDCHAR,
                       isALNUM_uni(tmp),
                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
            break;
+       case NBOUNDA:
+           FBC_NBOUND_NOLOAD(isWORDCHAR_A,
+                             isWORDCHAR_A(tmp),
+                             isWORDCHAR_A((U8*)s));
+           break;
        case BOUNDU:
            FBC_BOUND(isWORDCHAR_L1,
                      isALNUM_uni(tmp),
@@ -1628,6 +1686,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
                 isWORDCHAR((U8) *s)
            );
+       case ALNUMA:
+           REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
        case NALNUMU:
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
@@ -1640,6 +1700,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
                 ! isALNUM(*s)
            );
+       case NALNUMA:
+           REXEC_FBC_UTF8_CLASS_SCAN( !isWORDCHAR_A(*s));
        case NALNUML:
            REXEC_FBC_CSCAN_TAINT(
                !isALNUM_LC_utf8((U8*)s),
@@ -1657,6 +1719,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
                 isSPACE((U8) *s)
            );
+       case SPACEA:
+           REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
        case SPACEL:
            REXEC_FBC_CSCAN_TAINT(
                isSPACE_LC_utf8((U8*)s),
@@ -1674,6 +1738,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
                 ! isSPACE((U8) *s)
            );
+       case NSPACEA:
+           REXEC_FBC_UTF8_CLASS_SCAN( !isSPACE_A(*s));
        case NSPACEL:
            REXEC_FBC_CSCAN_TAINT(
                !isSPACE_LC_utf8((U8*)s),
@@ -1685,6 +1751,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
                isDIGIT(*s)
            );
+       case DIGITA:
+           REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
        case DIGITL:
            REXEC_FBC_CSCAN_TAINT(
                isDIGIT_LC_utf8((U8*)s),
@@ -1696,6 +1764,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
                !isDIGIT(*s)
            );
+       case NDIGITA:
+           REXEC_FBC_UTF8_CLASS_SCAN( !isDIGIT_A(*s));
        case NDIGITL:
            REXEC_FBC_CSCAN_TAINT(
                !isDIGIT_LC_utf8((U8*)s),
@@ -3652,10 +3722,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            /* FALL THROUGH */
        case BOUND:
        case BOUNDU:
+       case BOUNDA:
        case NBOUND:
        case NBOUNDU:
+       case NBOUNDA:
            /* was last char in word? */
-           if (utf8_target) {
+           if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
                if (locinput == PL_bostr)
                    ln = '\n';
                else {
@@ -3674,6 +3746,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                }
            }
            else {
+
+               /* Here the string isn't utf8, or is utf8 and only ascii
+                * characters are to match \w.  In the latter case looking at
+                * the byte just prior to the current one may be just the final
+                * byte of a multi-byte character.  This is ok.  There are two
+                * cases:
+                * 1) it is a single byte character, and then the test is doing
+                *      just what it's supposed to.
+                * 2) it is a multi-byte character, in which case the final
+                *      byte is never mistakable for ASCII, and so the test
+                *      will say it is not a word character, which is the
+                *      correct answer. */
                ln = (locinput != PL_bostr) ?
                    UCHARAT(locinput - 1) : '\n';
                switch (FLAGS(scan)) {
@@ -3689,6 +3773,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        ln = isALNUM(ln);
                        n = isALNUM(nextchr);
                        break;
+                   case REGEX_ASCII_RESTRICTED_CHARSET:
+                       ln = isWORDCHAR_A(ln);
+                       n = isWORDCHAR_A(nextchr);
+                       break;
                    default:
                        Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
                        break;
@@ -3727,15 +3815,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
                  ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
                  ALNUMU, NALNUMU, isWORDCHAR_L1,
+                 ALNUMA, NALNUMA, isWORDCHAR_A,
                  perl_word, "a");
 
         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
                  SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
                  SPACEU, NSPACEU, isSPACE_L1,
+                 SPACEA, NSPACEA, isSPACE_A,
                  perl_space, " ");
 
         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
                DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
+               DIGITA, NDIGITA, isDIGIT_A,
                posix_digit, "0");
 
        case CLUMP: /* Match \X: logical Unicode character.  This is defined as
@@ -5989,6 +6080,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            scan++;
        }
        break;
+    case ALNUMA:
+       while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
+           scan++;
+       }
+       break;
     case ALNUML:
        PL_reg_flags |= RF_tainted;
        if (utf8_target) {
@@ -6029,6 +6125,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            scan++;
        }
        break;
+    case NALNUMA:
+       if (utf8_target) {
+           while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+               scan += UTF8SKIP(scan);
+           }
+       }
+       else {
+           while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+               scan++;
+           }
+       }
+       break;
     case NALNUML:
        PL_reg_flags |= RF_tainted;
        if (utf8_target) {
@@ -6073,6 +6181,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            scan++;
        }
        break;
+    case SPACEA:
+       while (scan < loceol && isSPACE_A((U8) *scan)) {
+           scan++;
+       }
+       break;
     case SPACEL:
        PL_reg_flags |= RF_tainted;
        if (utf8_target) {
@@ -6117,6 +6230,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            scan++;
        }
        break;
+    case NSPACEA:
+       if (utf8_target) {
+           while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+               scan += UTF8SKIP(scan);
+           }
+       }
+       else {
+           while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+               scan++;
+           }
+       }
+       break;
     case NSPACEL:
        PL_reg_flags |= RF_tainted;
        if (utf8_target) {
@@ -6145,6 +6270,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan++;
        }
        break;
+    case DIGITA:
+       while (scan < loceol && isDIGIT_A((U8) *scan)) {
+           scan++;
+       }
+       break;
     case DIGITL:
        PL_reg_flags |= RF_tainted;
        if (utf8_target) {
@@ -6172,6 +6302,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            while (scan < loceol && !isDIGIT(*scan))
                scan++;
        }
+       break;
+    case NDIGITA:
+       if (utf8_target) {
+           while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+               scan += UTF8SKIP(scan);
+           }
+       }
+       else {
+           while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+               scan++;
+           }
+       }
+       break;
     case NDIGITL:
        PL_reg_flags |= RF_tainted;
        if (utf8_target) {
index d677ae8..9780297 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -258,6 +258,7 @@ and check for NULL.
 #define LOCALE_PAT_MOD       'l'
 #define UNICODE_PAT_MOD      'u'
 #define DEPENDS_PAT_MOD      'd'
+#define ASCII_RESTRICT_PAT_MOD 'a'
 
 #define ONCE_PAT_MODS        "o"
 #define KEEPCOPY_PAT_MODS    "p"
@@ -267,6 +268,7 @@ and check for NULL.
 #define LOCALE_PAT_MODS      "l"
 #define UNICODE_PAT_MODS     "u"
 #define DEPENDS_PAT_MODS     "d"
+#define ASCII_RESTRICT_PAT_MODS "a"
 
 /* This string is expected by regcomp.c to be ordered so that the first
  * character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of extflags; the next
@@ -314,6 +316,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
+       case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
     }
 
     return "?";            /* Unknown */
index 41f717b..f34cec8 100644 (file)
@@ -35,7 +35,7 @@ $testcases{'[:space:]'} = $testcases{'\s'};
 $testcases{'[:word:]'} = $testcases{'\w'};
 
 # For each possible character set...
-foreach my $charset ("d", "u") {
+foreach my $charset ("a", "d", "u") {
 
     # And in utf8 or not
     foreach my $upgrade ("", 'utf8::upgrade($a); ') {
index 041296a..3c3b10f 100644 (file)
@@ -628,15 +628,15 @@ $(?<=^(a))        a       y       $1      a
 ([[:upper:]]+) ABcd01Xy__--  ${nulnul}${ffff}  y       $1      AB
 ([[:xdigit:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      ABcd01
 ([[:^alpha:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      01
-([[:^alnum:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  yT      $1      __--  ${nulnul}${ffff}
+((?a)[[:^alnum:]]+)    ABcd01Xy__--  ${nulnul}${ffff}  y       $1      __--  ${nulnul}${ffff}
 ([[:^ascii:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      ${ffff}
 ([[:^cntrl:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      ABcd01Xy__--  
 ([[:^digit:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      ABcd
 ([[:^lower:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      AB
-([[:^print:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  yT      $1      ${nulnul}${ffff}
+((?a)[[:^print:]]+)    ABcd01Xy__--  ${nulnul}${ffff}  y       $1      ${nulnul}${ffff}
 ([[:^punct:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      ABcd01Xy
 ([[:^space:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      ABcd01Xy__--
-([[:^word:]]+) ABcd01Xy__--  ${nulnul}${ffff}  yT      $1      --  ${nulnul}${ffff}
+((?a)[[:^word:]]+)     ABcd01Xy__--  ${nulnul}${ffff}  y       $1      --  ${nulnul}${ffff}
 ([[:^upper:]]+)        ABcd01Xy__--  ${nulnul}${ffff}  y       $1      cd01
 ([[:^xdigit:]]+)       ABcd01Xy__--  ${nulnul}${ffff}  y       $1      Xy__--  ${nulnul}${ffff}
 [[:foo:]]      -       c       -       POSIX class [:foo:] unknown
@@ -906,6 +906,7 @@ tt+$        xxxtt   y       -       -
 ^([^,]{0,3},){3,}d     aaa,b,c,d       y       $1      c,
 ^([^,]{0,3},){0,3}d    aaa,b,c,d       y       $1      c,
 (?i)           y       -       -
+(?a:((?u)\w)\W)        \xC0\xC0        y       $&      \xC0\xC0
 '(?!\A)x'm     a\nxb\n y       -       -
 ^(a(b)?)+$     aba     y       -$1-$2- -a--
 ^(aa(bb)?)+$   aabbaa  y       -$1-$2- -aa--