d => 0,
l => 1,
u => 2,
+ a => 3,
);
sub setcolor {
my $reflags = $^H{reflags} || 0;
my $seen_charset;
for(split//, $s) {
- if (/[dul]/) {
+ if (/[adul]/) {
if ($on) {
if ($seen_charset && $seen_charset ne $_) {
require Carp;
at the top of your code.
-The character set /dul flags cancel each other out. So, in this example,
+The character set /adul flags cancel each other out. So, in this example,
use re "/u";
"ss" =~ /\xdf/;
typedef enum {
REGEX_DEPENDS_CHARSET = 0,
REGEX_LOCALE_CHARSET,
- REGEX_UNICODE_CHARSET
+ REGEX_UNICODE_CHARSET,
+ REGEX_ASCII_RESTRICTED_CHARSET
} regex_charset;
#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5)
[ List each enhancement as a =head2 entry ]
+=head2 New regular expression modifier C</a>
+
+The C</a> regular expression modifier restricts C<\s> to match precisely
+the five characters C<[ \f\n\r\t]>, C<\d> to match precisely the 10
+characters C<[0-9]>, C<\w> to match precisely the 63 characters
+C<[A-Za-z0-9_]>, and the Posix (C<[[:posix:]]>) character classes to
+match only the appropriate ASCII characters. The complements, of
+course, match everything but; and C<\b> and C<\B> are correspondingly
+affected. Otherwise, C</a> behaves like the C</u> modifier, in that
+case-insensitive matching uses Unicode semantics; for example, "k" will
+match the Unicode C<\N{KELVIN SIGN}> under C</i> matching, and code
+points in the Latin1 range, above ASCII will have Unicode semantics when
+it comes to case-insensitive matching. Like its cousins (C</u>, C</l>,
+and C</d>), and in spite of the terminology, C</a> in 5.14 will not
+actually be able to be used as a suffix at the end of a regular
+expression (this restriction is planned to be lifted in 5.16). It must
+occur either as an infix modifier, such as C<(?a:...)> or (C<(?a)...>,
+or it can be turned on within the lexical scope of C<use re '/a'>.
+Turning on C</a> turns off the other "character set" modifiers.
+
=head2 Any unsigned value can be encoded as a character
With this release, Perl is adopting a model that any unsigned value can
the comment as soon as it sees a C<)>, so there is no way to put a literal
C<)> in the comment.
-=item C<(?dlupimsx-imsx)>
+=item C<(?adlupimsx-imsx)>
-=item C<(?^lupimsx)>
+=item C<(?^alupimsx)>
X<(?)> X<(?^)>
One or more embedded pattern-match modifiers, to be turned on (or
C<"d">) may follow the caret to override it.
But a minus sign is not legal with it.
-Also, starting in Perl 5.14, are modifiers C<"d">, C<"l">, and C<"u">,
-which for 5.14 may not be used as suffix modifiers.
+Also, starting in Perl 5.14, are modifiers C<"a">, C<"d">, C<"l">, and
+C<"u">, which for 5.14 may not be used as suffix modifiers.
C<"l"> means to use a locale (see L<perllocale>) when pattern matching.
The locale used will be the one in effect at the time of execution of
in strict ASCII their meanings are undefined. Thus the platform
effectively becomes a Unicode platform. The ASCII characters remain as
ASCII characters (since ASCII is a subset of Latin-1 and Unicode). For
-example, when this option is not on, on a non-utf8 string, C<"\w">
+example, when this option is XXX not on, on a non-utf8 string, C<"\w">
matches precisely C<[A-Za-z0-9_]>. When the option is on, it matches
not just those, but all the Latin-1 word characters (such as an "n" with
a tilde). On EBCDIC platforms, which already are equivalent to Latin-1,
S> will match any of C<SS>, C<Ss>, C<sS>, and C<ss>, otherwise not.
(This last case is buggy, however.)
+C<"a"> is the same as C<"u">, but C<\d>, C<\s>, C<\w>, and the Posix
+character classes are restricted to matching in the ASCII range only.
+That is, with this modifier, C<\d> always means precisely the digits
+C<"0"> to C<"9">; C<\s> means the five characters C<[ \f\n\r\t]>;
+C<\w> means the 53 characters C<[A-Za-z0-9_]>; and likewise, all the
+Posix classes such as C<[[:print:]]> match only the appropriate
+ASCII-range characters. As you would expect, this modifier causes, for
+example, C<\D> to mean the same thing as C<[^0-9]>. C<"a"> behaves the
+same as C<"u"> with regards to case-insensitive matches. XXX
+
C<"d"> means to use the traditional Perl pattern matching behavior.
This is dualistic (hence the name C<"d">, which also could stand for
"depends"). When this is in effect, Perl matches utf8-encoded strings
=item C<(?:pattern)>
X<(?:)>
-=item C<(?dluimsx-imsx:pattern)>
+=item C<(?adluimsx-imsx:pattern)>
-=item C<(?^luimsx:pattern)>
+=item C<(?^aluimsx:pattern)>
X<(?^:)>
This is for clustering, not capturing; it groups subexpressions like
characters if you don't need to.
Any letters between C<?> and C<:> act as flags modifiers as with
-C<(?dluimsx-imsx)>. For example,
+C<(?adluimsx-imsx)>. For example,
/(?s-i:more.*than).*million/i
New in perl 5.10.0 are the classes C<\h> and C<\v> which match horizontal
and vertical whitespace characters.
+The exact set of characters matched by C<\d>, C<\s>, and C<\w> varies
+depending on various pragma and regular expression modifiers. See
+L<perlre>.
+
The uppercase variants (C<\W>, C<\D>, C<\S>, C<\H>, and C<\V>) are
character classes that match, respectively, any character that isn't a
word character, digit, whitespace, horizontal whitespace, or vertical
matches just the digits '0' to '9'.
Unicode digits may cause some confusion, and some security issues. In UTF-8
-strings, C<\d> matches the same characters matched by
+strings, unless the C<"a"> modifier is specified, C<\d> matches the same
+characters matched by
C<\p{General_Category=Decimal_Number}>, or synonymously,
C<\p{General_Category=Digit}>. Starting with Unicode version 4.1, this is the
same set of characters matched by C<\p{Numeric_Type=Decimal}>.
#define UTF cBOOL(RExC_utf8)
#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
+#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
cs = REGEX_UNICODE_CHARSET;
has_charset_modifier = 1;
break;
+ case ASCII_RESTRICT_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ cs = REGEX_ASCII_RESTRICTED_CHARSET;
+ has_charset_modifier = 1;
+ break;
case DEPENDS_PAT_MOD:
if (has_use_defaults
|| has_charset_modifier
case REGEX_UNICODE_CHARSET:
op = ALNUMU;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = ALNUMA;
+ break;
case REGEX_DEPENDS_CHARSET:
op = ALNUM;
break;
case REGEX_UNICODE_CHARSET:
op = NALNUMU;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NALNUMA;
+ break;
case REGEX_DEPENDS_CHARSET:
op = NALNUM;
break;
case REGEX_UNICODE_CHARSET:
op = BOUNDU;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = BOUNDA;
+ break;
case REGEX_DEPENDS_CHARSET:
op = BOUND;
break;
case REGEX_UNICODE_CHARSET:
op = NBOUNDU;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NBOUNDA;
+ break;
case REGEX_DEPENDS_CHARSET:
op = NBOUND;
break;
case REGEX_UNICODE_CHARSET:
op = SPACEU;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = SPACEA;
+ break;
case REGEX_DEPENDS_CHARSET:
op = SPACE;
break;
case REGEX_UNICODE_CHARSET:
op = NSPACEU;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NSPACEA;
+ break;
case REGEX_DEPENDS_CHARSET:
op = NSPACE;
break;
case REGEX_LOCALE_CHARSET:
op = DIGITL;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = DIGITA;
+ break;
case REGEX_DEPENDS_CHARSET: /* No difference between these */
case REGEX_UNICODE_CHARSET:
op = DIGIT;
case REGEX_LOCALE_CHARSET:
op = NDIGITL;
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NDIGITA;
+ break;
case REGEX_DEPENDS_CHARSET: /* No difference between these */
case REGEX_UNICODE_CHARSET:
op = NDIGIT;
ret = reganode(pRExC_state,
((! FOLD)
? NREF
- : (UNI_SEMANTICS)
+ : (AT_LEAST_UNI_SEMANTICS)
? NREFFU
: (LOC)
? NREFFL
ret = reganode(pRExC_state,
((! FOLD)
? REF
- : (UNI_SEMANTICS)
+ : (AT_LEAST_UNI_SEMANTICS)
? REFFU
: (LOC)
? REFFL
(U8) ((! FOLD) ? EXACT
: (LOC)
? EXACTFL
- : (UNI_SEMANTICS)
+ : (AT_LEAST_UNI_SEMANTICS)
? EXACTFU
: EXACTF)
);
if (! TEST_7) stored += \
S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
- /* For a non-ut8 target string with DEPENDS semantics, all above ASCII \
- * Latin1 code points match the complement of any of the classes. But \
- * in utf8, they have their Unicode semantics, so can't just set them \
- * in the bitmap, or else regexec.c will think they matched when they \
- * shouldn't. */ \
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
+ if (ASCII_RESTRICTED) { \
+ for (value = 128; value < 256; value++) { \
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
+ } \
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
+ } \
+ else { \
+ /* For a non-ut8 target string with DEPENDS semantics, all above \
+ * ASCII Latin1 code points match the complement of any of the \
+ * classes. But in utf8, they have their Unicode semantics, so \
+ * can't just set them in the bitmap, or else regexec.c will think \
+ * they matched when they shouldn't. */ \
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
+ } \
} \
yesno = '!'; \
what = WORD; \
U8 stored = 0;
U8 fold;
- fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
+ fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
: PL_fold[value];
/* It assumes the bit for 'value' has already been set */
stored +=
S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
}
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
yesno = '!';
what = "ASCII";
break;
}
yesno = '!';
what = POSIX_CC_UNI_NAME("Digit");
+ if (ASCII_RESTRICTED ) {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ }
break;
case ANYOF_MAX:
/* this is to handle \p and \P */
vFAIL("Invalid [::] class");
break;
}
- if (what) {
+ if (what && ! (ASCII_RESTRICTED)) {
/* Strings such as "+utf8::isWord\n" */
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ANYOF_FLAGS(ret) |= ANYOF_UTF8;
op = EXACT;
}
} /* else 2 chars in the bit map: the folds of each other */
- else if (UNI_SEMANTICS || !isASCII(value)) {
+ else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
/* To join adjacent nodes, they must be the exact EXACTish type.
* Try to use the most likely type, by using EXACTFU if the regex
case REGEX_LOCALE_CHARSET:
PerlIO_printf(Perl_debug_log, "LOCALE");
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
+ break;
default:
PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
break;
* This is because that code fails when the test succeeds, so we want to have
* the test fail so that the code succeeds. The swash is stored in a
* predictable PL_ place */
-#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR) \
+#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
+ CLASS, STR) \
case NAME: \
_CCC_TRY_CODE( !, FUNC, \
cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
* irrelevant here */
#define CCC_TRY(NAME, NNAME, FUNC, \
NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
+ NAMEA, NNAMEA, FUNCA, \
CLASS, STR) \
case NAMEL: \
PL_reg_flags |= RF_tainted; \
case NNAMEL: \
PL_reg_flags |= RF_tainted; \
_CCC_TRY_CODE( , LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
+ case NAMEA: \
+ if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
+ sayNO; \
+ } \
+ /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
+ nextchr = UCHARAT(++locinput); \
+ break; \
+ case NNAMEA: \
+ if (locinput >= PL_regeol || FUNCA(nextchr)) { \
+ sayNO; \
+ } \
+ if (utf8_target) { \
+ locinput += PL_utf8skip[nextchr]; \
+ nextchr = UCHARAT(locinput); \
+ } \
+ else { \
+ nextchr = UCHARAT(++locinput); \
+ } \
+ break; \
/* Generate the non-locale cases */ \
_CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
#define CCC_TRY_U(NAME, NNAME, FUNC, \
NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
NAMEU, NNAMEU, FUNCU, \
+ NAMEA, NNAMEA, FUNCA, \
CLASS, STR) \
- CCC_TRY(NAME, NNAME, FUNC, NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, CLASS, STR) \
+ CCC_TRY(NAME, NNAME, FUNC, \
+ NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
+ NAMEA, NNAMEA, FUNCA, \
+ CLASS, STR) \
_CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
#define DUMP_EXEC_POS(li,s,doutf8) \
dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
+
+#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
+ tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
+ tmp = TEST_NON_UTF8(tmp); \
+ REXEC_FBC_UTF8_SCAN( \
+ if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
+ tmp = !tmp; \
+ IF_SUCCESS; \
+ } \
+ else { \
+ IF_FAIL; \
+ } \
+ ); \
+
+#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
+ if (s == PL_bostr) { \
+ tmp = '\n'; \
+ } \
+ else { \
+ U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
+ tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
+ } \
+ tmp = TeSt1_UtF8; \
+ LOAD_UTF8_CHARCLASS_ALNUM(); \
+ REXEC_FBC_UTF8_SCAN( \
+ if (tmp == ! (TeSt2_UtF8)) { \
+ tmp = !tmp; \
+ IF_SUCCESS; \
+ } \
+ else { \
+ IF_FAIL; \
+ } \
+ ); \
+
/* The only difference between the BOUND and NBOUND cases is that
* REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
* NBOUND. This is accomplished by passing it in either the if or else clause,
* with the other one being empty */
#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
- FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, )
+ FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, ), TEST_NON_UTF8, REXEC_FBC_TRYIT, )
+
+#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+ FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, ), TEST_NON_UTF8, REXEC_FBC_TRYIT, )
#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
- FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, , REXEC_FBC_TRYIT)
+ FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, , REXEC_FBC_TRYIT), TEST_NON_UTF8, , REXEC_FBC_TRYIT)
+
+#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+ FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, , REXEC_FBC_TRYIT), TEST_NON_UTF8, , REXEC_FBC_TRYIT)
+
/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
* be passed in completely with the variable name being tested, which isn't
* one, and compare it with the wordness of this one. If they differ, we have
* a boundary. At the beginning of the string, pretend that the previous
* character was a new-line */
-#define FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, \
- IF_SUCCESS, IF_FAIL) \
+#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
if (utf8_target) { \
- if (s == PL_bostr) { \
- tmp = '\n'; \
- } \
- else { \
- U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
- tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
- } \
- tmp = TEST1_UTF8; \
- LOAD_UTF8_CHARCLASS_ALNUM(); \
- REXEC_FBC_UTF8_SCAN( \
- if (tmp == ! (TEST2_UTF8)) { \
- tmp = !tmp; \
- IF_SUCCESS; \
- } \
- else { \
- IF_FAIL; \
- } \
- ); \
+ UTF8_CODE \
} \
else { /* Not utf8 */ \
tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
isALNUM_uni(tmp),
cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
break;
+ case BOUNDA:
+ FBC_BOUND_NOLOAD(isWORDCHAR_A,
+ isWORDCHAR_A(tmp),
+ isWORDCHAR_A((U8*)s));
+ break;
case NBOUND:
FBC_NBOUND(isWORDCHAR,
isALNUM_uni(tmp),
cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
break;
+ case NBOUNDA:
+ FBC_NBOUND_NOLOAD(isWORDCHAR_A,
+ isWORDCHAR_A(tmp),
+ isWORDCHAR_A((U8*)s));
+ break;
case BOUNDU:
FBC_BOUND(isWORDCHAR_L1,
isALNUM_uni(tmp),
swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
isWORDCHAR((U8) *s)
);
+ case ALNUMA:
+ REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
case NALNUMU:
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_WORD(),
!swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
! isALNUM(*s)
);
+ case NALNUMA:
+ REXEC_FBC_UTF8_CLASS_SCAN( !isWORDCHAR_A(*s));
case NALNUML:
REXEC_FBC_CSCAN_TAINT(
!isALNUM_LC_utf8((U8*)s),
*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
isSPACE((U8) *s)
);
+ case SPACEA:
+ REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
case SPACEL:
REXEC_FBC_CSCAN_TAINT(
isSPACE_LC_utf8((U8*)s),
!(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
! isSPACE((U8) *s)
);
+ case NSPACEA:
+ REXEC_FBC_UTF8_CLASS_SCAN( !isSPACE_A(*s));
case NSPACEL:
REXEC_FBC_CSCAN_TAINT(
!isSPACE_LC_utf8((U8*)s),
swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
isDIGIT(*s)
);
+ case DIGITA:
+ REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
case DIGITL:
REXEC_FBC_CSCAN_TAINT(
isDIGIT_LC_utf8((U8*)s),
!swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
!isDIGIT(*s)
);
+ case NDIGITA:
+ REXEC_FBC_UTF8_CLASS_SCAN( !isDIGIT_A(*s));
case NDIGITL:
REXEC_FBC_CSCAN_TAINT(
!isDIGIT_LC_utf8((U8*)s),
/* FALL THROUGH */
case BOUND:
case BOUNDU:
+ case BOUNDA:
case NBOUND:
case NBOUNDU:
+ case NBOUNDA:
/* was last char in word? */
- if (utf8_target) {
+ if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
if (locinput == PL_bostr)
ln = '\n';
else {
}
}
else {
+
+ /* Here the string isn't utf8, or is utf8 and only ascii
+ * characters are to match \w. In the latter case looking at
+ * the byte just prior to the current one may be just the final
+ * byte of a multi-byte character. This is ok. There are two
+ * cases:
+ * 1) it is a single byte character, and then the test is doing
+ * just what it's supposed to.
+ * 2) it is a multi-byte character, in which case the final
+ * byte is never mistakable for ASCII, and so the test
+ * will say it is not a word character, which is the
+ * correct answer. */
ln = (locinput != PL_bostr) ?
UCHARAT(locinput - 1) : '\n';
switch (FLAGS(scan)) {
ln = isALNUM(ln);
n = isALNUM(nextchr);
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ ln = isWORDCHAR_A(ln);
+ n = isWORDCHAR_A(nextchr);
+ break;
default:
Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
break;
CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
ALNUMU, NALNUMU, isWORDCHAR_L1,
+ ALNUMA, NALNUMA, isWORDCHAR_A,
perl_word, "a");
CCC_TRY_U(SPACE, NSPACE, isSPACE,
SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
SPACEU, NSPACEU, isSPACE_L1,
+ SPACEA, NSPACEA, isSPACE_A,
perl_space, " ");
CCC_TRY(DIGIT, NDIGIT, isDIGIT,
DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
+ DIGITA, NDIGITA, isDIGIT_A,
posix_digit, "0");
case CLUMP: /* Match \X: logical Unicode character. This is defined as
scan++;
}
break;
+ case ALNUMA:
+ while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
+ scan++;
+ }
+ break;
case ALNUML:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
scan++;
}
break;
+ case NALNUMA:
+ if (utf8_target) {
+ while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+ scan += UTF8SKIP(scan);
+ }
+ }
+ else {
+ while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+ scan++;
+ }
+ }
+ break;
case NALNUML:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
scan++;
}
break;
+ case SPACEA:
+ while (scan < loceol && isSPACE_A((U8) *scan)) {
+ scan++;
+ }
+ break;
case SPACEL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
scan++;
}
break;
+ case NSPACEA:
+ if (utf8_target) {
+ while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+ scan += UTF8SKIP(scan);
+ }
+ }
+ else {
+ while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+ scan++;
+ }
+ }
+ break;
case NSPACEL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
scan++;
}
break;
+ case DIGITA:
+ while (scan < loceol && isDIGIT_A((U8) *scan)) {
+ scan++;
+ }
+ break;
case DIGITL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
while (scan < loceol && !isDIGIT(*scan))
scan++;
}
+ break;
+ case NDIGITA:
+ if (utf8_target) {
+ while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+ scan += UTF8SKIP(scan);
+ }
+ }
+ else {
+ while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+ scan++;
+ }
+ }
+ break;
case NDIGITL:
PL_reg_flags |= RF_tainted;
if (utf8_target) {
#define LOCALE_PAT_MOD 'l'
#define UNICODE_PAT_MOD 'u'
#define DEPENDS_PAT_MOD 'd'
+#define ASCII_RESTRICT_PAT_MOD 'a'
#define ONCE_PAT_MODS "o"
#define KEEPCOPY_PAT_MODS "p"
#define LOCALE_PAT_MODS "l"
#define UNICODE_PAT_MODS "u"
#define DEPENDS_PAT_MODS "d"
+#define ASCII_RESTRICT_PAT_MODS "a"
/* This string is expected by regcomp.c to be ordered so that the first
* character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of extflags; the next
case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
+ case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
}
return "?"; /* Unknown */
$testcases{'[:word:]'} = $testcases{'\w'};
# For each possible character set...
-foreach my $charset ("d", "u") {
+foreach my $charset ("a", "d", "u") {
# And in utf8 or not
foreach my $upgrade ("", 'utf8::upgrade($a); ') {
([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01
([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
-([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 __-- ${nulnul}${ffff}
+((?a)[[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff}
([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff}
([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
-([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 ${nulnul}${ffff}
+((?a)[[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff}
([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
-([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 -- ${nulnul}${ffff}
+((?a)[[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 -- ${nulnul}${ffff}
([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01
([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff}
[[:foo:]] - c - POSIX class [:foo:] unknown
^([^,]{0,3},){3,}d aaa,b,c,d y $1 c,
^([^,]{0,3},){0,3}d aaa,b,c,d y $1 c,
(?i) y - -
+(?a:((?u)\w)\W) \xC0\xC0 y $& \xC0\xC0
'(?!\A)x'm a\nxb\n y - -
^(a(b)?)+$ aba y -$1-$2- -a--
^(aa(bb)?)+$ aabbaa y -$1-$2- -aa--