PERL_ARGS_ASSERT_CL_IS_ANYTHING;
- for (value = 0; value <= ANYOF_MAX; value += 2)
+ for (value = 0; value < ANYOF_MAX; value += 2)
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
return 1;
if (!(cl->flags & ANYOF_UNICODE_ALL))
PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
+ PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
if (UTF)
do {
RExC_parse += UTF8SKIP(RExC_parse);
- } while (isALNUM_utf8((U8*)RExC_parse));
+ } while (isWORDCHAR_utf8((U8*)RExC_parse));
else
do {
RExC_parse++;
- } while (isALNUM(*RExC_parse));
+ } while (isWORDCHAR(*RExC_parse));
} else {
RExC_parse++; /* so the <- from the vFAIL is after the offending character */
vFAIL("Group name must start with a non-digit word character");
const char *s = RExC_parse;
const char c = *s++;
- while (isALNUM(*s))
+ while (isWORDCHAR(*s))
s++;
if (*s && c == *s && s[1] == ']') {
SAVEFREESV(RExC_rx_sv);
}
default:
/* Allow \_ to not give an error */
- if (!SIZE_ONLY && isALNUM(value) && value != '_') {
+ if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
SAVEFREESV(RExC_rx_sv);
SAVEFREESV(listsv);
ckWARN2reg(RExC_parse,
if (! SIZE_ONLY) {
U8 classnum = namedclass_to_classnum(namedclass);
+ if (namedclass >= ANYOF_MAX) { /* If a special class */
+ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
+
+ /* Here, should be \h, \H, \v, or \V. Neither /d nor
+ * /l make a difference in what these match. There
+ * would be problems if these characters had folds
+ * other than themselves, as cp_list is subject to
+ * folding. */
+ if (classnum != _CC_VERTSPACE) {
+ assert( namedclass == ANYOF_HORIZWS
+ || namedclass == ANYOF_NHORIZWS);
+
+ /* It turns out that \h is just a synonym for
+ * XPosixBlank */
+ classnum = _CC_BLANK;
+ }
- /* The ascii range inversion list */
- SV* ascii_source = PL_Posix_ptrs[classnum];
-
- /* The full Latin1 range inversion list */
- SV* l1_source = PL_L1Posix_ptrs[classnum];
-
- /* The name of the property to use to match the full eXtended
- * Unicode range swash fo this character class */
- const char *Xname = swash_property_names[classnum];
+ _invlist_union_maybe_complement_2nd(
+ cp_list,
+ PL_XPosix_ptrs[classnum],
+ namedclass % 2, /* Complement if odd
+ (NHORIZWS, NVERTWS) */
+ &cp_list);
+ }
+ }
+ else if (classnum == _CC_ASCII) {
+#ifdef HAS_ISASCII
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
+ }
+ else
+#endif /* Not isascii(); just use the hard-coded definition for it */
+ _invlist_union_maybe_complement_2nd(
+ posixes,
+ PL_ASCII,
+ namedclass % 2, /* Complement if odd (NASCII) */
+ &posixes);
+ }
+ else { /* Garden variety class */
+
+ /* The ascii range inversion list */
+ SV* ascii_source = PL_Posix_ptrs[classnum];
+
+ /* The full Latin1 range inversion list */
+ SV* l1_source = PL_L1Posix_ptrs[classnum];
+
+ /* This code is structured into two major clauses. The
+ * first is for classes whose complete definitions may not
+ * already be known. It not, the Latin1 definition
+ * (guaranteed to already known) is used plus code is
+ * generated to load the rest at run-time (only if needed).
+ * If the complete definition is known, it drops down to
+ * the second clause, where the complete definition is
+ * known */
+
+ if (classnum < _FIRST_NON_SWASH_CC) {
+
+ /* Here, the class has a swash, which may or not
+ * already be loaded */
+
+ /* The name of the property to use to match the full
+ * eXtended Unicode range swash for this character
+ * class */
+ const char *Xname = swash_property_names[classnum];
+
+ if ( ! PL_utf8_swash_ptrs[classnum]) {
+ if (namedclass % 2 == 0) { /* A non-complemented
+ class */
+ /* If not /a matching, there are code points we
+ * don't know at compile time. Arrange for the
+ * unknown matches to be loaded at run-time, if
+ * needed */
+ if (! AT_LEAST_ASCII_RESTRICTED) {
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
+ Xname);
+ }
+ if (LOC) { /* Under locale, set run-time
+ lookup */
+ ANYOF_CLASS_SET(ret, namedclass);
+ }
+ else {
+ /* Add the current class's code points to
+ * the running total */
+ _invlist_union(posixes,
+ (AT_LEAST_ASCII_RESTRICTED)
+ ? ascii_source
+ : l1_source,
+ &posixes);
+ }
+ }
+ else { /* A complemented class */
+ if (AT_LEAST_ASCII_RESTRICTED) {
+ /* Under /a should match everything above
+ * ASCII, plus the complement of the set's
+ * ASCII matches */
+ _invlist_union_complement_2nd(posixes,
+ ascii_source,
+ &posixes);
+ }
+ else {
+ /* Arrange for the unknown matches to be
+ * loaded at run-time, if needed */
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
+ Xname);
+ runtime_posix_matches_above_Unicode = TRUE;
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
+ }
+ else {
+
+ /* We want to match everything in
+ * Latin1, except those things that
+ * l1_source matches */
+ SV* scratch_list = NULL;
+ _invlist_subtract(PL_Latin1, l1_source,
+ &scratch_list);
+
+ /* Add the list from this class to the
+ * running total */
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes,
+ scratch_list,
+ &posixes);
+ SvREFCNT_dec_NN(scratch_list);
+ }
+ if (DEPENDS_SEMANTICS) {
+ ANYOF_FLAGS(ret)
+ |= ANYOF_NON_UTF8_LATIN1_ALL;
+ }
+ }
+ }
+ }
+ goto namedclass_done;
+ }
- switch ((I32)namedclass) {
+ /* Here, there is a swash loaded for the class. If no
+ * inversion list for it yet, get it */
+ if (! PL_XPosix_ptrs[classnum]) {
+ PL_XPosix_ptrs[classnum]
+ = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
+ }
+ }
- case ANYOF_DIGIT:
- l1_source = ascii_source;
- /* FALL THROUGH */
+ /* Here there is an inversion list already loaded for the
+ * entire class */
- case ANYOF_ALPHANUMERIC: /* C's alnum, in contrast to \w */
- case ANYOF_ALPHA:
- case ANYOF_CASED:
- case ANYOF_GRAPH:
- case ANYOF_LOWER:
- case ANYOF_PRINT:
- case ANYOF_PUNCT:
- case ANYOF_UPPER:
- case ANYOF_WORDCHAR:
- if ( ! PL_utf8_swash_ptrs[classnum]) {
-
- /* If not /a matching, there are code points we don't
- * know at compile time. Arrange for the unknown
- * matches to be loaded at run-time, if needed */
- if (! AT_LEAST_ASCII_RESTRICTED) {
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", Xname);
- }
- if (LOC) { /* Under locale, set run-time lookup */
- ANYOF_CLASS_SET(ret, namedclass);
- }
- else {
- /* Add the current class's code points to the
- * running total */
+ if (namedclass % 2 == 0) { /* A non-complemented class,
+ like ANYOF_PUNCT */
+ if (! LOC) {
+ /* For non-locale, just add it to any existing list
+ * */
_invlist_union(posixes,
(AT_LEAST_ASCII_RESTRICTED)
- ? ascii_source
- : l1_source,
+ ? ascii_source
+ : PL_XPosix_ptrs[classnum],
&posixes);
}
- break;
- }
- if (! PL_XPosix_ptrs[classnum]) {
- PL_XPosix_ptrs[classnum]
- = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
- }
- /* FALL THROUGH */
-
- case ANYOF_BLANK:
- case ANYOF_CNTRL:
- case ANYOF_PSXSPC:
- case ANYOF_SPACE:
- case ANYOF_XDIGIT:
- if (! LOC) {
- /* For non-locale, just add it to any existing list */
- _invlist_union(posixes,
- (AT_LEAST_ASCII_RESTRICTED)
- ? ascii_source
- : PL_XPosix_ptrs[classnum],
- &posixes);
- }
- else { /* Locale */
- SV* scratch_list = NULL;
+ else { /* Locale */
+ SV* scratch_list = NULL;
- /* For above Latin1 code points, we use the full
- * Unicode range */
- _invlist_intersection(PL_AboveLatin1,
- PL_XPosix_ptrs[classnum],
- &scratch_list);
- /* And set the output to it, adding instead if there
- * already is an output. Checking if 'posixes' is NULL
- * first saves an extra clone. Its reference count
- * will be decremented at the next union, etc, or if
- * this is the only instance, at the end of the routine
- * */
- if (! posixes) {
- posixes = scratch_list;
- }
- else {
- _invlist_union(posixes, scratch_list, &posixes);
- SvREFCNT_dec_NN(scratch_list);
- }
+ /* For above Latin1 code points, we use the full
+ * Unicode range */
+ _invlist_intersection(PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+ &scratch_list);
+ /* And set the output to it, adding instead if
+ * there already is an output. Checking if
+ * 'posixes' is NULL first saves an extra clone.
+ * Its reference count will be decremented at the
+ * next union, etc, or if this is the only
+ * instance, at the end of the routine */
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec_NN(scratch_list);
+ }
#ifndef HAS_ISBLANK
- if (namedclass != ANYOF_BLANK) {
-#endif
- /* Set this class in the node for runtime
- * matching */
- ANYOF_CLASS_SET(ret, namedclass);
-#ifndef HAS_ISBLANK
- }
- else {
- /* No isblank(), use the hard-coded ASCII-range
- * blanks, adding them to the running total. */
-
- _invlist_union(posixes, ascii_source, &posixes);
- }
+ if (namedclass != ANYOF_BLANK) {
#endif
- }
- break;
-
- case ANYOF_NDIGIT:
- l1_source = ascii_source;
- /* FALL THROUGH */
-
- case ANYOF_NALPHANUMERIC:
- case ANYOF_NALPHA:
- case ANYOF_NGRAPH:
- case ANYOF_NLOWER:
- case ANYOF_NPRINT:
- case ANYOF_NPUNCT:
- case ANYOF_NUPPER:
- case ANYOF_NWORDCHAR:
- if ( ! PL_utf8_swash_ptrs[classnum]) {
- if (AT_LEAST_ASCII_RESTRICTED) {
- /* Under /a should match everything above ASCII,
- * and the complement of the set's ASCII matches */
- _invlist_union_complement_2nd(posixes, ascii_source,
- &posixes);
- }
- else {
- /* Arrange for the unknown matches to be loaded at
- * run-time, if needed */
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", Xname);
- runtime_posix_matches_above_Unicode = TRUE;
- if (LOC) {
+ /* Set this class in the node for runtime
+ * matching */
ANYOF_CLASS_SET(ret, namedclass);
+#ifndef HAS_ISBLANK
}
else {
+ /* No isblank(), use the hard-coded ASCII-range
+ * blanks, adding them to the running total. */
- /* We want to match everything in Latin1,
- * except those things that l1_source matches
- * */
- SV* scratch_list = NULL;
- _invlist_subtract(PL_Latin1, l1_source,
- &scratch_list);
-
- /* Add the list from this class to the running
- * total */
- if (! posixes) {
- posixes = scratch_list;
- }
- else {
- _invlist_union(posixes, scratch_list,
- &posixes);
- SvREFCNT_dec_NN(scratch_list);
- }
- if (DEPENDS_SEMANTICS) {
- ANYOF_FLAGS(ret)
- |= ANYOF_NON_UTF8_LATIN1_ALL;
- }
+ _invlist_union(posixes, ascii_source, &posixes);
}
+#endif
}
- break;
}
- if (! PL_XPosix_ptrs[classnum]) {
- PL_XPosix_ptrs[classnum]
- = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
- }
- /* FALL THROUGH */
-
- case ANYOF_NBLANK:
- case ANYOF_NCNTRL:
- case ANYOF_NPSXSPC:
- case ANYOF_NSPACE:
- case ANYOF_NXDIGIT:
- if (! LOC) {
- _invlist_union_complement_2nd(
+ else { /* A complemented class, like ANYOF_NPUNCT */
+ if (! LOC) {
+ _invlist_union_complement_2nd(
posixes,
(AT_LEAST_ASCII_RESTRICTED)
? ascii_source
: PL_XPosix_ptrs[classnum],
&posixes);
- /* Under /d, everything in the upper half of the Latin1
- * range matches this complement */
- if (DEPENDS_SEMANTICS) {
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
- }
- }
- else { /* Locale */
- SV* scratch_list = NULL;
- _invlist_subtract(PL_AboveLatin1,
- PL_XPosix_ptrs[classnum],
- &scratch_list);
- if (! posixes) {
- posixes = scratch_list;
- }
- else {
- _invlist_union(posixes, scratch_list, &posixes);
- SvREFCNT_dec_NN(scratch_list);
+ /* Under /d, everything in the upper half of the
+ * Latin1 range matches this complement */
+ if (DEPENDS_SEMANTICS) {
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
+ }
}
+ else { /* Locale */
+ SV* scratch_list = NULL;
+ _invlist_subtract(PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+ &scratch_list);
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec_NN(scratch_list);
+ }
#ifndef HAS_ISBLANK
- if (namedclass != ANYOF_NBLANK) {
+ if (namedclass != ANYOF_NBLANK) {
#endif
- ANYOF_CLASS_SET(ret, namedclass);
+ ANYOF_CLASS_SET(ret, namedclass);
#ifndef HAS_ISBLANK
- }
- else {
- /* Get the list of all code points in Latin1 that
- * are not ASCII blanks, and add them to the
- * running total */
- _invlist_subtract(PL_Latin1, ascii_source,
- &scratch_list);
- _invlist_union(posixes, scratch_list, &posixes);
- SvREFCNT_dec_NN(scratch_list);
- }
-#endif
- }
- break;
-
- case ANYOF_ASCII:
-#ifdef HAS_ISASCII
- if (LOC) {
- ANYOF_CLASS_SET(ret, namedclass);
- }
- else
-#endif /* Not isascii(); just use the hard-coded definition for it */
- _invlist_union(posixes, PL_ASCII, &posixes);
- break;
- case ANYOF_NASCII:
-#ifdef HAS_ISASCII
- if (LOC) {
- ANYOF_CLASS_SET(ret, namedclass);
- }
- else {
+ }
+ else {
+ /* Get the list of all code points in Latin1
+ * that are not ASCII blanks, and add them to
+ * the running total */
+ _invlist_subtract(PL_Latin1, ascii_source,
+ &scratch_list);
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec_NN(scratch_list);
+ }
#endif
- _invlist_union_complement_2nd(posixes,
- PL_ASCII, &posixes);
- if (DEPENDS_SEMANTICS) {
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
}
-#ifdef HAS_ISASCII
}
-#endif
- break;
-
- case ANYOF_HORIZWS:
- /* For these, we use the cp_list, as neither /d nor /l make
- * a difference in what these match. There would be
- * problems if these characters had folds other than
- * themselves, as cp_list is subject to folding.
- *
- * It turns out that \h is just a synonym for XPosixBlank */
- classnum = _CC_BLANK;
- /* FALL THROUGH */
-
- case ANYOF_VERTWS:
- _invlist_union(cp_list, PL_XPosix_ptrs[classnum], &cp_list);
- break;
-
- case ANYOF_NHORIZWS:
- classnum = _CC_BLANK;
- /* FALL THROUGH */
-
- case ANYOF_NVERTWS:
- _invlist_union_complement_2nd(cp_list,
- PL_XPosix_ptrs[classnum],
- &cp_list);
- break;
-
- case ANYOF_UNIPROP: /* this is to handle \p and \P */
- break;
-
- default:
- vFAIL("Invalid [::] class");
- break;
- }
-
+ }
+ namedclass_done:
continue; /* Go get next character */
}
} /* end of namedclass \blah */