#endif
-#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
- case NAMEL: \
- PL_reg_flags |= RF_tainted; \
- /* FALL THROUGH */ \
- case NAME: \
- if (!nextchr) \
- sayNO; \
- if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
- if (!CAT2(PL_utf8_,CLASS)) { \
- bool ok; \
- ENTER; \
- save_re_context(); \
- ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
- assert(ok); \
- LEAVE; \
- } \
- if (!(OP(scan) == NAME \
- ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
- : LCFUNC_utf8((U8*)locinput))) \
- { \
- sayNO; \
- } \
- locinput += PL_utf8skip[nextchr]; \
- nextchr = UCHARAT(locinput); \
- break; \
- } \
- /* Drops through to the macro that calls this one */
-
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
- _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
- if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
- sayNO; \
- nextchr = UCHARAT(++locinput); \
- break
-
-/* Almost identical to the above, but has a case for a node that matches chars
- * between 128 and 255 using Unicode (latin1) semantics. */
-#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
- _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
- if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
- sayNO; \
- nextchr = UCHARAT(++locinput); \
- break
-
-#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
- case NAMEL: \
- PL_reg_flags |= RF_tainted; \
- /* FALL THROUGH */ \
- case NAME : \
- if (!nextchr && locinput >= PL_regeol) \
- sayNO; \
- if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
- if (!CAT2(PL_utf8_,CLASS)) { \
- bool ok; \
- ENTER; \
- save_re_context(); \
- ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
- assert(ok); \
- LEAVE; \
- } \
- if ((OP(scan) == NAME \
- ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
- : LCFUNC_utf8((U8*)locinput))) \
- { \
- sayNO; \
- } \
- locinput += PL_utf8skip[nextchr]; \
- nextchr = UCHARAT(locinput); \
- break; \
- }
-
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
- _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
- if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
- sayNO; \
- nextchr = UCHARAT(++locinput); \
- break
-
-
-#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
- _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \
- if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
- sayNO; \
- nextchr = UCHARAT(++locinput); \
- break
-
-
+/* The actual code for CCC_TRY, which uses several variables from the routine
+ * it's callable from. It is designed to be the bulk of a case statement.
+ * FUNC is the macro or function to call on non-utf8 targets that indicate if
+ * nextchr matches the class.
+ * UTF8_TEST is the whole test string to use for utf8 targets
+ * LOAD is what to use to test, and if not present to load in the swash for the
+ * class
+ * POS_OR_NEG is either empty or ! to complement the results of FUNC or
+ * UTF8_TEST test.
+ * The logic is: Fail if we're at the end-of-string; otherwise if the target is
+ * utf8 and a variant, load the swash if necessary and test using the utf8
+ * test. Advance to the next character if test is ok, otherwise fail; If not
+ * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
+ * fails, or advance to the next character */
+
+#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
+ if (locinput >= PL_regeol) { \
+ sayNO; \
+ } \
+ if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
+ LOAD_UTF8_CHARCLASS(CLASS, STR); \
+ if (POS_OR_NEG (UTF8_TEST)) { \
+ sayNO; \
+ } \
+ locinput += PL_utf8skip[nextchr]; \
+ nextchr = UCHARAT(locinput); \
+ break; \
+ } \
+ if (POS_OR_NEG (FUNC(nextchr))) { \
+ sayNO; \
+ } \
+ nextchr = UCHARAT(++locinput); \
+ break;
+
+/* Handle the non-locale cases for a character class and its complement. It
+ * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
+ * This is because that code fails when the test succeeds, so we want to have
+ * the test fail so that the code succeeds. The swash is stored in a
+ * predictable PL_ place */
+#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR) \
+ case NAME: \
+ _CCC_TRY_CODE( !, FUNC, \
+ cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
+ (U8*)locinput, TRUE)), \
+ CLASS, STR) \
+ case NNAME: \
+ _CCC_TRY_CODE( , FUNC, \
+ cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
+ (U8*)locinput, TRUE)), \
+ CLASS, STR) \
+
+/* Generate the case statements for both locale and non-locale character
+ * classes in regmatch for classes that don't have special unicode semantics.
+ * Locales don't use an immediate swash, but an intermediary special locale
+ * function that is called on the pointer to the current place in the input
+ * string. That function will resolve to needing the same swash. One might
+ * think that because we don't know what the locale will match, we shouldn't
+ * check with the swash loading function that it loaded properly; ie, that we
+ * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
+ * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
+ * irrelevant here */
+#define CCC_TRY(NAME, NNAME, FUNC, \
+ NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
+ CLASS, STR) \
+ case NAMEL: \
+ PL_reg_flags |= RF_tainted; \
+ _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
+ case NNAMEL: \
+ PL_reg_flags |= RF_tainted; \
+ _CCC_TRY_CODE( , LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
+ /* Generate the non-locale cases */ \
+ _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
+
+/* This is like CCC_TRY, but has an extra set of parameters for generating case
+ * statements to handle separate Unicode semantics nodes */
+#define CCC_TRY_U(NAME, NNAME, FUNC, \
+ NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
+ NAMEU, NNAMEU, FUNCU, \
+ CLASS, STR) \
+ CCC_TRY(NAME, NNAME, FUNC, NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, CLASS, STR) \
+ _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
tmp = cBOOL((OP(c) == BOUNDL)
? isALNUM_LC(tmp)
: (isWORDCHAR_L1(tmp)
- && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
+ && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
REXEC_FBC_SCAN(
if (tmp ==
!((OP(c) == BOUNDL)
? isALNUM_LC(*s)
: (isWORDCHAR_L1((U8) *s)
- && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+ && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
{
tmp = !tmp;
REXEC_FBC_TRYIT;
tmp = cBOOL((OP(c) == NBOUNDL)
? isALNUM_LC(tmp)
: (isWORDCHAR_L1(tmp)
- && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
+ && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
REXEC_FBC_SCAN(
if (tmp == ! cBOOL(
(OP(c) == NBOUNDL)
? isALNUM_LC(*s)
: (isWORDCHAR_L1((U8) *s)
- && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+ && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
{
tmp = !tmp;
}
if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
goto got_it;
break;
- case ALNUM:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_PERL_WORD(),
- swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
- (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
- );
case ALNUML:
REXEC_FBC_CSCAN_TAINT(
isALNUM_LC_utf8((U8*)s),
isALNUM_LC(*s)
);
+ case ALNUMU:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_PERL_WORD(),
+ swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+ isWORDCHAR_L1((U8) *s)
+ );
+ case ALNUM:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_PERL_WORD(),
+ swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+ isWORDCHAR((U8) *s)
+ );
+ case NALNUMU:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_PERL_WORD(),
+ swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+ ! isWORDCHAR_L1((U8) *s)
+ );
case NALNUM:
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_WORD(),
!swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
- ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
+ ! isALNUM(*s)
);
case NALNUML:
REXEC_FBC_CSCAN_TAINT(
!isALNUM_LC_utf8((U8*)s),
!isALNUM_LC(*s)
);
+ case SPACEU:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+ *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
+ isSPACE_L1((U8) *s)
+ );
case SPACE:
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_SPACE(),
*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
- isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
+ isSPACE((U8) *s)
);
case SPACEL:
REXEC_FBC_CSCAN_TAINT(
isSPACE_LC_utf8((U8*)s),
isSPACE_LC(*s)
);
+ case NSPACEU:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+ !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
+ ! isSPACE_L1((U8) *s)
+ );
case NSPACE:
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_SPACE(),
!(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
- !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
+ ! isSPACE((U8) *s)
);
case NSPACEL:
REXEC_FBC_CSCAN_TAINT(
else {
ln = (locinput != PL_bostr) ?
UCHARAT(locinput - 1) : '\n';
- if (FLAGS(scan) & USE_UNI) {
+ if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
/* Here, can't be BOUNDL or NBOUNDL because they never set
- * the flags to USE_UNI */
+ * the flags to REGEX_UNICODE_CHARSET */
ln = isWORDCHAR_L1(ln);
n = isWORDCHAR_L1(nextchr);
}
}
break;
/* Special char classes - The defines start on line 129 or so */
- CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
- CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+ CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
+ ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
+ ALNUMU, NALNUMU, isWORDCHAR_L1,
+ perl_word, "a");
- CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
- CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+ CCC_TRY_U(SPACE, NSPACE, isSPACE,
+ SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
+ SPACEU, NSPACEU, isSPACE_L1,
+ perl_space, " ");
- CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
- CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
+ CCC_TRY(DIGIT, NDIGIT, isDIGIT,
+ DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
+ posix_digit, "0");
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
scan++;
}
break;
- case ALNUM:
+ case ALNUMU:
if (utf8_target) {
+ utf8_wordchar:
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_ALNUM();
while (hardcount < max && scan < loceol &&
scan += UTF8SKIP(scan);
hardcount++;
}
- } else if (FLAGS(p) & USE_UNI) {
+ } else {
while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
scan++;
}
- } else {
- while (scan < loceol && isALNUM((U8) *scan)) {
- scan++;
- }
+ }
+ break;
+ case ALNUM:
+ if (utf8_target)
+ goto utf8_wordchar;
+ while (scan < loceol && isALNUM((U8) *scan)) {
+ scan++;
}
break;
case ALNUML:
scan++;
}
break;
- case NALNUM:
+ case NALNUMU:
if (utf8_target) {
+
+ utf8_Nwordchar:
+
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_ALNUM();
while (hardcount < max && scan < loceol &&
- !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+ ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
{
scan += UTF8SKIP(scan);
hardcount++;
}
- } else if (FLAGS(p) & USE_UNI) {
+ } else {
while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
scan++;
}
- } else {
- while (scan < loceol && ! isALNUM((U8) *scan)) {
- scan++;
- }
+ }
+ break;
+ case NALNUM:
+ if (utf8_target)
+ goto utf8_Nwordchar;
+ while (scan < loceol && ! isALNUM((U8) *scan)) {
+ scan++;
}
break;
case NALNUML:
scan++;
}
break;
- case SPACE:
+ case SPACEU:
if (utf8_target) {
+
+ utf8_space:
+
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_SPACE();
while (hardcount < max && scan < loceol &&
scan += UTF8SKIP(scan);
hardcount++;
}
- } else if (FLAGS(p) & USE_UNI) {
+ break;
+ }
+ else {
while (scan < loceol && isSPACE_L1((U8) *scan)) {
scan++;
}
- } else {
- while (scan < loceol && isSPACE((U8) *scan))
- scan++;
+ break;
+ }
+ case SPACE:
+ if (utf8_target)
+ goto utf8_space;
+
+ while (scan < loceol && isSPACE((U8) *scan)) {
+ scan++;
}
break;
case SPACEL:
scan++;
}
break;
- case NSPACE:
+ case NSPACEU:
if (utf8_target) {
+
+ utf8_Nspace:
+
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_SPACE();
while (hardcount < max && scan < loceol &&
- !(*scan == ' ' ||
- swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+ ! (*scan == ' ' ||
+ swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
{
scan += UTF8SKIP(scan);
hardcount++;
}
- } else if (FLAGS(p) & USE_UNI) {
+ break;
+ }
+ else {
while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
scan++;
}
- } else {
- while (scan < loceol && ! isSPACE((U8) *scan)) {
- scan++;
- }
+ }
+ break;
+ case NSPACE:
+ if (utf8_target)
+ goto utf8_Nspace;
+
+ while (scan < loceol && ! isSPACE((U8) *scan)) {
+ scan++;
}
break;
case NSPACEL: