#define STATIC static
#endif
-#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
+/* Valid for non-utf8 strings only: avoids the reginclass call if there are no
+ * complications: i.e., if everything matchable is straight forward in the
+ * bitmap */
+#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
+ : ANYOF_BITMAP_TEST(p,*(c)))
/*
* Forwards.
#endif
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
- case NAMEL: \
- PL_reg_flags |= RF_tainted; \
- /* FALL THROUGH */ \
- case NAME: \
- if (!nextchr) \
- sayNO; \
- if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
- if (!CAT2(PL_utf8_,CLASS)) { \
- bool ok; \
- ENTER; \
- save_re_context(); \
- ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
- assert(ok); \
- LEAVE; \
- } \
- if (!(OP(scan) == NAME \
+#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
+ case NAMEL: \
+ PL_reg_flags |= RF_tainted; \
+ /* FALL THROUGH */ \
+ case NAME: \
+ if (!nextchr) \
+ sayNO; \
+ if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
+ if (!CAT2(PL_utf8_,CLASS)) { \
+ bool ok; \
+ ENTER; \
+ save_re_context(); \
+ ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
+ assert(ok); \
+ LEAVE; \
+ } \
+ if (!(OP(scan) == NAME \
? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
- : LCFUNC_utf8((U8*)locinput))) \
- { \
- sayNO; \
- } \
- locinput += PL_utf8skip[nextchr]; \
- nextchr = UCHARAT(locinput); \
- break; \
- } \
- if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
- sayNO; \
- nextchr = UCHARAT(++locinput); \
+ : LCFUNC_utf8((U8*)locinput))) \
+ { \
+ sayNO; \
+ } \
+ locinput += PL_utf8skip[nextchr]; \
+ nextchr = UCHARAT(locinput); \
+ break; \
+ } \
+ /* Drops through to the macro that calls this one */
+
+#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
+ _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
+ if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
+ sayNO; \
+ nextchr = UCHARAT(++locinput); \
break
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
- case NAMEL: \
- PL_reg_flags |= RF_tainted; \
- /* FALL THROUGH */ \
- case NAME : \
- if (!nextchr && locinput >= PL_regeol) \
- sayNO; \
- if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
- if (!CAT2(PL_utf8_,CLASS)) { \
- bool ok; \
- ENTER; \
- save_re_context(); \
- ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
- assert(ok); \
- LEAVE; \
- } \
- if ((OP(scan) == NAME \
+/* Almost identical to the above, but has a case for a node that matches chars
+ * between 128 and 255 using Unicode (latin1) semantics. */
+#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
+ _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
+ if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+ sayNO; \
+ nextchr = UCHARAT(++locinput); \
+ break
+
+#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
+ case NAMEL: \
+ PL_reg_flags |= RF_tainted; \
+ /* FALL THROUGH */ \
+ case NAME : \
+ if (!nextchr && locinput >= PL_regeol) \
+ sayNO; \
+ if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
+ if (!CAT2(PL_utf8_,CLASS)) { \
+ bool ok; \
+ ENTER; \
+ save_re_context(); \
+ ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
+ assert(ok); \
+ LEAVE; \
+ } \
+ if ((OP(scan) == NAME \
? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
- : LCFUNC_utf8((U8*)locinput))) \
- { \
- sayNO; \
- } \
- locinput += PL_utf8skip[nextchr]; \
- nextchr = UCHARAT(locinput); \
- break; \
- } \
- if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
- sayNO; \
- nextchr = UCHARAT(++locinput); \
+ : LCFUNC_utf8((U8*)locinput))) \
+ { \
+ sayNO; \
+ } \
+ locinput += PL_utf8skip[nextchr]; \
+ nextchr = UCHARAT(locinput); \
+ break; \
+ }
+
+#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
+ _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
+ if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
+ sayNO; \
+ nextchr = UCHARAT(++locinput); \
break
+#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
+ _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \
+ if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+ sayNO; \
+ nextchr = UCHARAT(++locinput); \
+ break
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
/* for use after a quantifier and before an EXACT-like node -- japhy */
-/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
+/* it would be nice to rework regcomp.sym to generate this stuff. sigh
+ *
+ * NOTE that *nothing* that affects backtracking should be in here, specifically
+ * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
+ * node that is in between two EXACT like nodes when ascertaining what the required
+ * "follow" character is. This should probably be moved to regex compile time
+ * although it may be done at run time beause of the REF possibility - more
+ * investigation required. -- demerphq
+*/
#define JUMPABLE(rn) ( \
OP(rn) == OPEN || \
(OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
OP(rn) == PLUS || OP(rn) == MINMOD || \
OP(rn) == KEEPS || \
- /*(PL_regkind[OP(rn)] == VERB && OP(rn) != PRUNE && OP(rn) != COMMIT && OP(rn) != MARKPOINT && OP(rn) != SKIP && OP(rn) != CUTGROUP) || */\
(PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
)
#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
switch (OP(c)) {
case ANYOF:
if (utf8_target) {
- REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+ REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
!UTF8_IS_INVARIANT((U8)s[0]) ?
reginclass(prog, c, (U8*)s, 0, utf8_target) :
REGINCLASS(prog, c, (U8*)s));
}
);
}
- else {
+ else { /* Not utf8 */
tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
- tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+ tmp = cBOOL((OP(c) == BOUNDL)
+ ? isALNUM_LC(tmp)
+ : (isWORDCHAR_L1(tmp)
+ && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
REXEC_FBC_SCAN(
if (tmp ==
- !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+ !((OP(c) == BOUNDL)
+ ? isALNUM_LC(*s)
+ : (isWORDCHAR_L1((U8) *s)
+ && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+ {
tmp = !tmp;
REXEC_FBC_TRYIT;
}
}
else {
tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
- tmp = ((OP(c) == NBOUND ?
- isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+ tmp = cBOOL((OP(c) == NBOUNDL)
+ ? isALNUM_LC(tmp)
+ : (isWORDCHAR_L1(tmp)
+ && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
REXEC_FBC_SCAN(
- if (tmp ==
- !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+ if (tmp == ! cBOOL(
+ (OP(c) == NBOUNDL)
+ ? isALNUM_LC(*s)
+ : (isWORDCHAR_L1((U8) *s)
+ && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+ {
tmp = !tmp;
+ }
else REXEC_FBC_TRYIT;
);
}
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_WORD(),
swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
- isALNUM(*s)
+ (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
);
case ALNUML:
REXEC_FBC_CSCAN_TAINT(
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_WORD(),
!swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
- !isALNUM(*s)
+ ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
);
case NALNUML:
REXEC_FBC_CSCAN_TAINT(
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_SPACE(),
*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
- isSPACE(*s)
+ isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
);
case SPACEL:
REXEC_FBC_CSCAN_TAINT(
REXEC_FBC_CSCAN_PRELOAD(
LOAD_UTF8_CHARCLASS_PERL_SPACE(),
!(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
- !isSPACE(*s)
+ !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
);
case NSPACEL:
REXEC_FBC_CSCAN_TAINT(
PerlIO_printf( Perl_debug_log,
" Scanning for legal start char...\n");
}
- );
- while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
- uc++;
- }
+ );
+ if (utf8_target) {
+ while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ uc += UTF8SKIP(uc);
+ }
+ } else {
+ while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ uc++;
+ }
+ }
s= (char *)uc;
}
if (uc >(U8*)last_start) break;
"%*s %smatched empty string...%s\n",
REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
);
- break;
+ if (!trie->jump)
+ break;
} else {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
else {
ln = (locinput != PL_bostr) ?
UCHARAT(locinput - 1) : '\n';
- if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+ if (FLAGS(scan) & USE_UNI) {
+
+ /* Here, can't be BOUNDL or NBOUNDL because they never set
+ * the flags to USE_UNI */
+ ln = isWORDCHAR_L1(ln);
+ n = isWORDCHAR_L1(nextchr);
+ }
+ else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
ln = isALNUM(ln);
n = isALNUM(nextchr);
}
case ANYOF:
if (utf8_target) {
STRLEN inclasslen = PL_regeol - locinput;
+ if (locinput >= PL_regeol)
+ sayNO;
if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
goto anyof_fail;
- if (locinput >= PL_regeol)
- sayNO;
- locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
+ locinput += inclasslen;
nextchr = UCHARAT(locinput);
break;
}
else {
if (nextchr < 0)
nextchr = UCHARAT(locinput);
- if (!REGINCLASS(rex, scan, (U8*)locinput))
- goto anyof_fail;
if (!nextchr && locinput >= PL_regeol)
sayNO;
+ if (!REGINCLASS(rex, scan, (U8*)locinput))
+ goto anyof_fail;
nextchr = UCHARAT(++locinput);
break;
}
sayNO;
break;
/* Special char classes - The defines start on line 129 or so */
- CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
- CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
+ CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+ CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
- CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
- CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
+ CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+ CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
COP * const ocurcop = PL_curcop;
PAD *old_comppad;
char *saved_regeol = PL_regeol;
-
+ struct re_save_state saved_state;
+
+ /* To not corrupt the existing regex state while executing the
+ * eval we would normally put it on the save stack, like with
+ * save_re_context. However, re-evals have a weird scoping so we
+ * can't just add ENTER/LEAVE here. With that, things like
+ *
+ * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
+ *
+ * would break, as they expect the localisation to be unwound
+ * only when the re-engine backtracks through the bit that
+ * localised it.
+ *
+ * What we do instead is just saving the state in a local c
+ * variable.
+ */
+ Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
+
n = ARG(scan);
PL_op = (OP_4tree*)rexi->data->data[n];
DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
PUTBACK;
}
+ Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
+
PL_op = oop;
PAD_RESTORE_LOCAL(old_comppad);
PL_curcop = ocurcop;
case CANY:
scan = loceol;
break;
- case EXACT: /* length of string is 1 */
- c = (U8)*STRING(p);
- while (scan < loceol && UCHARAT(scan) == c)
- scan++;
- break;
- case EXACTF: /* length of string is 1 */
+ case EXACT:
+ /* To get here, EXACTish nodes must have *byte* length == 1. That
+ * means they match only characters in the string that can be expressed
+ * as a single byte. For non-utf8 strings, that means a simple match.
+ * For utf8 strings, the character matched must be an invariant, or
+ * downgradable to a single byte. The pattern's utf8ness is
+ * irrelevant, as since it's a single byte, it either isn't utf8, or if
+ * it is, it's an invariant */
+
c = (U8)*STRING(p);
- while (scan < loceol &&
- (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
- scan++;
+ assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+ if (! utf8_target || UNI_IS_INVARIANT(c)) {
+ while (scan < loceol && UCHARAT(scan) == c) {
+ scan++;
+ }
+ }
+ else {
+
+ /* Here, the string is utf8, and the pattern char is different
+ * in utf8 than not, so can't compare them directly. Outside the
+ * loop, find find the two utf8 bytes that represent c, and then
+ * look for those in sequence in the utf8 string */
+ U8 high = UTF8_TWO_BYTE_HI(c);
+ U8 low = UTF8_TWO_BYTE_LO(c);
+ loceol = PL_regeol;
+
+ while (hardcount < max
+ && scan + 1 < loceol
+ && UCHARAT(scan) == high
+ && UCHARAT(scan + 1) == low)
+ {
+ scan += 2;
+ hardcount++;
+ }
+ }
break;
- case EXACTFL: /* length of string is 1 */
+ case EXACTFL:
PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case EXACTF:
+
+ /* The comments for the EXACT case apply as well to these fold ones */
+
c = (U8)*STRING(p);
- while (scan < loceol &&
- (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
- scan++;
+ assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+ if (utf8_target) { /* Use full Unicode fold matching */
+
+ /* For the EXACTFL case, It doesn't really make sense to compare
+ * locale and utf8, but it is best we can do. The documents warn
+ * against mixing them */
+
+ char *tmpeol = loceol;
+ while (hardcount < max
+ && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+ STRING(p), NULL, 1, UTF_PATTERN))
+ {
+ scan = tmpeol;
+ tmpeol = loceol;
+ hardcount++;
+ }
+
+ /* XXX Note that the above handles properly the German sharp s in
+ * the pattern matching ss in the string. But it doesn't handle
+ * properly cases where the string contains say 'LIGATURE ff' and
+ * the pattern is 'f+'. This would require, say, a new function or
+ * revised interface to foldEQ_utf8(), in which the maximum number
+ * of characters to match could be passed and it would return how
+ * many actually did. This is just one of many cases where
+ * multi-char folds don't work properly, and so the fix is being
+ * deferred */
+ }
+ else {
+
+ /* Here, the string isn't utf8; and either the pattern isn't utf8
+ * or c is an invariant, so its utf8ness doesn't affect c. Can
+ * just do simple comparisons for exact or fold matching. */
+ switch (OP(p)) {
+ case EXACTF:
+ while (scan < loceol &&
+ (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
+ {
+ scan++;
+ }
+ break;
+ case EXACTFL:
+ while (scan < loceol &&
+ (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
+ {
+ scan++;
+ }
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
+ }
+ }
break;
case ANYOF:
if (utf8_target) {
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_ALNUM();
while (hardcount < max && scan < loceol &&
- swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+ swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+ {
scan += UTF8SKIP(scan);
hardcount++;
}
+ } else if (FLAGS(p) & USE_UNI) {
+ while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
+ scan++;
+ }
} else {
- while (scan < loceol && isALNUM(*scan))
- scan++;
+ while (scan < loceol && isALNUM((U8) *scan)) {
+ scan++;
+ }
}
break;
case ALNUML:
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS_ALNUM();
while (hardcount < max && scan < loceol &&
- !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+ !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+ {
scan += UTF8SKIP(scan);
hardcount++;
}
+ } else if (FLAGS(p) & USE_UNI) {
+ while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
+ scan++;
+ }
} else {
- while (scan < loceol && !isALNUM(*scan))
- scan++;
+ while (scan < loceol && ! isALNUM((U8) *scan)) {
+ scan++;
+ }
}
break;
case NALNUML:
LOAD_UTF8_CHARCLASS_SPACE();
while (hardcount < max && scan < loceol &&
(*scan == ' ' ||
- swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+ swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+ {
scan += UTF8SKIP(scan);
hardcount++;
}
+ } else if (FLAGS(p) & USE_UNI) {
+ while (scan < loceol && isSPACE_L1((U8) *scan)) {
+ scan++;
+ }
} else {
- while (scan < loceol && isSPACE(*scan))
- scan++;
+ while (scan < loceol && isSPACE((U8) *scan))
+ scan++;
}
break;
case SPACEL:
LOAD_UTF8_CHARCLASS_SPACE();
while (hardcount < max && scan < loceol &&
!(*scan == ' ' ||
- swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+ swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+ {
scan += UTF8SKIP(scan);
hardcount++;
}
+ } else if (FLAGS(p) & USE_UNI) {
+ while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
+ scan++;
+ }
} else {
- while (scan < loceol && !isSPACE(*scan))
- scan++;
+ while (scan < loceol && ! isSPACE((U8) *scan)) {
+ scan++;
+ }
}
break;
case NSPACEL:
/*
- reginclass - determine if a character falls into a character class
- The n is the ANYOF regnode, the p is the target string, lenp
- is pointer to the maximum length of how far to go in the p
- (if the lenp is zero, UTF8SKIP(p) is used),
- utf8_target tells whether the target string is in UTF-8.
+ n is the ANYOF regnode
+ p is the target string
+ lenp is pointer to the maximum number of bytes of how far to go in p
+ (This is assumed wthout checking to always be at least the current
+ character's size)
+ utf8_target tells whether p is in UTF-8.
+
+ Returns true if matched; false otherwise. If lenp is not NULL, on return
+ from a successful match, the value it points to will be updated to how many
+ bytes in p were matched. If there was no match, the value is undefined,
+ possibly changed from the input.
*/
STATIC bool
-S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
{
dVAR;
const char flags = ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;
- STRLEN len = 0;
- STRLEN plen;
+ STRLEN c_len = 0;
+ STRLEN maxlen;
PERL_ARGS_ASSERT_REGINCLASS;
+ /* If c is not already the code point, get it */
if (utf8_target && !UTF8_IS_INVARIANT(c)) {
- c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
+ c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
| UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
/* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
* UTF8_ALLOW_FFFF */
- if (len == (STRLEN)-1)
+ if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
}
+ else {
+ c_len = 1;
+ }
- plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
- if (utf8_target || (flags & ANYOF_UNICODE)) {
- if (lenp)
- *lenp = 0;
- if (utf8_target && !ANYOF_RUNTIME(n)) {
- if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
- match = TRUE;
- }
- if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
- match = TRUE;
- if (!match) {
- AV *av;
- SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
-
- if (sw) {
- U8 * utf8_p;
- if (utf8_target) {
- utf8_p = (U8 *) p;
- } else {
- STRLEN len = 1;
- utf8_p = bytes_to_utf8(p, &len);
- }
- if (swash_fetch(sw, utf8_p, 1))
- match = TRUE;
- else if (flags & ANYOF_FOLD) {
- if (!match && lenp && av) {
- I32 i;
- for (i = 0; i <= av_len(av); i++) {
- SV* const sv = *av_fetch(av, i, FALSE);
- STRLEN len;
- const char * const s = SvPV_const(sv, len);
- if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
- *lenp = len;
- match = TRUE;
- break;
- }
- }
- }
- if (!match) {
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-
- STRLEN tmplen;
- to_utf8_fold(utf8_p, tmpbuf, &tmplen);
- if (swash_fetch(sw, tmpbuf, 1))
- match = TRUE;
- }
- }
+ /* Use passed in max length, or one character if none passed in or less
+ * than one character. And assume will match just one character. This is
+ * overwritten later if matched more. */
+ if (lenp) {
+ maxlen = (*lenp > c_len) ? *lenp : c_len;
+ *lenp = c_len;
- /* If we allocated a string above, free it */
- if (! utf8_target) Safefree(utf8_p);
- }
- }
- if (match && lenp && *lenp == 0)
- *lenp = UNISKIP(NATIVE_TO_UNI(c));
}
- if (!match && c < 256) {
+ else {
+ maxlen = c_len;
+ }
+
+ /* If this character is potentially in the bitmap, check it */
+ if (c < 256) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if (flags & ANYOF_FOLD) {
match = TRUE;
}
- if (!match && (flags & ANYOF_CLASS)) {
+ if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
PL_reg_flags |= RF_tainted;
if (
(ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
}
}
+ /* If the bitmap didn't (or couldn't) match, and something outside the
+ * bitmap could match, try that */
+ if (!match) {
+ if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
+ match = TRUE;
+ }
+ else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
+ || (utf8_target && flags & ANYOF_UTF8))
+ {
+ AV *av;
+ SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
+
+ if (sw) {
+ U8 * utf8_p;
+ if (utf8_target) {
+ utf8_p = (U8 *) p;
+ } else {
+ STRLEN len = 1;
+ utf8_p = bytes_to_utf8(p, &len);
+ }
+ if (swash_fetch(sw, utf8_p, 1))
+ match = TRUE;
+ else if (flags & ANYOF_FOLD) {
+ if (!match && lenp && av) {
+ I32 i;
+ for (i = 0; i <= av_len(av); i++) {
+ SV* const sv = *av_fetch(av, i, FALSE);
+ STRLEN len;
+ const char * const s = SvPV_const(sv, len);
+ if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
+ *lenp = len;
+ match = TRUE;
+ break;
+ }
+ }
+ }
+ if (!match) {
+ U8 folded[UTF8_MAXBYTES_CASE+1];
+
+ /* See if the folded version matches */
+ STRLEN foldlen;
+ to_utf8_fold(utf8_p, folded, &foldlen);
+ if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */
+ match = TRUE;
+ }
+ else {
+ SV** listp;
+
+ /* Consider "k" =~ /[K]/i. The line above would
+ * have just folded the 'k' to itself, and that
+ * isn't going to match 'K'. So we look through
+ * the closure of everything that folds to 'k'.
+ * That will find the 'K'. Initialize the list, if
+ * necessary */
+ if (! PL_utf8_foldclosures) {
+
+ /* If the folds haven't been read in, call a
+ * fold function to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+ to_utf8_fold((U8*) "A", dummy, &dummy_len);
+ }
+ PL_utf8_foldclosures =
+ _swash_inversion_hash(PL_utf8_tofold);
+ }
+
+ /* The data structure is a hash with the keys every
+ * character that is folded to, like 'k', and the
+ * values each an array of everything that folds to
+ * its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) folded, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV i;
+ for (i = 0; i <= av_len(list); i++) {
+ SV** try_p = av_fetch(list, i, FALSE);
+ if (try_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+ }
+ /* Don't have to worry about embeded nulls
+ * since NULL isn't folded or foldable */
+ if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
+ match = TRUE;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /* If we allocated a string above, free it */
+ if (! utf8_target) Safefree(utf8_p);
+ }
+ }
+ }
+
return (flags & ANYOF_INVERT) ? !match : match;
}