);
case SPACEL:
REXEC_FBC_CSCAN_TAINT(
- *s == ' ' || isSPACE_LC_utf8((U8*)s),
+ isSPACE_LC_utf8((U8*)s),
isSPACE_LC(*s)
);
case NSPACE:
);
case NSPACEL:
REXEC_FBC_CSCAN_TAINT(
- !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
+ !isSPACE_LC_utf8((U8*)s),
!isSPACE_LC(*s)
);
case DIGIT:
ST.nextword,
tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
PL_colors[0], PL_colors[1],
- (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
+ (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
)
: "not compiled under -Dr",
PL_colors[5] );
break;
case NREFFL:
- {
+ { /* The capture buffer cases. The ones beginning with N for the
+ named buffers just convert to the equivalent numbered and
+ pretend they were called as the corresponding numbered buffer
+ op. */
+ /* don't initialize these, it makes C++ unhappy */
char *s;
char type;
+ re_fold_t folder;
+ const U8 *fold_array;
+
PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case NREF:
+ folder = foldEQ_locale;
+ fold_array = PL_fold_locale;
+ type = REFFL;
+ goto do_nref;
+
+ case NREFFU:
+ folder = foldEQ_latin1;
+ fold_array = PL_fold_latin1;
+ type = REFFU;
+ goto do_nref;
+
case NREFF:
- type = OP(scan);
+ folder = foldEQ;
+ fold_array = PL_fold;
+ type = REFF;
+ goto do_nref;
+
+ case NREF:
+ type = REF;
+ folder = NULL;
+ fold_array = NULL;
+ do_nref:
+
+ /* For the named back references, find the corresponding buffer
+ * number */
n = reg_check_named_buff_matched(rex,scan);
- if ( n ) {
- type = REF + ( type - NREF );
- goto do_ref;
- } else {
+ if ( ! n ) {
sayNO;
- }
- /* unreached */
+ }
+ goto do_nref_ref_common;
+
case REFFL:
PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
+ folder = foldEQ_locale;
+ fold_array = PL_fold_locale;
+ goto do_ref;
+
+ case REFFU:
+ folder = foldEQ_latin1;
+ fold_array = PL_fold_latin1;
+ goto do_ref;
+
+ case REFF:
+ folder = foldEQ;
+ fold_array = PL_fold;
+ goto do_ref;
+
case REF:
- case REFF:
- n = ARG(scan); /* which paren pair */
+ folder = NULL;
+ fold_array = NULL;
+
+ do_ref:
type = OP(scan);
- do_ref:
+ n = ARG(scan); /* which paren pair */
+
+ do_nref_ref_common:
ln = PL_regoffs[n].start;
PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
if (*PL_reglastparen < n || ln == -1)
break;
s = PL_bostr + ln;
- if (utf8_target && type != REF) { /* REF can do byte comparison */
- char *l = locinput;
- const char *e = PL_bostr + PL_regoffs[n].end;
- /*
- * Note that we can't do the "other character" lookup trick as
- * in the 8-bit case (no pun intended) because in Unicode we
- * have to map both upper and title case to lower case.
- */
- if (type == REFF) {
- while (s < e) {
- STRLEN ulen1, ulen2;
- U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
- U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
-
- if (l >= PL_regeol)
- sayNO;
- toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
- toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
- if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
- sayNO;
- s += ulen1;
- l += ulen2;
- }
+ if (type != REF /* REF can do byte comparison */
+ && (utf8_target
+ || (type == REFFU
+ && (*s == (char) LATIN_SMALL_LETTER_SHARP_S
+ || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S))))
+ { /* XXX handle REFFL better */
+ char * limit = PL_regeol;
+
+ /* This call case insensitively compares the entire buffer
+ * at s, with the current input starting at locinput, but
+ * not going off the end given by PL_regeol, and returns in
+ * limit upon success, how much of the current input was
+ * matched */
+ if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target,
+ locinput, &limit, 0, utf8_target))
+ {
+ sayNO;
}
- locinput = l;
+ locinput = limit;
nextchr = UCHARAT(locinput);
break;
}
- /* Inline the first character, for speed. */
+ /* Not utf8: Inline the first character, for speed. */
if (UCHARAT(s) != nextchr &&
(type == REF ||
- (UCHARAT(s) != (type == REFF
- ? PL_fold : PL_fold_locale)[nextchr])))
+ UCHARAT(s) != fold_array[nextchr]))
sayNO;
ln = PL_regoffs[n].end - ln;
if (locinput + ln > PL_regeol)
sayNO;
if (ln > 1 && (type == REF
? memNE(s, locinput, ln)
- : (type == REFF
- ? ! foldEQ(s, locinput, ln)
- : ! foldEQ_locale(s, locinput, ln))))
+ : ! folder(s, locinput, ln)))
sayNO;
locinput += ln;
nextchr = UCHARAT(locinput);
n = ARG(scan);
if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
locinput += ln;
- } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
+ } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
sayNO;
} else {
U8 folded[UTF8_MAXBYTES_CASE+1];
char *tmpeol = loceol;
while (hardcount < max
&& foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
- STRING(p), NULL, 1, UTF_PATTERN))
+ STRING(p), NULL, 1, cBOOL(UTF_PATTERN)))
{
scan = tmpeol;
tmpeol = loceol;
if (utf8_target) {
loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
- (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+ isSPACE_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
if (utf8_target) {
loceol = PL_regeol;
while (hardcount < max && scan < loceol &&
- !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+ !isSPACE_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
scan++;
}
break;
+ case DIGITL:
+ PL_reg_flags |= RF_tainted;
+ if (utf8_target) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol &&
+ isDIGIT_LC_utf8((U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && isDIGIT_LC(*scan))
+ scan++;
+ }
+ break;
case NDIGIT:
if (utf8_target) {
loceol = PL_regeol;
while (scan < loceol && !isDIGIT(*scan))
scan++;
}
+ case NDIGITL:
+ PL_reg_flags |= RF_tainted;
+ if (utf8_target) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol &&
+ !isDIGIT_LC_utf8((U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !isDIGIT_LC(*scan))
+ scan++;
+ }
+ break;
case LNBREAK:
if (utf8_target) {
loceol = PL_regeol;
if (c < 256) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
- else if (flags & ANYOF_FOLD) {
- U8 f;
- if (flags & ANYOF_LOCALE) {
- PL_reg_flags |= RF_tainted;
- f = PL_fold_locale[c];
- }
- else
- f = PL_fold[c];
- if (f != c && ANYOF_BITMAP_TEST(n, f))
- match = TRUE;
- }
-
- if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
+ else if (flags & ANYOF_LOCALE) {
PL_reg_flags |= RF_tainted;
- if (
- (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
+
+ if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
+ && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
+ {
+ match = TRUE;
+ }
+ else if (ANYOF_CLASS_TEST_ANY_SET(n)
+ && ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
(ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
(ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
(ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
(ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
(ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
) /* How's that for a conditional? */
- {
+ ) {
match = TRUE;
}
}
if (!match) {
if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
if (c >= 256
- || ((flags & ANYOF_FOLD) /* Latin1 1 that has a non-Latin1 fold
- should match */
+ || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a
+ non-Latin1 fold
+ should match */
&& _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
{
match = TRUE;
}
if (swash_fetch(sw, utf8_p, 1))
match = TRUE;
- else if (flags & ANYOF_FOLD) {
+ else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
if (!match && lenp && av) {
I32 i;
for (i = 0; i <= av_len(av); i++) {
}
}
}
- if (!match) {
+ if (!match) { /* See if the folded version matches */
U8 folded[UTF8_MAXBYTES_CASE+1];
-
- /* See if the folded version matches */
+ SV** listp;
STRLEN foldlen;
+
to_utf8_fold(utf8_p, folded, &foldlen);
- if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */
- match = TRUE;
- }
- else {
- /* The fold in a few cases of an above Latin1 char
- * is in the Latin1 range, and hence may be in the
- * bitmap */
- if (UTF8_IS_INVARIANT(*folded)
- && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded)))
- {
- match = TRUE;
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(*folded)
- && ANYOF_BITMAP_TEST(n,
- UNI_TO_NATIVE(
- TWO_BYTE_UTF8_TO_UNI(folded[0],
- folded[1]))))
- { /* Since the fold comes from internally
- * generated data, we can safely assume it is
- * valid utf8 in the test above */
- match = TRUE;
+ /* Consider "k" =~ /[K]/i. The line above would have
+ * just folded the 'k' to itself, and that isn't going
+ * to match 'K'. So we look through the closure of
+ * everything that folds to 'k'. That will find the
+ * 'K'. Initialize the list, if necessary */
+ if (! PL_utf8_foldclosures) {
+
+ /* If the folds haven't been read in, call a fold
+ * function to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+ to_utf8_fold((U8*) "A", dummy, &dummy_len);
}
- if (! match) {
- SV** listp;
-
- /* Consider "k" =~ /[K]/i. The line above
- * would have just folded the 'k' to itself,
- * and that isn't going to match 'K'. So we
- * look through the closure of everything that
- * folds to 'k'. That will find the 'K'.
- * Initialize the list, if necessary */
- if (! PL_utf8_foldclosures) {
-
- /* If the folds haven't been read in, call a
- * fold function to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
- STRLEN dummy_len;
- to_utf8_fold((U8*) "A",
- dummy, &dummy_len);
- }
- PL_utf8_foldclosures =
- _swash_inversion_hash(PL_utf8_tofold);
- }
+ PL_utf8_foldclosures =
+ _swash_inversion_hash(PL_utf8_tofold);
+ }
- /* The data structure is a hash with the keys
- * every character that is folded to, like 'k',
- * and the values each an array of everything
- * that folds to its key. e.g. [ 'k', 'K',
- * KELVIN_SIGN ] */
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) folded, foldlen, FALSE)))
+ /* The data structure is a hash with the keys every
+ * character that is folded to, like 'k', and the
+ * values each an array of everything that folds to its
+ * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) folded, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV i;
+ for (i = 0; i <= av_len(list); i++) {
+ SV** try_p = av_fetch(list, i, FALSE);
+ char* try_c;
+ if (try_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+ }
+ /* Don't have to worry about embeded nulls
+ * since NULL isn't folded or foldable */
+ try_c = SvPVX(*try_p);
+
+ /* The fold in a few cases of an above Latin1
+ * char is in the Latin1 range, and hence may
+ * be in the bitmap */
+ if (UTF8_IS_INVARIANT(*try_c)
+ && ANYOF_BITMAP_TEST(n,
+ UNI_TO_NATIVE(*try_c)))
{
- AV* list = (AV*) *listp;
- IV i;
- for (i = 0; i <= av_len(list); i++) {
- SV** try_p = av_fetch(list, i, FALSE);
- char* try_c;
- if (try_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- /* Don't have to worry about embeded
- * nulls since NULL isn't folded or
- * foldable */
- try_c = SvPVX(*try_p);
- if (UTF8_IS_INVARIANT(*try_c)
- && ANYOF_BITMAP_TEST(n,
- UNI_TO_NATIVE(*try_c)))
- {
- match = TRUE;
- break;
- }
- else if
- (UTF8_IS_DOWNGRADEABLE_START(*try_c)
- && ANYOF_BITMAP_TEST(n,
- UNI_TO_NATIVE(
+ match = TRUE;
+ break;
+ }
+ else if
+ (UTF8_IS_DOWNGRADEABLE_START(*try_c)
+ && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
TWO_BYTE_UTF8_TO_UNI(try_c[0],
- try_c[1]))))
- {
- match = TRUE;
- break;
- } else if (swash_fetch(sw,
- (U8*) try_c, 1))
- {
- match = TRUE;
- break;
- }
- }
+ try_c[1]))))
+ {
+ /* Since the fold comes from internally
+ * generated data, we can safely assume it
+ * is valid utf8 in the test above */
+ match = TRUE;
+ break;
+ } else if (swash_fetch(sw, (U8*) try_c, 1)) {
+ match = TRUE;
+ break;
}
}
- }
+ }
}
}