#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
+#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
#endif
switch (flags) {
+ case EXACTFA:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
if (UTF
- && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
+ && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
&& ( STR_LEN(scan) >= 6 ) )
{
/*
ret = reganode(pRExC_state,
((! FOLD)
? NREF
- : (UNI_SEMANTICS)
- ? NREFFU
- : (LOC)
- ? NREFFL
- : NREFF),
+ : (MORE_ASCII_RESTRICTED)
+ ? NREFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? NREFFU
+ : (LOC)
+ ? NREFFL
+ : NREFF),
num);
*flagp |= HASWIDTH;
if (has_charset_modifier || flagsp == &negflags) {
goto fail_modifiers;
}
- cs = REGEX_ASCII_RESTRICTED_CHARSET;
+ if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
+ /* Doubled modifier implies more restricted */
+ cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
+ RExC_parse++;
+ }
+ else {
+ cs = REGEX_ASCII_RESTRICTED_CHARSET;
+ }
has_charset_modifier = 1;
break;
case DEPENDS_PAT_MOD:
STRLEN len = 0; /* Its current byte length */
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
-
- ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
- : (LOC)
- ? EXACTFL
- : UNI_SEMANTICS
- ? EXACTFU
- : EXACTF));
+ ret = reg_node(pRExC_state,
+ (U8) ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : (MORE_ASCII_RESTRICTED)
+ ? EXACTFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? EXACTFU
+ : EXACTF));
s= STRING(ret);
/* Exact nodes can hold only a U8 length's of text = 255. Loop through
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
UV cp; /* Ord of current character */
+ bool use_this_char_fold = FOLD;
/* Code points are separated by dots. If none, there is only one
* code point, and is terminated by the brace */
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- if (! FOLD) { /* Not folding, just append to the string */
+ if (FOLD
+ && (cp > 255 || ! MORE_ASCII_RESTRICTED)
+ && is_TRICKYFOLD_cp(cp))
+ {
+ }
+
+ /* Under /aa, we can't mix ASCII with non- in a fold. If we are
+ * folding, and the source isn't ASCII, look through all the
+ * characters it folds to. If any one of them is ASCII, forbid
+ * this fold. (cp is uni, so the 127 below is correct even for
+ * EBCDIC) */
+ if (use_this_char_fold && cp > 127 && MORE_ASCII_RESTRICTED) {
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+ U8* s = tmpbuf;
+ U8* e;
+ STRLEN foldlen;
+
+ (void) toFOLD_uni(cp, tmpbuf, &foldlen);
+ e = s + foldlen;
+
+ while (s < e) {
+ if (isASCII(*s)) {
+ use_this_char_fold = FALSE;
+ break;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+
+ if (! use_this_char_fold) { /* Not folding, just append to the
+ string */
STRLEN unilen;
/* Quit before adding this character if would exceed limit */
op = ALNUMU;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = ALNUMA;
break;
case REGEX_DEPENDS_CHARSET:
op = NALNUMU;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = NALNUMA;
break;
case REGEX_DEPENDS_CHARSET:
op = BOUNDU;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = BOUNDA;
break;
case REGEX_DEPENDS_CHARSET:
op = NBOUNDU;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = NBOUNDA;
break;
case REGEX_DEPENDS_CHARSET:
op = SPACEU;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = SPACEA;
break;
case REGEX_DEPENDS_CHARSET:
op = NSPACEU;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = NSPACEA;
break;
case REGEX_DEPENDS_CHARSET:
op = DIGITL;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = DIGITA;
break;
case REGEX_DEPENDS_CHARSET: /* No difference between these */
op = NDIGITL;
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
op = NDIGITA;
break;
case REGEX_DEPENDS_CHARSET: /* No difference between these */
ret = reganode(pRExC_state,
((! FOLD)
? NREF
- : (AT_LEAST_UNI_SEMANTICS)
- ? NREFFU
- : (LOC)
- ? NREFFL
- : NREFF),
+ : (MORE_ASCII_RESTRICTED)
+ ? NREFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? NREFFU
+ : (LOC)
+ ? NREFFL
+ : NREFF),
num);
*flagp |= HASWIDTH;
ret = reganode(pRExC_state,
((! FOLD)
? REF
- : (AT_LEAST_UNI_SEMANTICS)
- ? REFFU
- : (LOC)
- ? REFFL
- : REFF),
+ : (MORE_ASCII_RESTRICTED)
+ ? REFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? REFFU
+ : (LOC)
+ ? REFFL
+ : REFF),
num);
*flagp |= HASWIDTH;
char *s;
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+ regnode * orig_emit;
parse_start = RExC_parse - 1;
defchar:
ender = 0;
+ orig_emit = RExC_emit; /* Save the original output node position in
+ case we need to output a different node
+ type */
ret = reg_node(pRExC_state,
(U8) ((! FOLD) ? EXACT
: (LOC)
? EXACTFL
- : (AT_LEAST_UNI_SEMANTICS)
- ? EXACTFU
- : EXACTF)
+ : (MORE_ASCII_RESTRICTED)
+ ? EXACTFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? EXACTFU
+ : EXACTF)
);
s = STRING(ret);
for (len = 0, p = RExC_parse - 1;
p += numlen;
}
else
- ender = *p++;
+ ender = (U8) *p++;
break;
+ } /* End of switch on the literal */
+
+ /* Certain characters are problematic because their folded
+ * length is so different from their original length that it
+ * isn't handleable by the optimizer. They are therefore not
+ * placed in an EXACTish node; and are here handled specially.
+ * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
+ * putting it in a special node keeps regexec from having to
+ * deal with a non-utf8 multi-char fold */
+ if (FOLD
+ && (ender > 255 || ! MORE_ASCII_RESTRICTED)
+ && is_TRICKYFOLD_cp(ender))
+ {
+ /* If is in middle of outputting characters into an
+ * EXACTish node, go output what we have so far, and
+ * position the parse so that this will be called again
+ * immediately */
+ if (len) {
+ p = RExC_parse + len - 1;
+ goto loopdone;
+ }
+ else {
+
+ /* Here we are ready to output our tricky fold
+ * character. What's done is to pretend it's in a
+ * [bracketed] class, and let the code that deals with
+ * those handle it, as that code has all the
+ * intelligence necessary. First save the current
+ * parse state, get rid of the already allocated EXACT
+ * node that the ANYOFV node will replace, and point
+ * the parse to a buffer which we fill with the
+ * character we want the regclass code to think is
+ * being parsed */
+ char* const oldregxend = RExC_end;
+ char tmpbuf[2];
+ RExC_emit = orig_emit;
+ RExC_parse = tmpbuf;
+ if (UTF) {
+ tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
+ tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
+ RExC_end = RExC_parse + 2;
+ }
+ else {
+ tmpbuf[0] = ender;
+ RExC_end = RExC_parse + 1;
+ }
+
+ ret = regclass(pRExC_state,depth+1);
+
+ /* Here, have parsed the buffer. Reset the parse to
+ * the actual input, and return */
+ RExC_end = oldregxend;
+ RExC_parse = p - 1;
+
+ Set_Node_Offset(ret, RExC_parse);
+ Set_Node_Cur_Length(ret);
+ nextchar(pRExC_state);
+ *flagp |= HASWIDTH|SIMPLE;
+ return ret;
+ }
}
+
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
if (UTF && FOLD) {
/* Prime the casefolded buffer. */
- ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+ if (isASCII(ender)) {
+ ender = toLOWER(ender);
+ *tmpbuf = ender;
+ foldlen = 1;
+ }
+ else if (! MORE_ASCII_RESTRICTED) {
+ ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+ }
+ else {
+ /* When not to mix ASCII with non-, reject folds that
+ * mix them, using only the non-folded code point. So
+ * do the fold to a temporary, and inspect each
+ * character in it. */
+ U8 trialbuf[UTF8_MAXBYTES_CASE+1];
+ U8* s = trialbuf;
+ UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
+ U8* e = s + foldlen;
+ bool fold_ok = TRUE;
+
+ while (s < e) {
+ if (isASCII(*s)) {
+ fold_ok = FALSE;
+ break;
+ }
+ s += UTF8SKIP(s);
+ }
+ if (fold_ok) {
+ Copy(trialbuf, tmpbuf, foldlen, U8);
+ ender = tmpender;
+ }
+ else {
+ uvuni_to_utf8(tmpbuf, ender);
+ foldlen = UNISKIP(ender);
+ }
+ }
}
if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
if (len)
else
REGC((char)ender, s++);
}
- loopdone:
+ loopdone: /* Jumped to when encounters something that shouldn't be in
+ the node */
RExC_parse = p - 1;
Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
} \
- if (ASCII_RESTRICTED) { \
+ if (AT_LEAST_ASCII_RESTRICTED) { \
for (value = 128; value < 256; value++) { \
stored += set_regclass_bit( \
pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
- : PL_fold[value];
+ : PL_fold[value];
/* It assumes the bit for 'value' has already been set */
if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
ANYOF_BITMAP_SET(node, fold);
stored++;
}
- if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
+ if ((_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED))
|| (! UNI_SEMANTICS
&& ! isASCII(value)
&& PL_fold_latin1[value] != value))
}
yesno = '!';
what = POSIX_CC_UNI_NAME("Digit");
- if (ASCII_RESTRICTED ) {
+ if (AT_LEAST_ASCII_RESTRICTED ) {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
}
break;
vFAIL("Invalid [::] class");
break;
}
- if (what && ! (ASCII_RESTRICTED)) {
+ if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
/* Strings such as "+utf8::isWord\n" */
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ANYOF_FLAGS(ret) |= ANYOF_UTF8;
}
}
+ /* non-Latin1 code point implies unicode semantics. Must be set in
+ * pass1 so is there for the whole of pass 2 */
if (value > 255) {
RExC_uni_semantics = 1;
}
* target string is utf8, or under unicode rules */
if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
while (loc < e) {
+ if (MORE_ASCII_RESTRICTED && (isASCII(*loc) != isASCII(j))) {
+ goto end_multi_fold;
+ }
/* XXX Discard this fold if any are latin1
* and LOC */
if (UTF8_IS_INVARIANT(*loc)
/* This node is variable length */
OP(ret) = ANYOFV;
+ end_multi_fold: ;
}
}
else { /* Single character fold */
Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
}
c = SvUV(*c_p);
+ if (MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j))) {
+ continue;
+ }
if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
stored += set_regclass_bit(pRExC_state, ret, (U8) c, &nonbitmap);
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFA:
case EXACTFU:
case EXACTFL:
if( exact == PSEUDO )
case REGEX_ASCII_RESTRICTED_CHARSET:
PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
break;
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
+ break;
default:
PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
break;