/ (DUPE|DUPE) X? (?{ ... }) Y /x
-Thus EVAL blocks follwing a trie may be called a different number of times with
+Thus EVAL blocks following a trie may be called a different number of times with
and without the optimisation. With the optimisations dupes will be silently
ignored. This inconsistant behaviour of EVAL type nodes is well established as
the following demonstrates:
regnode *convert = NULL;
U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
- const U8 * const folder = ( flags == EXACTF
- ? PL_fold
- : ( flags == EXACTFL
- ? PL_fold_locale
- : NULL
- )
- );
+ const U8 * folder = NULL;
#ifdef DEBUGGING
const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
PERL_UNUSED_ARG(depth);
#endif
+ switch (flags) {
+ case EXACTFU: folder = PL_fold_latin1; break;
+ case EXACTF: folder = PL_fold; break;
+ case EXACTFL: folder = PL_fold_locale; break;
+ }
+
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
trie->refcount = 1;
trie->startstate = 1;
TRIE_STORE_REVCHAR;
}
if ( set_bit ) {
- /* store the codepoint in the bitmap, and if its ascii
- also store its folded equivelent. */
+ /* store the codepoint in the bitmap, and its folded
+ * equivalent. */
TRIE_BITMAP_SET(trie,uvc);
/* store the folded codepoint */
if ( !UTF ) {
/* store first byte of utf8 representation of
- codepoints in the 127 < uvc < 256 range */
- if (127 < uvc && uvc < 192) {
- TRIE_BITMAP_SET(trie,194);
- } else if (191 < uvc ) {
- TRIE_BITMAP_SET(trie,195);
- /* && uvc < 256 -- we know uvc is < 256 already */
+ variant codepoints */
+ if (! UNI_IS_INVARIANT(uvc)) {
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
}
}
set_bit = 0; /* We've done our bit :-) */
}
#endif
}
-
- if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
+#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
+#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
+#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
+#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
+
+ if (UTF
+ && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
+ && ( STR_LEN(scan) >= 6 ) )
+ {
/*
Two problematic code points in Unicode casefolding of EXACT nodes:
/* Check whether it is compatible with what we know already! */
int compat = 1;
+
+ /* If compatibile, we or it in below. It is compatible if is
+ * in the bitmp and either 1) its bit or its fold is set, or 2)
+ * it's for a locale. Even if there isn't unicode semantics
+ * here, at runtime there may be because of matching against a
+ * utf8 string, so accept a possible false positive for
+ * latin1-range folds */
if (uc >= 0x100 ||
(!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
&& (!(data->start_class->flags & ANYOF_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
)
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
int compat = 1;
-
if (uc >= 0x100 ||
- (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, uc)
- && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
+ && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
+ {
compat = 0;
+ }
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat) {
ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
data->start_class->flags |= ANYOF_FOLD;
- if (OP(scan) == EXACTFL)
+ if (OP(scan) == EXACTFL) {
data->start_class->flags |= ANYOF_LOCALE;
+ }
+ else {
+
+ /* Also set the other member of the fold pair. In case
+ * that unicode semantics is called for at runtime, use
+ * the full latin1 fold. (Can't do this for locale,
+ * because not known until runtime */
+ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
+ }
}
}
else if (flags & SCF_DO_STCLASS_OR) {
if (data->start_class->flags & ANYOF_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
- if (uc < 0x100)
+ if (uc < 0x100) {
ANYOF_BITMAP_SET(data->start_class, uc);
+ if (OP(scan) != EXACTFL) {
+
+ /* And set the other member of the fold pair, but
+ * can't do that in locale because not known until
+ * run-time */
+ ANYOF_BITMAP_SET(data->start_class,
+ PL_fold_latin1[uc]);
+ }
+ }
data->start_class->flags &= ~ANYOF_EOS;
}
cl_and(data->start_class, and_withp);
f |= SCF_DO_STCLASS_AND;
f &= ~SCF_DO_STCLASS_OR;
}
- /* These are the cases when once a subexpression
- fails at a particular position, it cannot succeed
- even after backtracking at the enclosing scope.
-
- XXXX what if minimal match and we are at the
- initial run of {n,m}? */
- if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
+ /* Exclude from super-linear cache processing any {n,m}
+ regops for which the combination of input pos and regex
+ pos is not enough information to determine if a match
+ will be possible.
+
+ For example, in the regex /foo(bar\s*){4,8}baz/ with the
+ regex pos at the \s*, the prospects for a match depend not
+ only on the input position but also on how many (bar\s*)
+ repeats into the {4,8} we are. */
+ if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
}
}
else if (OP(scan) == FOLDCHAR) {
- int d = ARG(scan)==0xDF ? 1 : 2;
+ int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
flags &= ~SCF_DO_STCLASS;
min += 1;
delta += d;
#endif
REGEXP *
-Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
+Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
{
dVAR;
REGEXP *rx;
regnode *scan;
I32 flags;
I32 minlen = 0;
+ U32 pm_flags;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
I32 sawlookahead = 0;
I32 sawplus = 0;
I32 sawopen = 0;
+ bool used_setjump = FALSE;
U8 jump_ret = 0;
dJMPENV;
RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
-
+ /****************** LONG JUMP TARGET HERE***********************/
/* Longjmp back to here if have to switch in midstream to utf8 */
if (! RExC_orig_utf8) {
JMPENV_PUSH(jump_ret);
+ used_setjump = TRUE;
}
if (jump_ret == 0) { /* First time through */
- exp = SvPV(pattern, plen);
- xend = exp + plen;
+ exp = SvPV(pattern, plen);
+ xend = exp + plen;
+ /* ignore the utf8ness if the pattern is 0 length */
+ if (plen == 0) {
+ RExC_utf8 = RExC_orig_utf8 = 0;
+ }
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
restudied = 0;
#endif
+ /* Set to use unicode semantics if the pattern is in utf8 and has the
+ * 'dual' charset specified, as it means unicode when utf8 */
+ pm_flags = orig_pm_flags;
+ if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
+ pm_flags |= RXf_PMf_UNICODE;
+ }
+
RExC_precomp = exp;
RExC_flags = pm_flags;
RExC_sawback = 0;
return(NULL);
}
- /* Here, finished first pass. Get rid of our setjmp, which we added for
- * efficiency only if the passed-in string wasn't in utf8, as shown by
- * RExC_orig_utf8. But if the first pass was redone, that variable will be
- * 1 here even though the original string wasn't utf8, but in this case
- * there will have been a long jump */
- if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+ /* Here, finished first pass. Get rid of any added setjmp */
+ if (used_setjump) {
JMPENV_POP;
}
DEBUG_PARSE_r({
if (PL_regkind[OP(first)] == EXACT) {
if (OP(first) == EXACT)
NOOP; /* Empty, get anchored substr later. */
- else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
+ else
ri->regstclass = first;
}
#ifdef TRIE_STCLASS
SvREFCNT_inc_simple_void(sv_dat);
}
RExC_sawback = 1;
- ret = reganode(pRExC_state,
- (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
- num);
+ ret = reganode(pRExC_state,
+ ((! FOLD)
+ ? NREF
+ : (UNI_SEMANTICS)
+ ? NREFFU
+ : (LOC)
+ ? NREFFL
+ : NREFF),
+ num);
*flagp |= HASWIDTH;
Set_Node_Offset(ret, parse_start+1);
that follow */
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ if (RExC_utf8) { /* But the default for a utf8 pattern is
+ unicode semantics */
+ RExC_flags |= RXf_PMf_UNICODE;
+ }
goto parse_flags;
default:
--RExC_parse;
{
goto fail_modifiers;
}
- negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+
+ /* The dual charset means unicode semantics if the
+ * pattern (or target, not known until runtime) are
+ * utf8 */
+ if (RExC_utf8) {
+ posflags |= RXf_PMf_UNICODE;
+ negflags |= RXf_PMf_LOCALE;
+ }
+ else {
+ negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ }
has_charset_modifier = 1;
break;
case ONCE_PAT_MOD: /* 'o' */
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : UNI_SEMANTICS
+ ? EXACTFU
+ : EXACTF));
s= STRING(ret);
/* Exact nodes can hold only a U8 length's of text = 255. Loop through
RExC_parse++;
vFAIL("Quantifier follows nothing");
break;
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
+#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
+#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
+ case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
+#endif
do_foldchar:
if (!LOC && FOLD) {
U32 len,cp;
literal text handling code.
*/
switch ((U8)*++RExC_parse) {
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
goto do_foldchar;
/* Special Escapes */
case 'A':
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'd':
- ret = reg_node(pRExC_state, DIGIT);
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(DIGITL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(DIGIT));
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'D':
- ret = reg_node(pRExC_state, NDIGIT);
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(NDIGITL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(NDIGIT));
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'R':
RExC_sawback = 1;
ret = reganode(pRExC_state,
- (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
- num);
+ ((! FOLD)
+ ? NREF
+ : (UNI_SEMANTICS)
+ ? NREFFU
+ : (LOC)
+ ? NREFFL
+ : NREFF),
+ num);
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
}
RExC_sawback = 1;
ret = reganode(pRExC_state,
- (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
- num);
+ ((! FOLD)
+ ? REF
+ : (UNI_SEMANTICS)
+ ? REFFU
+ : (LOC)
+ ? REFFL
+ : REFF),
+ num);
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
defchar:
ender = 0;
ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ (U8) ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : (UNI_SEMANTICS)
+ ? EXACTFU
+ : EXACTF)
+ );
s = STRING(ret);
for (len = 0, p = RExC_parse - 1;
len < 127 && p < RExC_end;
if (RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
switch ((U8)*p) {
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
goto normal_default;
case '^':
switch ((U8)*++p) {
/* These are all the special escapes. */
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
goto normal_default;
case 'A': /* Start assertion */
} \
} \
else { \
- for (value = 0; value < 256; value++) { \
+ for (value = 0; value < 128; value++) { \
if (TEST_7) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
} \
} \
yesno = '+'; \
} \
} \
else { \
- for (value = 0; value < 256; value++) { \
+ for (value = 0; value < 128; value++) { \
if (! TEST_7) stored += \
S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
} \
+ for (value = 128; value < 256; value++) { \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ } \
} \
yesno = '!'; \
what = WORD; \
if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
}
- else if (_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C(value)
+ else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
|| (! UNI_SEMANTICS
&& ! isASCII(value)
&& PL_fold_latin1[value] != value))
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ASCII);
else {
-#ifndef EBCDIC
for (value = 0; value < 128; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else /* EBCDIC */
- for (value = 0; value < 256; value++) {
- if (isASCII(value))
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
- }
-#endif /* EBCDIC */
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
}
yesno = '+';
what = "ASCII";
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_NASCII);
else {
-#ifndef EBCDIC
for (value = 128; value < 256; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else /* EBCDIC */
- for (value = 0; value < 256; value++) {
- if (!isASCII(value))
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
- }
-#endif /* EBCDIC */
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
}
yesno = '!';
what = "ASCII";
else
#endif
for (i = prevvalue; i <= ceilvalue; i++) {
- if (!ANYOF_BITMAP_TEST(ret,i)) {
- stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
- }
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
}
}
if (value > 255 || UTF) {
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFU:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
if ( RXp_PAREN_NAMES(prog) ) {
- if ( k != REF || OP(o) < NREF) {
+ if ( k != REF || (OP(o) < NREF)) {
AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
SV **name= av_fetch(list, ARG(o), 0 );
if (name)
sv_catpvs(sv, "{unicode_all}");
else if (flags & ANYOF_UTF8)
sv_catpvs(sv, "{unicode}");
- else if (flags & ANYOF_NONBITMAP)
+ if (flags & ANYOF_NONBITMAP_NON_UTF8)
sv_catpvs(sv, "{outside bitmap}");
{
ones (binary 1111 1111, hexadecimal FF). It is similar, but not
identical, to the ASCII delete (DEL) or rubout control character.
) So the old condition can be simplified to !isPRINT(c) */
- if (!isPRINT(c))
- Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
+ if (!isPRINT(c)) {
+ if (c < 256) {
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+ }
+ }
else {
const char string = c;
if (c == '-' || c == ']' || c == '\\' || c == '^')