I32 orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
+ I32 uni_semantics; /* If a d charset modifier should use unicode
+ rules, even if the pattern is not in
+ utf8 */
HV *paren_names; /* Paren names */
regnode **recurse; /* Recurse regops */
I32 recurse_count; /* Number of recurse regops */
+ I32 in_lookbehind;
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_uni_semantics (pRExC_state->uni_semantics)
#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
#define RExC_paren_names (pRExC_state->paren_names)
#define RExC_recurse (pRExC_state->recurse)
#define RExC_recurse_count (pRExC_state->recurse_count)
+#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define HASWIDTH 0x01 /* Known to match non-null strings. */
/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
- * character, and if utf8, must be invariant. */
+ * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
#define SIMPLE 0x02
#define SPSTART 0x04 /* Starts with * or +. */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
minimum length for the F is 1. This is important as the minimum length
is used to determine offsets in front of and behind the string being
looked for. Since strings can be composites this is the length of the
- pattern at the time it was commited with a scan_commit. Note that
+ pattern at the time it was committed with a scan_commit. Note that
the length is calculated by study_chunk, so that the minimum lengths
are not known until the full pattern has been compiled, thus the
pointer to the value.
SV **longest; /* Either &l_fixed, or &l_float. */
SV *longest_fixed; /* longest fixed string found in pattern */
I32 offset_fixed; /* offset where it starts */
- I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
+ I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
I32 lookbehind_fixed; /* is the position of the string modfied by LB */
SV *longest_float; /* longest floating string found in pattern */
I32 offset_float_min; /* earliest point in string it can appear */
I32 offset_float_max; /* latest point in string it can appear */
- I32 *minlen_float; /* pointer to the minlen relevent to the string */
+ I32 *minlen_float; /* pointer to the minlen relevant to the string */
I32 lookbehind_float; /* is the position of the string modified by LB */
I32 flags;
I32 whilem_c;
#define SCF_SEEN_ACCEPT 0x8000
#define UTF cBOOL(RExC_utf8)
-#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
-#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
+#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
+#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
+
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
#define OOB_UNICODE 12345678
ANYOF_CLASS_ZERO(cl);
ANYOF_BITMAP_SETALL(cl);
- cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
+ cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
- cl->flags |= ANYOF_FOLD;
}
/* Can match anything (initialization) */
PERL_ARGS_ASSERT_CL_AND;
assert(and_with->type == ANYOF);
- if (!(and_with->flags & ANYOF_CLASS)
- && !(cl->flags & ANYOF_CLASS)
+
+ if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
+ && !(ANYOF_CLASS_TEST_ANY_SET(cl))
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(and_with->flags & ANYOF_FOLD)
- && !(cl->flags & ANYOF_FOLD)) {
+ && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
int i;
if (and_with->flags & ANYOF_INVERT)
if (!(and_with->flags & ANYOF_EOS))
cl->flags &= ~ANYOF_EOS;
- if (!(and_with->flags & ANYOF_FOLD))
- cl->flags &= ~ANYOF_FOLD;
+ if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
+ cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
+ if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
+ cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
- if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
- !(and_with->flags & ANYOF_INVERT)) {
- cl->flags &= ~ANYOF_UNICODE_ALL;
+ if (cl->flags & ANYOF_UNICODE_ALL
+ && and_with->flags & ANYOF_NONBITMAP
+ && !(and_with->flags & ANYOF_INVERT))
+ {
+ if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ }
cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
only the one(s)
actually set */
* (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
*/
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(or_with->flags & ANYOF_FOLD)
- && !(cl->flags & ANYOF_FOLD) ) {
+ && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
int i;
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
} else {
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && (!(or_with->flags & ANYOF_FOLD)
- || (cl->flags & ANYOF_FOLD)) ) {
+ && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
int i;
/* OR char bitmap and class bitmap separately */
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
cl->bitmap[i] |= or_with->bitmap[i];
- if (or_with->flags & ANYOF_CLASS) {
+ if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
cl->classflags[i] |= or_with->classflags[i];
cl->flags |= ANYOF_CLASS;
}
if (or_with->flags & ANYOF_EOS)
cl->flags |= ANYOF_EOS;
+ if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
+ cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
- if (or_with->flags & ANYOF_FOLD)
- cl->flags |= ANYOF_FOLD;
+ if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
/* If both nodes match something outside the bitmap, but what they match
* outside is not the same pointer, and hence not easily compared, give up
tables that are used to generate the final compressed
representation which is what dump_trie expects.
- Part of the reason for their existance is to provide a form
+ Part of the reason for their existence is to provide a form
of documentation as to how the different representations function.
*/
/ (DUPE|DUPE) X? (?{ ... }) Y /x
-Thus EVAL blocks follwing a trie may be called a different number of times with
+Thus EVAL blocks following a trie may be called a different number of times with
and without the optimisation. With the optimisations dupes will be silently
-ignored. This inconsistant behaviour of EVAL type nodes is well established as
+ignored. This inconsistent behaviour of EVAL type nodes is well established as
the following demonstrates:
'words'=~/(word|word|word)(?{ print $1 })[xyz]/
Example of what happens on a structural level:
-The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
+The regexp /(ac|ad|ab)+/ will produce the following debug output:
1: CURLYM[1] {1,32767}(18)
5: BRANCH(8)
regnode *convert = NULL;
U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
- const U8 * const folder = ( flags == EXACTF
- ? PL_fold
- : ( flags == EXACTFL
- ? PL_fold_locale
- : NULL
- )
- );
+ const U8 * folder = NULL;
#ifdef DEBUGGING
const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
PERL_UNUSED_ARG(depth);
#endif
+ switch (flags) {
+ case EXACTFA:
+ case EXACTFU: folder = PL_fold_latin1; break;
+ case EXACTF: folder = PL_fold; break;
+ case EXACTFL: folder = PL_fold_locale; break;
+ }
+
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
trie->refcount = 1;
trie->startstate = 1;
middle and the least common are on the outside. IMO this would be better
than a most to least common mapping as theres a decent chance the most
common letter will share a node with the least common, meaning the node
- will not be compressable. With a middle is most common approach the worst
+ will not be compressible. With a middle is most common approach the worst
case is when we have the least common nodes twice.
*/
TRIE_STORE_REVCHAR;
}
if ( set_bit ) {
- /* store the codepoint in the bitmap, and if its ascii
- also store its folded equivelent. */
+ /* store the codepoint in the bitmap, and its folded
+ * equivalent. */
TRIE_BITMAP_SET(trie,uvc);
/* store the folded codepoint */
if ( !UTF ) {
/* store first byte of utf8 representation of
- codepoints in the 127 < uvc < 256 range */
- if (127 < uvc && uvc < 192) {
- TRIE_BITMAP_SET(trie,194);
- } else if (191 < uvc ) {
- TRIE_BITMAP_SET(trie,195);
- /* && uvc < 256 -- we know uvc is < 256 already */
+ variant codepoints */
+ if (! UNI_IS_INVARIANT(uvc)) {
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
}
}
set_bit = 0; /* We've done our bit :-) */
}
#endif
}
-
- if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
+#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
+#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
+#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
+#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
+
+ if (UTF
+ && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
+ && ( STR_LEN(scan) >= 6 ) )
+ {
/*
Two problematic code points in Unicode casefolding of EXACT nodes:
return stopnow;
}
-/* REx optimizer. Converts nodes into quickier variants "in place".
+/* REx optimizer. Converts nodes into quicker variants "in place".
Finds fixed substrings. */
/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
SAVEFREEPV(and_withp)
/* this is a chain of data about sub patterns we are processing that
- need to be handled seperately/specially in study_chunk. Its so
+ need to be handled separately/specially in study_chunk. Its so
we can simulate recursion without losing state. */
struct scan_frame;
typedef struct scan_frame {
which would be constructed from a pattern like /A|LIST|OF|WORDS/
- If we can find such a subseqence we need to turn the first
+ If we can find such a subsequence we need to turn the first
element into a trie and then add the subsequent branch exact
strings to the trie.
If x(1..n)==tail then we can do a simple trie, if not we make
a "jump" trie, such that when we match the appropriate word
- we "jump" to the appopriate tail node. Essentailly we turn
+ we "jump" to the appropriate tail node. Essentially we turn
a nested if into a case structure of sorts.
*/
and noper_next is the same as scan (our current
position in the regex) then the EXACT branch is
a possible optimization target. Once we have
- two or more consequetive such branches we can
+ two or more consecutive such branches we can
create a trie of the EXACT's contents and stich
it in place. If the sequence represents all of
the branches we eliminate the whole thing and
/* Check whether it is compatible with what we know already! */
int compat = 1;
+
+ /* If compatible, we or it in below. It is compatible if is
+ * in the bitmp and either 1) its bit or its fold is set, or 2)
+ * it's for a locale. Even if there isn't unicode semantics
+ * here, at runtime there may be because of matching against a
+ * utf8 string, so accept a possible false positive for
+ * latin1-range folds */
if (uc >= 0x100 ||
(!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
- && (!(data->start_class->flags & ANYOF_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
)
+ {
compat = 0;
+ }
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat)
ANYOF_BITMAP_SET(data->start_class, uc);
+ else if (uc >= 0x100) {
+ int i;
+
+ /* Some Unicode code points fold to the Latin1 range; as
+ * XXX temporary code, instead of figuring out if this is
+ * one, just assume it is and set all the start class bits
+ * that could be some such above 255 code point's fold
+ * which will generate fals positives. As the code
+ * elsewhere that does compute the fold settles down, it
+ * can be extracted out and re-used here */
+ for (i = 0; i < 256; i++){
+ if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+ ANYOF_BITMAP_SET(data->start_class, i);
+ }
+ }
+ }
data->start_class->flags &= ~ANYOF_EOS;
if (uc < 0x100)
data->start_class->flags &= ~ANYOF_UNICODE_ALL;
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
int compat = 1;
-
if (uc >= 0x100 ||
- (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, uc)
- && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
+ && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
+ {
compat = 0;
+ }
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat) {
ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
- data->start_class->flags |= ANYOF_FOLD;
- if (OP(scan) == EXACTFL)
+ data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
+ if (OP(scan) == EXACTFL) {
data->start_class->flags |= ANYOF_LOCALE;
+ }
+ else {
+
+ /* Also set the other member of the fold pair. In case
+ * that unicode semantics is called for at runtime, use
+ * the full latin1 fold. (Can't do this for locale,
+ * because not known until runtime */
+ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
+ }
+ }
+ else if (uc >= 0x100) {
+ int i;
+ for (i = 0; i < 256; i++){
+ if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+ ANYOF_BITMAP_SET(data->start_class, i);
+ }
+ }
}
}
else if (flags & SCF_DO_STCLASS_OR) {
- if (data->start_class->flags & ANYOF_FOLD) {
+ if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
- if (uc < 0x100)
+ if (uc < 0x100) {
ANYOF_BITMAP_SET(data->start_class, uc);
+ if (OP(scan) != EXACTFL) {
+
+ /* And set the other member of the fold pair, but
+ * can't do that in locale because not known until
+ * run-time */
+ ANYOF_BITMAP_SET(data->start_class,
+ PL_fold_latin1[uc]);
+ }
+ }
data->start_class->flags &= ~ANYOF_EOS;
}
cl_and(data->start_class, and_withp);
f |= SCF_DO_STCLASS_AND;
f &= ~SCF_DO_STCLASS_OR;
}
- /* These are the cases when once a subexpression
- fails at a particular position, it cannot succeed
- even after backtracking at the enclosing scope.
-
- XXXX what if minimal match and we are at the
- initial run of {n,m}? */
- if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
+ /* Exclude from super-linear cache processing any {n,m}
+ regops for which the combination of input pos and regex
+ pos is not enough information to determine if a match
+ will be possible.
+
+ For example, in the regex /foo(bar\s*){4,8}baz/ with the
+ regex pos at the \s*, the prospects for a match depend not
+ only on the input position but also on how many (bar\s*)
+ repeats into the {4,8} we are. */
+ if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
#ifdef DEBUGGING
OP(nxt1 + 1) = OPTIMIZED; /* was count. */
OP(nxt + 1) = OPTIMIZED; /* was count. */
- NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
- NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
+ NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
+ NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
#endif
#if 0
while ( nxt1 && (OP(nxt1) != WHILEM)) {
NEXT_OFF(oscan) += NEXT_OFF(next);
}
continue;
- default: /* REF and CLUMP only? */
+ default: /* REF, ANYOFV, and CLUMP only? */
if (flags & SCF_DO_SUBSTR) {
SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
data->longest = &(data->longest_float);
}
}
else if (OP(scan) == FOLDCHAR) {
- int d = ARG(scan)==0xDF ? 1 : 2;
+ int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
flags &= ~SCF_DO_STCLASS;
min += 1;
delta += d;
goto do_default;
if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
- || ((data->start_class->flags & ANYOF_CLASS)
- && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
+ || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
cl_anything(pRExC_state, data->start_class);
}
if (flags & SCF_DO_STCLASS_AND || !value)
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
- if (FLAGS(scan) & USE_UNI) {
+ if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (!isWORDCHAR_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
- else if (FLAGS(scan) & USE_UNI) {
+ else if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
- case ALNUML:
- if (flags & SCF_DO_STCLASS_AND) {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
- }
- else {
- ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
- data->start_class->flags |= ANYOF_LOCALE;
- }
- break;
case NALNUM:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
- if (FLAGS(scan) & USE_UNI) {
+ if (OP(scan) == NALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
else {
- for (value = 0; value < 256; value++)
- if (!isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ if (OP(scan) == NALNUMU) {
+ for (value = 0; value < 256; value++) {
+ if (! isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (! isALNUM(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
}
}
break;
- case NALNUML:
- if (flags & SCF_DO_STCLASS_AND) {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
- }
- else {
- data->start_class->flags |= ANYOF_LOCALE;
- ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
- }
- break;
case SPACE:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
- if (FLAGS(scan) & USE_UNI) {
+ if (OP(scan) == SPACEU) {
for (value = 0; value < 256; value++) {
if (!isSPACE_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
if (data->start_class->flags & ANYOF_LOCALE) {
ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
}
- else if (FLAGS(scan) & USE_UNI) {
+ else if (OP(scan) == SPACEU) {
for (value = 0; value < 256; value++) {
if (isSPACE_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
- case SPACEL:
- if (flags & SCF_DO_STCLASS_AND) {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
- }
- else {
- data->start_class->flags |= ANYOF_LOCALE;
- ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
- }
- break;
case NSPACE:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
- if (FLAGS(scan) & USE_UNI) {
+ if (OP(scan) == NSPACEU) {
for (value = 0; value < 256; value++) {
if (isSPACE_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
- else if (FLAGS(scan) & USE_UNI) {
+ else if (OP(scan) == NSPACEU) {
for (value = 0; value < 256; value++) {
if (!isSPACE_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
- case NSPACEL:
- if (flags & SCF_DO_STCLASS_AND) {
- if (data->start_class->flags & ANYOF_LOCALE) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
- for (value = 0; value < 256; value++)
- if (!isSPACE(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
- else {
- data->start_class->flags |= ANYOF_LOCALE;
- ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
- }
- break;
case DIGIT:
if (flags & SCF_DO_STCLASS_AND) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
#endif
REGEXP *
-Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
+Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
{
dVAR;
REGEXP *rx;
regnode *scan;
I32 flags;
I32 minlen = 0;
+ U32 pm_flags;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
I32 sawlookahead = 0;
I32 sawplus = 0;
I32 sawopen = 0;
+ bool used_setjump = FALSE;
U8 jump_ret = 0;
dJMPENV;
DEBUG_r(if (!PL_colorset) reginitcolors());
RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+ RExC_uni_semantics = 0;
-
+ /****************** LONG JUMP TARGET HERE***********************/
/* Longjmp back to here if have to switch in midstream to utf8 */
if (! RExC_orig_utf8) {
JMPENV_PUSH(jump_ret);
+ used_setjump = TRUE;
}
if (jump_ret == 0) { /* First time through */
- exp = SvPV(pattern, plen);
- xend = exp + plen;
+ exp = SvPV(pattern, plen);
+ xend = exp + plen;
+ /* ignore the utf8ness if the pattern is 0 length */
+ if (plen == 0) {
+ RExC_utf8 = RExC_orig_utf8 = 0;
+ }
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
restudied = 0;
#endif
+ /* Set to use unicode semantics if the pattern is in utf8 and has the
+ * 'depends' charset specified, as it means unicode when utf8 */
+ pm_flags = orig_pm_flags;
+
+ if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
+ set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
+ }
+
RExC_precomp = exp;
RExC_flags = pm_flags;
RExC_sawback = 0;
RExC_seen = 0;
+ RExC_in_lookbehind = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_seen_evals = 0;
RExC_extralen = 0;
return(NULL);
}
- /* Here, finished first pass. Get rid of our setjmp, which we added for
- * efficiency only if the passed-in string wasn't in utf8, as shown by
- * RExC_orig_utf8. But if the first pass was redone, that variable will be
- * 1 here even though the original string wasn't utf8, but in this case
- * there will have been a long jump */
- if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+ /* Here, finished first pass. Get rid of any added setjmp */
+ if (used_setjump) {
JMPENV_POP;
}
+
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
"Required size %"IVdf" nodes\n"
RExC_lastnum=0;
RExC_lastparse=NULL;
});
+
+ /* The first pass could have found things that force Unicode semantics */
+ if ((RExC_utf8 || RExC_uni_semantics)
+ && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
+ {
+ set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
+ }
+
/* Small enough for pointer-storage convention?
If extralen==0, this means that we will not need long jumps. */
if (RExC_size >= 0x10000L && RExC_extralen)
r->extflags = pm_flags;
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
- bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+ bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
/* The caret is output if there are any defaults: if not all the STD
* flags are set, or if no character set specifier is needed */
* covered by the caret */
const STRLEN wraplen = plen + has_p + has_runon
+ has_default /* If needs a caret */
- + has_charset /* If needs a character set specifier */
+
+ /* If needs a character set specifier */
+ + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
*p++= DEFAULT_PAT_MOD;
}
if (has_charset) {
- if (r->extflags & RXf_PMf_LOCALE) {
- *p++ = LOCALE_PAT_MOD;
- } else {
- *p++ = UNICODE_PAT_MOD;
- }
+ STRLEN len;
+ const char* const name = get_regex_charset_name(r->extflags, &len);
+ Copy(name, p, len, char);
+ p += len;
}
if (has_p)
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/
if (PL_regkind[OP(first)] == EXACT) {
if (OP(first) == EXACT)
NOOP; /* Empty, get anchored substr later. */
- else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
+ else
ri->regstclass = first;
}
#ifdef TRIE_STCLASS
if (ri->regstclass
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
+
+ /* If the synthetic start class were to ever be used when EOS is set,
+ * that bit would have to be cleared, as it is shared with another */
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
= r->float_substr = r->float_utf8 = NULL;
+
+ /* If the synthetic start class were to ever be used when EOS is set,
+ * that bit would have to be cleared, as it is shared with another */
if (!(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class))
{
else {
regnode *first = ri->program + 1;
U8 fop = OP(first);
- U8 nop = OP(NEXTOPER(first));
-
- if (PL_regkind[fop] == NOTHING && nop == END)
+
+ if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
r->extflags |= RXf_NULL;
- else if (PL_regkind[fop] == BOL && nop == END)
+ else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
r->extflags |= RXf_START_ONLY;
- else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
+ else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
+ && OP(regnext(first)) == END)
r->extflags |= RXf_WHITE;
}
#endif
DEBUG_PARSE_MSG((funcname)); \
PerlIO_printf(Perl_debug_log,fmt "\n",args); \
})
+
+/* This section of code defines the inversion list object and its methods. The
+ * interfaces are highly subject to change, so as much as possible is static to
+ * this file. An inversion list is here implemented as a malloc'd C array with
+ * some added info. More will be coming when functionality is added later.
+ *
+ * Some of the methods should always be private to the implementation, and some
+ * should eventually be made public */
+
+#define INVLIST_INITIAL_LEN 10
+#define INVLIST_ARRAY_KEY "array"
+#define INVLIST_MAX_KEY "max"
+#define INVLIST_LEN_KEY "len"
+
+PERL_STATIC_INLINE UV*
+S_invlist_array(pTHX_ HV* const invlist)
+{
+ /* Returns the pointer to the inversion list's array. Every time the
+ * length changes, this needs to be called in case malloc or realloc moved
+ * it */
+
+ SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
+
+ PERL_ARGS_ASSERT_INVLIST_ARRAY;
+
+ if (list_ptr == NULL) {
+ Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
+ INVLIST_ARRAY_KEY);
+ }
+
+ return INT2PTR(UV *, SvUV(*list_ptr));
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
+{
+ PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
+
+ /* Sets the array stored in the inversion list to the memory beginning with
+ * the parameter */
+
+ if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
+ Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
+ INVLIST_ARRAY_KEY);
+ }
+}
+
+PERL_STATIC_INLINE UV
+S_invlist_len(pTHX_ HV* const invlist)
+{
+ /* Returns the current number of elements in the inversion list's array */
+
+ SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
+
+ PERL_ARGS_ASSERT_INVLIST_LEN;
+
+ if (len_ptr == NULL) {
+ Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
+ INVLIST_LEN_KEY);
+ }
+
+ return SvUV(*len_ptr);
+}
+
+PERL_STATIC_INLINE UV
+S_invlist_max(pTHX_ HV* const invlist)
+{
+ /* Returns the maximum number of elements storable in the inversion list's
+ * array, without having to realloc() */
+
+ SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
+
+ PERL_ARGS_ASSERT_INVLIST_MAX;
+
+ if (max_ptr == NULL) {
+ Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
+ INVLIST_MAX_KEY);
+ }
+
+ return SvUV(*max_ptr);
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
+{
+ /* Sets the current number of elements stored in the inversion list */
+
+ PERL_ARGS_ASSERT_INVLIST_SET_LEN;
+
+ if (len != 0 && len > invlist_max(invlist)) {
+ Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
+ }
+
+ if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
+ Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
+ INVLIST_LEN_KEY);
+ }
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
+{
+
+ /* Sets the maximum number of elements storable in the inversion list
+ * without having to realloc() */
+
+ PERL_ARGS_ASSERT_INVLIST_SET_MAX;
+
+ if (max < invlist_len(invlist)) {
+ Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
+ }
+
+ if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
+ Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
+ INVLIST_LEN_KEY);
+ }
+}
+
+#ifndef PERL_IN_XSUB_RE
+HV*
+Perl__new_invlist(pTHX_ IV initial_size)
+{
+
+ /* Return a pointer to a newly constructed inversion list, with enough
+ * space to store 'initial_size' elements. If that number is negative, a
+ * system default is used instead */
+
+ HV* invlist = newHV();
+ UV* list;
+
+ if (initial_size < 0) {
+ initial_size = INVLIST_INITIAL_LEN;
+ }
+
+ /* Allocate the initial space */
+ Newx(list, initial_size, UV);
+ invlist_set_array(invlist, list);
+
+ /* set_len has to come before set_max, as the latter inspects the len */
+ invlist_set_len(invlist, 0);
+ invlist_set_max(invlist, initial_size);
+
+ return invlist;
+}
+#endif
+
+PERL_STATIC_INLINE void
+S_invlist_destroy(pTHX_ HV* const invlist)
+{
+ /* Inversion list destructor */
+
+ SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
+
+ PERL_ARGS_ASSERT_INVLIST_DESTROY;
+
+ if (list_ptr != NULL) {
+ UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
+ Safefree(list);
+ }
+}
+
+STATIC void
+S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
+{
+ /* Change the maximum size of an inversion list (up or down) */
+
+ UV* orig_array;
+ UV* array;
+ const UV old_max = invlist_max(invlist);
+
+ PERL_ARGS_ASSERT_INVLIST_EXTEND;
+
+ if (old_max == new_max) { /* If a no-op */
+ return;
+ }
+
+ array = orig_array = invlist_array(invlist);
+ Renew(array, new_max, UV);
+
+ /* If the size change moved the list in memory, set the new one */
+ if (array != orig_array) {
+ invlist_set_array(invlist, array);
+ }
+
+ invlist_set_max(invlist, new_max);
+
+}
+
+PERL_STATIC_INLINE void
+S_invlist_trim(pTHX_ HV* const invlist)
+{
+ PERL_ARGS_ASSERT_INVLIST_TRIM;
+
+ /* Change the length of the inversion list to how many entries it currently
+ * has */
+
+ invlist_extend(invlist, invlist_len(invlist));
+}
+
+/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
+ * etc */
+
+#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
+
+#ifndef PERL_IN_XSUB_RE
+void
+Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
+{
+ /* Subject to change or removal. Append the range from 'start' to 'end' at
+ * the end of the inversion list. The range must be above any existing
+ * ones. */
+
+ UV* array = invlist_array(invlist);
+ UV max = invlist_max(invlist);
+ UV len = invlist_len(invlist);
+
+ PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
+
+ if (len > 0) {
+
+ /* Here, the existing list is non-empty. The current max entry in the
+ * list is generally the first value not in the set, except when the
+ * set extends to the end of permissible values, in which case it is
+ * the first entry in that final set, and so this call is an attempt to
+ * append out-of-order */
+
+ UV final_element = len - 1;
+ if (array[final_element] > start
+ || ELEMENT_IN_INVLIST_SET(final_element))
+ {
+ Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
+ }
+
+ /* Here, it is a legal append. If the new range begins with the first
+ * value not in the set, it is extending the set, so the new first
+ * value not in the set is one greater than the newly extended range.
+ * */
+ if (array[final_element] == start) {
+ if (end != UV_MAX) {
+ array[final_element] = end + 1;
+ }
+ else {
+ /* But if the end is the maximum representable on the machine,
+ * just let the range that this would extend have no end */
+ invlist_set_len(invlist, len - 1);
+ }
+ return;
+ }
+ }
+
+ /* Here the new range doesn't extend any existing set. Add it */
+
+ len += 2; /* Includes an element each for the start and end of range */
+
+ /* If overflows the existing space, extend, which may cause the array to be
+ * moved */
+ if (max < len) {
+ invlist_extend(invlist, len);
+ array = invlist_array(invlist);
+ }
+
+ invlist_set_len(invlist, len);
+
+ /* The next item on the list starts the range, the one after that is
+ * one past the new range. */
+ array[len - 2] = start;
+ if (end != UV_MAX) {
+ array[len - 1] = end + 1;
+ }
+ else {
+ /* But if the end is the maximum representable on the machine, just let
+ * the range have no end */
+ invlist_set_len(invlist, len - 1);
+ }
+}
+#endif
+
+PERL_STATIC_INLINE HV*
+S_invlist_union(pTHX_ HV* const a, HV* const b)
+{
+ /* Return a new inversion list which is the union of two inversion lists.
+ * The basis for this comes from "Unicode Demystified" Chapter 13 by
+ * Richard Gillam, published by Addison-Wesley, and explained at some
+ * length there. The preface says to incorporate its examples into your
+ * code at your own risk.
+ *
+ * The algorithm is like a merge sort.
+ *
+ * XXX A potential performance improvement is to keep track as we go along
+ * if only one of the inputs contributes to the result, meaning the other
+ * is a subset of that one. In that case, we can skip the final copy and
+ * return the larger of the input lists */
+
+ UV* array_a = invlist_array(a); /* a's array */
+ UV* array_b = invlist_array(b);
+ UV len_a = invlist_len(a); /* length of a's array */
+ UV len_b = invlist_len(b);
+
+ HV* u; /* the resulting union */
+ UV* array_u;
+ UV len_u;
+
+ UV i_a = 0; /* current index into a's array */
+ UV i_b = 0;
+ UV i_u = 0;
+
+ /* running count, as explained in the algorithm source book; items are
+ * stopped accumulating and are output when the count changes to/from 0.
+ * The count is incremented when we start a range that's in the set, and
+ * decremented when we start a range that's not in the set. So its range
+ * is 0 to 2. Only when the count is zero is something not in the set.
+ */
+ UV count = 0;
+
+ PERL_ARGS_ASSERT_INVLIST_UNION;
+
+ /* Size the union for the worst case: that the sets are completely
+ * disjoint */
+ u = _new_invlist(len_a + len_b);
+ array_u = invlist_array(u);
+
+ /* Go through each list item by item, stopping when exhausted one of
+ * them */
+ while (i_a < len_a && i_b < len_b) {
+ UV cp; /* The element to potentially add to the union's array */
+ bool cp_in_set; /* is it in the the input list's set or not */
+
+ /* We need to take one or the other of the two inputs for the union.
+ * Since we are merging two sorted lists, we take the smaller of the
+ * next items. In case of a tie, we take the one that is in its set
+ * first. If we took one not in the set first, it would decrement the
+ * count, possibly to 0 which would cause it to be output as ending the
+ * range, and the next time through we would take the same number, and
+ * output it again as beginning the next range. By doing it the
+ * opposite way, there is no possibility that the count will be
+ * momentarily decremented to 0, and thus the two adjoining ranges will
+ * be seamlessly merged. (In a tie and both are in the set or both not
+ * in the set, it doesn't matter which we take first.) */
+ if (array_a[i_a] < array_b[i_b]
+ || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
+ {
+ cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
+ cp= array_a[i_a++];
+ }
+ else {
+ cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
+ cp= array_b[i_b++];
+ }
+
+ /* Here, have chosen which of the two inputs to look at. Only output
+ * if the running count changes to/from 0, which marks the
+ * beginning/end of a range in that's in the set */
+ if (cp_in_set) {
+ if (count == 0) {
+ array_u[i_u++] = cp;
+ }
+ count++;
+ }
+ else {
+ count--;
+ if (count == 0) {
+ array_u[i_u++] = cp;
+ }
+ }
+ }
+
+ /* Here, we are finished going through at least one of the lists, which
+ * means there is something remaining in at most one. We check if the list
+ * that hasn't been exhausted is positioned such that we are in the middle
+ * of a range in its set or not. (We are in the set if the next item in
+ * the array marks the beginning of something not in the set) If in the
+ * set, we decrement 'count'; if 0, there is potentially more to output.
+ * There are four cases:
+ * 1) Both weren't in their sets, count is 0, and remains 0. What's left
+ * in the union is entirely from the non-exhausted set.
+ * 2) Both were in their sets, count is 2. Nothing further should
+ * be output, as everything that remains will be in the exhausted
+ * list's set, hence in the union; decrementing to 1 but not 0 insures
+ * that
+ * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
+ * Nothing further should be output because the union includes
+ * everything from the exhausted set. Not decrementing insures that.
+ * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
+ * decrementing to 0 insures that we look at the remainder of the
+ * non-exhausted set */
+ if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
+ || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
+ {
+ count--;
+ }
+
+ /* The final length is what we've output so far, plus what else is about to
+ * be output. (If 'count' is non-zero, then the input list we exhausted
+ * has everything remaining up to the machine's limit in its set, and hence
+ * in the union, so there will be no further output. */
+ len_u = i_u;
+ if (count == 0) {
+ /* At most one of the subexpressions will be non-zero */
+ len_u += (len_a - i_a) + (len_b - i_b);
+ }
+
+ /* Set result to final length, which can change the pointer to array_u, so
+ * re-find it */
+ if (len_u != invlist_len(u)) {
+ invlist_set_len(u, len_u);
+ invlist_trim(u);
+ array_u = invlist_array(u);
+ }
+
+ /* When 'count' is 0, the list that was exhausted (if one was shorter than
+ * the other) ended with everything above it not in its set. That means
+ * that the remaining part of the union is precisely the same as the
+ * non-exhausted list, so can just copy it unchanged. (If both list were
+ * exhausted at the same time, then the operations below will be both 0.)
+ */
+ if (count == 0) {
+ IV copy_count; /* At most one will have a non-zero copy count */
+ if ((copy_count = len_a - i_a) > 0) {
+ Copy(array_a + i_a, array_u + i_u, copy_count, UV);
+ }
+ else if ((copy_count = len_b - i_b) > 0) {
+ Copy(array_b + i_b, array_u + i_u, copy_count, UV);
+ }
+ }
+
+ return u;
+}
+
+PERL_STATIC_INLINE HV*
+S_invlist_intersection(pTHX_ HV* const a, HV* const b)
+{
+ /* Return the intersection of two inversion lists. The basis for this
+ * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
+ * by Addison-Wesley, and explained at some length there. The preface says
+ * to incorporate its examples into your code at your own risk.
+ *
+ * The algorithm is like a merge sort, and is essentially the same as the
+ * union above
+ */
+
+ UV* array_a = invlist_array(a); /* a's array */
+ UV* array_b = invlist_array(b);
+ UV len_a = invlist_len(a); /* length of a's array */
+ UV len_b = invlist_len(b);
+
+ HV* r; /* the resulting intersection */
+ UV* array_r;
+ UV len_r;
+
+ UV i_a = 0; /* current index into a's array */
+ UV i_b = 0;
+ UV i_r = 0;
+
+ /* running count, as explained in the algorithm source book; items are
+ * stopped accumulating and are output when the count changes to/from 2.
+ * The count is incremented when we start a range that's in the set, and
+ * decremented when we start a range that's not in the set. So its range
+ * is 0 to 2. Only when the count is 2 is something in the intersection.
+ */
+ UV count = 0;
+
+ PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
+
+ /* Size the intersection for the worst case: that the intersection ends up
+ * fragmenting everything to be completely disjoint */
+ r= _new_invlist(len_a + len_b);
+ array_r = invlist_array(r);
+
+ /* Go through each list item by item, stopping when exhausted one of
+ * them */
+ while (i_a < len_a && i_b < len_b) {
+ UV cp; /* The element to potentially add to the intersection's
+ array */
+ bool cp_in_set; /* Is it in the input list's set or not */
+
+ /* We need to take one or the other of the two inputs for the union.
+ * Since we are merging two sorted lists, we take the smaller of the
+ * next items. In case of a tie, we take the one that is not in its
+ * set first (a difference from the union algorithm). If we took one
+ * in the set first, it would increment the count, possibly to 2 which
+ * would cause it to be output as starting a range in the intersection,
+ * and the next time through we would take that same number, and output
+ * it again as ending the set. By doing it the opposite of this, we
+ * there is no possibility that the count will be momentarily
+ * incremented to 2. (In a tie and both are in the set or both not in
+ * the set, it doesn't matter which we take first.) */
+ if (array_a[i_a] < array_b[i_b]
+ || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
+ {
+ cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
+ cp= array_a[i_a++];
+ }
+ else {
+ cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
+ cp= array_b[i_b++];
+ }
+
+ /* Here, have chosen which of the two inputs to look at. Only output
+ * if the running count changes to/from 2, which marks the
+ * beginning/end of a range that's in the intersection */
+ if (cp_in_set) {
+ count++;
+ if (count == 2) {
+ array_r[i_r++] = cp;
+ }
+ }
+ else {
+ if (count == 2) {
+ array_r[i_r++] = cp;
+ }
+ count--;
+ }
+ }
+
+ /* Here, we are finished going through at least one of the sets, which
+ * means there is something remaining in at most one. See the comments in
+ * the union code */
+ if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
+ || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
+ {
+ count--;
+ }
+
+ /* The final length is what we've output so far plus what else is in the
+ * intersection. Only one of the subexpressions below will be non-zero */
+ len_r = i_r;
+ if (count == 2) {
+ len_r += (len_a - i_a) + (len_b - i_b);
+ }
+
+ /* Set result to final length, which can change the pointer to array_r, so
+ * re-find it */
+ if (len_r != invlist_len(r)) {
+ invlist_set_len(r, len_r);
+ invlist_trim(r);
+ array_r = invlist_array(r);
+ }
+
+ /* Finish outputting any remaining */
+ if (count == 2) { /* Only one of will have a non-zero copy count */
+ IV copy_count;
+ if ((copy_count = len_a - i_a) > 0) {
+ Copy(array_a + i_a, array_r + i_r, copy_count, UV);
+ }
+ else if ((copy_count = len_b - i_b) > 0) {
+ Copy(array_b + i_b, array_r + i_r, copy_count, UV);
+ }
+ }
+
+ return r;
+}
+
+STATIC HV*
+S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
+{
+ /* Add the range from 'start' to 'end' inclusive to the inversion list's
+ * set. A pointer to the inversion list is returned. This may actually be
+ * a new list, in which case the passed in one has been destroyed */
+
+ HV* range_invlist;
+ HV* added_invlist;
+
+ UV len = invlist_len(invlist);
+
+ PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST;
+
+ /* If comes after the final entry, can just append it to the end */
+ if (len == 0
+ || start >= invlist_array(invlist)
+ [invlist_len(invlist) - 1])
+ {
+ _append_range_to_invlist(invlist, start, end);
+ return invlist;
+ }
+
+ /* Here, can't just append things, create and return a new inversion list
+ * which is the union of this range and the existing inversion list */
+ range_invlist = _new_invlist(2);
+ _append_range_to_invlist(range_invlist, start, end);
+
+ added_invlist = invlist_union(invlist, range_invlist);
+
+ /* The passed in list can be freed, as well as our temporary */
+ invlist_destroy(range_invlist);
+ if (invlist != added_invlist) {
+ invlist_destroy(invlist);
+ }
+
+ return added_invlist;
+}
+
+/* End of inversion list object */
+
/*
- reg - regular expression, i.e. main body or parenthesized thing
*
SvREFCNT_inc_simple_void(sv_dat);
}
RExC_sawback = 1;
- ret = reganode(pRExC_state,
- (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
- num);
+ ret = reganode(pRExC_state,
+ ((! FOLD)
+ ? NREF
+ : (MORE_ASCII_RESTRICTED)
+ ? NREFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? NREFFU
+ : (LOC)
+ ? NREFFL
+ : NREFF),
+ num);
*flagp |= HASWIDTH;
Set_Node_Offset(ret, parse_start+1);
if (SIZE_ONLY) {
HE *he_str;
SV *sv_dat = NULL;
- if (!svname) /* shouldnt happen */
+ if (!svname) /* shouldn't happen */
Perl_croak(aTHX_
"panic: reg_scan_name returned NULL");
if (!RExC_paren_names) {
goto capturing_parens;
}
RExC_seen |= REG_SEEN_LOOKBEHIND;
+ RExC_in_lookbehind++;
RExC_parse++;
case '=': /* (?=...) */
RExC_seen_zerolen++;
that follow */
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET);
goto parse_flags;
default:
--RExC_parse;
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
bool has_charset_modifier = 0;
+ regex_charset cs = REGEX_DEPENDS_CHARSET;
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
if (has_charset_modifier || flagsp == &negflags) {
goto fail_modifiers;
}
- posflags |= RXf_PMf_LOCALE;
- negflags |= RXf_PMf_UNICODE;
+ cs = REGEX_LOCALE_CHARSET;
has_charset_modifier = 1;
break;
case UNICODE_PAT_MOD:
if (has_charset_modifier || flagsp == &negflags) {
goto fail_modifiers;
}
- posflags |= RXf_PMf_UNICODE;
- negflags |= RXf_PMf_LOCALE;
+ cs = REGEX_UNICODE_CHARSET;
+ has_charset_modifier = 1;
+ break;
+ case ASCII_RESTRICT_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
+ /* Doubled modifier implies more restricted */
+ cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
+ RExC_parse++;
+ }
+ else {
+ cs = REGEX_ASCII_RESTRICTED_CHARSET;
+ }
has_charset_modifier = 1;
break;
- case DUAL_PAT_MOD:
+ case DEPENDS_PAT_MOD:
if (has_use_defaults
|| has_charset_modifier
|| flagsp == &negflags)
{
goto fail_modifiers;
}
- negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+
+ /* The dual charset means unicode semantics if the
+ * pattern (or target, not known until runtime) are
+ * utf8, or something in the pattern indicates unicode
+ * semantics */
+ cs = (RExC_utf8 || RExC_uni_semantics)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET;
has_charset_modifier = 1;
break;
case ONCE_PAT_MOD: /* 'o' */
case ')':
RExC_flags |= posflags;
RExC_flags &= ~negflags;
+ set_regex_charset(&RExC_flags, cs);
if (paren != ':') {
oregflags |= posflags;
oregflags &= ~negflags;
+ set_regex_charset(&oregflags, cs);
}
nextchar(pRExC_state);
if (paren != ':') {
FAIL("Junk on end of regexp"); /* "Can't happen". */
/* NOTREACHED */
}
+
+ if (RExC_in_lookbehind) {
+ RExC_in_lookbehind--;
+ }
if (after_freeze)
RExC_npar = after_freeze;
return(ret);
STRLEN len = 0; /* Its current byte length */
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
-
ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ (U8) ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : (MORE_ASCII_RESTRICTED)
+ ? EXACTFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? EXACTFU
+ : EXACTF));
s= STRING(ret);
/* Exact nodes can hold only a U8 length's of text = 255. Loop through
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
UV cp; /* Ord of current character */
+ bool use_this_char_fold = FOLD;
/* Code points are separated by dots. If none, there is only one
* code point, and is terminated by the brace */
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- if (! FOLD) { /* Not folding, just append to the string */
+ if (FOLD
+ && (cp > 255 || ! MORE_ASCII_RESTRICTED)
+ && is_TRICKYFOLD_cp(cp))
+ {
+ }
+
+ /* Under /aa, we can't mix ASCII with non- in a fold. If we are
+ * folding, and the source isn't ASCII, look through all the
+ * characters it folds to. If any one of them is ASCII, forbid
+ * this fold. (cp is uni, so the 127 below is correct even for
+ * EBCDIC) */
+ if (use_this_char_fold && cp > 127 && MORE_ASCII_RESTRICTED) {
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+ U8* s = tmpbuf;
+ U8* e;
+ STRLEN foldlen;
+
+ (void) toFOLD_uni(cp, tmpbuf, &foldlen);
+ e = s + foldlen;
+
+ while (s < e) {
+ if (isASCII(*s)) {
+ use_this_char_fold = FALSE;
+ break;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+
+ if (! use_this_char_fold) { /* Not folding, just append to the
+ string */
STRLEN unilen;
/* Quit before adding this character if would exceed limit */
Note: we have to be careful with escapes, as they can be both literal
and special, and in the case of \10 and friends can either, depending
- on context. Specifically there are two seperate switches for handling
+ on context. Specifically there are two separate switches for handling
escape sequences, with the one for handling literal escapes requiring
a dummy entry for all of the special escapes that are actually handled
by the other.
register regnode *ret = NULL;
I32 flags;
char *parse_start = RExC_parse;
+ U8 op;
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("atom");
*flagp = WORST; /* Tentatively. */
RExC_parse++;
vFAIL("Quantifier follows nothing");
break;
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
+#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
+#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
+ case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
+#endif
do_foldchar:
if (!LOC && FOLD) {
U32 len,cp;
literal text handling code.
*/
switch ((U8)*++RExC_parse) {
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
goto do_foldchar;
/* Special Escapes */
case 'A':
*flagp |= HASWIDTH;
goto finish_meta_pat;
case 'w':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(ALNUML));
- } else {
- ret = reg_node(pRExC_state, (U8)(ALNUM));
- FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = ALNUML;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = ALNUMU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = ALNUMA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = ALNUM;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'W':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(NALNUML));
- } else {
- ret = reg_node(pRExC_state, (U8)(NALNUM));
- FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NALNUML;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = NALNUMU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = NALNUMA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = NALNUM;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(BOUNDL));
- } else {
- ret = reg_node(pRExC_state, (U8)(BOUND));
- FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = BOUNDL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = BOUNDU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = BOUNDA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = BOUND;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
+ FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
+ if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+ ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
+ }
goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(NBOUNDL));
- } else {
- ret = reg_node(pRExC_state, (U8)(NBOUND));
- FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NBOUNDL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = NBOUNDU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = NBOUNDA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = NBOUND;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
+ FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
+ if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+ ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
+ }
goto finish_meta_pat;
case 's':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(SPACEL));
- } else {
- ret = reg_node(pRExC_state, (U8)(SPACE));
- FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = SPACEL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = SPACEU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = SPACEA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = SPACE;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'S':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(NSPACEL));
- } else {
- ret = reg_node(pRExC_state, (U8)(NSPACE));
- FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NSPACEL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = NSPACEU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = NSPACEA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = NSPACE;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'd':
- ret = reg_node(pRExC_state, DIGIT);
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = DIGITL;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = DIGITA;
+ break;
+ case REGEX_DEPENDS_CHARSET: /* No difference between these */
+ case REGEX_UNICODE_CHARSET:
+ op = DIGIT;
+ break;
+ default:
+ goto bad_charset;
+ }
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'D':
- ret = reg_node(pRExC_state, NDIGIT);
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NDIGITL;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ op = NDIGITA;
+ break;
+ case REGEX_DEPENDS_CHARSET: /* No difference between these */
+ case REGEX_UNICODE_CHARSET:
+ op = NDIGIT;
+ break;
+ default:
+ goto bad_charset;
+ }
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'R':
RExC_sawback = 1;
ret = reganode(pRExC_state,
- (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
- num);
+ ((! FOLD)
+ ? NREF
+ : (MORE_ASCII_RESTRICTED)
+ ? NREFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? NREFFU
+ : (LOC)
+ ? NREFFL
+ : NREFF),
+ num);
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
}
RExC_sawback = 1;
ret = reganode(pRExC_state,
- (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
- num);
+ ((! FOLD)
+ ? REF
+ : (MORE_ASCII_RESTRICTED)
+ ? REFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? REFFU
+ : (LOC)
+ ? REFFL
+ : REFF),
+ num);
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
char *s;
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+ regnode * orig_emit;
parse_start = RExC_parse - 1;
defchar:
ender = 0;
+ orig_emit = RExC_emit; /* Save the original output node position in
+ case we need to output a different node
+ type */
ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ (U8) ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : (MORE_ASCII_RESTRICTED)
+ ? EXACTFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? EXACTFU
+ : EXACTF)
+ );
s = STRING(ret);
for (len = 0, p = RExC_parse - 1;
len < 127 && p < RExC_end;
if (RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
switch ((U8)*p) {
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
goto normal_default;
case '^':
switch ((U8)*++p) {
/* These are all the special escapes. */
- case 0xDF:
- case 0xC3:
- case 0xCE:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+ case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
goto normal_default;
case 'A': /* Start assertion */
break;
case 'c':
p++;
- ender = grok_bslash_c(*p++, SIZE_ONLY);
+ ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8':case '9':
FAIL("Trailing \\");
/* FALL THROUGH */
default:
- if (!SIZE_ONLY&& isALPHA(*p))
- ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
+ if (!SIZE_ONLY&& isALPHA(*p)) {
+ /* Include any { following the alpha to emphasize
+ * that it could be part of an escape at some point
+ * in the future */
+ int len = (*(p + 1) == '{') ? 2 : 1;
+ ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
+ }
goto normal_default;
}
break;
p += numlen;
}
else
- ender = *p++;
+ ender = (U8) *p++;
break;
+ } /* End of switch on the literal */
+
+ /* Certain characters are problematic because their folded
+ * length is so different from their original length that it
+ * isn't handleable by the optimizer. They are therefore not
+ * placed in an EXACTish node; and are here handled specially.
+ * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
+ * putting it in a special node keeps regexec from having to
+ * deal with a non-utf8 multi-char fold */
+ if (FOLD
+ && (ender > 255 || ! MORE_ASCII_RESTRICTED)
+ && is_TRICKYFOLD_cp(ender))
+ {
+ /* If is in middle of outputting characters into an
+ * EXACTish node, go output what we have so far, and
+ * position the parse so that this will be called again
+ * immediately */
+ if (len) {
+ p = RExC_parse + len - 1;
+ goto loopdone;
+ }
+ else {
+
+ /* Here we are ready to output our tricky fold
+ * character. What's done is to pretend it's in a
+ * [bracketed] class, and let the code that deals with
+ * those handle it, as that code has all the
+ * intelligence necessary. First save the current
+ * parse state, get rid of the already allocated EXACT
+ * node that the ANYOFV node will replace, and point
+ * the parse to a buffer which we fill with the
+ * character we want the regclass code to think is
+ * being parsed */
+ char* const oldregxend = RExC_end;
+ char tmpbuf[2];
+ RExC_emit = orig_emit;
+ RExC_parse = tmpbuf;
+ if (UTF) {
+ tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
+ tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
+ RExC_end = RExC_parse + 2;
+ }
+ else {
+ tmpbuf[0] = ender;
+ RExC_end = RExC_parse + 1;
+ }
+
+ ret = regclass(pRExC_state,depth+1);
+
+ /* Here, have parsed the buffer. Reset the parse to
+ * the actual input, and return */
+ RExC_end = oldregxend;
+ RExC_parse = p - 1;
+
+ Set_Node_Offset(ret, RExC_parse);
+ Set_Node_Cur_Length(ret);
+ nextchar(pRExC_state);
+ *flagp |= HASWIDTH|SIMPLE;
+ return ret;
+ }
}
+
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
if (UTF && FOLD) {
/* Prime the casefolded buffer. */
- ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+ if (isASCII(ender)) {
+ ender = toLOWER(ender);
+ *tmpbuf = ender;
+ foldlen = 1;
+ }
+ else if (! MORE_ASCII_RESTRICTED) {
+ ender = toFOLD_uni(ender, tmpbuf, &foldlen);
+ }
+ else {
+ /* When not to mix ASCII with non-, reject folds that
+ * mix them, using only the non-folded code point. So
+ * do the fold to a temporary, and inspect each
+ * character in it. */
+ U8 trialbuf[UTF8_MAXBYTES_CASE+1];
+ U8* s = trialbuf;
+ UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
+ U8* e = s + foldlen;
+ bool fold_ok = TRUE;
+
+ while (s < e) {
+ if (isASCII(*s)) {
+ fold_ok = FALSE;
+ break;
+ }
+ s += UTF8SKIP(s);
+ }
+ if (fold_ok) {
+ Copy(trialbuf, tmpbuf, foldlen, U8);
+ ender = tmpender;
+ }
+ else {
+ uvuni_to_utf8(tmpbuf, ender);
+ foldlen = UNISKIP(ender);
+ }
+ }
}
if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
if (len)
else
REGC((char)ender, s++);
}
- loopdone:
+ loopdone: /* Jumped to when encounters something that shouldn't be in
+ the node */
RExC_parse = p - 1;
Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
}
return(ret);
+
+/* Jumped to when an unrecognized character set is encountered */
+bad_charset:
+ Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
+ return(NULL);
}
STATIC char *
}
}
-/* No locale test */
-#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
-ANYOF_##NAME: \
- for (value = 0; value < 256; value++) \
- if (TEST) \
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
- yesno = '+'; \
- what = WORD; \
- break; \
-case ANYOF_N##NAME: \
- for (value = 0; value < 256; value++) \
- if (!TEST) \
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
- yesno = '!'; \
- what = WORD; \
+/* No locale test, and always Unicode semantics */
+#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
+ANYOF_##NAME: \
+ for (value = 0; value < 256; value++) \
+ if (TEST) \
+ stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ yesno = '+'; \
+ what = WORD; \
+ break; \
+case ANYOF_N##NAME: \
+ for (value = 0; value < 256; value++) \
+ if (!TEST) \
+ stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ yesno = '!'; \
+ what = WORD; \
break
/* Like the above, but there are differences if we are in uni-8-bit or not, so
* there are two tests passed in, to use depending on that. There aren't any
* cases where the label is different from the name, so no need for that
* parameter */
-#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
-ANYOF_##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (TEST_8) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
- } \
- } \
- else { \
- for (value = 0; value < 256; value++) { \
- if (TEST_7) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
- } \
- } \
- yesno = '+'; \
- what = WORD; \
- break; \
-case ANYOF_N##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (! TEST_8) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
- } \
- } \
- else { \
- for (value = 0; value < 256; value++) { \
- if (! TEST_7) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
- } \
- } \
- yesno = '!'; \
- what = WORD; \
+#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
+ANYOF_##NAME: \
+ if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
+ else if (UNI_SEMANTICS) { \
+ for (value = 0; value < 256; value++) { \
+ if (TEST_8(value)) stored += \
+ set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ } \
+ } \
+ else { \
+ for (value = 0; value < 128; value++) { \
+ if (TEST_7(UNI_TO_NATIVE(value))) stored += \
+ set_regclass_bit(pRExC_state, ret, \
+ (U8) UNI_TO_NATIVE(value), &nonbitmap); \
+ } \
+ } \
+ yesno = '+'; \
+ what = WORD; \
+ break; \
+case ANYOF_N##NAME: \
+ if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
+ else if (UNI_SEMANTICS) { \
+ for (value = 0; value < 256; value++) { \
+ if (! TEST_8(value)) stored += \
+ set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ } \
+ } \
+ else { \
+ for (value = 0; value < 128; value++) { \
+ if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
+ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
+ } \
+ if (AT_LEAST_ASCII_RESTRICTED) { \
+ for (value = 128; value < 256; value++) { \
+ stored += set_regclass_bit( \
+ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
+ } \
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
+ } \
+ else { \
+ /* For a non-ut8 target string with DEPENDS semantics, all above \
+ * ASCII Latin1 code points match the complement of any of the \
+ * classes. But in utf8, they have their Unicode semantics, so \
+ * can't just set them in the bitmap, or else regexec.c will think \
+ * they matched when they shouldn't. */ \
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
+ } \
+ } \
+ yesno = '!'; \
+ what = WORD; \
break
/*
#endif
STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
{
/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
U8 stored = 0;
U8 fold;
- fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
- : PL_fold[value];
+ PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
+
+ fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
+ : PL_fold[value];
/* It assumes the bit for 'value' has already been set */
if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
ANYOF_BITMAP_SET(node, fold);
stored++;
}
-
- /* The fold of the German sharp s is two ASCII characters, so isn't in the
- * bitmap and doesn't have to be in utf8, but we only process it if unicode
- * semantics are called for */
- if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
- ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
- }
- else if (_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C(value)
- || (! UNI_SEMANTICS
- && ! isASCII(value)
- && PL_fold_latin1[value] != value))
+ if ((_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED))
+ || (! UNI_SEMANTICS
+ && ! isASCII(value)
+ && PL_fold_latin1[value] != value))
{ /* A character that has a fold outside of Latin1 matches outside the
bitmap, but only when the target string is utf8. Similarly when we
don't have unicode semantics for the above ASCII Latin-1 characters,
and they have a fold, they should match if the target is utf8, and
not otherwise */
+ if (! *nonbitmap_ptr) {
+ *nonbitmap_ptr = _new_invlist(2);
+ }
+ *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
ANYOF_FLAGS(node) |= ANYOF_UTF8;
}
PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
{
/* This inline function sets a bit in the bitmap if not already set, and if
* appropriate, its fold, returning the number of bits that actually
U8 stored;
+ PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
+
if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
return 0;
}
stored = 1;
if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
- stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
+ stored += set_regclass_bit_fold(pRExC_state, node, value, nonbitmap_ptr);
}
return stored;
/*
parse a class specification and produce either an ANYOF node that
- matches the pattern or if the pattern matches a single char only and
- that char is < 256 and we are case insensitive then we produce an
- EXACT node instead.
-*/
+ matches the pattern or perhaps will be optimized into an EXACTish node
+ instead. */
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
bool need_class = 0;
SV *listsv = NULL;
UV n;
- bool optimize_invert = TRUE;
+ HV* nonbitmap = NULL;
AV* unicode_alternate = NULL;
#ifdef EBCDIC
UV literal_endpoint = 0;
#endif
- UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */
+ UV stored = 0; /* how many chars stored in the bitmap */
regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
case we need to change the emitted regop to an EXACT. */
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state, ANYOF, 0);
- if (!SIZE_ONLY)
+
+ if (!SIZE_ONLY) {
ANYOF_FLAGS(ret) = 0;
+ }
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
+#ifdef ANYOF_ADD_LOC_SKIP
+ if (LOC) {
+ RExC_size += ANYOF_ADD_LOC_SKIP;
+ }
+#endif
listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
}
else {
RExC_emit += ANYOF_SKIP;
- if (FOLD)
- ANYOF_FLAGS(ret) |= ANYOF_FOLD;
- if (LOC)
+ if (LOC) {
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
+#ifdef ANYOF_ADD_LOC_SKIP
+ RExC_emit += ANYOF_ADD_LOC_SKIP;
+#endif
+ }
ANYOF_BITMAP_ZERO(ret);
listsv = newSVpvs("# comment\n");
}
e = RExC_parse;
n = 1;
}
- if (!SIZE_ONLY) {
+ if (SIZE_ONLY) {
+ if (LOC) {
+ ckWARN2reg(RExC_parse,
+ "\\%c uses Unicode rules, not locale rules",
+ (int) value);
+ }
+ }
+ else {
if (UCHARAT(RExC_parse) == '^') {
RExC_parse++;
n--;
n--;
}
}
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
- (value=='p' ? '+' : '!'), (int)n, RExC_parse);
+
+ /* Add the property name to the list. If /i matching, give
+ * a different name which consists of the normal name
+ * sandwiched between two underscores and '_i'. The design
+ * is discussed in the commit message for this. */
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
+ (value=='p' ? '+' : '!'),
+ (FOLD) ? "__" : "",
+ (int)n,
+ RExC_parse,
+ (FOLD) ? "_i" : ""
+ );
}
RExC_parse = e + 1;
* something that isn't utf8 */
ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
namedclass = ANYOF_MAX; /* no official name, but it's named */
+
+ /* \p means they want Unicode semantics */
+ RExC_uni_semantics = 1;
}
break;
case 'n': value = '\n'; break;
goto recode_encoding;
break;
case 'c':
- value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
+ value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
if (LOC && namedclass < ANYOF_MAX && ! need_class) {
need_class = 1;
if (SIZE_ONLY) {
+#ifdef ANYOF_CLASS_ADD_SKIP
RExC_size += ANYOF_CLASS_ADD_SKIP;
+#endif
}
else {
+#ifdef ANYOF_CLASS_ADD_SKIP
RExC_emit += ANYOF_CLASS_ADD_SKIP;
+#endif
ANYOF_CLASS_ZERO(ret);
}
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
}
- /* a bad range like a-\d, a-[:digit:] ? */
+ /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
+ * literal */
if (range) {
if (!SIZE_ONLY) {
const int w =
if (prevvalue < 256) {
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+ set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &nonbitmap);
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+ set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
}
else {
ANYOF_FLAGS(ret) |= ANYOF_UTF8;
Perl_sv_catpvf(aTHX_ listsv,
- "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+ "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
}
}
const char *what = NULL;
char yesno = 0;
- if (namedclass > OOB_NAMEDCLASS)
- optimize_invert = FALSE;
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
- case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
- case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
- case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
- case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
- case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
- case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
- case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
- case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
- case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
- case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
+ case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
+ case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
+ case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
+ case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
+ case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
+ case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
+ case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
+ case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
+ case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
+ case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
/* \s, \w match all unicode if utf8. */
- case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
- case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
+ case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
+ case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
#else
/* \s, \w match ascii and locale only */
- case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
- case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
+ case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
+ case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
#endif
- case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
+ case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
case ANYOF_ASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ASCII);
else {
-#ifndef EBCDIC
for (value = 0; value < 128; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else /* EBCDIC */
- for (value = 0; value < 256; value++) {
- if (isASCII(value))
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
- }
-#endif /* EBCDIC */
+ set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
}
yesno = '+';
- what = "ASCII";
+ what = NULL; /* Doesn't match outside ascii, so
+ don't want to add +utf8:: */
break;
case ANYOF_NASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_NASCII);
else {
-#ifndef EBCDIC
for (value = 128; value < 256; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else /* EBCDIC */
- for (value = 0; value < 256; value++) {
- if (!isASCII(value))
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
- }
-#endif /* EBCDIC */
+ set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
}
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
yesno = '!';
what = "ASCII";
break;
/* consecutive digits assumed */
for (value = '0'; value <= '9'; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+ set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
}
yesno = '+';
what = POSIX_CC_UNI_NAME("Digit");
/* consecutive digits assumed */
for (value = 0; value < '0'; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+ set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
for (value = '9' + 1; value < 256; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+ set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
}
yesno = '!';
what = POSIX_CC_UNI_NAME("Digit");
+ if (AT_LEAST_ASCII_RESTRICTED ) {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ }
break;
case ANYOF_MAX:
/* this is to handle \p and \P */
vFAIL("Invalid [::] class");
break;
}
- if (what) {
+ if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
/* Strings such as "+utf8::isWord\n" */
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
- }
- stored+=2; /* can't optimize this class */
-
- /* All but ASCII can match Unicode characters, but all the ones
- * that aren't in utf8 are in the bitmap */
- if (namedclass != ANYOF_ASCII) {
ANYOF_FLAGS(ret) |= ANYOF_UTF8;
}
+
continue;
}
} /* end of namedclass \blah */
}
if (!SIZE_ONLY)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+ set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
} else
range = 1; /* yeah, it's a range! */
continue; /* but do it the next time */
}
}
+ /* non-Latin1 code point implies unicode semantics. Must be set in
+ * pass1 so is there for the whole of pass 2 */
+ if (value > 255) {
+ RExC_uni_semantics = 1;
+ }
+
/* now is the next time */
- /*stored += (value - prevvalue + 1);*/
if (!SIZE_ONLY) {
if (prevvalue < 256) {
const IV ceilvalue = value < 256 ? value : 255;
for (i = prevvalue; i <= ceilvalue; i++)
if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+ set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
}
} else {
for (i = prevvalue; i <= ceilvalue; i++)
if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+ set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
}
}
}
else
#endif
for (i = prevvalue; i <= ceilvalue; i++) {
- if (!ANYOF_BITMAP_TEST(ret,i)) {
- stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
- }
+ stored += set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
}
}
- if (value > 255 || UTF) {
- const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
- const UV natvalue = NATIVE_TO_UNI(value);
- stored+=2; /* can't optimize this class */
+ if (value > 255) {
+ const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
+ const UV natvalue = NATIVE_TO_UNI(value);
+ if (! nonbitmap) {
+ nonbitmap = _new_invlist(2);
+ }
+ nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
+ ANYOF_FLAGS(ret) |= ANYOF_UTF8;
+ }
+#if 0
/* If the code point requires utf8 to represent, and we are not
* folding, it can't match unless the target is in utf8. Only
ANYOF_FLAGS(ret) |= (FOLD || value < 256)
? ANYOF_NONBITMAP
: ANYOF_UTF8;
- if (prevnatvalue < natvalue) { /* what about > ? */
+ if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
+
+ /* The \t sets the whole range */
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
prevnatvalue, natvalue);
+
+ /* Currently, we don't look at every value in the range.
+ * Therefore we have to assume the worst case: that if
+ * folding, it will match more than one character. But in
+ * lookbehind patterns, can only be single character
+ * length, so disallow those folds */
+ if (FOLD && ! RExC_in_lookbehind) {
+ OP(ret) = ANYOFV;
+ }
}
else if (prevnatvalue == natvalue) {
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
#endif
Perl_sv_catpvf(aTHX_ listsv,
"%04"UVxf"\n", f);
- else {
+ else if (! RExC_in_lookbehind) {
/* Any multicharacter foldings
+ * (disallowed in lookbehind patterns)
* require the following transform:
* [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
* where E folds into "pq" and F folds
sv = newSVpvn_utf8((char*)foldbuf, foldlen,
TRUE);
av_push(unicode_alternate, sv);
+ OP(ret) = ANYOFV;
}
}
}
}
}
+#endif
#ifdef EBCDIC
literal_endpoint = 0;
#endif
return ret;
/****** !SIZE_ONLY AFTER HERE *********/
- if( stored == 1 && (value < 128 || (value < 256 && !UTF))
- && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
- ) {
- /* optimize single char class to an EXACT node but *only* when its not
- * a UTF/high char. Note that the information needed to decide to do
- * this optimization is not currently available until the 2nd pass, and
- * that the actually used EXACT node takes less space than the
- * calculated ANYOF node, and hence the amount of space calculated in
- * the first pass is larger than actually used. Currently we don't
- * keep track of enough information to do this for nodes which contain
- * matches outside the bitmap */
+ /* Finish up the non-bitmap entries */
+ if (nonbitmap) {
+ UV* nonbitmap_array;
+ UV i;
+
+ /* If folding, we add to the list all characters that could fold to or
+ * from the ones already on the list */
+ if (FOLD) {
+ HV* fold_intersection;
+ UV* fold_list;
+
+ /* This is a list of all the characters that participate in folds
+ * (except marks, etc in multi-char folds */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
+ PL_utf8_foldable = _swash_to_invlist(swash);
+ }
+
+ /* This is a hash that for a particular fold gives all characters
+ * that are involved in it */
+ if (! PL_utf8_foldclosures) {
+
+ /* If we were unable to find any folds, then we likely won't be
+ * able to find the closures. So just create an empty list.
+ * Folding will effectively be restricted to the non-Unicode
+ * rules hard-coded into Perl. (This case happens legitimately
+ * during compilation of Perl itself before the Unicode tables
+ * are generated) */
+ if (invlist_len(PL_utf8_foldable) == 0) {
+ PL_utf8_foldclosures = _new_invlist(0);
+ } else {
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+ to_utf8_fold((U8*) "A", dummy, &dummy_len);
+ }
+ PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+ }
+ }
+
+ /* Only the characters in this class that participate in folds need
+ * be checked. Get the intersection of this class and all the
+ * possible characters that are foldable. This can quickly narrow
+ * down a large class */
+ fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
+
+ /* Now look at the foldable characters in this class individually */
+ fold_list = invlist_array(fold_intersection);
+ for (i = 0; i < invlist_len(fold_intersection); i++) {
+ UV j;
+
+ /* The next entry is the beginning of the range that is in the
+ * class */
+ UV start = fold_list[i++];
+
+
+ /* The next entry is the beginning of the next range, which
+ * isn't in the class, so the end of the current range is one
+ * less than that */
+ UV end = fold_list[i] - 1;
+
+ /* Look at every character in the range */
+ for (j = start; j <= end; j++) {
+
+ /* Get its fold */
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1];
+ STRLEN foldlen;
+ const UV f = to_uni_fold(j, foldbuf, &foldlen);
+
+ if (foldlen > (STRLEN)UNISKIP(f)) {
+
+ /* Any multicharacter foldings (disallowed in
+ * lookbehind patterns) require the following
+ * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
+ * E folds into "pq" and F folds into "rst", all other
+ * characters fold to single characters. We save away
+ * these multicharacter foldings, to be later saved as
+ * part of the additional "s" data. */
+ if (! RExC_in_lookbehind) {
+ SV *sv;
+ U8* loc = foldbuf;
+ U8* e = foldbuf + foldlen;
+
+ /* If any of the folded characters of this are in
+ * the Latin1 range, tell the regex engine that
+ * this can match a non-utf8 target string. The
+ * multi-byte fold whose source is in the
+ * Latin1 range (U+00DF) applies only when the
+ * target string is utf8, or under unicode rules */
+ if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
+ while (loc < e) {
+ if (MORE_ASCII_RESTRICTED && (isASCII(*loc) != isASCII(j))) {
+ goto end_multi_fold;
+ }
+ /* XXX Discard this fold if any are latin1
+ * and LOC */
+ if (UTF8_IS_INVARIANT(*loc)
+ || UTF8_IS_DOWNGRADEABLE_START(*loc))
+ {
+ ANYOF_FLAGS(ret)
+ |= ANYOF_NONBITMAP_NON_UTF8;
+ break;
+ }
+ loc += UTF8SKIP(loc);
+ }
+ }
+ ANYOF_FLAGS(ret) |= ANYOF_UTF8;
+
+ if (!unicode_alternate) {
+ unicode_alternate = newAV();
+ }
+ sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
+ av_push(unicode_alternate, sv);
+
+ /* This node is variable length */
+ OP(ret) = ANYOFV;
+ end_multi_fold: ;
+ }
+ }
+ else { /* Single character fold */
+ SV** listp;
+
+ /* Consider "k" =~ /[K]/i. The line above would have
+ * just folded the 'k' to itself, and that isn't going
+ * to match 'K'. So we look through the closure of
+ * everything that folds to 'k'. That will find the
+ * 'K'. Initialize the list, if necessary */
+
+ /* The data structure is a hash with the keys every
+ * character that is folded to, like 'k', and the
+ * values each an array of everything that folds to its
+ * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) foldbuf, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_len(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ if (c_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+ }
+ c = SvUV(*c_p);
+ if (MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j))) {
+ continue;
+ }
+
+ if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
+ stored += set_regclass_bit(pRExC_state, ret, (U8) c, &nonbitmap);
+ }
+ /* It may be that the code point is already
+ * in this range or already in the bitmap,
+ * XXX THink about LOC
+ * in which case we need do nothing */
+ else if ((c < start || c > end)
+ && (c > 255
+ || ! ANYOF_BITMAP_TEST(ret, c)))
+ {
+ nonbitmap = add_range_to_invlist(nonbitmap, c, c);
+ }
+ }
+ }
+ }
+ }
+ }
+ invlist_destroy(fold_intersection);
+ } /* End of processing all the folds */
+
+ /* Here have the full list of items to match that aren't in the
+ * bitmap. Convert to the structure that the rest of the code is
+ * expecting. XXX That rest of the code should convert to this
+ * structure */
+ nonbitmap_array = invlist_array(nonbitmap);
+ for (i = 0; i < invlist_len(nonbitmap); i++) {
+
+ /* The next entry is the beginning of the range that is in the
+ * class */
+ UV start = nonbitmap_array[i++];
+
+ /* The next entry is the beginning of the next range, which isn't
+ * in the class, so the end of the current range is one less than
+ * that */
+ UV end = nonbitmap_array[i] - 1;
+
+ if (start == end) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
+ }
+ else {
+ /* The \t sets the whole range */
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+ /* XXX EBCDIC */
+ start, end);
+ }
+ }
+ invlist_destroy(nonbitmap);
+ }
+
+ /* Here, we have calculated what code points should be in the character
+ * class. Now we can see about various optimizations. Fold calculation
+ * needs to take place before inversion. Otherwise /[^k]/i would invert to
+ * include K, which under /i would match k. */
+
+ /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
+ * set the FOLD flag yet, so this this does optimize those. It doesn't
+ * optimize locale. Doing so perhaps could be done as long as there is
+ * nothing like \w in it; some thought also would have to be given to the
+ * interaction with above 0x100 chars */
+ if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+ ANYOF_BITMAP(ret)[value] ^= 0xFF;
+ stored = 256 - stored;
+
+ /* The inversion means that everything above 255 is matched; and at the
+ * same time we clear the invert flag */
+ ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
+ }
+
+ /* Folding in the bitmap is taken care of above, but not for locale (for
+ * which we have to wait to see what folding is in effect at runtime), and
+ * for things not in the bitmap. Set run-time fold flag for these */
+ if (FOLD && (LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
+ ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+ }
+
+ /* A single character class can be "optimized" into an EXACTish node.
+ * Note that since we don't currently count how many characters there are
+ * outside the bitmap, we are XXX missing optimization possibilities for
+ * them. This optimization can't happen unless this is a truly single
+ * character class, which means that it can't be an inversion into a
+ * many-character class, and there must be no possibility of there being
+ * things outside the bitmap. 'stored' (only) for locales doesn't include
+ * \w, etc, so have to make a special test that they aren't present
+ *
+ * Similarly A 2-character class of the very special form like [bB] can be
+ * optimized into an EXACTFish node, but only for non-locales, and for
+ * characters which only have the two folds; so things like 'fF' and 'Ii'
+ * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
+ * FI'. */
+ if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+ && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+ || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
+ || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+ && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
+ /* If the latest code point has a fold whose
+ * bit is set, it must be the only other one */
+ && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
+ && ANYOF_BITMAP_TEST(ret, prevvalue)))))
+ {
+ /* Note that the information needed to decide to do this optimization
+ * is not currently available until the 2nd pass, and that the actually
+ * used EXACTish node takes less space than the calculated ANYOF node,
+ * and hence the amount of space calculated in the first pass is larger
+ * than actually used, so this optimization doesn't gain us any space.
+ * But an EXACT node is faster than an ANYOF node, and can be combined
+ * with any adjacent EXACT nodes later by the optimizer for further
+ * gains. The speed of executing an EXACTF is similar to an ANYOF
+ * node, so the optimization advantage comes from the ability to join
+ * it to adjacent EXACT nodes */
+
const char * cur_parse= RExC_parse;
+ U8 op;
RExC_emit = (regnode *)orig_emit;
RExC_parse = (char *)orig_parse;
- ret = reg_node(pRExC_state,
- (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
- RExC_parse = (char *)cur_parse;
- *STRING(ret)= (char)value;
- STR_LEN(ret)= 1;
- RExC_emit += STR_SZ(1);
- SvREFCNT_dec(listsv);
- return ret;
- }
- /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
- if ( /* If the only flag is folding (plus possibly inversion). */
- ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
- ) {
- for (value = 0; value < 256; ++value) {
- if (ANYOF_BITMAP_TEST(ret, value)) {
- UV fold = PL_fold[value];
- if (fold != value)
- ANYOF_BITMAP_SET(ret, fold);
+ if (stored == 1) {
+
+ /* A locale node with one point can be folded; all the other cases
+ * with folding will have two points, since we calculate them above
+ */
+ if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
+ op = EXACTFL;
+ }
+ else {
+ op = EXACT;
}
+ } /* else 2 chars in the bit map: the folds of each other */
+ else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
+
+ /* To join adjacent nodes, they must be the exact EXACTish type.
+ * Try to use the most likely type, by using EXACTFU if the regex
+ * calls for them, or is required because the character is
+ * non-ASCII */
+ op = EXACTFU;
+ }
+ else { /* Otherwise, more likely to be EXACTF type */
+ op = EXACTF;
}
- ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
- }
- /* optimize inverted simple patterns (e.g. [^a-z]) */
- if (optimize_invert &&
- /* If the only flag is inversion. */
- (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
- for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
- ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
- ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
+ ret = reg_node(pRExC_state, op);
+ RExC_parse = (char *)cur_parse;
+ if (UTF && ! NATIVE_IS_INVARIANT(value)) {
+ *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
+ *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
+ STR_LEN(ret)= 2;
+ RExC_emit += STR_SZ(2);
+ }
+ else {
+ *STRING(ret)= (char)value;
+ STR_LEN(ret)= 1;
+ RExC_emit += STR_SZ(1);
+ }
+ SvREFCNT_dec(listsv);
+ return ret;
}
+
{
AV * const av = newAV();
SV *rv;
- Look for optimizable sequences at the same time.
- currently only looks for EXACT chains.
-This is expermental code. The idea is to use this routine to perform
+This is experimental code. The idea is to use this routine to perform
in place optimizations on branches and groups as they are constructed,
with the long term intention of removing optimization from study_chunk so
that it is purely analytical.
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFA:
+ case EXACTFU:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
{
int bit;
int set=0;
+ regex_charset cs;
for (bit=0; bit<32; bit++) {
if (flags & (1<<bit)) {
+ if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
+ continue;
+ }
if (!set++ && lead)
PerlIO_printf(Perl_debug_log, "%s",lead);
PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
}
}
+ if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+ if (!set++ && lead) {
+ PerlIO_printf(Perl_debug_log, "%s",lead);
+ }
+ switch (cs) {
+ case REGEX_UNICODE_CHARSET:
+ PerlIO_printf(Perl_debug_log, "UNICODE");
+ break;
+ case REGEX_LOCALE_CHARSET:
+ PerlIO_printf(Perl_debug_log, "LOCALE");
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
+ break;
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
+ break;
+ default:
+ PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
+ break;
+ }
+ }
if (lead) {
if (set)
PerlIO_printf(Perl_debug_log, "\n");
* --jhi */
pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
PERL_PV_ESCAPE_UNI_DETECT |
+ PERL_PV_ESCAPE_NONASCII |
PERL_PV_PRETTY_ELLIPSES |
PERL_PV_PRETTY_LTGT |
PERL_PV_PRETTY_NOCLEAR
else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
if ( RXp_PAREN_NAMES(prog) ) {
- if ( k != REF || OP(o) < NREF) {
+ if ( k != REF || (OP(o) < NREF)) {
AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
SV **name= av_fetch(list, ARG(o), 0 );
if (name)
if (flags & ANYOF_LOCALE)
sv_catpvs(sv, "{loc}");
- if (flags & ANYOF_FOLD)
+ if (flags & ANYOF_LOC_NONBITMAP_FOLD)
sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
}
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
- /* output any special charclass tests (used mostly under use locale) */
- if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
+ /* output any special charclass tests (used entirely under use locale) */
+ if (ANYOF_CLASS_TEST_ANY_SET(o))
for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+ if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+ sv_catpvs(sv, "{non-utf8-latin1-all}");
+ }
+
/* output information about the unicode matching */
if (flags & ANYOF_UNICODE_ALL)
sv_catpvs(sv, "{unicode_all}");
else if (flags & ANYOF_UTF8)
sv_catpvs(sv, "{unicode}");
- else if (flags & ANYOF_NONBITMAP)
+ if (flags & ANYOF_NONBITMAP_NON_UTF8)
sv_catpvs(sv, "{outside bitmap}");
{
The solution is to make a lightweight copy of the regexp structure
when a qr// is returned from the code executed by (??{$qr}) this
- lightweight copy doesnt actually own any of its data except for
+ lightweight copy doesn't actually own any of its data except for
the starp/end and the actual regexp structure itself.
*/
ones (binary 1111 1111, hexadecimal FF). It is similar, but not
identical, to the ASCII delete (DEL) or rubout control character.
) So the old condition can be simplified to !isPRINT(c) */
- if (!isPRINT(c))
- Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
+ if (!isPRINT(c)) {
+ if (c < 256) {
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+ }
+ }
else {
const char string = c;
if (c == '-' || c == ']' || c == '\\' || c == '^')
else if ( op == PLUS || op == STAR) {
DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
}
- else if (op == ANYOF) {
+ else if (PL_regkind[(U8)op] == ANYOF) {
/* arglen 1 + class block */
node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
? ANYOF_CLASS_SKIP : ANYOF_SKIP);