* Flags to be passed up and down.
*/
#define WORST 0 /* Worst case. */
-#define HASWIDTH 0x01 /* Known to match non-null strings. */
+#define HASWIDTH 0x01 /* Known to not match null strings, could match
+ non-null ones. */
/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
* character. (There needs to be a case: in the switch statement in regexec.c
#endif
switch (flags) {
- case EXACT: case EXACTL: break;
+ case EXACT: case EXACT_ONLY8: case EXACTL: break;
case EXACTFAA:
case EXACTFU_SS:
case EXACTFU:
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (flags == EXACT || flags == EXACTL)
+ if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
noper= noper_next;
}
- if ( noper < tail &&
- (
- OP(noper) == flags ||
- (
- flags == EXACTFU &&
- OP(noper) == EXACTFU_SS
- )
- )
- ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFU_SS))) )
+ {
uc= (U8*)STRING(noper);
e= uc + STR_LEN(noper);
} else {
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFU_SS))) )
+ {
const U8 *uc= (U8*)STRING(noper);
const U8 *e= uc + STR_LEN(noper);
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFU_SS))) )
+ {
const U8 *uc= (U8*)STRING(noper);
const U8 *e= uc + STR_LEN(noper);
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
- if (OP(scan) != EXACT && OP(scan) != EXACTL) {
+ if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
U8* s0 = (U8*) STRING(scan);
U8* s = s0;
U8* s_end = s0 + STR_LEN(scan);
----------------+-----------
NOTHING | NOTHING
EXACT | EXACT
+ EXACT_ONLY8 | EXACT
EXACTFU | EXACTFU
+ EXACTFU_ONLY8 | EXACTFU
EXACTFU_SS | EXACTFU
- EXACTFAA | EXACTFAA
+ EXACTFAA | EXACTFAA
EXACTL | EXACTL
EXACTFLU8 | EXACTFLU8
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
? NOTHING \
- : ( EXACT == (X) ) \
+ : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
? EXACT \
- : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
+ : ( EXACTFU == (X) \
+ || EXACTFU_ONLY8 == (X) \
+ || EXACTFU_SS == (X) ) \
? EXACTFU \
- : ( EXACTFAA == (X) ) \
- ? EXACTFAA \
+ : ( EXACTFAA == (X) ) \
+ ? EXACTFAA \
: ( EXACTL == (X) ) \
? EXACTL \
- : ( EXACTFLU8 == (X) ) \
- ? EXACTFLU8 \
+ : ( EXACTFLU8 == (X) ) \
+ ? EXACTFLU8 \
: 0 )
/* dont use tail as the end marker for this traverse */
continue;
}
}
- else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
+ else if ( OP(scan) == EXACT
+ || OP(scan) == EXACT_ONLY8
+ || OP(scan) == EXACTL)
+ {
SSize_t l = STR_LEN(scan);
UV uc;
assert(l);
case PLUS:
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
- if (OP(next) == EXACT
+ if ( OP(next) == EXACT
+ || OP(next) == EXACT_ONLY8
|| OP(next) == EXACTL
|| (flags & SCF_DO_STCLASS))
{
&& n < pRExC_state->code_blocks->count
&& s == pRExC_state->code_blocks->cb[n].start)
{
- /* blank out literal code block */
- assert(pat[s] == '(');
- while (s <= pRExC_state->code_blocks->cb[n].end) {
- *p++ = '_';
+ /* blank out literal code block so that they aren't
+ * recompiled: eg change from/to:
+ * /(?{xyz})/
+ * /(?=====)/
+ * and
+ * /(??{xyz})/
+ * /(?======)/
+ * and
+ * /(?(?{xyz}))/
+ * /(?(?=====))/
+ */
+ assert(pat[s] == '(');
+ assert(pat[s+1] == '?');
+ *p++ = '(';
+ *p++ = '?';
+ s += 2;
+ while (s < pRExC_state->code_blocks->cb[n].end) {
+ *p++ = '=';
s++;
}
- s--;
+ *p++ = ')';
n++;
continue;
}
goto redo_parse;
}
- /* In a stable state, as here, this must be true */
- assert(RExC_size = RExC_emit + 1);
-
/* Here, we have successfully parsed and generated the pattern's program
* for the regex engine. We are ready to finish things up and look for
* optimizations. */
DEBUG_PEEP("first:", first, 0, 0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
- if (OP(first) == EXACT || OP(first) == EXACTL)
+ if ( OP(first) == EXACT
+ || OP(first) == EXACT_ONLY8
+ || OP(first) == EXACTL)
+ {
NOOP; /* Empty, get anchored substr later. */
+ }
else
RExC_rxi->regstclass = first;
}
&& nop == END)
RExC_rx->extflags |= RXf_WHITE;
else if ( RExC_rx->extflags & RXf_SPLIT
- && (fop == EXACT || fop == EXACTL)
+ && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& nop == END )
* contain only above-Latin1 characters (hence must be in UTF8),
* which don't participate in folds with Latin1-range characters,
* as the latter's folds aren't known until runtime. */
- bool maybe_exactfu = TRUE;
+ bool maybe_exactfu = FOLD;
+
+ /* Does this node contain something that can't match unless the
+ * target string is (also) in UTF-8 */
+ bool requires_utf8_target = FALSE;
+
+ bool has_micro_sign = FALSE;
/* Allocate an EXACT node. The node_type may change below to
* another EXACTish node, but since the size of the node doesn't
}
p = RExC_parse;
RExC_parse = parse_start;
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
- }
/* The \N{} means the pattern, if previously /d,
* becomes /u. That means it can't be an EXACTF node,
if (! maybe_exactfu) {
len = 0;
s = s0;
- maybe_exactfu = TRUE; /* Prob. unnecessary */
+ maybe_exactfu = FOLD; /* Prob. unnecessary */
goto reparse;
}
}
}
UPDATE_WARNINGS_LOC(p - 1);
ender = result;
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
- }
break;
}
case 'x':
}
#endif
}
- else {
- REQUIRE_UTF8(flagp);
- }
break;
}
case 'c':
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
- }
p += numlen;
if ( isDIGIT(*p) /* like \08, \178 */
&& ckWARN(WARN_REGEXP)
/* Here, have looked at the literal character, and <ender>
* contains its ordinal; <p> points to the character after it.
- * We need to check if the next non-ignored thing is a
+ * */
+
+ if (ender > 255) {
+ REQUIRE_UTF8(flagp);
+ }
+
+ /* We need to check if the next non-ignored thing is a
* quantifier. Move <p> to after anything that should be
* ignored, which, as a side effect, positions <p> for the next
* loop iteration */
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
added_len = (char *) new_s - s;
s = (char *) new_s;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
+ }
}
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
/* Here, continuing a node with non-folded characters. Add
* this one */
-
- if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
- *(s++) = (char) ender;
- }
- else {
- s = (char *) uvchr_to_utf8((U8 *) s, ender);
- added_len = UVCHR_SKIP(ender);
- }
+ goto not_fold_common;
}
else { /* Here, does participate in some fold */
? FOLD_FLAGS_NOMIX_ASCII
: 0));
s += added_len;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
+ if (UNLIKELY(ender == GREEK_SMALL_LETTER_MU)) {
+ has_micro_sign = TRUE;
+ }
+ }
}
}
else {
}
#endif
+ else if (UNLIKELY(ender == MICRO_SIGN)) {
+ has_micro_sign = TRUE;
+ }
+
/* Even when folding, we store just the input
* character, as we have an array that finds its fold
* quickly */
OP(REGNODE_p(ret)) = node_type;
/* If the node type is EXACT here, check to see if it
- * should be EXACTL. */
+ * should be EXACTL, or EXACT_ONLY8. */
if (node_type == EXACT) {
if (LOC) {
OP(REGNODE_p(ret)) = EXACTL;
}
+ else if (requires_utf8_target) {
+ OP(REGNODE_p(ret)) = EXACT_ONLY8;
+ }
}
if (FOLD) {
else if (node_type == EXACTF) {
RExC_seen_d_op = TRUE;
}
+
+ /* The micro sign is the only below 256 character that
+ * folds to above 255 */
+ if ( OP(REGNODE_p(ret)) == EXACTFU
+ && requires_utf8_target
+ && LIKELY(! has_micro_sign))
+ {
+ OP(REGNODE_p(ret)) = EXACTFU_ONLY8;
+ }
+
}
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
if (UCHARAT(RExC_parse) != ')')
vFAIL("Expecting close paren for wrapper for nested extended charclass");
- RExC_parse++;
RExC_flags = save_flags;
goto handle_operand;
}
bool warn_super = ALWAYS_WARN_SUPER;
- const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
- case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
if (optimizable) {
int posix_class = -1; /* Illegal value */
- const char * cur_parse= RExC_parse;
U8 ANYOFM_mask = 0xFF;
U32 anode_arg = 0;
UV start, end;
}
if (op != END) {
- RExC_parse = (char *)orig_parse;
- RExC_emit = orig_emit;
-
if (regarglen[op]) {
ret = reganode(pRExC_state, op, anode_arg);
} else {
ret = reg_node(pRExC_state, op);
}
-
- RExC_parse = (char *)cur_parse;
+ Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
+ RExC_parse - orig_parse);;
if (PL_regkind[op] == EXACT) {
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
if ( exact ) {
switch (OP(REGNODE_p(scan))) {
case EXACT:
+ case EXACT_ONLY8:
case EXACTL:
case EXACTF:
case EXACTFAA_NO_TRIE:
case EXACTFAA:
case EXACTFU:
+ case EXACTFU_ONLY8:
case EXACTFLU8:
case EXACTFU_SS:
case EXACTFL: