scan_frame *frame_head;
scan_frame *frame_last;
U32 frame_count;
+ U32 strict;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_frame_head (pRExC_state->frame_head)
#define RExC_frame_last (pRExC_state->frame_last)
#define RExC_frame_count (pRExC_state->frame_count)
+#define RExC_strict (pRExC_state->strict)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
-
-/* Allow for side effects in s */
-#define REGC(c,s) STMT_START { \
- if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
-} STMT_END
-
/* Macros for recording node offsets. 20001227 mjd@plover.com
* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
* element 2*n-1 of the array. Element #2n holds the byte length node #n.
RExC_uni_semantics = 0;
RExC_contains_locale = 0;
RExC_contains_i = 0;
+ RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
pRExC_state->runtime_code_qr = NULL;
RExC_frame_head= NULL;
RExC_frame_last= NULL;
RExC_recurse_count = 0;
pRExC_state->code_index = 0;
-#if 0 /* REGC() is (currently) a NOP at the first pass.
- * Clever compilers notice this and complain. --jhi */
- REGC((U8)REG_MAGIC, (char*)RExC_emit);
-#endif
DEBUG_PARSE_r(
PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
RExC_lastnum=0;
RExC_emit_bound = ri->program + RExC_size + 1;
pRExC_state->code_index = 0;
- REGC((U8)REG_MAGIC, (char*) RExC_emit++);
+ *((char*) RExC_emit++) = (char) REG_MAGIC;
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
FALSE, /* means parse the whole char class */
TRUE, /* allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
+ RExC_strict,
NULL);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
FALSE, /* don't silence non-portable warnings.
It would be a bug if these returned
non-portables */
+ RExC_strict,
NULL);
/* regclass() can only return RESTART_UTF8 if multi-char folds
are allowed. */
&result,
&error_msg,
PASS2, /* out warnings */
- FALSE, /* not strict */
+ RExC_strict,
TRUE, /* Output warnings
for non-
portables */
&result,
&error_msg,
PASS2, /* out warnings */
- FALSE, /* not strict */
- TRUE, /* Output warnings
+ RExC_strict,
+ TRUE, /* Silence warnings
for non-
portables */
UTF);
* from \1 - \9 is a backreference, any multi-digit
* escape which does not start with 0 and which when
* evaluated as decimal could refer to an already
- * parsed capture buffer is a backslash. Anything else
- * is octal.
+ * parsed capture buffer is a back reference. Anything
+ * else is octal.
*
* Note this implies that \118 could be interpreted as
* 118 OR as "\11" . "8" depending on whether there
* unfolded, and we have to calculate how many EXACTish
* nodes it will take; and we may run out of room in a node
* in the middle of a potential multi-char fold, and have
- * to back off accordingly. (Hence we can't use REGC for
- * the simple case just below.) */
+ * to back off accordingly. */
UV folded;
if (isASCII_uni(ender)) {
posix class */
FALSE, /* don't allow multi-char folds */
TRUE, /* silence non-portable warnings. */
- ¤t))
+ TRUE, /* strict */
+ ¤t
+ ))
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
(UV) *flagp);
TRUE, /* means parse just the next thing */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
- ¤t))
+ TRUE, /* strict */
+ ¤t
+ ))
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
(UV) *flagp);
/* regclass() will return with parsing just the \ sequence,
only if not a posix class */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
- ¤t))
+ TRUE, /* strict */
+ ¤t
+ ))
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
(UV) *flagp);
/* function call leaves parse pointing to the ']', except if we
TRUE, /* silence non-portable warnings. The above may very
well have generated non-portable code points, but
they're valid on this machine */
- NULL);
+ FALSE, /* similarly, no need for strict */
+ NULL
+ );
if (!node)
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
PTR2UV(flagp));
const bool silence_non_portable, /* Don't output warnings
about too large
characters */
- SV** ret_invlist) /* Return an inversion list, not a node */
+ const bool strict,
+ SV** ret_invlist /* Return an inversion list, not a node */
+ )
{
/* parse a bracketed class specification. Most of these will produce an
* ANYOF node; but something like [a] will produce an EXACT node; [aA], an
separate for a while from the non-complemented
versions because of complications with /d
matching */
+ SV* simple_posixes = NULL; /* But under some conditions, the classes can be
+ treated more simply than the general case,
+ leading to less compilation and execution
+ work */
UV element_count = 0; /* Number of distinct elements in the class.
Optimizations may be possible if this is tiny */
AV * multi_char_matches = NULL; /* Code points that fold to more than one
char * stop_ptr = RExC_end; /* where to stop parsing */
const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
space? */
- const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
/* Unicode properties are stored in a swash; this holds the current one
* being parsed. If this swash is the only above-latin1 component of the
&cp_list);
}
}
- else { /* Garden variety class. If is NASCII, NDIGIT, ...
+ else if (UNI_SEMANTICS
+ || classnum == _CC_ASCII
+ || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+ || classnum == _CC_XDIGIT)))
+ {
+ /* We usually have to worry about /d and /a affecting what
+ * POSIX classes match, with special code needed for /d
+ * because we won't know until runtime what all matches.
+ * But there is no extra work needed under /u, and
+ * [:ascii:] is unaffected by /a and /d; and :digit: and
+ * :xdigit: don't have runtime differences under /d. So we
+ * can special case these, and avoid some extra work below,
+ * and at runtime. */
+ _invlist_union_maybe_complement_2nd(
+ simple_posixes,
+ PL_XPosix_ptrs[classnum],
+ namedclass % 2 != 0,
+ &simple_posixes);
+ }
+ else { /* Garden variety class. If is NUPPER, NALPHA, ...
complement and use nposixes */
SV** posixes_ptr = namedclass % 2 == 0
? &posixes
: &nposixes;
- SV** source_ptr = &PL_XPosix_ptrs[classnum];
_invlist_union_maybe_complement_2nd(
*posixes_ptr,
- *source_ptr,
+ PL_XPosix_ptrs[classnum],
namedclass % 2 != 0,
posixes_ptr);
}
op = POSIXA;
}
}
- else if (prevvalue == 'A') {
- if (value == 'Z'
+ else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+ /* We can optimize A-Z or a-z, but not if they could match
+ * something like the KELVIN SIGN under /i (/a means they
+ * can't) */
+ if (prevvalue == 'A') {
+ if (value == 'Z'
#ifdef EBCDIC
- && literal_endpoint == 2
+ && literal_endpoint == 2
#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
- op = POSIXA;
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+ op = POSIXA;
+ }
}
- }
- else if (prevvalue == 'a') {
- if (value == 'z'
+ else if (prevvalue == 'a') {
+ if (value == 'z'
#ifdef EBCDIC
- && literal_endpoint == 2
+ && literal_endpoint == 2
#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
- op = POSIXA;
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+ op = POSIXA;
+ }
}
}
}
SvREFCNT_dec(posixes);
SvREFCNT_dec(nposixes);
+ SvREFCNT_dec(simple_posixes);
SvREFCNT_dec(cp_list);
SvREFCNT_dec(cp_foldable_list);
return ret;
* classes. The lists are kept separate up to now because we don't want to
* fold the classes (folding of those is automatically handled by the swash
* fetching code) */
+ if (simple_posixes) {
+ _invlist_union(cp_list, simple_posixes, &cp_list);
+ SvREFCNT_dec_NN(simple_posixes);
+ }
if (posixes || nposixes) {
if (posixes && AT_LEAST_ASCII_RESTRICTED) {
/* Under /a and /aa, nothing above ASCII matches these */
}
/*
-- reguni - emit (if appropriate) a Unicode character
-*/
-PERL_STATIC_INLINE STRLEN
-S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
-{
- PERL_ARGS_ASSERT_REGUNI;
-
- return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
-}
-
-/*
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
sv_catpvs(sv, "{non-utf8-latin1-all}");
}
- /* output information about the unicode matching */
if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
sv_catpvs(sv, "{above_bitmap_all}");
- else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+
+ if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
SV *lv; /* Set if there is something outside the bit map. */
- bool byte_output = FALSE; /* If something in the bitmap has
- been output */
+ bool byte_output = FALSE; /* If something has been output */
SV *only_utf8_locale;
/* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
DEBUG_COMPILE_r(
{
- const char * const s = SvPV_nolen_const(prog->check_substr
- ? prog->check_substr : prog->check_utf8);
+ const char * const s = SvPV_nolen_const(RX_UTF8(r)
+ ? prog->check_utf8 : prog->check_substr);
if (!PL_colorset) reginitcolors();
PerlIO_printf(Perl_debug_log,
"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
PL_colors[4],
- prog->check_substr ? "" : "utf8 ",
+ RX_UTF8(r) ? "utf8 " : "",
PL_colors[5],PL_colors[0],
s,
PL_colors[1],
(strlen(s) > 60 ? "..." : ""));
} );
- return prog->check_substr ? prog->check_substr : prog->check_utf8;
+ /* use UTF8 check substring if regexp pattern itself is in UTF8 */
+ return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
}
/*