char *parse; /* Input-scan pointer. */
char *copy_start; /* start of copy of input within
constructed parse string */
+ char *save_copy_start; /* Provides one level of saving
+ and restoring 'copy_start' */
char *copy_start_in_input; /* Position in input string
corresponding to copy_start */
SSize_t whilem_seen; /* number of WHILEM in this expr */
#define RExC_precomp (pRExC_state->precomp)
#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
+#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
#define RExC_precomp_end (pRExC_state->precomp_end)
#define RExC_rx_sv (pRExC_state->rx_sv)
#define RExC_rx (pRExC_state->rx)
} STMT_END
/* Setting this to NULL is a signal to not output warnings */
-#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
-#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
+#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
+ STMT_START { \
+ RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
+ RExC_copy_start_in_constructed = NULL; \
+ } STMT_END
+#define RESTORE_WARNINGS \
+ RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
/* Since a warning can be generated multiple times as the input is reparsed, we
* output it the first time we come to that point in the parse, but suppress it
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
+ const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+ ? 0
+ : ANYOF_FLAGS(node);
PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
}
/* Get the code points valid only under UTF-8 locales */
- if ( (ANYOF_FLAGS(node) & ANYOFL_FOLD)
+ if ( (flags & ANYOFL_FOLD)
&& av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
{
only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
* actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
* have to do this here before we add the unconditionally matched code
* points */
- if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ if (flags & ANYOF_INVERT) {
_invlist_intersection_complement_2nd(invlist,
PL_UpperLatin1,
&invlist);
}
/* Add in the points from the bit map */
- if (OP(node) != ANYOFH) {
+ if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
unsigned int start = i++;
* as well. But don't add them if inverting, as when that gets done below,
* it would exclude all these characters, including the ones it shouldn't
* that were added just above */
- if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
- && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+ if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
+ && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
{
_invlist_union(invlist, PL_UpperLatin1, &invlist);
}
/* Similarly for these */
- if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
_invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
}
- if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ if (flags & ANYOF_INVERT) {
_invlist_invert(invlist);
}
- else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+ else if (flags & ANYOFL_FOLD) {
if (new_node_has_latin1) {
/* Under /li, any 0-255 could fold to any other 0-255, depending on
if (only_utf8_locale_invlist) {
_invlist_union_maybe_complement_2nd(invlist,
only_utf8_locale_invlist,
- ANYOF_FLAGS(node) & ANYOF_INVERT,
+ flags & ANYOF_INVERT,
&invlist);
}
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
- U8 and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
+ U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(and_with);
U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
SV* ored_cp_list;
U8 ored_flags;
- U8 or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
+ U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(or_with);
PERL_ARGS_ASSERT_SSC_OR;
if (UTF) { \
SV *zlopp = newSV(UTF8_MAXBYTES); \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+ unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+ *kapow = '\0'; \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
case ANYOFL:
case ANYOFPOSIXL:
case ANYOFH:
+ case ANYOFHb:
+ case ANYOFHr:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
const char* name;
name = get_regex_charset_name(RExC_rx->extflags, &len);
- if strEQ(name, DEPENDS_PAT_MODS) { /* /d under UTF-8 => /u */
+ if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
assert(RExC_utf8);
name = UNICODE_PAT_MODS;
len = sizeof(UNICODE_PAT_MODS) - 1;
RExC_sawback = 1;
ret = reganode(pRExC_state,
((! FOLD)
- ? NREF
+ ? REFN
: (ASCII_FOLD_RESTRICTED)
- ? NREFFA
+ ? REFFAN
: (AT_LEAST_UNI_SEMANTICS)
- ? NREFFU
+ ? REFFUN
: (LOC)
- ? NREFFL
- : NREFF),
+ ? REFFLN
+ : REFFN),
num);
*flagp |= HASWIDTH;
if (!SvIOK(max_open)) {
sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
}
- if (depth > 4 * SvIV(max_open)) { /* We increase depth by 4 for each open
- paren */
+ if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
+ open paren */
vFAIL("Too many nested open parens");
}
goto gen_recurse_regop;
/* NOTREACHED */
case '+':
- if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+ if (! inRANGE(RExC_parse[0], '1', '9')) {
RExC_parse++;
vFAIL("Illegal pattern");
}
goto parse_recursion;
/* NOTREACHED*/
case '-': /* (?-1) */
- if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+ if (! inRANGE(RExC_parse[0], '1', '9')) {
RExC_parse--; /* rewind to let it be handled later */
goto parse_flags;
}
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void_NN(sv_dat);
}
- ret = reganode(pRExC_state, NGROUPP, num);
+ ret = reganode(pRExC_state, GROUPPN, num);
goto insert_if_check_paren;
}
else if (memBEGINs(RExC_parse,
parno = 1;
RExC_parse++;
}
- else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ else if (inRANGE(RExC_parse[0], '1', '9')) {
UV uv;
endptr = RExC_end;
if (grok_atoUV(RExC_parse, &uv, &endptr)
ret = reganode(pRExC_state, INSUBP, parno);
goto insert_if_check_paren;
}
- else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ else if (inRANGE(RExC_parse[0], '1', '9')) {
/* (?(1)...) */
char c;
UV uv;
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
+
+ case ')':
+ if (RExC_strict) { /* [perl #132851] */
+ ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
+ }
+ /* FALLTHROUGH */
default: /* e.g., (?i) */
RExC_parse = (char *) seqstart + 1;
parse_flags:
value = (U8 *) SvPV(value_sv, value_len);
/* See if the result is one code point vs 0 or multiple */
- if (value_len > 0 && value_len <= ((SvUTF8(value_sv))
- ? UTF8SKIP(value)
- : 1))
+ if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
+ ? UTF8SKIP(value)
+ : 1))
{
/* Here, exactly one code point. If that isn't what is wanted,
* fail */
&& num >= RExC_npar
/* cannot be an octal escape if it starts with 8 */
&& *RExC_parse != '8'
- /* cannot be an octal escape it it starts with 9 */
+ /* cannot be an octal escape if it starts with 9 */
&& *RExC_parse != '9'
) {
/* Probably not meant to be a backref, instead likely
has_micro_sign = TRUE;
}
- *(s++) = (char) (DEPENDS_SEMANTICS)
- ? toFOLD(ender)
-
- /* Under /u, the fold of any
- * character in the 0-255 range
- * happens to be its lowercase
- * equivalent, except for LATIN SMALL
- * LETTER SHARP S, which was handled
- * above, and the MICRO SIGN, whose
- * fold requires UTF-8 to represent.
- * */
- : toLOWER_L1(ender);
+ *(s++) = (DEPENDS_SEMANTICS)
+ ? (char) toFOLD(ender)
+
+ /* Under /u, the fold of any character in
+ * the 0-255 range happens to be its
+ * lowercase equivalent, except for LATIN
+ * SMALL LETTER SHARP S, which was handled
+ * above, and the MICRO SIGN, whose fold
+ * requires UTF-8 to represent. */
+ : (char) toLOWER_L1(ender);
}
} /* End of adding current character to the node */
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
- /* Free up any over-allocated space */
- change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
+ /* Free up any over-allocated space; cast is to silence bogus
+ * warning in MS VC */
+ change_engine_size(pRExC_state,
+ - (Ptrdiff_t) (initial_size - STR_SZ(len)));
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
RExC_emit += STR_SZ(len);
/* If the node isn't a single character, it can't be SIMPLE */
- if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+ if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
maybe_SIMPLE = 0;
}
assert(PL_regkind[OP(node)] == ANYOF);
/* There is no bitmap for this node type */
- if (OP(node) == ANYOFH) {
+ if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
return;
}
FALSE, /* Require return to be an ANYOF */
¤t))
{
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
+ goto regclass_failed;
}
/* regclass() will return with parsing just the \ sequence,
FALSE, /* Require return to be an ANYOF */
¤t))
{
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
+ goto regclass_failed;
}
if (! current) {
}
if (!node)
- FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
- PTR2UV(flagp));
+ goto regclass_failed;
/* Fix up the node type if we are in locale. (We have pretended we are
* under /u for the purposes of regclass(), as this construct will only
nextchar(pRExC_state);
Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
return node;
+
+ regclass_failed:
+ FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
+ (UV) *flagp);
}
#ifdef ENABLE_REGEX_SETS_DEBUGGING
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
const bool stop_at_1, /* Just parse the next thing, don't
look for a full character class */
- bool allow_multi_folds,
+ bool allow_mutiple_chars,
const bool silence_non_portable, /* Don't output warnings
about too large
characters */
#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
&& UNICODE_DOT_DOT_VERSION == 0)
- allow_multi_folds = FALSE;
+ allow_mutiple_chars = FALSE;
#endif
/* We include the /i status at the beginning of this so that we can
if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
RExC_parse++;
invert = TRUE;
- allow_multi_folds = FALSE;
+ allow_mutiple_chars = FALSE;
MARK_NAUGHTY(1);
SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
}
else { /* cp_count > 1 */
assert(cp_count > 1);
if (! RExC_in_multi_char_class) {
- if (invert || range || *RExC_parse == '-') {
+ if ( ! allow_mutiple_chars
+ || invert
+ || range
+ || *RExC_parse == '-')
+ {
if (strict) {
RExC_parse--;
- vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
+ vFAIL("\\N{} here is restricted to one character");
}
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
break; /* <value> contains the first code
) {
SV* scratch_list = NULL;
- /* What the Posix classes (like \w, [:space:]) match in locale
- * isn't knowable under locale until actual match time. A
+ /* What the Posix classes (like \w, [:space:]) match isn't
+ * generally knowable under locale until actual match time. A
* special node is used for these which has extra space for a
* bitmap, with a bit reserved for each named class that is to
- * be matched against. This isn't needed for \p{} and
+ * be matched against. (This isn't needed for \p{} and
* pseudo-classes, as they are not affected by locale, and
- * hence are dealt with separately */
- POSIXL_SET(posixl, namedclass);
- has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
- anyof_flags |= ANYOF_MATCHES_POSIXL;
-
- /* The above-Latin1 characters are not subject to locale rules.
- * Just add them to the unconditionally-matched list */
-
- /* Get the list of the above-Latin1 code points this matches */
- _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
- PL_XPosix_ptrs[classnum],
-
- /* Odd numbers are complements, like
- * NDIGIT, NASCII, ... */
- namedclass % 2 != 0,
- &scratch_list);
- /* Checking if 'cp_list' is NULL first saves an extra clone.
- * Its reference count will be decremented at the next union,
- * etc, or if this is the only instance, at the end of the
- * routine */
- if (! cp_list) {
- cp_list = scratch_list;
- }
- else {
- _invlist_union(cp_list, scratch_list, &cp_list);
- SvREFCNT_dec_NN(scratch_list);
+ * hence are dealt with separately.) However, if a named class
+ * and its complement are both present, then it matches
+ * everything, and there is no runtime dependency. Odd numbers
+ * are the complements of the next lower number, so xor works.
+ * (Note that something like [\w\D] should match everything,
+ * because \d should be a proper subset of \w. But rather than
+ * trust that the locale is well behaved, we leave this to
+ * runtime to sort out) */
+ if (POSIXL_TEST(posixl, namedclass ^ 1)) {
+ cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
+ POSIXL_ZERO(posixl);
+ has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
+ anyof_flags &= ~ANYOF_MATCHES_POSIXL;
+ continue; /* We could ignore the rest of the class, but
+ best to parse it for any errors */
+ }
+ else { /* Here, isn't the complement of any already parsed
+ class */
+ POSIXL_SET(posixl, namedclass);
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
+ anyof_flags |= ANYOF_MATCHES_POSIXL;
+
+ /* The above-Latin1 characters are not subject to locale
+ * rules. Just add them to the unconditionally-matched
+ * list */
+
+ /* Get the list of the above-Latin1 code points this
+ * matches */
+ _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+
+ /* Odd numbers are complements,
+ * like NDIGIT, NASCII, ... */
+ namedclass % 2 != 0,
+ &scratch_list);
+ /* Checking if 'cp_list' is NULL first saves an extra
+ * clone. Its reference count will be decremented at the
+ * next union, etc, or if this is the only instance, at the
+ * end of the routine */
+ if (! cp_list) {
+ cp_list = scratch_list;
+ }
+ else {
+ _invlist_union(cp_list, scratch_list, &cp_list);
+ SvREFCNT_dec_NN(scratch_list);
+ }
+ continue; /* Go get next character */
}
- continue; /* Go get next character */
}
else {
* "ss" =~ /^[^\xDF]+$/i => N
*
* See [perl #89750] */
- if (FOLD && allow_multi_folds && value == prevvalue) {
+ if (FOLD && allow_mutiple_chars && value == prevvalue) {
if ( value == LATIN_SMALL_LETTER_SHARP_S
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
value)))
literal
);
}
- else if isMNEMONIC_CNTRL(value) {
+ else if (isMNEMONIC_CNTRL(value)) {
vWARN4(RExC_parse,
"\"%.*s\" is more clearly written simply as \"%s\"",
(int) (RExC_parse - rangebegin),
RExC_emit += 1 + STR_SZ(len);
STR_LEN(REGNODE_p(ret)) = len;
if (len == 1) {
- *STRING(REGNODE_p(ret)) = value;
+ *STRING(REGNODE_p(ret)) = (U8) value;
}
else {
uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
SvREFCNT_dec(intersection);
}
- /* If didn't find an optimization and there is no need for a
- * bitmap, optimize to indicate that */
+ /* If didn't find an optimization and there is no need for a bitmap,
+ * optimize to indicate that */
if ( start[0] >= NUM_ANYOF_CODE_POINTS
&& ! LOC
&& ! upper_latin1_only_utf8_matches
&& anyof_flags == 0)
{
+ U8 low_utf8[UTF8_MAXBYTES+1];
UV highest_cp = invlist_highest(cp_list);
- /* If the lowest and highest code point in the class have the same
- * UTF-8 first byte, then all do, and we can store that byte for
- * regexec.c to use so that it can more quickly scan the target
- * string for potential matches for this class. We co-opt the the
- * flags field for this. Zero means, they don't have the same
- * first byte. We do accept here very large code points (for
- * future use), but don't bother with this optimization for them,
- * as it would cause other complications */
- if (highest_cp > IV_MAX) {
- anyof_flags = 0;
- }
- else {
- U8 low_utf8[UTF8_MAXBYTES+1];
+ op = ANYOFH;
+
+ /* Currently the maximum allowed code point by the system is
+ * IV_MAX. Higher ones are reserved for future internal use. This
+ * particular regnode can be used for higher ones, but we can't
+ * calculate the code point of those. IV_MAX suffices though, as
+ * it will be a large first byte */
+ (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+ /* We store the lowest possible first byte of the UTF-8
+ * representation, using the flags field. This allows for quick
+ * ruling out of some inputs without having to convert from UTF-8
+ * to code point. For EBCDIC, this has to be I8. */
+ anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+ /* If the first UTF-8 start byte for the highest code point in the
+ * range is suitably small, we may be able to get an upper bound as
+ * well */
+ if (highest_cp <= IV_MAX) {
U8 high_utf8[UTF8_MAXBYTES+1];
- (void) uvchr_to_utf8(low_utf8, start[0]);
- (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+ (void) uvchr_to_utf8(high_utf8, highest_cp);
+
+ /* If the lowest and highest are the same, we can get an exact
+ * first byte instead of a just minimum. We signal this with a
+ * different regnode */
+ if (low_utf8[0] == high_utf8[0]) {
+
+ /* No need to convert to I8 for EBCDIC as this is an exact
+ * match */
+ anyof_flags = low_utf8[0];
+ op = ANYOFHb;
+ }
+ else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
+ {
- anyof_flags = (low_utf8[0] == high_utf8[0])
- ? low_utf8[0]
- : 0;
+ /* Here, the high byte is not the same as the low, but is
+ * small enough that its reasonable to have a loose upper
+ * bound, which is packed in with the strict lower bound.
+ * See comments at the definition of MAX_ANYOF_HRx_BYTE.
+ * On EBCDIC platforms, I8 is used. On ASCII platforms I8
+ * is the same thing as UTF-8 */
+
+ U8 bits = 0;
+ U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
+ U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
+ - anyof_flags;
+
+ if (range_diff <= max_range_diff / 8) {
+ bits = 3;
+ }
+ else if (range_diff <= max_range_diff / 4) {
+ bits = 2;
+ }
+ else if (range_diff <= max_range_diff / 2) {
+ bits = 1;
+ }
+ anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
+ op = ANYOFHr;
+ }
}
- op = ANYOFH;
+ goto done_finding_op;
}
} /* End of seeing if can optimize it into a different node */
is_anyof: /* It's going to be an ANYOF node. */
- if (op != ANYOFH) {
- op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
- ? ANYOFD
- : ((posixl)
- ? ANYOFPOSIXL
- : ((LOC)
- ? ANYOFL
- : ANYOF));
- }
+ op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+ ? ANYOFD
+ : ((posixl)
+ ? ANYOFPOSIXL
+ : ((LOC)
+ ? ANYOFL
+ : ANYOF));
+
+ done_finding_op:
ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
FILL_NODE(ret, op); /* We set the argument later */
}
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
- assert(val - scan <= U32_MAX);
+ assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
}
else {
);
});
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
- assert(val - scan <= U32_MAX);
+ assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
}
else {
name_list= RExC_paren_name_list;
}
if (name_list) {
- if ( k != REF || (OP(o) < NREF)) {
+ if ( k != REF || (OP(o) < REFN)) {
SV **name= av_fetch(name_list, parno, 0 );
if (name)
Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
- const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o);
+ const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(o);
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (OP(o) != ANYOFH) {
+ if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
/* Then all the things that could fit in the bitmap */
do_sep = put_charclass_bitmap_innards(sv,
ANYOF_BITMAP(o),
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
- if (OP(o) == ANYOFH && FLAGS(o) != 0) {
- Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+ if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ U8 lowest = (OP(o) != ANYOFHr)
+ ? FLAGS(o)
+ : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+ U8 highest = (OP(o) == ANYOFHb)
+ ? lowest
+ : OP(o) == ANYOFH
+ ? 0xFF
+ : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
}
-
SvREFCNT_dec(unresolved);
}
else if (k == ANYOFM) {
if (!dsv)
dsv = (REGEXP*) newSV_type(SVt_REGEXP);
else {
+ assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
+
+ /* our only valid caller, sv_setsv_flags(), should have done
+ * a SV_CHECK_THINKFIRST_COW_DROP() by now */
+ assert(!SvOOK(dsv));
+ assert(!SvIsCOW(dsv));
+ assert(!SvROK(dsv));
+
+ if (SvPVX_const(dsv)) {
+ if (SvLEN(dsv))
+ Safefree(SvPVX(dsv));
+ SvPVX(dsv) = NULL;
+ }
+ SvLEN_set(dsv, 0);
+ SvCUR_set(dsv, 0);
SvOK_off((SV *)dsv);
+
if (islv) {
/* For PVLVs, the head (sv_any) points to an XPVLV, while
* the LV's xpvlenu_rx will point to a regexp body, which
2: something we no longer hold a reference on
so we need to copy it locally. */
RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
+ /* set malloced length to a non-zero value so it will be freed
+ * (otherwise in combination with SVf_FAKE it looks like an alien
+ * buffer). It doesn't have to be the actual malloced size, since it
+ * should never be grown */
+ SvLEN_set(dstr, SvCUR(sstr)+1);
ret->mother_re = NULL;
}
#endif /* PERL_IN_XSUB_RE */
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
UTF8fARG(is_contents_utf8, s - s0, s0));
sv_catpvs(msg, "\"");
- goto return_msg;
+ goto return_failure;
}
/* Accumulate this digit into the value */
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
UTF8fARG(is_contents_utf8, s - s0, s0));
sv_catpvs(msg, "\"");
- goto return_msg;
+ goto return_failure;
}
max = (max << 4) + READ_XDIGIT(s);
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
UTF8fARG(is_contents_utf8, s - s0, s0));
sv_catpvs(msg, "\"");
- goto return_msg;
+ goto return_failure;
}
#if 0 /* See explanation at definition above of get_extended_utf8_msg() */
: level + 1
);
if (this_definition == NULL) {
- goto return_msg; /* 'msg' should have had the reason appended to
- it by the above call */
+ goto return_failure; /* 'msg' should have had the reason
+ appended to it by the above call */
}
if (! is_invlist(this_definition)) { /* Unknown at this time */
}
/* Otherwise, add some explanatory text, but we will return success */
+ goto return_msg;
+
+ return_failure:
+ running_definition = NULL;
return_msg:
RESTORE_CONTEXT;
}
+STATIC SV *
+S_get_fq_name(pTHX_
+ const char * const name, /* The first non-blank in the \p{}, \P{} */
+ const Size_t name_len, /* Its length in bytes, not including any trailing space */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool has_colon_colon
+ )
+{
+ /* Returns a mortal SV containing the fully qualified version of the input
+ * name */
+
+ SV * fq_name;
+
+ fq_name = newSVpvs_flags("", SVs_TEMP);
+
+ /* Use the current package if it wasn't included in our input */
+ if (! has_colon_colon) {
+ const HV * pkg = (IN_PERL_COMPILETIME)
+ ? PL_curstash
+ : CopSTASH(PL_curcop);
+ const char* pkgname = HvNAME(pkg);
+
+ Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+ UTF8fARG(is_utf8, strlen(pkgname), pkgname));
+ sv_catpvs(fq_name, "::");
+ }
+
+ Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+ UTF8fARG(is_utf8, name_len, name));
+ return fq_name;
+}
+
SV *
Perl_parse_uniprop_string(pTHX_
int slash_pos = -1; /* Where the '/' is found, or negative if none */
int table_index = 0; /* The entry number for this property in the table
of all Unicode property names */
- bool starts_with_In_or_Is = FALSE; /* ? Does the name start with 'In' or
- 'Is' */
+ bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
Size_t lookup_offset = 0; /* Used to ignore the first few characters of
the normalized name in certain situations */
Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
it is the definition. Otherwise it is a
string containing the fully qualified sub
name of 'name' */
+ SV * fq_name = NULL; /* For user-defined properties, the fully
+ qualified name */
bool invert_return = FALSE; /* ? Do we need to complement the result before
returning it */
pos_in_brackets = strchr("([<)]>)]>", open);
close = (pos_in_brackets) ? pos_in_brackets[3] : open;
- if ( name[name_len-1] != close
+ if ( i >= name_len
+ || name[name_len-1] != close
|| (escaped && name[name_len-2] != '\\'))
{
sv_catpvs(msg, "Unicode property wildcard not terminated");
&& name[non_pkg_begin+0] == 'I'
&& (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
{
- starts_with_In_or_Is = TRUE;
+ /* Names that start with In have different characterstics than those
+ * that start with Is */
+ if (name[non_pkg_begin+1] == 's') {
+ starts_with_Is = TRUE;
+ }
}
else {
could_be_user_defined = FALSE;
if (could_be_user_defined) {
CV* user_sub;
+ /* If the user defined property returns the empty string, it could
+ * easily be because the pattern is being compiled before the data it
+ * actually needs to compile is available. This could be argued to be
+ * a bug in the perl code, but this is a change of behavior for Perl,
+ * so we handle it. This means that intentionally returning nothing
+ * will not be resolved until runtime */
+ bool empty_return = FALSE;
+
/* Here, the name could be for a user defined property, which are
* implemented as subs. */
user_sub = get_cvn_flags(name, name_len, 0);
if (user_sub) {
+ const char insecure[] = "Insecure user-defined property";
/* Here, there is a sub by the correct name. Normally we call it
* to get the property definition */
dSP;
SV * user_sub_sv = MUTABLE_SV(user_sub);
SV * error; /* Any error returned by calling 'user_sub' */
- SV * fq_name; /* Fully qualified property name */
+ SV * key; /* The key into the hash of user defined sub names
+ */
SV * placeholder;
- char to_fold_string[] = "0:"; /* The 0 gets overwritten with the
- actual value */
SV ** saved_user_prop_ptr; /* Hash entry for this property */
/* How many times to retry when another thread is in the middle of
/* If we get here, we know this property is user-defined */
*user_defined_ptr = TRUE;
- /* We refuse to call a tainted subroutine; returning an error
- * instead */
+ /* We refuse to call a potentially tainted subroutine; returning an
+ * error instead */
if (TAINT_get) {
if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
- sv_catpvs(msg, "Insecure user-defined property");
+ sv_catpvn(msg, insecure, sizeof(insecure) - 1);
goto append_name_to_msg;
}
* should the need arise, passing the /i status as a parameter.
*
* We start by constructing the hash key name, consisting of the
- * fully qualified subroutine name */
- fq_name = sv_2mortal(newSV(10)); /* 10 is just a guess */
- (void) cv_name(user_sub, fq_name, 0);
-
- /* But precede the sub name in the key with the /i status, so that
- * there is a key for /i and a different key for non-/i */
- to_fold_string[0] = to_fold + '0';
- sv_insert(fq_name, 0, 0, to_fold_string, 2);
+ * fully qualified subroutine name, preceded by the /i status, so
+ * that there is a key for /i and a different key for non-/i */
+ key = newSVpvn(((to_fold) ? "1" : "0"), 1);
+ fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+ non_pkg_begin != 0);
+ sv_catsv(key, fq_name);
+ sv_2mortal(key);
/* We only call the sub once throughout the life of the program
* (with the /i, non-/i exception noted above). That means the
/* If we have an entry for this key, the subroutine has already
* been called once with this /i status. */
saved_user_prop_ptr = hv_fetch(PL_user_def_props,
- SvPVX(fq_name), SvCUR(fq_name), 0);
+ SvPVX(key), SvCUR(key), 0);
if (saved_user_prop_ptr) {
/* If the saved result is an inversion list, it is the valid
* for this property in the hash. So we have the go ahead to
* expand the definition ourselves. */
+ PUSHSTACKi(PERLSI_MAGIC);
ENTER;
/* Create a temporary placeholder in the hash to detect recursion
* */
SWITCH_TO_GLOBAL_CONTEXT;
placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
- (void) hv_store_ent(PL_user_def_props, fq_name, placeholder, 0);
+ (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
RESTORE_CONTEXT;
/* Now that we have a placeholder, we can let other threads
USER_PROP_MUTEX_UNLOCK;
/* Make sure the placeholder always gets destroyed */
- SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(fq_name));
+ SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
PUSHMARK(SP);
SAVETMPS;
XPUSHs(boolSV(to_fold));
PUTBACK;
+ /* The following block was taken from swash_init(). Presumably
+ * they apply to here as well, though we no longer use a swash --
+ * khw */
+ SAVEHINTS();
+ save_re_context();
+ /* We might get here via a subroutine signature which uses a utf8
+ * parameter name, at which point PL_subname will have been set
+ * but not yet used. */
+ save_item(PL_subname);
+
(void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
SPAGAIN;
error = ERRSV;
- if (SvTRUE(error)) {
+ if (TAINT_get || SvTRUE(error)) {
if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
- sv_catpvs(msg, "Error \"");
- sv_catsv(msg, error);
- sv_catpvs(msg, "\"");
+ if (SvTRUE(error)) {
+ sv_catpvs(msg, "Error \"");
+ sv_catsv(msg, error);
+ sv_catpvs(msg, "\"");
+ }
+ if (TAINT_get) {
+ if (SvTRUE(error)) sv_catpvs(msg, "; ");
+ sv_catpvn(msg, insecure, sizeof(insecure) - 1);
+ }
+
if (name_len > 0) {
sv_catpvs(msg, " in expansion of ");
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
prop_definition = NULL;
}
else { /* G_SCALAR guarantees a single return value */
+ SV * contents = POPs;
/* The contents is supposed to be the expansion of the property
- * definition. Call a function to check for valid syntax and
- * handle it */
- prop_definition = handle_user_defined_property(name, name_len,
+ * definition. If the definition is deferrable, and we got an
+ * empty string back, set a flag to later defer it (after clean
+ * up below). */
+ if ( deferrable
+ && (! SvPOK(contents) || SvCUR(contents) == 0))
+ {
+ empty_return = TRUE;
+ }
+ else { /* Otherwise, call a function to check for valid syntax,
+ and handle it */
+
+ prop_definition = handle_user_defined_property(
+ name, name_len,
is_utf8, to_fold, runtime,
deferrable,
- POPs, user_defined_ptr,
+ contents, user_defined_ptr,
msg,
level);
+ }
}
- /* Here, we have the results of the expansion. Replace the
- * placeholder with them. We need exclusive access to the hash,
- * and we can't let anyone else in, between when we delete the
- * placeholder and add the permanent entry */
+ /* Here, we have the results of the expansion. Delete the
+ * placeholder, and if the definition is now known, replace it with
+ * that definition. We need exclusive access to the hash, and we
+ * can't let anyone else in, between when we delete the placeholder
+ * and add the permanent entry */
USER_PROP_MUTEX_LOCK;
- S_delete_recursion_entry(aTHX_ SvPVX(fq_name));
-
- if (! prop_definition || is_invlist(prop_definition)) {
+ S_delete_recursion_entry(aTHX_ SvPVX(key));
+ if ( ! empty_return
+ && (! prop_definition || is_invlist(prop_definition)))
+ {
/* If we got success we use the inversion list defining the
* property; otherwise use the error message */
SWITCH_TO_GLOBAL_CONTEXT;
(void) hv_store_ent(PL_user_def_props,
- fq_name,
+ key,
((prop_definition)
? newSVsv(prop_definition)
: newSVsv(msg)),
FREETMPS;
LEAVE;
+ POPSTACK;
+
+ if (empty_return) {
+ goto definition_deferred;
+ }
if (prop_definition) {
/* If it didn't find the property ... */
if (table_index == 0) {
- /* Try again stripping off any initial 'In' or 'Is' */
- if (starts_with_In_or_Is) {
+ /* Try again stripping off any initial 'Is'. This is because we
+ * promise that an initial Is is optional. The same isn't true of
+ * names that start with 'In'. Those can match only blocks, and the
+ * lookup table already has those accounted for. */
+ if (starts_with_Is) {
lookup_name += 2;
lookup_len -= 2;
equals_pos -= 2;
* NV. */
NV value;
+ SSize_t value_len = lookup_len - equals_pos;
/* Get the value */
- if (my_atof3(lookup_name + equals_pos, &value,
- lookup_len - equals_pos)
+ if ( value_len <= 0
+ || my_atof3(lookup_name + equals_pos, &value,
+ value_len)
!= lookup_name + lookup_len)
{
goto failed;
definition_deferred:
/* Here it could yet to be defined, so defer evaluation of this
- * until its needed at runtime. */
- prop_definition = newSVpvs_flags("", SVs_TEMP);
-
- /* To avoid any ambiguity, the package is always specified.
- * Use the current one if it wasn't included in our input */
- if (non_pkg_begin == 0) {
- const HV * pkg = (IN_PERL_COMPILETIME)
- ? PL_curstash
- : CopSTASH(PL_curcop);
- const char* pkgname = HvNAME(pkg);
-
- Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
- UTF8fARG(is_utf8, strlen(pkgname), pkgname));
- sv_catpvs(prop_definition, "::");
+ * until its needed at runtime. We need the fully qualified property name
+ * to avoid ambiguity, and a trailing newline */
+ if (! fq_name) {
+ fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+ non_pkg_begin != 0 /* If has "::" */
+ );
}
-
- Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
- UTF8fARG(is_utf8, name_len, name));
- sv_catpvs(prop_definition, "\n");
+ sv_catpvs(fq_name, "\n");
*user_defined_ptr = TRUE;
- return prop_definition;
+ return fq_name;
}
#endif