char *parse; /* Input-scan pointer. */
char *copy_start; /* start of copy of input within
constructed parse string */
+ char *save_copy_start; /* Provides one level of saving
+ and restoring 'copy_start' */
char *copy_start_in_input; /* Position in input string
corresponding to copy_start */
SSize_t whilem_seen; /* number of WHILEM in this expr */
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
I32 in_lookbehind;
+ I32 in_lookahead;
I32 contains_locale;
I32 override_recoding;
-#ifdef EBCDIC
- I32 recode_x_to_native;
-#endif
+ I32 recode_x_to_native;
I32 in_multi_char_class;
struct reg_code_blocks *code_blocks;/* positions of literal (?{})
within pattern */
#define RExC_precomp (pRExC_state->precomp)
#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
+#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
#define RExC_precomp_end (pRExC_state->precomp_end)
#define RExC_rx_sv (pRExC_state->rx_sv)
#define RExC_rx (pRExC_state->rx)
#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
under /d from /u ? */
-
#ifdef RE_TRACK_PATTERN_OFFSETS
# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
others */
#define RExC_study_chunk_recursed_bytes \
(pRExC_state->study_chunk_recursed_bytes)
#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
+#define RExC_in_lookahead (pRExC_state->in_lookahead)
#define RExC_contains_locale (pRExC_state->contains_locale)
+#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+
#ifdef EBCDIC
-# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+# define SET_recode_x_to_native(x) \
+ STMT_START { RExC_recode_x_to_native = (x); } STMT_END
+#else
+# define SET_recode_x_to_native(x) NOOP
#endif
+
#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
#define RExC_frame_head (pRExC_state->frame_head)
#define RExC_frame_last (pRExC_state->frame_last)
#define REQUIRE_BRANCHJ(flagp, restart_retval) \
STMT_START { \
RExC_use_BRANCHJ = 1; \
- if (LIKELY(! IN_PARENS_PASS)) { \
- /* No need to restart the parse immediately if we're \
- * going to reparse anyway to count parens */ \
- *flagp |= RESTART_PARSE; \
- return restart_retval; \
- } \
+ *flagp |= RESTART_PARSE; \
+ return restart_retval; \
} STMT_END
/* Until we have completed the parse, we leave RExC_total_parens at 0 or
Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
+#define FAIL3(msg,arg1,arg2) _FAIL( \
+ Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
+ arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
+
/*
* Simple_vFAIL -- like FAIL, but marks the current location in the scan
*/
} STMT_END
/* Setting this to NULL is a signal to not output warnings */
-#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
-#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
+#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
+ STMT_START { \
+ RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
+ RExC_copy_start_in_constructed = NULL; \
+ } STMT_END
+#define RESTORE_WARNINGS \
+ RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
/* Since a warning can be generated multiple times as the input is reparsed, we
* output it the first time we come to that point in the parse, but suppress it
#define UPDATE_WARNINGS_LOC(loc) \
STMT_START { \
if (TO_OUTPUT_WARNINGS(loc)) { \
- RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
+ RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
+ - RExC_precomp; \
} \
} STMT_END
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
- const U8 flags = OP(node) == ANYOFH ? 0 : ANYOF_FLAGS(node);
+ const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+ ? 0
+ : ANYOF_FLAGS(node);
PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
}
/* Add in the points from the bit map */
- if (OP(node) != ANYOFH) {
+ if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
unsigned int start = i++;
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
- U8 and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
+ U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(and_with);
U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
SV* ored_cp_list;
U8 ored_flags;
- U8 or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
+ U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(or_with);
PERL_ARGS_ASSERT_SSC_OR;
if (UTF) { \
SV *zlopp = newSV(UTF8_MAXBYTES); \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+ unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+ *kapow = '\0'; \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
#endif
switch (flags) {
- case EXACT: case EXACT_ONLY8: case EXACTL: break;
+ case EXACT: case EXACT_REQ8: case EXACTL: break;
case EXACTFAA:
case EXACTFUP:
case EXACTFU:
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
+ if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
if ( noper < tail
&& ( OP(noper) == flags
- || (flags == EXACT && OP(noper) == EXACT_ONLY8)
- || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || (flags == EXACT && OP(noper) == EXACT_REQ8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
uc= (U8*)STRING(noper);
if ( noper < tail
&& ( OP(noper) == flags
- || (flags == EXACT && OP(noper) == EXACT_ONLY8)
- || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || (flags == EXACT && OP(noper) == EXACT_REQ8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
const U8 *uc= (U8*)STRING(noper);
if ( noper < tail
&& ( OP(noper) == flags
- || (flags == EXACT && OP(noper) == EXACT_ONLY8)
- || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || (flags == EXACT && OP(noper) == EXACT_REQ8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
const U8 *uc= (U8*)STRING(noper);
if ( state==1 ) {
OP( convert ) = nodetype;
str=STRING(convert);
- STR_LEN(convert)=0;
+ setSTR_LEN(convert, 0);
}
- STR_LEN(convert) += len;
+ setSTR_LEN(convert, STR_LEN(convert) + len);
while (len--)
*str++ = *ch++;
} else {
* using /iaa matching will be doing so almost entirely with ASCII
* strings, so this should rarely be encountered in practice */
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
- if (PL_regkind[OP(scan)] == EXACT) \
+#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
+ if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT \
+ && OP(scan) != LEXACT_REQ8) \
join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
STATIC U32
/* Joining something that requires UTF-8 with something that
* doesn't, means the result requires UTF-8. */
- if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
- OP(scan) = EXACT_ONLY8;
+ if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
+ OP(scan) = EXACT_REQ8;
}
- else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
+ else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
; /* join is compatible, no need to change OP */
}
- else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
- OP(scan) = EXACTFU_ONLY8;
+ else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
+ OP(scan) = EXACTFU_REQ8;
}
- else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
+ else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
; /* join is compatible, no need to change OP */
}
else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
* node. And if the only adjacent node is EXACTF, they get
* absorbed into that, under the theory that a longer node is
* better than two shorter ones, even if one is EXACTFU. Note
- * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
+ * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
* and the EXACTFU_S_EDGE ones only for non-UTF-8. */
if (STRING(n)[STR_LEN(n)-1] == 's') {
merged++;
NEXT_OFF(scan) += NEXT_OFF(n);
- STR_LEN(scan) += STR_LEN(n);
+ setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
- if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
+ if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
U8* s0 = (U8*) STRING(scan);
U8* s = s0;
U8* s_end = s0 + STR_LEN(scan);
*/
JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
- /* Follow the next-chain of the current node and optimize
- away all the NOTHINGs from it. */
- if (OP(scan) != CURLYX) {
- const int max = (reg_off_by_arg[OP(scan)]
- ? I32_MAX
- /* I32 may be smaller than U16 on CRAYs! */
- : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
- int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
- int noff;
- regnode *n = scan;
-
- /* Skip NOTHING and LONGJMP. */
- while ((n = regnext(n))
- && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
- || ((OP(n) == LONGJMP) && (noff = ARG(n))))
- && off + noff < max)
- off += noff;
- if (reg_off_by_arg[OP(scan)])
- ARG(scan) = off;
- else
- NEXT_OFF(scan) = off;
- }
+ /* Follow the next-chain of the current node and optimize
+ away all the NOTHINGs from it. */
+ if (OP(scan) != CURLYX) {
+ const int max = (reg_off_by_arg[OP(scan)]
+ ? I32_MAX
+ /* I32 may be smaller than U16 on CRAYs! */
+ : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
+ int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
+ int noff;
+ regnode *n = scan;
+
+ /* Skip NOTHING and LONGJMP. */
+ while ( (n = regnext(n))
+ && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+ || ((OP(n) == LONGJMP) && (noff = ARG(n))))
+ && off + noff < max)
+ off += noff;
+ if (reg_off_by_arg[OP(scan)])
+ ARG(scan) = off;
+ else
+ NEXT_OFF(scan) = off;
+ }
- /* The principal pseudo-switch. Cannot be a switch, since we
- look into several different things. */
+ /* The principal pseudo-switch. Cannot be a switch, since we look into
+ * several different things. */
if ( OP(scan) == DEFINEP ) {
SSize_t minlen = 0;
SSize_t deltanext = 0;
----------------+-----------
NOTHING | NOTHING
EXACT | EXACT
- EXACT_ONLY8 | EXACT
+ EXACT_REQ8 | EXACT
EXACTFU | EXACTFU
- EXACTFU_ONLY8 | EXACTFU
+ EXACTFU_REQ8 | EXACTFU
EXACTFUP | EXACTFU
EXACTFAA | EXACTFAA
EXACTL | EXACTL
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
? NOTHING \
- : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
+ : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
? EXACT \
: ( EXACTFU == (X) \
- || EXACTFU_ONLY8 == (X) \
+ || EXACTFU_REQ8 == (X) \
|| EXACTFUP == (X) ) \
? EXACTFU \
: ( EXACTFAA == (X) ) \
}
}
else if ( OP(scan) == EXACT
- || OP(scan) == EXACT_ONLY8
+ || OP(scan) == LEXACT
+ || OP(scan) == EXACT_REQ8
+ || OP(scan) == LEXACT_REQ8
|| OP(scan) == EXACTL)
{
SSize_t l = STR_LEN(scan);
}
if (flags & SCF_DO_STCLASS) {
- SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
+ SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
assert(EXACTF_invlist);
if (flags & SCF_DO_STCLASS_AND) {
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
if ( OP(next) == EXACT
- || OP(next) == EXACT_ONLY8
+ || OP(next) == LEXACT
+ || OP(next) == EXACT_REQ8
+ || OP(next) == LEXACT_REQ8
|| OP(next) == EXACTL
|| (flags & SCF_DO_STCLASS))
{
case ANYOFL:
case ANYOFPOSIXL:
case ANYOFH:
+ case ANYOFHb:
+ case ANYOFHr:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
}
#endif
}
-
else if (OP(scan) == OPEN) {
if (stopparen != (I32)ARG(scan))
pars++;
const char* name;
name = get_regex_charset_name(RExC_rx->extflags, &len);
- if strEQ(name, DEPENDS_PAT_MODS) { /* /d under UTF-8 => /u */
+ if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
assert(RExC_utf8);
name = UNICODE_PAT_MODS;
len = sizeof(UNICODE_PAT_MODS) - 1;
&& memEQ(RX_PRECOMP(old_re), exp, plen)
&& !runtime_code /* with runtime code, always recompile */ )
{
+ DEBUG_COMPILE_r({
+ SV *dsv= sv_newmortal();
+ RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
+ Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
+ PL_colors[4], PL_colors[5], s);
+ });
return old_re;
}
RExC_seen = 0;
RExC_maxlen = 0;
RExC_in_lookbehind = 0;
+ RExC_in_lookahead = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
-#ifdef EBCDIC
RExC_recode_x_to_native = 0;
-#endif
RExC_in_multi_char_class = 0;
RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
SetProgLen(RExC_rxi,RExC_size);
#endif
+ DEBUG_DUMP_PRE_OPTIMIZE_r({
+ SV * const sv = sv_newmortal();
+ RXi_GET_DECL(RExC_rx, ri);
+ DEBUG_RExC_seen();
+ Perl_re_printf( aTHX_ "Program before optimization:\n");
+
+ (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
+ sv, 0, 0);
+ });
+
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "Starting post parse optimization\n");
);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
if ( OP(first) == EXACT
- || OP(first) == EXACT_ONLY8
+ || OP(first) == LEXACT
+ || OP(first) == EXACT_REQ8
+ || OP(first) == LEXACT_REQ8
|| OP(first) == EXACTL)
{
NOOP; /* Empty, get anchored substr later. */
&& nop == END)
RExC_rx->extflags |= RXf_WHITE;
else if ( RExC_rx->extflags & RXf_SPLIT
- && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
+ && ( fop == EXACT || fop == LEXACT
+ || fop == EXACT_REQ8 || fop == LEXACT_REQ8
+ || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& nop == END )
return zero_addr + *offset;
}
-PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
-{
- /* Sets the current number of elements stored in the inversion list.
- * Updates SvCUR correspondingly */
- PERL_UNUSED_CONTEXT;
- PERL_ARGS_ASSERT_INVLIST_SET_LEN;
-
- assert(is_invlist(invlist));
-
- SvCUR_set(invlist,
- (len == 0)
- ? 0
- : TO_INTERNAL_SIZE(len + offset));
- assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
-}
-
STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
{
invlist_iterfinish(invlist);
*get_invlist_previous_index_addr(invlist) = 0;
+ SvPOK_on(invlist); /* This allows B to extract the PV */
}
SV*
invlist_iterfinish(invlist);
SvREADONLY_on(invlist);
+ SvPOK_on(invlist);
return invlist;
}
STATIC void
-S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
-{
- /* Grow the maximum size of an inversion list */
-
- PERL_ARGS_ASSERT_INVLIST_EXTEND;
-
- assert(is_invlist(invlist));
-
- /* Add one to account for the zero element at the beginning which may not
- * be counted by the calling parameters */
- SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
-}
-
-STATIC void
S__append_range_to_invlist(pTHX_ SV* const invlist,
const UV start, const UV end)
{
#endif
-PERL_STATIC_INLINE SV*
-S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
- return _add_range_to_invlist(invlist, cp, cp);
-}
-
#ifndef PERL_IN_XSUB_RE
void
Perl__invlist_invert(pTHX_ SV* const invlist)
#endif
-PERL_STATIC_INLINE STRLEN*
-S_get_invlist_iter_addr(SV* invlist)
-{
- /* Return the address of the UV that contains the current iteration
- * position */
-
- PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
-
- assert(is_invlist(invlist));
-
- return &(((XINVLIST*) SvANY(invlist))->iterator);
-}
-
-PERL_STATIC_INLINE void
-S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
-{
- PERL_ARGS_ASSERT_INVLIST_ITERINIT;
-
- *get_invlist_iter_addr(invlist) = 0;
-}
-
-PERL_STATIC_INLINE void
-S_invlist_iterfinish(SV* invlist)
-{
- /* Terminate iterator for invlist. This is to catch development errors.
- * Any iteration that is interrupted before completed should call this
- * function. Functions that add code points anywhere else but to the end
- * of an inversion list assert that they are not in the middle of an
- * iteration. If they were, the addition would make the iteration
- * problematical: if the iteration hadn't reached the place where things
- * were being added, it would be ok */
-
- PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
-
- *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
-}
-
-STATIC bool
-S_invlist_iternext(SV* invlist, UV* start, UV* end)
-{
- /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
- * This call sets in <*start> and <*end>, the next range in <invlist>.
- * Returns <TRUE> if successful and the next call will return the next
- * range; <FALSE> if was already at the end of the list. If the latter,
- * <*start> and <*end> are unchanged, and the next call to this function
- * will start over at the beginning of the list */
-
- STRLEN* pos = get_invlist_iter_addr(invlist);
- UV len = _invlist_len(invlist);
- UV *array;
-
- PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
-
- if (*pos >= len) {
- *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
- return FALSE;
- }
-
- array = invlist_array(invlist);
-
- *start = array[(*pos)++];
-
- if (*pos >= len) {
- *end = UV_MAX;
- }
- else {
- *end = array[(*pos)++] - 1;
- }
-
- return TRUE;
-}
-
-PERL_STATIC_INLINE UV
-S_invlist_highest(SV* const invlist)
-{
- /* Returns the highest code point that matches an inversion list. This API
- * has an ambiguity, as it returns 0 under either the highest is actually
- * 0, or if the list is empty. If this distinction matters to you, check
- * for emptiness before calling this function */
-
- UV len = _invlist_len(invlist);
- UV *array;
-
- PERL_ARGS_ASSERT_INVLIST_HIGHEST;
-
- if (len == 0) {
- return 0;
- }
-
- array = invlist_array(invlist);
-
- /* The last element in the array in the inversion list always starts a
- * range that goes to infinity. That range may be for code points that are
- * matched in the inversion list, or it may be for ones that aren't
- * matched. In the latter case, the highest code point in the set is one
- * less than the beginning of this range; otherwise it is the final element
- * of this range: infinity */
- return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
- ? UV_MAX
- : array[len - 1] - 1;
-}
-
STATIC SV *
S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
{
* call SvREFCNT_dec() when done with it.
*/
STATIC SV*
-S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
+S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
{
dVAR;
const U8 * s = (U8*)STRING(node);
/* Start out big enough for 2 separate code points */
SV* invlist = _new_invlist(4);
- PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
+ PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
if (! UTF) {
uc = *s;
RExC_sawback = 1;
ret = reganode(pRExC_state,
((! FOLD)
- ? NREF
+ ? REFN
: (ASCII_FOLD_RESTRICTED)
- ? NREFFA
+ ? REFFAN
: (AT_LEAST_UNI_SEMANTICS)
- ? NREFFU
+ ? REFFUN
: (LOC)
- ? NREFFL
- : NREFF),
+ ? REFFLN
+ : REFFN),
num);
*flagp |= HASWIDTH;
PERL_ARGS_ASSERT_REG;
DEBUG_PARSE("reg ");
-
max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
assert(max_open);
if (!SvIOK(max_open)) {
*flagp = 0; /* Tentatively. */
+ if (RExC_in_lookbehind) {
+ RExC_in_lookbehind++;
+ }
+ if (RExC_in_lookahead) {
+ RExC_in_lookahead++;
+ }
+
/* Having this true makes it feasible to have a lot fewer tests for the
* parse pointer being in scope. For example, we can write
* while(isFOO(*RExC_parse)) RExC_parse++;
goto parse_rest;
}
- /* By doing this here, we avoid extra warnings for nested
- * script runs */
- ckWARNexperimental(RExC_parse,
- WARN_EXPERIMENTAL__SCRIPT_RUN,
- "The script_run feature is experimental");
-
if (paren == 's') {
/* Here, we're starting a new regular script run */
ret = reg_node(pRExC_state, SROPEN);
return 0;
}
- REGTAIL(pRExC_state, ret, atomic);
+ if (! REGTAIL(pRExC_state, ret, atomic)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
- REGTAIL(pRExC_state, atomic,
- reg_node(pRExC_state, SRCLOSE));
+ if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
+ SRCLOSE)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
RExC_in_script_run = 0;
return ret;
/*FALLTHROUGH*/
alpha_assertions:
- ckWARNexperimental(RExC_parse,
- WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
- "The alpha_assertions feature is experimental");
RExC_seen_zerolen++;
if (RExC_parse >= RExC_end) {
vFAIL("Sequence (?... not terminated");
}
-
- /* FALLTHROUGH */
+ RExC_seen_zerolen++;
+ break;
case '=': /* (?=...) */
RExC_seen_zerolen++;
+ RExC_in_lookahead++;
break;
case '!': /* (?!...) */
RExC_seen_zerolen++;
RExC_flags & RXf_PMf_COMPILETIME
);
FLAGS(REGNODE_p(ret)) = 2;
- REGTAIL(pRExC_state, ret, eval);
+ if (! REGTAIL(pRExC_state, ret, eval)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
/* deal with the length of this later - MJD */
return ret;
}
tail = reg(pRExC_state, 1, &flag, depth+1);
RETURN_FAIL_ON_RESTART(flag, flagp);
- REGTAIL(pRExC_state, ret, tail);
+ if (! REGTAIL(pRExC_state, ret, tail)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
goto insert_if;
}
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void_NN(sv_dat);
}
- ret = reganode(pRExC_state, NGROUPP, num);
+ ret = reganode(pRExC_state, GROUPPN, num);
goto insert_if_check_paren;
}
else if (memBEGINs(RExC_parse,
}
nextchar(pRExC_state);
insert_if:
- REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+ if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
+ IFTHEN, 0)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
br = regbranch(pRExC_state, &flags, 1, depth+1);
if (br == 0) {
RETURN_FAIL_ON_RESTART(flags,flagp);
FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
} else
- REGTAIL(pRExC_state, br, reganode(pRExC_state,
- LONGJMP, 0));
+ if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
+ LONGJMP, 0)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
c = UCHARAT(RExC_parse);
nextchar(pRExC_state);
if (flags&HASWIDTH)
FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
}
- REGTAIL(pRExC_state, ret, lastbr);
+ if (! REGTAIL(pRExC_state, ret, lastbr)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
c = UCHARAT(RExC_parse);
vFAIL("Switch (?(condition)... contains too many branches");
}
ender = reg_node(pRExC_state, TAIL);
- REGTAIL(pRExC_state, br, ender);
+ if (! REGTAIL(pRExC_state, br, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (lastbr) {
- REGTAIL(pRExC_state, lastbr, ender);
- REGTAIL(pRExC_state, REGNODE_OFFSET(
- NEXTOPER(
- NEXTOPER(REGNODE_p(lastbr)))),
- ender);
+ if (! REGTAIL(pRExC_state, lastbr, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
+ if (! REGTAIL(pRExC_state,
+ REGNODE_OFFSET(
+ NEXTOPER(
+ NEXTOPER(REGNODE_p(lastbr)))),
+ ender))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
#if 0 /* Removing this doesn't cause failures in the test suite -- khw */
RExC_size++; /* XXX WHY do we need this?!!
For large programs it seems to be required
goto parse_rest;
} /* end switch */
}
- else {
- if (*RExC_parse == '{') {
- ckWARNregdep(RExC_parse + 1,
- "Unescaped left brace in regex is "
- "deprecated here (and will be fatal "
- "in Perl 5.32), passed through");
- }
- /* Not bothering to indent here, as the above 'else' is temporary
- * */
- if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
+ else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
capturing_parens:
parno = RExC_npar;
RExC_npar++;
/* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
paren = ':';
ret = 0;
- }
}
}
else /* ! paren */
*flagp |= flags&SIMPLE;
}
if (is_open) { /* Starts with OPEN. */
- REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
+ if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else if (paren != '?') /* Not Conditional */
ret = br;
lastbr = br;
while (*RExC_parse == '|') {
if (RExC_use_BRANCHJ) {
+ bool shut_gcc_up;
+
ender = reganode(pRExC_state, LONGJMP, 0);
/* Append to the previous. */
- REGTAIL(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
- ender);
+ shut_gcc_up = REGTAIL(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
+ ender);
+ PERL_UNUSED_VAR(shut_gcc_up);
}
nextchar(pRExC_state);
if (freeze_paren) {
is_nothing= 0;
}
else if (op == BRANCHJ) {
- REGTAIL_STUDY(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
- ender);
+ bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
+ ender);
+ PERL_UNUSED_VAR(shut_gcc_up);
/* for now we always disable this optimisation * /
if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
|| regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
if (RExC_in_lookbehind) {
RExC_in_lookbehind--;
}
+ if (RExC_in_lookahead) {
+ RExC_in_lookahead--;
+ }
if (after_freeze > RExC_npar)
RExC_npar = after_freeze;
return(ret);
const regnode_offset w = reg_node(pRExC_state, WHILEM);
FLAGS(REGNODE_p(w)) = 0;
- REGTAIL(pRExC_state, ret, w);
+ if (! REGTAIL(pRExC_state, ret, w)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (RExC_use_BRANCHJ) {
reginsert(pRExC_state, LONGJMP, ret, depth+1);
reginsert(pRExC_state, NOTHING, ret, depth+1);
if (RExC_use_BRANCHJ)
NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
LONGJMP. */
- REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
+ if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+ NOTHING)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
RExC_whilem_seen++;
MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
if (*RExC_parse == '?') {
nextchar(pRExC_state);
reginsert(pRExC_state, MINMOD, ret, depth+1);
- REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
+ if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else if (*RExC_parse == '+') {
regnode_offset ender;
nextchar(pRExC_state);
ender = reg_node(pRExC_state, SUCCEED);
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
reginsert(pRExC_state, SUSPEND, ret, depth+1);
ender = reg_node(pRExC_state, TAIL);
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
if (ISMULT2(RExC_parse)) {
sv_catsv(substitute_parse, value_sv);
sv_catpv(substitute_parse, ")");
-#ifdef EBCDIC
/* The value should already be native, so no need to convert on EBCDIC
* platforms.*/
assert(! RExC_recode_x_to_native);
-#endif
}
else { /* \N{U+...} */
sv_catpvs(substitute_parse, ")");
-#ifdef EBCDIC
/* The values are Unicode, and therefore have to be converted to native
* on a non-Unicode (meaning non-ASCII) platform. */
- RExC_recode_x_to_native = 1;
-#endif
-
+ SET_recode_x_to_native(1);
}
/* Here, we have the string the name evaluates to, ready to be parsed,
RExC_start = save_start;
RExC_parse = endbrace;
RExC_end = orig_end;
-#ifdef EBCDIC
- RExC_recode_x_to_native = 0;
-#endif
+ SET_recode_x_to_native(0);
SvREFCNT_dec_NN(substitute_parse);
char *parse_start;
U8 op;
int invert = 0;
- U8 arg;
GET_RE_DEBUG_FLAGS_DECL;
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'K':
- RExC_seen_zerolen++;
- ret = reg_node(pRExC_state, KEEPS);
- *flagp |= SIMPLE;
- /* XXX:dmq : disabling in-place substitution seems to
- * be necessary here to avoid cases of memory corruption, as
- * with: C<$_="x" x 80; s/x\K/y/> -- rgs
- */
- RExC_seen |= REG_LOOKBEHIND_SEEN;
- goto finish_meta_pat;
+ if (!RExC_in_lookbehind && !RExC_in_lookahead) {
+ RExC_seen_zerolen++;
+ ret = reg_node(pRExC_state, KEEPS);
+ *flagp |= SIMPLE;
+ /* XXX:dmq : disabling in-place substitution seems to
+ * be necessary here to avoid cases of memory corruption, as
+ * with: C<$_="x" x 80; s/x\K/y/> -- rgs
+ */
+ RExC_seen |= REG_LOOKBEHIND_SEEN;
+ goto finish_meta_pat;
+ }
+ else {
+ ++RExC_parse; /* advance past the 'K' */
+ vFAIL("\\K not permitted in lookahead/lookbehind");
+ }
case 'Z':
ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
*flagp |= HASWIDTH;
goto finish_meta_pat;
- case 'W':
- invert = 1;
- /* FALLTHROUGH */
- case 'w':
- arg = ANYOF_WORDCHAR;
- goto join_posix;
-
case 'B':
invert = 1;
/* FALLTHROUGH */
goto finish_meta_pat;
}
- case 'D':
- invert = 1;
- /* FALLTHROUGH */
- case 'd':
- arg = ANYOF_DIGIT;
- if (! DEPENDS_SEMANTICS) {
- goto join_posix;
- }
-
- /* \d doesn't have any matches in the upper Latin1 range, hence /d
- * is equivalent to /u. Changing to /u saves some branches at
- * runtime */
- op = POSIXU;
- goto join_posix_op_known;
-
case 'R':
ret = reg_node(pRExC_state, LNBREAK);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
- case 'H':
- invert = 1;
- /* FALLTHROUGH */
+ case 'd':
+ case 'D':
case 'h':
- arg = ANYOF_BLANK;
- op = POSIXU;
- goto join_posix_op_known;
-
- case 'V':
- invert = 1;
- /* FALLTHROUGH */
- case 'v':
- arg = ANYOF_VERTWS;
- op = POSIXU;
- goto join_posix_op_known;
-
- case 'S':
- invert = 1;
- /* FALLTHROUGH */
- case 's':
- arg = ANYOF_SPACE;
-
- join_posix:
-
- op = POSIXD + get_regex_charset(RExC_flags);
- if (op > POSIXA) { /* /aa is same as /a */
- op = POSIXA;
- }
- else if (op == POSIXL) {
- RExC_contains_locale = 1;
- }
- else if (op == POSIXD) {
- RExC_seen_d_op = TRUE;
- }
-
- join_posix_op_known:
-
- if (invert) {
- op += NPOSIXD - POSIXD;
- }
-
- ret = reg_node(pRExC_state, op);
- FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
-
- *flagp |= HASWIDTH|SIMPLE;
- /* FALLTHROUGH */
-
- finish_meta_pat:
- if ( UCHARAT(RExC_parse + 1) == '{'
- && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
- {
- RExC_parse += 2;
- vFAIL("Unescaped left brace in regex is illegal here");
- }
- nextchar(pRExC_state);
- Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
- break;
+ case 'H':
case 'p':
case 'P':
+ case 's':
+ case 'S':
+ case 'v':
+ case 'V':
+ case 'w':
+ case 'W':
+ /* These all have the same meaning inside [brackets], and it knows
+ * how to do the best optimizations for them. So, pretend we found
+ * these within brackets, and let it do the work */
RExC_parse--;
ret = regclass(pRExC_state, flagp, depth+1,
FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
(UV) *flagp);
- RExC_parse--;
+ RExC_parse--; /* regclass() leaves this one too far ahead */
+ finish_meta_pat:
+ /* The escapes above that don't take a parameter can't be
+ * followed by a '{'. But 'pX', 'p{foo}' and
+ * correspondingly 'P' can be */
+ if ( RExC_parse - parse_start == 1
+ && UCHARAT(RExC_parse + 1) == '{'
+ && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+ {
+ RExC_parse += 2;
+ vFAIL("Unescaped left brace in regex is illegal here");
+ }
Set_Node_Offset(REGNODE_p(ret), parse_start);
- Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
+ Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
nextchar(pRExC_state);
break;
case 'N':
&& num >= RExC_npar
/* cannot be an octal escape if it starts with 8 */
&& *RExC_parse != '8'
- /* cannot be an octal escape it it starts with 9 */
+ /* cannot be an octal escape if it starts with 9 */
&& *RExC_parse != '9'
) {
/* Probably not meant to be a backref, instead likely
UV ender = 0;
char *p;
char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
char *s0;
- U8 upper_parse = MAX_NODE_STRING_SIZE;
+ U32 max_string_len = 255;
+
+ /* We may have to reparse the node, artificially stopping filling
+ * it early, based on info gleaned in the first parse. This
+ * variable gives where we stop. Make it above the normal stopping
+ * place first time through; otherwise it would stop too early */
+ U32 upper_fill = max_string_len + 1;
/* We start out as an EXACT node, even if under /i, until we find a
* character which is in a fold. The algorithm now segregates into
U8 node_type = EXACT;
/* Assume the node will be fully used; the excess is given back at
- * the end. We can't make any other length assumptions, as a byte
- * input sequence could shrink down. */
- Ptrdiff_t initial_size = STR_SZ(256);
+ * the end. Under /i, leave enough extra room so that we won't
+ * overflow the buffer when we fold a character which would end up
+ * overflowing the node. We can't make any other length
+ * assumptions, as a byte input sequence could shrink down. */
+ Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
+ + ((! FOLD)
+ ? 0
+ : 1 * ((UTF)
+ ? UTF8_MAXBYTES_CASE
+ /* Max non-UTF-8 expansion is 2 */ : 2)));
bool next_is_quantifier;
char * oldp = NULL;
+ char * old_oldp = NULL;
/* We can convert EXACTF nodes to EXACTFU if they contain only
* characters that match identically regardless of the target
/* So is the MICRO SIGN */
bool has_micro_sign = FALSE;
+ /* Set when we fill up the current node and there is still more
+ * text to process */
+ bool overflowed;
+
/* Allocate an EXACT node. The node_type may change below to
* another EXACTish node, but since the size of the node doesn't
* change, it works */
- ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+ ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
+ "exact");
FILL_NODE(ret, node_type);
RExC_emit++;
reparse:
+ p = RExC_parse;
+ len = 0;
+ s = s0;
+ node_type = EXACT;
+ oldp = NULL;
+ maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
+ maybe_SIMPLE = SIMPLE;
+ requires_utf8_target = FALSE;
+ has_ss = FALSE;
+ has_micro_sign = FALSE;
+
+ continue_parse:
+
/* This breaks under rare circumstances. If folding, we do not
* want to split a node at a character that is a non-final in a
* multi-char fold, as an input string could just happen to want to
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
+ overflowed = FALSE;
+
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
* We quit at the first non-literal or when the node gets full, or
* under /i the categorization of folding/non-folding character
* changes */
- for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+ while (p < RExC_end && len < upper_fill) {
/* In most cases each iteration adds one byte to the output.
* The exceptions override this */
Size_t added_len = 1;
+ old_oldp = oldp;
oldp = p;
/* White space has already been ignored */
UPDATE_WARNINGS_LOC(p - 1);
ender = result;
- if (ender < 0x100) {
#ifdef EBCDIC
+ if (ender < 0x100) {
if (RExC_recode_x_to_native) {
ender = LATIN1_TO_NATIVE(ender);
}
-#endif
}
+#endif
break;
}
case 'c':
/* Ready to add 'ender' to the node */
if (! FOLD) { /* The simple case, just append the literal */
+ not_fold_common:
- not_fold_common:
- if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
- *(s++) = (char) ender;
- }
- else {
- U8 * new_s = uvchr_to_utf8((U8*)s, ender);
- added_len = (char *) new_s - s;
- s = (char *) new_s;
+ /* Don't output if it would overflow */
+ if (UNLIKELY(len > max_string_len - ((UTF)
+ ? UVCHR_SKIP(ender)
+ : 1)))
+ {
+ overflowed = TRUE;
+ break;
+ }
- if (ender > 255) {
- requires_utf8_target = TRUE;
- }
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+ *(s++) = (char) ender;
+ }
+ else {
+ U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+ added_len = (char *) new_s - s;
+ s = (char *) new_s;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
}
+ }
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
goto loopdone;
}
- if (UTF) { /* Use the folded value */
+ if (UTF) { /* Alway use the folded value for UTF-8
+ patterns */
if (UVCHR_IS_INVARIANT(ender)) {
+ if (UNLIKELY(len + 1 > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
*(s)++ = (U8) toFOLD(ender);
}
else {
- ender = _to_uni_fold_flags(
+ UV folded = _to_uni_fold_flags(
ender,
- (U8 *) s,
+ (U8 *) s, /* We have allocated extra space
+ in 's' so can't run off the
+ end */
&added_len,
FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
? FOLD_FLAGS_NOMIX_ASCII
: 0));
+ if (UNLIKELY(len + added_len > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
s += added_len;
- if ( ender > 255
- && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+ if ( folded > 255
+ && LIKELY(folded != GREEK_SMALL_LETTER_MU))
{
/* U+B5 folds to the MU, so its possible for a
* non-UTF-8 target to match it */
}
}
}
- else {
-
- /* Here is non-UTF8. First, see if the character's
- * fold differs between /d and /u. */
- if (PL_fold[ender] != PL_fold_latin1[ender]) {
- maybe_exactfu = FALSE;
+ else { /* Here is non-UTF8. */
+
+ /* The fold will be one or (rarely) two characters.
+ * Check that there's room for at least a single one
+ * before setting any flags, etc. Because otherwise an
+ * overflowing character could cause a flag to be set
+ * even though it doesn't end up in this node. (For
+ * the two character fold, we check again, before
+ * setting any flags) */
+ if (UNLIKELY(len + 1 > max_string_len)) {
+ overflowed = TRUE;
+ break;
}
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- /* On non-ancient Unicode versions, this includes the
- * multi-char fold SHARP S to 'ss' */
-
- if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
- || ( isALPHA_FOLD_EQ(ender, 's')
- && len > 0
- && isALPHA_FOLD_EQ(*(s-1), 's')))
- {
- /* Here, we have one of the following:
- * a) a SHARP S. This folds to 'ss' only under
- * /u rules. If we are in that situation,
- * fold the SHARP S to 'ss'. See the comments
- * for join_exact() as to why we fold this
- * non-UTF at compile time, and no others.
- * b) 'ss'. When under /u, there's nothing
- * special needed to be done here. The
- * previous iteration handled the first 's',
- * and this iteration will handle the second.
- * If, on the otherhand it's not /u, we have
- * to exclude the possibility of moving to /u,
- * so that we won't generate an unwanted
- * match, unless, at runtime, the target
- * string is in UTF-8.
- * */
+ /* On non-ancient Unicodes, check for the only possible
+ * multi-char fold */
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ /* This potential multi-char fold means the node
+ * can't be simple (because it could match more
+ * than a single char). And in some cases it will
+ * match 'ss', so set that flag */
+ maybe_SIMPLE = 0;
has_ss = TRUE;
- maybe_exactfu = FALSE; /* Can't generate an
- EXACTFU node (unless we
- already are in one) */
- if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
- maybe_SIMPLE = 0;
- if (node_type == EXACTFU) {
- *(s++) = 's';
-
- /* Let the code below add in the extra 's' */
- ender = 's';
- added_len = 2;
+
+ /* It can't change to be an EXACTFU (unless already
+ * is one). We fold it iff under /u rules. */
+ if (node_type != EXACTFU) {
+ maybe_exactfu = FALSE;
+ }
+ else {
+ if (UNLIKELY(len + 2 > max_string_len)) {
+ overflowed = TRUE;
+ break;
}
+
+ *(s++) = 's';
+ *(s++) = 's';
+ added_len = 2;
+
+ goto done_with_this_char;
}
}
+ else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
+ && LIKELY(len > 0)
+ && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
+ {
+ /* Also, the sequence 'ss' is special when not
+ * under /u. If the target string is UTF-8, it
+ * should match SHARP S; otherwise it won't. So,
+ * here we have to exclude the possibility of this
+ * node moving to /u.*/
+ has_ss = TRUE;
+ maybe_exactfu = FALSE;
+ }
#endif
+ /* Here, the fold will be a single character */
- else if (UNLIKELY(ender == MICRO_SIGN)) {
+ if (UNLIKELY(ender == MICRO_SIGN)) {
has_micro_sign = TRUE;
}
+ else if (PL_fold[ender] != PL_fold_latin1[ender]) {
+
+ /* If the character's fold differs between /d and
+ * /u, this can't change to be an EXACTFU node */
+ maybe_exactfu = FALSE;
+ }
*(s++) = (DEPENDS_SEMANTICS)
? (char) toFOLD(ender)
}
} /* End of adding current character to the node */
+ done_with_this_char:
+
len += added_len;
if (next_is_quantifier) {
} /* End of loop through literal characters */
- /* Here we have either exhausted the input or ran out of room in
- * the node. (If we encountered a character that can't be in the
- * node, transfer is made directly to <loopdone>, and so we
- * wouldn't have fallen off the end of the loop.) In the latter
- * case, we artificially have to split the node into two, because
- * we just don't have enough space to hold everything. This
- * creates a problem if the final character participates in a
- * multi-character fold in the non-final position, as a match that
- * should have occurred won't, due to the way nodes are matched,
- * and our artificial boundary. So back off until we find a non-
- * problematic character -- one that isn't at the beginning or
- * middle of such a fold. (Either it doesn't participate in any
- * folds, or appears only in the final position of all the folds it
- * does participate in.) A better solution with far fewer false
- * positives, and that would fill the nodes more completely, would
- * be to actually have available all the multi-character folds to
- * test against, and to back-off only far enough to be sure that
- * this node isn't ending with a partial one. <upper_parse> is set
- * further below (if we need to reparse the node) to include just
- * up through that final non-problematic character that this code
- * identifies, so when it is set to less than the full node, we can
- * skip the rest of this */
- if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
- PERL_UINT_FAST8_T backup_count = 0;
-
- const STRLEN full_len = len;
-
- assert(len >= MAX_NODE_STRING_SIZE);
-
- /* Here, <s> points to just beyond where we have output the
- * final character of the node. Look backwards through the
- * string until find a non- problematic character */
-
- if (! UTF) {
-
- /* This has no multi-char folds to non-UTF characters */
- if (ASCII_FOLD_RESTRICTED) {
- goto loopdone;
- }
+ /* Here we have either exhausted the input or run out of room in
+ * the node. If the former, we are done. (If we encountered a
+ * character that can't be in the node, transfer is made directly
+ * to <loopdone>, and so we wouldn't have fallen off the end of the
+ * loop.) */
+ if (LIKELY(! overflowed)) {
+ goto loopdone;
+ }
+
+ /* Here we have run out of room. We can grow plain EXACT and
+ * LEXACT nodes. If the pattern is gigantic enough, though,
+ * eventually we'll have to artificially chunk the pattern into
+ * multiple nodes. */
+ if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
+ Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
+ Size_t overhead_expansion = 0;
+ char temp[256];
+ Size_t max_nodes_for_string;
+ Size_t achievable;
+ SSize_t delta;
+
+ /* Here we couldn't fit the final character in the current
+ * node, so it will have to be reparsed, no matter what else we
+ * do */
+ p = oldp;
+
+ /* If would have overflowed a regular EXACT node, switch
+ * instead to an LEXACT. The code below is structured so that
+ * the actual growing code is common to changing from an EXACT
+ * or just increasing the LEXACT size. This means that we have
+ * to save the string in the EXACT case before growing, and
+ * then copy it afterwards to its new location */
+ if (node_type == EXACT) {
+ overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
+ RExC_emit += overhead_expansion;
+ Copy(s0, temp, len, char);
+ }
+
+ /* Ready to grow. If it was a plain EXACT, the string was
+ * saved, and the first few bytes of it overwritten by adding
+ * an argument field. We assume, as we do elsewhere in this
+ * file, that one byte of remaining input will translate into
+ * one byte of output, and if that's too small, we grow again,
+ * if too large the excess memory is freed at the end */
+
+ max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
+ achievable = MIN(max_nodes_for_string,
+ current_string_nodes + STR_SZ(RExC_end - p));
+ delta = achievable - current_string_nodes;
+
+ /* If there is just no more room, go finish up this chunk of
+ * the pattern. */
+ if (delta <= 0) {
+ goto loopdone;
+ }
- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
- backup_count++;
- }
- len = s - s0 + 1;
- }
+ change_engine_size(pRExC_state, delta + overhead_expansion);
+ current_string_nodes += delta;
+ max_string_len
+ = sizeof(struct regnode) * current_string_nodes;
+ upper_fill = max_string_len + 1;
+
+ /* If the length was small, we know this was originally an
+ * EXACT node now converted to LEXACT, and the string has to be
+ * restored. Otherwise the string was untouched. 260 is just
+ * a number safely above 255 so don't have to worry about
+ * getting it precise */
+ if (len < 260) {
+ node_type = LEXACT;
+ FILL_NODE(ret, node_type);
+ s0 = STRING(REGNODE_p(ret));
+ Copy(temp, s0, len, char);
+ s = s0 + len;
+ }
+
+ goto continue_parse;
+ }
+ else if (! LOC) { /* XXX shouldn't /l assume could be a UTF-8
+ locale, and prepare for that? */
+
+ /* Here is /i. Running out of room creates a problem if we are
+ * folding, and the split happens in the middle of a
+ * multi-character fold, as a match that should have occurred,
+ * won't, due to the way nodes are matched, and our artificial
+ * boundary. So back off until we aren't splitting such a
+ * fold. If there is no such place to back off to, we end up
+ * taking the entire node as-is. This can happen if the node
+ * consists entirely of 'f' or entirely of 's' characters (or
+ * things that fold to them) as 'ff' and 'ss' are
+ * multi-character folds.
+ *
+ * At this point:
+ * old_oldp points to the beginning in the input of the
+ * penultimate character in the node.
+ * oldp points to the beginning in the input of the
+ * final character in the node.
+ * p points to the beginning in the input of the
+ * next character in the input, the one that won't
+ * fit in the node.
+ *
+ * We aren't in the middle of a multi-char fold unless the
+ * final character in the node can appear in a non-final
+ * position in such a fold. Very few characters actually
+ * participate in multi-character folds, and fewer still can be
+ * in the non-final position. But it's complicated to know
+ * here if that final character is folded or not, so skip this
+ * check */
+
+ /* Make sure enough space for final char of node,
+ * first char of following node, and the fold of the
+ * following char (so we don't have to worry about
+ * that fold running off the end */
+ U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
+ STRLEN fold_len;
+ UV folded;
+ char * const sav_oldp = oldp;
+
+ assert(FOLD);
+
+ /* The Unicode standard says that multi character folds consist
+ * of either two or three characters. So we create a buffer
+ * containing a window of three. The first is the final
+ * character in the node (folded), and then the two that begin
+ * the following node. But if the first character of the
+ * following node can't be in a non-final fold position, there
+ * is no need to look at its successor character. The macros
+ * used below to check for multi character folds require folded
+ * inputs, so we have to fold these. (The fold of p was likely
+ * calculated in the loop above, but it hasn't beeen saved, and
+ * khw thinks it would be too entangled to change to do so) */
+
+ if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
+ folded = _to_uni_fold_flags(ender,
+ foldbuf,
+ &fold_len,
+ FOLD_FLAGS_FULL);
+ }
else {
+ foldbuf[0] = folded = MICRO_SIGN;
+ fold_len = 1;
+ }
+
+ /* Here, foldbuf contains the fold of the first character in
+ * the next node. We may also need the next one (if there is
+ * one) to get our third, but if the first character folded to
+ * more than one, those extra one(s) will serve as the third.
+ * Also, we don't need a third unless the previous one can
+ * appear in a non-final position in a fold */
+ if ( ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
+ && (fold_len == 1 || ( UTF
+ && UVCHR_SKIP(folded) == fold_len))
+ && UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
+ {
+ if (UTF) {
+ STRLEN next_fold_len;
- /* Point to the first byte of the final character */
- s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
+ (U8*) RExC_end, foldbuf + fold_len,
+ &next_fold_len);
+ fold_len += next_fold_len;
+ }
+ else {
+ if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
+ foldbuf[fold_len] = 's';
+ }
+ else {
+ foldbuf[fold_len] = toLOWER_L1(p[1]);
+ }
+ fold_len++;
+ }
+ }
- while (s >= s0) { /* Search backwards until find
- a non-problematic char */
- if (UTF8_IS_INVARIANT(*s)) {
+ /* Here foldbuf contains the the fold of p, and if appropriate
+ * that of the character following p in the input. */
- /* There are no ascii characters that participate
- * in multi-char folds under /aa. In EBCDIC, the
- * non-ascii invariants are all control characters,
- * so don't ever participate in any folds. */
- if (ASCII_FOLD_RESTRICTED
- || ! IS_NON_FINAL_FOLD(*s))
- {
- break;
- }
+ /* Search backwards until find a place that doesn't split a
+ * multi-char fold */
+ while (1) {
+ STRLEN s_len;
+ char s_fold_buf[UTF8_MAXBYTES_CASE];
+ char * s_fold = s_fold_buf;
+
+ if (s <= s0) {
+
+ /* There's no safe place in the node to split. Quit so
+ * will take the whole node */
+ oldp = sav_oldp;
+ break;
+ }
+
+ /* Backup 1 character. The first time through this moves s
+ * to point to the final character in the node */
+ if (UTF) {
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ }
+ else {
+ s--;
+ }
+
+ /* 's' may or may not be folded; so make sure it is, and
+ * use just the final character in its fold (should there
+ * be more than one */
+ if (UTF) {
+ toFOLD_utf8_safe((U8*) s,
+ (U8*) s + UTF8SKIP(s),
+ (U8 *) s_fold_buf, &s_len);
+ while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
+ {
+ s_fold += UTF8SKIP(s_fold);
}
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
- *s, *(s+1))))
- {
+ s_len = UTF8SKIP(s_fold);
+ }
+ else {
+ if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
+ {
+ s_fold_buf[0] = 's';
+ }
+ else { /* This works for all other non-UTF-8 folds
+ */
+ s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+ }
+ s_len = 1;
+ }
+
+ /* Unshift this character to the beginning of the buffer,
+ * No longer needed trailing characters are overwritten.
+ * */
+ Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
+ Copy(s_fold, foldbuf, s_len, U8);
+
+ /* If this isn't a multi-character fold, we have found a
+ * splittable place. If this is the final character in the
+ * node, that means the node is valid as-is, and can quit.
+ * Otherwise, we note how much we can fill the node before
+ * coming to a non-splittable position, and go parse it
+ * again, stopping there. This is done because we know
+ * where in the output to stop, but we don't have a map to
+ * where that is in the input. One could be created, but
+ * it seems like overkill for such a rare event as we are
+ * dealing with here */
+ if (UTF) {
+ if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
+ foldbuf + UTF8_MAXBYTES_CASE))
+ {
+ upper_fill = s + UTF8SKIP(s) - s0;
+ if (LIKELY(oldp)) {
break;
}
+ goto reparse;
}
- else if (! _invlist_contains_cp(
- PL_NonFinalFold,
- valid_utf8_to_uvchr((U8 *) s, NULL)))
- {
+ }
+ else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
+ foldbuf + UTF8_MAXBYTES_CASE))
+ {
+ upper_fill = s + 1 - s0;
+ if (LIKELY(oldp)) {
break;
}
-
- /* Here, the current character is problematic in that
- * it does occur in the non-final position of some
- * fold, so try the character before it, but have to
- * special case the very first byte in the string, so
- * we don't read outside the string */
- s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
- backup_count++;
- } /* End of loop backwards through the string */
-
- /* If there were only problematic characters in the string,
- * <s> will point to before s0, in which case the length
- * should be 0, otherwise include the length of the
- * non-problematic character just found */
- len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
- }
-
- /* Here, have found the final character, if any, that is
- * non-problematic as far as ending the node without splitting
- * it across a potential multi-char fold. <len> contains the
- * number of bytes in the node up-to and including that
- * character, or is 0 if there is no such character, meaning
- * the whole node contains only problematic characters. In
- * this case, give up and just take the node as-is. We can't
- * do any better */
- if (len == 0) {
- len = full_len;
-
- } else {
-
- /* Here, the node does contain some characters that aren't
- * problematic. If we didn't have to backup any, then the
- * final character in the node is non-problematic, and we
- * can take the node as-is */
- if (backup_count == 0) {
- goto loopdone;
+ goto reparse;
}
- else if (backup_count == 1) {
- /* If the final character is problematic, but the
- * penultimate is not, back-off that last character to
- * later start a new node with it */
- p = oldp;
- goto loopdone;
- }
+ oldp = old_oldp;
+ old_oldp = NULL;
+
+ } /* End of loop backing up through the node */
+ /* Here the node consists entirely of non-final multi-char
+ * folds. (Likely it is all 'f's or all 's's.) There's no
+ * decent place to split it, so give up and just take the
+ * whole thing */
- /* Here, the final non-problematic character is earlier
- * in the input than the penultimate character. What we do
- * is reparse from the beginning, going up only as far as
- * this final ok one, thus guaranteeing that the node ends
- * in an acceptable character. The reason we reparse is
- * that we know how far in the character is, but we don't
- * know how to correlate its position with the input parse.
- * An alternate implementation would be to build that
- * correlation as we go along during the original parse,
- * but that would entail extra work for every node, whereas
- * this code gets executed only when the string is too
- * large for the node, and the final two characters are
- * problematic, an infrequent occurrence. Yet another
- * possible strategy would be to save the tail of the
- * string, and the next time regatom is called, initialize
- * with that. The problem with this is that unless you
- * back off one more character, you won't be guaranteed
- * regatom will get called again, unless regbranch,
- * regpiece ... are also changed. If you do back off that
- * extra character, so that there is input guaranteed to
- * force calling regatom, you can't handle the case where
- * just the first character in the node is acceptable. I
- * (khw) decided to try this method which doesn't have that
- * pitfall; if performance issues are found, we can do a
- * combination of the current approach plus that one */
- upper_parse = len;
- len = 0;
- s = s0;
- goto reparse;
- }
} /* End of verifying node ends with an appropriate char */
+ p = oldp;
+
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
/* Free up any over-allocated space; cast is to silence bogus
* warning in MS VC */
change_engine_size(pRExC_state,
- - (Ptrdiff_t) (initial_size - STR_SZ(len)));
+ - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
else {
/* If the node type is EXACT here, check to see if it
- * should be EXACTL, or EXACT_ONLY8. */
+ * should be EXACTL, or EXACT_REQ8. */
if (node_type == EXACT) {
if (LOC) {
node_type = EXACTL;
}
else if (requires_utf8_target) {
- node_type = EXACT_ONLY8;
+ node_type = EXACT_REQ8;
}
- } else if (FOLD) {
+ }
+ else if (node_type == LEXACT) {
+ if (requires_utf8_target) {
+ node_type = LEXACT_REQ8;
+ }
+ }
+ else if (FOLD) {
if ( UNLIKELY(has_micro_sign || has_ss)
&& (node_type == EXACTFU || ( node_type == EXACTF
&& maybe_exactfu)))
if (maybe_exactfu) {
node_type = EXACTFLU8;
}
+ else if (UNLIKELY(
+ _invlist_contains_cp(PL_HasMultiCharFold, ender)))
+ {
+ /* A character that folds to more than one will
+ * match multiple characters, so can't be SIMPLE.
+ * We don't have to worry about this with EXACTFLU8
+ * nodes just above, as they have already been
+ * folded (since the fold doesn't vary at run
+ * time). Here, if the final character in the node
+ * folds to multiple, it can't be simple. (This
+ * only has an effect if the node has only a single
+ * character, hence the final one, as elsewhere we
+ * turn off simple for nodes whose length > 1 */
+ maybe_SIMPLE = 0;
+ }
}
else if (node_type == EXACTF) { /* Means is /di */
+ /* This intermediate variable is needed solely because
+ * the asserts in the macro where used exceed Win32's
+ * literal string capacity */
+ char first_char = * STRING(REGNODE_p(ret));
+
/* If 'maybe_exactfu' is clear, then we need to stay
* /di. If it is set, it means there are no code
* points that match differently depending on UTF8ness
if (! maybe_exactfu) {
RExC_seen_d_op = TRUE;
}
- else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+ else if ( isALPHA_FOLD_EQ(first_char, 's')
|| isALPHA_FOLD_EQ(ender, 's'))
{
/* But, if the node begins or ends in an 's' we
}
if (requires_utf8_target && node_type == EXACTFU) {
- node_type = EXACTFU_ONLY8;
+ node_type = EXACTFU_REQ8;
}
}
OP(REGNODE_p(ret)) = node_type;
- STR_LEN(REGNODE_p(ret)) = len;
+ setSTR_LEN(REGNODE_p(ret), len);
RExC_emit += STR_SZ(len);
/* If the node isn't a single character, it can't be SIMPLE */
- if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+ if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
maybe_SIMPLE = 0;
}
assert(PL_regkind[OP(node)] == ANYOF);
/* There is no bitmap for this node type */
- if (OP(node) == ANYOFH) {
+ if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
return;
}
/* Recurse, with the meat of the embedded expression */
RExC_parse++;
- (void) handle_regex_sets(pRExC_state, ¤t, flagp,
- depth+1, oregcomp_parse);
+ if (! handle_regex_sets(pRExC_state, ¤t, flagp,
+ depth+1, oregcomp_parse))
+ {
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
+ }
/* Here, 'current' contains the embedded expression's
* inversion list, and RExC_parse points to the trailing
FALSE, /* Require return to be an ANYOF */
¤t))
{
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
goto regclass_failed;
}
FALSE, /* Require return to be an ANYOF */
¤t))
{
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
goto regclass_failed;
}
RExC_flags |= RXf_PMf_FOLD;
}
- if (!node)
+ if (!node) {
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
goto regclass_failed;
+ }
/* Fix up the node type if we are in locale. (We have pretended we are
* under /u for the purposes of regclass(), as this construct will only
{
if (strict) {
RExC_parse--;
- vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
+ vFAIL("\\N{} here is restricted to one character");
}
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
break; /* <value> contains the first code
literal
);
}
- else if isMNEMONIC_CNTRL(value) {
+ else if (isMNEMONIC_CNTRL(value)) {
vWARN4(RExC_parse,
"\"%.*s\" is more clearly written simply as \"%s\"",
(int) (RExC_parse - rangebegin),
/* Likewise for 'posixes' */
_invlist_union(posixes, cp_list, &cp_list);
+ SvREFCNT_dec(posixes);
/* Likewise for anything else in the range that matched only
* under UTF-8 */
goto not_anyof;
}
}
+
/* For well-behaved locales, some classes are subsets of others,
* so complementing the subset and including the non-complemented
* superset should match everything, like [\D[:alnum:]], and
/* Next see if can optimize classes that contain just a few code points
* into an EXACTish node. The reason to do this is to let the
- * optimizer join this node with adjacent EXACTish ones.
+ * optimizer join this node with adjacent EXACTish ones, and ANYOF
+ * nodes require conversion to code point from UTF-8.
*
* An EXACTFish node can be generated even if not under /i, and vice
* versa. But care must be taken. An EXACTFish node has to be such
op = (FOLD) ? EXACTFL : EXACTL;
}
else if (! FOLD) { /* Not /l and not /i */
- op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
+ op = (start[0] < 256) ? EXACT : EXACT_REQ8;
}
else if (start[0] < 256) { /* /i, not /l, and the code point is
small */
applies to it */
op = _invlist_contains_cp(PL_InMultiCharFold,
start[0])
- ? EXACTFU_ONLY8
- : EXACT_ONLY8;
+ ? EXACTFU_REQ8
+ : EXACT_REQ8;
}
value = start[0];
? EXACTFLU8
: (ASCII_FOLD_RESTRICTED)
? EXACTFAA
- : EXACTFU_ONLY8;
+ : EXACTFU_REQ8;
value = folded;
}
} /* Below, the lowest code point < 256 */
ret = regnode_guts(pRExC_state, op, len, "exact");
FILL_NODE(ret, op);
RExC_emit += 1 + STR_SZ(len);
- STR_LEN(REGNODE_p(ret)) = len;
+ setSTR_LEN(REGNODE_p(ret), len);
if (len == 1) {
*STRING(REGNODE_p(ret)) = (U8) value;
}
if (invlist_highest(cp_list) <= max_permissible) {
UV this_start, this_end;
- UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */
+ UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
U8 bits_differing = 0;
Size_t full_cp_count = 0;
bool first_time = TRUE;
full_cp_count += this_end - this_start + 1;
}
- invlist_iterfinish(cp_list);
/* At the end of the loop, we count how many bits differ from
* the bits in lowest code point, call the count 'd'. If the
ret = reganode(pRExC_state, op, lowest_cp);
FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
}
+
+ done_anyofm:
+ invlist_iterfinish(cp_list);
}
- done_anyofm:
if (inverted) {
_invlist_invert(cp_list);
if (op != END) {
goto not_anyof;
}
+
+ /* XXX We could create an ANYOFR_LOW node here if we saved above if
+ * all were invariants, it wasn't inverted, and there is a single
+ * range. This would be faster than some of the posix nodes we
+ * create below like /\d/a, but would be twice the size. Without
+ * having actually measured the gain, khw doesn't think the
+ * tradeoff is really worth it */
}
if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
SvREFCNT_dec(intersection);
}
- /* If didn't find an optimization and there is no need for a
- * bitmap, optimize to indicate that */
+ /* If didn't find an optimization and there is no need for a bitmap,
+ * optimize to indicate that */
if ( start[0] >= NUM_ANYOF_CODE_POINTS
&& ! LOC
&& ! upper_latin1_only_utf8_matches
&& anyof_flags == 0)
{
+ U8 low_utf8[UTF8_MAXBYTES+1];
UV highest_cp = invlist_highest(cp_list);
- /* If the lowest and highest code point in the class have the same
- * UTF-8 first byte, then all do, and we can store that byte for
- * regexec.c to use so that it can more quickly scan the target
- * string for potential matches for this class. We co-opt the the
- * flags field for this. Zero means, they don't have the same
- * first byte. We do accept here very large code points (for
- * future use), but don't bother with this optimization for them,
- * as it would cause other complications */
- if (highest_cp > IV_MAX) {
- anyof_flags = 0;
- }
- else {
- U8 low_utf8[UTF8_MAXBYTES+1];
+ op = ANYOFH;
+
+ /* Currently the maximum allowed code point by the system is
+ * IV_MAX. Higher ones are reserved for future internal use. This
+ * particular regnode can be used for higher ones, but we can't
+ * calculate the code point of those. IV_MAX suffices though, as
+ * it will be a large first byte */
+ (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+ /* We store the lowest possible first byte of the UTF-8
+ * representation, using the flags field. This allows for quick
+ * ruling out of some inputs without having to convert from UTF-8
+ * to code point. For EBCDIC, we use I8, as not doing that
+ * transformation would not rule out nearly so many things */
+ anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+ /* If the first UTF-8 start byte for the highest code point in the
+ * range is suitably small, we may be able to get an upper bound as
+ * well */
+ if (highest_cp <= IV_MAX) {
U8 high_utf8[UTF8_MAXBYTES+1];
- (void) uvchr_to_utf8(low_utf8, start[0]);
- (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+ (void) uvchr_to_utf8(high_utf8, highest_cp);
+
+ /* If the lowest and highest are the same, we can get an exact
+ * first byte instead of a just minimum. We signal this with a
+ * different regnode */
+ if (low_utf8[0] == high_utf8[0]) {
+
+ /* No need to convert to I8 for EBCDIC as this is an exact
+ * match */
+ anyof_flags = low_utf8[0];
+ op = ANYOFHb;
+ }
+ else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
+ {
+
+ /* Here, the high byte is not the same as the low, but is
+ * small enough that its reasonable to have a loose upper
+ * bound, which is packed in with the strict lower bound.
+ * See comments at the definition of MAX_ANYOF_HRx_BYTE.
+ * On EBCDIC platforms, I8 is used. On ASCII platforms I8
+ * is the same thing as UTF-8 */
- anyof_flags = (low_utf8[0] == high_utf8[0])
- ? low_utf8[0]
- : 0;
+ U8 bits = 0;
+ U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
+ U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
+ - anyof_flags;
+
+ if (range_diff <= max_range_diff / 8) {
+ bits = 3;
+ }
+ else if (range_diff <= max_range_diff / 4) {
+ bits = 2;
+ }
+ else if (range_diff <= max_range_diff / 2) {
+ bits = 1;
+ }
+ anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
+ op = ANYOFHr;
+ }
}
- op = ANYOFH;
+ goto done_finding_op;
}
} /* End of seeing if can optimize it into a different node */
is_anyof: /* It's going to be an ANYOF node. */
- if (op != ANYOFH) {
- op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
- ? ANYOFD
- : ((posixl)
- ? ANYOFPOSIXL
- : ((LOC)
- ? ANYOFL
- : ANYOF));
- }
+ op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+ ? ANYOFD
+ : ((posixl)
+ ? ANYOFPOSIXL
+ : ((LOC)
+ ? ANYOFL
+ : ANYOF));
+
+ done_finding_op:
ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
FILL_NODE(ret, op); /* We set the argument later */
STATIC void
S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
{
- /* 'size' is the delta to add or subtract from the current memory allocated
- * to the regex engine being constructed */
+ /* 'size' is the delta number of smallest regnode equivalents to add or
+ * subtract from the current memory allocated to the regex engine being
+ * constructed. */
PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
{
- /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns
- * and increments RExC_size and RExC_emit
+ /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
+ * equivalents space. It aligns and increments RExC_size and RExC_emit
*
* It returns the regnode's offset into the regex engine program */
}
else {
if (val - scan > U16_MAX) {
- /* Since not all callers check the return value, populate this with
- * something that won't loop and will likely lead to a crash if
+ /* Populate this with something that won't loop and will likely
+ * lead to a crash if the caller ignores the failure return, and
* execution continues */
NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
return FALSE;
#endif
if ( exact ) {
switch (OP(REGNODE_p(scan))) {
+ case LEXACT:
case EXACT:
- case EXACT_ONLY8:
+ case LEXACT_REQ8:
+ case EXACT_REQ8:
case EXACTL:
case EXACTF:
case EXACTFU_S_EDGE:
case EXACTFAA_NO_TRIE:
case EXACTFAA:
case EXACTFU:
- case EXACTFU_ONLY8:
+ case EXACTFU_REQ8:
case EXACTFLU8:
case EXACTFUP:
case EXACTFL:
}
else {
if (val - scan > U16_MAX) {
+ /* Populate this with something that won't loop and will likely
+ * lead to a crash if the caller ignores the failure return, and
+ * execution continues */
NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
return FALSE;
}
SvPVCLEAR(sv);
- if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
- /* It would be nice to FAIL() here, but this may be called from
- regexec.c, and it would be hard to supply pRExC_state. */
- Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
- (int)OP(o), (int)REGNODE_MAX);
+ if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
+ if (pRExC_state) { /* This gives more info, if we have it */
+ FAIL3("panic: corrupted regexp opcode %d > %d",
+ (int)OP(o), (int)REGNODE_MAX);
+ }
+ else {
+ Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
+ (int)OP(o), (int)REGNODE_MAX);
+ }
+ }
sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
k = PL_regkind[OP(o)];
name_list= RExC_paren_name_list;
}
if (name_list) {
- if ( k != REF || (OP(o) < NREF)) {
+ if ( k != REF || (OP(o) < REFN)) {
SV **name= av_fetch(name_list, parno, 0 );
if (name)
Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
- const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o);
+ const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(o);
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (OP(o) != ANYOFH) {
+ if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
/* Then all the things that could fit in the bitmap */
do_sep = put_charclass_bitmap_innards(sv,
ANYOF_BITMAP(o),
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
- if (OP(o) == ANYOFH && FLAGS(o) != 0) {
- Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+ if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ U8 lowest = (OP(o) != ANYOFHr)
+ ? FLAGS(o)
+ : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+ U8 highest = (OP(o) == ANYOFHb)
+ ? lowest
+ : OP(o) == ANYOFH
+ ? 0xFF
+ : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
}
-
SvREFCNT_dec(unresolved);
}
else if (k == ANYOFM) {
/* As a final resort, output the range or subrange as hex. */
- this_end = (end < NUM_ANYOF_CODE_POINTS)
- ? end
- : NUM_ANYOF_CODE_POINTS - 1;
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ this_end = end;
+ }
+ else { /* Have to split range at the bitmap boundary */
+ this_end = (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
+ }
#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
? "\\x%02" UVXf "-\\x%02" UVXf
RESTORE_CONTEXT;
}
+STATIC SV *
+S_get_fq_name(pTHX_
+ const char * const name, /* The first non-blank in the \p{}, \P{} */
+ const Size_t name_len, /* Its length in bytes, not including any trailing space */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool has_colon_colon
+ )
+{
+ /* Returns a mortal SV containing the fully qualified version of the input
+ * name */
+
+ SV * fq_name;
+
+ fq_name = newSVpvs_flags("", SVs_TEMP);
+
+ /* Use the current package if it wasn't included in our input */
+ if (! has_colon_colon) {
+ const HV * pkg = (IN_PERL_COMPILETIME)
+ ? PL_curstash
+ : CopSTASH(PL_curcop);
+ const char* pkgname = HvNAME(pkg);
+
+ Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+ UTF8fARG(is_utf8, strlen(pkgname), pkgname));
+ sv_catpvs(fq_name, "::");
+ }
+
+ Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+ UTF8fARG(is_utf8, name_len, name));
+ return fq_name;
+}
+
SV *
Perl_parse_uniprop_string(pTHX_
int slash_pos = -1; /* Where the '/' is found, or negative if none */
int table_index = 0; /* The entry number for this property in the table
of all Unicode property names */
- bool starts_with_In_or_Is = FALSE; /* ? Does the name start with 'In' or
- 'Is' */
+ bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
Size_t lookup_offset = 0; /* Used to ignore the first few characters of
the normalized name in certain situations */
Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
it is the definition. Otherwise it is a
string containing the fully qualified sub
name of 'name' */
+ SV * fq_name = NULL; /* For user-defined properties, the fully
+ qualified name */
bool invert_return = FALSE; /* ? Do we need to complement the result before
returning it */
pos_in_brackets = strchr("([<)]>)]>", open);
close = (pos_in_brackets) ? pos_in_brackets[3] : open;
- if ( name[name_len-1] != close
+ if ( i >= name_len
+ || name[name_len-1] != close
|| (escaped && name[name_len-2] != '\\'))
{
sv_catpvs(msg, "Unicode property wildcard not terminated");
Titlecase Mapping (both full and simple)
Uppercase Mapping (both full and simple)
* Move the part that looks at the property values into a perl
- * script, like utf8_heavy.pl is done. This makes things somewhat
+ * script, like utf8_heavy.pl was done. This makes things somewhat
* easier, but most importantly, it avoids always adding all these
* strings to the memory usage when the feature is little-used.
*
/* Certain properties whose values are numeric need special handling.
* They may optionally be prefixed by 'is'. Ignore that prefix for the
* purposes of checking if this is one of those properties */
- if (memBEGINPs(lookup_name, name_len, "is")) {
+ if (memBEGINPs(lookup_name, j, "is")) {
lookup_offset = 2;
}
}
/* Store the first real character in the denominator */
- lookup_name[j++] = name[i];
+ if (i < name_len) {
+ lookup_name[j++] = name[i];
+ }
}
}
/* If the original input began with 'In' or 'Is', it could be a subroutine
* call to a user-defined property instead of a Unicode property name. */
- if ( non_pkg_begin + name_len > 2
+ if ( name_len - non_pkg_begin > 2
&& name[non_pkg_begin+0] == 'I'
&& (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
{
- starts_with_In_or_Is = TRUE;
+ /* Names that start with In have different characterstics than those
+ * that start with Is */
+ if (name[non_pkg_begin+1] == 's') {
+ starts_with_Is = TRUE;
+ }
}
else {
could_be_user_defined = FALSE;
if (could_be_user_defined) {
CV* user_sub;
+ /* If the user defined property returns the empty string, it could
+ * easily be because the pattern is being compiled before the data it
+ * actually needs to compile is available. This could be argued to be
+ * a bug in the perl code, but this is a change of behavior for Perl,
+ * so we handle it. This means that intentionally returning nothing
+ * will not be resolved until runtime */
+ bool empty_return = FALSE;
+
/* Here, the name could be for a user defined property, which are
* implemented as subs. */
user_sub = get_cvn_flags(name, name_len, 0);
if (user_sub) {
+ const char insecure[] = "Insecure user-defined property";
/* Here, there is a sub by the correct name. Normally we call it
* to get the property definition */
dSP;
SV * user_sub_sv = MUTABLE_SV(user_sub);
SV * error; /* Any error returned by calling 'user_sub' */
- SV * fq_name; /* Fully qualified property name */
+ SV * key; /* The key into the hash of user defined sub names
+ */
SV * placeholder;
- char to_fold_string[] = "0:"; /* The 0 gets overwritten with the
- actual value */
SV ** saved_user_prop_ptr; /* Hash entry for this property */
/* How many times to retry when another thread is in the middle of
/* If we get here, we know this property is user-defined */
*user_defined_ptr = TRUE;
- /* We refuse to call a tainted subroutine; returning an error
- * instead */
+ /* We refuse to call a potentially tainted subroutine; returning an
+ * error instead */
if (TAINT_get) {
if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
- sv_catpvs(msg, "Insecure user-defined property");
+ sv_catpvn(msg, insecure, sizeof(insecure) - 1);
goto append_name_to_msg;
}
* should the need arise, passing the /i status as a parameter.
*
* We start by constructing the hash key name, consisting of the
- * fully qualified subroutine name */
- fq_name = sv_2mortal(newSV(10)); /* 10 is just a guess */
- (void) cv_name(user_sub, fq_name, 0);
-
- /* But precede the sub name in the key with the /i status, so that
- * there is a key for /i and a different key for non-/i */
- to_fold_string[0] = to_fold + '0';
- sv_insert(fq_name, 0, 0, to_fold_string, 2);
+ * fully qualified subroutine name, preceded by the /i status, so
+ * that there is a key for /i and a different key for non-/i */
+ key = newSVpvn(((to_fold) ? "1" : "0"), 1);
+ fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+ non_pkg_begin != 0);
+ sv_catsv(key, fq_name);
+ sv_2mortal(key);
/* We only call the sub once throughout the life of the program
* (with the /i, non-/i exception noted above). That means the
/* If we have an entry for this key, the subroutine has already
* been called once with this /i status. */
saved_user_prop_ptr = hv_fetch(PL_user_def_props,
- SvPVX(fq_name), SvCUR(fq_name), 0);
+ SvPVX(key), SvCUR(key), 0);
if (saved_user_prop_ptr) {
/* If the saved result is an inversion list, it is the valid
* */
SWITCH_TO_GLOBAL_CONTEXT;
placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
- (void) hv_store_ent(PL_user_def_props, fq_name, placeholder, 0);
+ (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
RESTORE_CONTEXT;
/* Now that we have a placeholder, we can let other threads
USER_PROP_MUTEX_UNLOCK;
/* Make sure the placeholder always gets destroyed */
- SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(fq_name));
+ SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
PUSHMARK(SP);
SAVETMPS;
XPUSHs(boolSV(to_fold));
PUTBACK;
+ /* The following block was taken from swash_init(). Presumably
+ * they apply to here as well, though we no longer use a swash --
+ * khw */
+ SAVEHINTS();
+ save_re_context();
+ /* We might get here via a subroutine signature which uses a utf8
+ * parameter name, at which point PL_subname will have been set
+ * but not yet used. */
+ save_item(PL_subname);
+
(void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
SPAGAIN;
error = ERRSV;
- if (SvTRUE(error)) {
+ if (TAINT_get || SvTRUE(error)) {
if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
- sv_catpvs(msg, "Error \"");
- sv_catsv(msg, error);
- sv_catpvs(msg, "\"");
+ if (SvTRUE(error)) {
+ sv_catpvs(msg, "Error \"");
+ sv_catsv(msg, error);
+ sv_catpvs(msg, "\"");
+ }
+ if (TAINT_get) {
+ if (SvTRUE(error)) sv_catpvs(msg, "; ");
+ sv_catpvn(msg, insecure, sizeof(insecure) - 1);
+ }
+
if (name_len > 0) {
sv_catpvs(msg, " in expansion of ");
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
prop_definition = NULL;
}
else { /* G_SCALAR guarantees a single return value */
+ SV * contents = POPs;
/* The contents is supposed to be the expansion of the property
- * definition. Call a function to check for valid syntax and
- * handle it */
- prop_definition = handle_user_defined_property(name, name_len,
+ * definition. If the definition is deferrable, and we got an
+ * empty string back, set a flag to later defer it (after clean
+ * up below). */
+ if ( deferrable
+ && (! SvPOK(contents) || SvCUR(contents) == 0))
+ {
+ empty_return = TRUE;
+ }
+ else { /* Otherwise, call a function to check for valid syntax,
+ and handle it */
+
+ prop_definition = handle_user_defined_property(
+ name, name_len,
is_utf8, to_fold, runtime,
deferrable,
- POPs, user_defined_ptr,
+ contents, user_defined_ptr,
msg,
level);
+ }
}
/* Here, we have the results of the expansion. Delete the
* and add the permanent entry */
USER_PROP_MUTEX_LOCK;
- S_delete_recursion_entry(aTHX_ SvPVX(fq_name));
-
- if (! prop_definition || is_invlist(prop_definition)) {
+ S_delete_recursion_entry(aTHX_ SvPVX(key));
+ if ( ! empty_return
+ && (! prop_definition || is_invlist(prop_definition)))
+ {
/* If we got success we use the inversion list defining the
* property; otherwise use the error message */
SWITCH_TO_GLOBAL_CONTEXT;
(void) hv_store_ent(PL_user_def_props,
- fq_name,
+ key,
((prop_definition)
? newSVsv(prop_definition)
: newSVsv(msg)),
LEAVE;
POPSTACK;
+ if (empty_return) {
+ goto definition_deferred;
+ }
+
if (prop_definition) {
/* If the definition is for something not known at this time,
/* If it didn't find the property ... */
if (table_index == 0) {
- /* Try again stripping off any initial 'In' or 'Is' */
- if (starts_with_In_or_Is) {
+ /* Try again stripping off any initial 'Is'. This is because we
+ * promise that an initial Is is optional. The same isn't true of
+ * names that start with 'In'. Those can match only blocks, and the
+ * lookup table already has those accounted for. */
+ if (starts_with_Is) {
lookup_name += 2;
lookup_len -= 2;
equals_pos -= 2;
* NV. */
NV value;
+ SSize_t value_len = lookup_len - equals_pos;
/* Get the value */
- if (my_atof3(lookup_name + equals_pos, &value,
- lookup_len - equals_pos)
+ if ( value_len <= 0
+ || my_atof3(lookup_name + equals_pos, &value,
+ value_len)
!= lookup_name + lookup_len)
{
goto failed;
definition_deferred:
/* Here it could yet to be defined, so defer evaluation of this
- * until its needed at runtime. */
- prop_definition = newSVpvs_flags("", SVs_TEMP);
-
- /* To avoid any ambiguity, the package is always specified.
- * Use the current one if it wasn't included in our input */
- if (non_pkg_begin == 0) {
- const HV * pkg = (IN_PERL_COMPILETIME)
- ? PL_curstash
- : CopSTASH(PL_curcop);
- const char* pkgname = HvNAME(pkg);
-
- Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
- UTF8fARG(is_utf8, strlen(pkgname), pkgname));
- sv_catpvs(prop_definition, "::");
+ * until its needed at runtime. We need the fully qualified property name
+ * to avoid ambiguity, and a trailing newline */
+ if (! fq_name) {
+ fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+ non_pkg_begin != 0 /* If has "::" */
+ );
}
-
- Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
- UTF8fARG(is_utf8, name_len, name));
- sv_catpvs(prop_definition, "\n");
+ sv_catpvs(fq_name, "\n");
*user_defined_ptr = TRUE;
- return prop_definition;
+ return fq_name;
}
#endif