# include "regcomp.h"
#endif
-#include "dquote_inline.h"
#include "invlist_inline.h"
#include "unicode_constants.h"
/* Certain characters are output as a sequence with the first being a
* backslash. */
-#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
+#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
struct RExC_state_t {
#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
#define PBITVAL(paren) (1 << ((paren) & 7))
-#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
-#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
-#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
+#define PAREN_OFFSET(depth) \
+ (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
+#define PAREN_TEST(depth, paren) \
+ (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
+#define PAREN_SET(depth, paren) \
+ (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
+#define PAREN_UNSET(depth, paren) \
+ (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
#define REQUIRE_UTF8(flagp) STMT_START { \
if (!UTF) { \
/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
* a flag that indicates we need to override /d with /u as a result of
* something in the pattern. It should only be used in regards to calling
- * set_regex_charset() or get_regex_charse() */
+ * set_regex_charset() or get_regex_charset() */
#define REQUIRE_UNI_RULES(flagp, restart_retval) \
STMT_START { \
if (DEPENDS_SEMANTICS) { \
#define _invlist_intersection_complement_2nd(a, b, output) \
_invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
+/* We add a marker if we are deferring expansion of a property that is both
+ * 1) potentiallly user-defined; and
+ * 2) could also be an official Unicode property.
+ *
+ * Without this marker, any deferred expansion can only be for a user-defined
+ * one. This marker shouldn't conflict with any that could be in a legal name,
+ * and is appended to its name to indicate this. There is a string and
+ * character form */
+#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
+#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
+
/* About scan_data_t.
During optimisation we recurse through the regexp program performing
#define UPDATE_WARNINGS_LOC(loc) \
STMT_START { \
if (TO_OUTPUT_WARNINGS(loc)) { \
- RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
+ RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
+ - RExC_precomp; \
} \
} STMT_END
} STMT_END
/* m is not necessarily a "literal string", in this macro */
-#define reg_warn_non_literal_string(loc, m) \
- _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
+#define warn_non_literal_string(loc, packed_warn, m) \
+ _WARN_HELPER(loc, packed_warn, \
+ Perl_warner(aTHX_ packed_warn, \
"%s" REPORT_LOCATION, \
m, REPORT_LOCATION_ARGS(loc)))
+#define reg_warn_non_literal_string(loc, m) \
+ warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
+
+#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
+ STMT_START { \
+ char * format; \
+ Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
+ Newx(format, format_size, char); \
+ my_strlcpy(format, m, format_size); \
+ my_strlcat(format, REPORT_LOCATION, format_size); \
+ SAVEFREEPV(format); \
+ _WARN_HELPER(loc, packwarn, \
+ Perl_ck_warner(aTHX_ packwarn, \
+ format, \
+ a1, REPORT_LOCATION_ARGS(loc))); \
+ } STMT_END
#define ckWARNreg(loc,m) \
_WARN_HELPER(loc, packWARN(WARN_REGEXP), \
/* END of edit_distance() stuff
* ========================================================= */
-/* is c a control character for which we have a mnemonic? */
-#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
-
-STATIC const char *
-S_cntrl_to_mnemonic(const U8 c)
-{
- /* Returns the mnemonic string that represents character 'c', if one
- * exists; NULL otherwise. The only ones that exist for the purposes of
- * this routine are a few control characters */
-
- switch (c) {
- case '\a': return "\\a";
- case '\b': return "\\b";
- case ESC_NATIVE: return "\\e";
- case '\f': return "\\f";
- case '\n': return "\\n";
- case '\r': return "\\r";
- case '\t': return "\\t";
- }
-
- return NULL;
-}
-
/* Mark that we cannot extend a found fixed substring at this point.
Update the longest found anchored substring or the longest found
floating substrings if needed. */
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
- const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+ const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
? 0
: ANYOF_FLAGS(node);
}
/* Add in the points from the bit map */
- if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+ if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
unsigned int start = i++;
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
- U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+ U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
? 0
: ANYOF_FLAGS(and_with);
U8 anded_flags;
SV* ored_cp_list;
U8 ored_flags;
- U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+ U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
? 0
: ANYOF_FLAGS(or_with);
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
+ SvREFCNT_dec(invlist);
/* Make sure is clone-safe */
ssc->invlist = NULL;
#endif
switch (flags) {
- case EXACT: case EXACT_ONLY8: case EXACTL: break;
+ case EXACT: case EXACT_REQ8: case EXACTL: break;
case EXACTFAA:
case EXACTFUP:
case EXACTFU:
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
+ if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
if (OP(noper) == NOTHING) {
/* skip past a NOTHING at the start of an alternation
* eg, /(?:)a|(?:b)/ should be the same as /a|b/
+ *
+ * If the next node is not something we are supposed to process
+ * we will just ignore it due to the condition guarding the
+ * next block.
*/
+
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
if ( noper < tail
&& ( OP(noper) == flags
- || (flags == EXACT && OP(noper) == EXACT_ONLY8)
- || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || (flags == EXACT && OP(noper) == EXACT_REQ8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
uc= (U8*)STRING(noper);
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
+ /* we will undo this assignment if noper does not
+ * point at a trieable type in the else clause of
+ * the following statement. */
}
if ( noper < tail
&& ( OP(noper) == flags
- || (flags == EXACT && OP(noper) == EXACT_ONLY8)
- || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || (flags == EXACT && OP(noper) == EXACT_REQ8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
const U8 *uc= (U8*)STRING(noper);
Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
}
}
- }
+ } else {
+ /* If we end up here it is because we skipped past a NOTHING, but did not end up
+ * on a trieable type. So we need to reset noper back to point at the first regop
+ * in the branch before we call TRIE_HANDLE_WORD()
+ */
+ noper= NEXTOPER(cur);
+ }
TRIE_HANDLE_WORD(state);
} /* end second pass */
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
+ /* we will undo this assignment if noper does not
+ * point at a trieable type in the else clause of
+ * the following statement. */
}
if ( noper < tail
&& ( OP(noper) == flags
- || (flags == EXACT && OP(noper) == EXACT_ONLY8)
- || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || (flags == EXACT && OP(noper) == EXACT_REQ8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
const U8 *uc= (U8*)STRING(noper);
/* charid is now 0 if we dont know the char read, or
* nonzero if we do */
}
+ } else {
+ /* If we end up here it is because we skipped past a NOTHING, but did not end up
+ * on a trieable type. So we need to reset noper back to point at the first regop
+ * in the branch before we call TRIE_HANDLE_WORD().
+ */
+ noper= NEXTOPER(cur);
}
accept_state = TRIE_NODENUM( state );
TRIE_HANDLE_WORD(accept_state);
if ( state==1 ) {
OP( convert ) = nodetype;
str=STRING(convert);
- STR_LEN(convert)=0;
+ setSTR_LEN(convert, 0);
}
- STR_LEN(convert) += len;
+ assert( ( STR_LEN(convert) + len ) < 256 );
+ setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
while (len--)
*str++ = *ch++;
} else {
trie->prefixlen = (state-1);
if (str) {
regnode *n = convert+NODE_SZ_STR(convert);
- NEXT_OFF(convert) = NODE_SZ_STR(convert);
+ assert( NODE_SZ_STR(convert) <= U16_MAX );
+ NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
trie->startstate = state;
trie->minlen -= (state - 1);
trie->maxlen -= (state - 1);
* using /iaa matching will be doing so almost entirely with ASCII
* strings, so this should rarely be encountered in practice */
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
- if (PL_regkind[OP(scan)] == EXACT) \
- join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
-
STATIC U32
S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
UV *min_subtract, bool *unfolded_multi_char,
/* Joining something that requires UTF-8 with something that
* doesn't, means the result requires UTF-8. */
- if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
- OP(scan) = EXACT_ONLY8;
+ if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
+ OP(scan) = EXACT_REQ8;
}
- else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
+ else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
; /* join is compatible, no need to change OP */
}
- else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
- OP(scan) = EXACTFU_ONLY8;
+ else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
+ OP(scan) = EXACTFU_REQ8;
}
- else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
+ else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
; /* join is compatible, no need to change OP */
}
else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
* node. And if the only adjacent node is EXACTF, they get
* absorbed into that, under the theory that a longer node is
* better than two shorter ones, even if one is EXACTFU. Note
- * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
+ * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
* and the EXACTFU_S_EDGE ones only for non-UTF-8. */
if (STRING(n)[STR_LEN(n)-1] == 's') {
merged++;
NEXT_OFF(scan) += NEXT_OFF(n);
- STR_LEN(scan) += STR_LEN(n);
+ assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
+ setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
- if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
+ if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
U8* s0 = (U8*) STRING(scan);
U8* s = s0;
U8* s_end = s0 + STR_LEN(scan);
}
#endif
}
-
- if ( STR_LEN(scan) == 1
- && isALPHA_A(* STRING(scan))
- && ( OP(scan) == EXACTFAA
- || ( OP(scan) == EXACTFU
- && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
- {
- U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
-
- /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
- * with the mask set to the complement of the bit that differs
- * between upper and lower case, and the lowest code point of the
- * pair (which the '&' forces) */
- OP(scan) = ANYOFM;
- ARG_SET(scan, *STRING(scan) & mask);
- FLAGS(scan) = mask;
- }
}
#ifdef DEBUGGING
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
+ SSize_t final_minlen;
/* There must be at least this number of characters to match */
SSize_t min = 0;
I32 pars = 0, code;
U32 j;
for ( j = 0 ; j < recursed_depth ; j++ ) {
for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
- if (
- PAREN_TEST(RExC_study_chunk_recursed +
- ( j * RExC_study_chunk_recursed_bytes), i )
- && (
- !j ||
- !PAREN_TEST(RExC_study_chunk_recursed +
- (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
- )
- ) {
+ if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
Perl_re_printf( aTHX_ " %d",(int)i);
break;
}
* parsing code, as each (?:..) is handled by a different invocation of
* reg() -- Yves
*/
- JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
-
- /* Follow the next-chain of the current node and optimize
- away all the NOTHINGs from it. */
- if (OP(scan) != CURLYX) {
- const int max = (reg_off_by_arg[OP(scan)]
- ? I32_MAX
- /* I32 may be smaller than U16 on CRAYs! */
- : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
- int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
- int noff;
- regnode *n = scan;
-
- /* Skip NOTHING and LONGJMP. */
- while ((n = regnext(n))
- && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
- || ((OP(n) == LONGJMP) && (noff = ARG(n))))
- && off + noff < max)
- off += noff;
- if (reg_off_by_arg[OP(scan)])
- ARG(scan) = off;
- else
- NEXT_OFF(scan) = off;
- }
+ if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT
+ && OP(scan) != LEXACT_REQ8)
+ join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
+ 0, NULL, depth + 1);
+
+ /* Follow the next-chain of the current node and optimize
+ away all the NOTHINGs from it. */
+ if (OP(scan) != CURLYX) {
+ const int max = (reg_off_by_arg[OP(scan)]
+ ? I32_MAX
+ /* I32 may be smaller than U16 on CRAYs! */
+ : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
+ int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
+ int noff;
+ regnode *n = scan;
+
+ /* Skip NOTHING and LONGJMP. */
+ while ( (n = regnext(n))
+ && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+ || ((OP(n) == LONGJMP) && (noff = ARG(n))))
+ && off + noff < max)
+ off += noff;
+ if (reg_off_by_arg[OP(scan)])
+ ARG(scan) = off;
+ else
+ NEXT_OFF(scan) = off;
+ }
- /* The principal pseudo-switch. Cannot be a switch, since we
- look into several different things. */
+ /* The principal pseudo-switch. Cannot be a switch, since we look into
+ * several different things. */
if ( OP(scan) == DEFINEP ) {
SSize_t minlen = 0;
SSize_t deltanext = 0;
if ( SvIV(re_trie_maxbuff)>=0 ) {
regnode *cur;
regnode *first = (regnode *)NULL;
- regnode *last = (regnode *)NULL;
+ regnode *prev = (regnode *)NULL;
regnode *tail = scan;
U8 trietype = 0;
U32 count=0;
----------------+-----------
NOTHING | NOTHING
EXACT | EXACT
- EXACT_ONLY8 | EXACT
+ EXACT_REQ8 | EXACT
EXACTFU | EXACTFU
- EXACTFU_ONLY8 | EXACTFU
+ EXACTFU_REQ8 | EXACTFU
EXACTFUP | EXACTFU
EXACTFAA | EXACTFAA
EXACTL | EXACTL
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
? NOTHING \
- : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
+ : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
? EXACT \
: ( EXACTFU == (X) \
- || EXACTFU_ONLY8 == (X) \
+ || EXACTFU_REQ8 == (X) \
|| EXACTFUP == (X) ) \
? EXACTFU \
: ( EXACTFAA == (X) ) \
REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
}
Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
- REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+ REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
);
});
} else {
if ( trietype == NOTHING )
trietype = noper_trietype;
- last = cur;
+ prev = cur;
}
if (first)
count++;
* noper may either be a triable node which can
* not be tried together with the current trie,
* or a non triable node */
- if ( last ) {
+ if ( prev ) {
/* If last is set and trietype is not
* NOTHING then we have found at least two
* triable branch sequences in a row of a
make_trie( pRExC_state,
startbranch, first, cur, tail,
count, trietype, depth+1 );
- last = NULL; /* note: we clear/update
+ prev = NULL; /* note: we clear/update
first, trietype etc below,
so we dont do it here */
}
Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
- REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+ REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
PL_reg_name[trietype]
);
});
- if ( last && trietype ) {
+ if ( prev && trietype ) {
if ( trietype != NOTHING ) {
/* the last branch of the sequence was part of
* a trie, so we have to construct it here
OP(opt)= OPTIMIZED;
}
}
- } /* end if ( last) */
+ } /* end if ( prev) */
} /* TRIE_MAXBUF is non zero */
-
} /* do trie */
}
if (
!recursed_depth
- ||
- !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
+ || !PAREN_TEST(recursed_depth - 1, paren)
) {
/* it is quite possible that there are more efficient ways
* to do this. We maintain a bitmap per level of recursion
if (!recursed_depth) {
Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
} else {
- Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
- RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
+ Copy(PAREN_OFFSET(recursed_depth - 1),
+ PAREN_OFFSET(recursed_depth),
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
- PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
+ PAREN_SET(recursed_depth, paren);
my_recursed_depth= recursed_depth + 1;
} else {
DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
}
}
else if ( OP(scan) == EXACT
- || OP(scan) == EXACT_ONLY8
+ || OP(scan) == LEXACT
+ || OP(scan) == EXACT_REQ8
+ || OP(scan) == LEXACT_REQ8
|| OP(scan) == EXACTL)
{
- SSize_t l = STR_LEN(scan);
+ SSize_t bytelen = STR_LEN(scan), charlen;
UV uc;
- assert(l);
+ assert(bytelen);
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
- uc = utf8_to_uvchr_buf(s, s + l, NULL);
- l = utf8_length(s, s + l);
+ uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+ charlen = utf8_length(s, s + bytelen);
} else {
uc = *((U8*)STRING(scan));
+ charlen = bytelen;
}
- min += l;
+ min += charlen;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
/* The code below prefers earlier match for fixed
offset, later match for variable offset. */
data->last_start_max = is_inf
? SSize_t_MAX : data->pos_min + data->pos_delta;
}
- sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+ sv_catpvn(data->last_found, STRING(scan), bytelen);
if (UTF)
SvUTF8_on(data->last_found);
{
MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
mg_find(sv, PERL_MAGIC_utf8) : NULL;
if (mg && mg->mg_len >= 0)
- mg->mg_len += utf8_length((U8*)STRING(scan),
- (U8*)STRING(scan)+STR_LEN(scan));
+ mg->mg_len += charlen;
}
- data->last_end = data->pos_min + l;
- data->pos_min += l; /* As in the first entry. */
+ data->last_end = data->pos_min + charlen;
+ data->pos_min += charlen; /* As in the first entry. */
data->flags &= ~SF_BEFORE_EOL;
}
}
else if (PL_regkind[OP(scan)] == EXACT) {
/* But OP != EXACT!, so is EXACTFish */
- SSize_t l = STR_LEN(scan);
+ SSize_t bytelen = STR_LEN(scan), charlen;
const U8 * s = (U8*)STRING(scan);
+ /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+ * with the mask set to the complement of the bit that differs
+ * between upper and lower case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ if ( bytelen == 1
+ && isALPHA_A(*s)
+ && ( OP(scan) == EXACTFAA
+ || ( OP(scan) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))))
+ {
+ U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+ OP(scan) = ANYOFM;
+ ARG_SET(scan, *s & mask);
+ FLAGS(scan) = mask;
+ /* we're not EXACTFish any more, so restudy */
+ continue;
+ }
+
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
assert(data);
scan_commit(pRExC_state, data, minlenp, is_inf);
}
- if (UTF) {
- l = utf8_length(s, s + l);
- }
+ charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
if (unfolded_multi_char) {
RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
}
- min += l - min_subtract;
+ min += charlen - min_subtract;
assert (min >= 0);
delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
- data->pos_min += l - min_subtract;
+ data->pos_min += charlen - min_subtract;
if (data->pos_min < 0) {
data->pos_min = 0;
}
}
if (flags & SCF_DO_STCLASS) {
- SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
+ SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
assert(EXACTF_invlist);
if (flags & SCF_DO_STCLASS_AND) {
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
if ( OP(next) == EXACT
- || OP(next) == EXACT_ONLY8
+ || OP(next) == LEXACT
+ || OP(next) == EXACT_REQ8
+ || OP(next) == LEXACT_REQ8
|| OP(next) == EXACTL
|| (flags & SCF_DO_STCLASS))
{
continue;
default:
-#ifdef DEBUGGING
Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
OP(scan));
-#endif
case REF:
case CLUMP:
if (flags & SCF_DO_SUBSTR) {
case ANYOFH:
case ANYOFHb:
case ANYOFHr:
+ case ANYOFHs:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
break;
}
+ case ANYOFR:
+ case ANYOFRb:
+ {
+ SV* cp_list = NULL;
+
+ cp_list = _add_range_to_invlist(cp_list,
+ ANYOFRbase(scan),
+ ANYOFRbase(scan) + ANYOFRdelta(scan));
+
+ if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class, cp_list, invert);
+ }
+ else if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class, cp_list, invert);
+ }
+
+ SvREFCNT_dec_NN(cp_list);
+ break;
+ }
+
case NPOSIXL:
invert = 1;
/* FALLTHROUGH */
}
#endif
}
-
else if (OP(scan) == OPEN) {
if (stopparen != (I32)ARG(scan))
pars++;
DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
- {
- SSize_t final_minlen= min < stopmin ? min : stopmin;
-
- if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
- if (final_minlen > SSize_t_MAX - delta)
- RExC_maxlen = SSize_t_MAX;
- else if (RExC_maxlen < final_minlen + delta)
- RExC_maxlen = final_minlen + delta;
- }
- return final_minlen;
+ final_minlen = min < stopmin
+ ? min : stopmin;
+ if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
+ if (final_minlen > SSize_t_MAX - delta)
+ RExC_maxlen = SSize_t_MAX;
+ else if (RExC_maxlen < final_minlen + delta)
+ RExC_maxlen = final_minlen + delta;
}
- NOT_REACHED; /* NOTREACHED */
+ return final_minlen;
}
STATIC U32
DEBUG_r(if (!PL_colorset) reginitcolors());
- /* Initialize these here instead of as-needed, as is quick and avoids
- * having to test them each time otherwise */
- if (! PL_InBitmap) {
-#ifdef DEBUGGING
- char * dump_len_string;
-#endif
-
- /* This is calculated here, because the Perl program that generates the
- * static global ones doesn't currently have access to
- * NUM_ANYOF_CODE_POINTS */
- PL_InBitmap = _new_invlist(2);
- PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
- NUM_ANYOF_CODE_POINTS - 1);
-#ifdef DEBUGGING
- dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
- if ( ! dump_len_string
- || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
- {
- PL_dump_re_max_len = 60; /* A reasonable default */
- }
-#endif
- }
pRExC_state->warn_text = NULL;
pRExC_state->unlexed_names = NULL;
&& memEQ(RX_PRECOMP(old_re), exp, plen)
&& !runtime_code /* with runtime code, always recompile */ )
{
+ DEBUG_COMPILE_r({
+ SV *dsv= sv_newmortal();
+ RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
+ Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
+ PL_colors[4], PL_colors[5], s);
+ });
return old_re;
}
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
if ( OP(first) == EXACT
- || OP(first) == EXACT_ONLY8
+ || OP(first) == LEXACT
+ || OP(first) == EXACT_REQ8
+ || OP(first) == LEXACT_REQ8
|| OP(first) == EXACTL)
{
NOOP; /* Empty, get anchored substr later. */
&& nop == END)
RExC_rx->extflags |= RXf_WHITE;
else if ( RExC_rx->extflags & RXf_SPLIT
- && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
+ && ( fop == EXACT || fop == LEXACT
+ || fop == EXACT_REQ8 || fop == LEXACT_REQ8
+ || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& nop == END )
i = rx->sublen + rx->suboffset - rx->offs[0].end;
}
else
- if ( 0 <= n && n <= (I32)rx->nparens &&
- (s1 = rx->offs[n].start) != -1 &&
+ if (inRANGE(n, 0, (I32)rx->nparens) &&
+ (s1 = rx->offs[n].start) != -1 &&
(t1 = rx->offs[n].end) != -1)
{
/* $&, ${^MATCH}, $1 ... */
i = t1 - s1;
if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
- i = el;
+ i = el;
}
return i;
}
return zero_addr + *offset;
}
-PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
-{
- /* Sets the current number of elements stored in the inversion list.
- * Updates SvCUR correspondingly */
- PERL_UNUSED_CONTEXT;
- PERL_ARGS_ASSERT_INVLIST_SET_LEN;
-
- assert(is_invlist(invlist));
-
- SvCUR_set(invlist,
- (len == 0)
- ? 0
- : TO_INTERNAL_SIZE(len + offset));
- assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
-}
-
STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
{
invlist_iterfinish(invlist);
*get_invlist_previous_index_addr(invlist) = 0;
+ SvPOK_on(invlist); /* This allows B to extract the PV */
}
SV*
invlist_iterfinish(invlist);
SvREADONLY_on(invlist);
+ SvPOK_on(invlist);
return invlist;
}
STATIC void
-S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
-{
- /* Grow the maximum size of an inversion list */
-
- PERL_ARGS_ASSERT_INVLIST_EXTEND;
-
- assert(is_invlist(invlist));
-
- /* Add one to account for the zero element at the beginning which may not
- * be counted by the calling parameters */
- SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
-}
-
-STATIC void
S__append_range_to_invlist(pTHX_ SV* const invlist,
const UV start, const UV end)
{
#endif
-PERL_STATIC_INLINE SV*
-S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
- return _add_range_to_invlist(invlist, cp, cp);
-}
-
#ifndef PERL_IN_XSUB_RE
void
Perl__invlist_invert(pTHX_ SV* const invlist)
#endif
-PERL_STATIC_INLINE STRLEN*
-S_get_invlist_iter_addr(SV* invlist)
-{
- /* Return the address of the UV that contains the current iteration
- * position */
-
- PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
-
- assert(is_invlist(invlist));
-
- return &(((XINVLIST*) SvANY(invlist))->iterator);
-}
-
-PERL_STATIC_INLINE void
-S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
-{
- PERL_ARGS_ASSERT_INVLIST_ITERINIT;
-
- *get_invlist_iter_addr(invlist) = 0;
-}
-
-PERL_STATIC_INLINE void
-S_invlist_iterfinish(SV* invlist)
-{
- /* Terminate iterator for invlist. This is to catch development errors.
- * Any iteration that is interrupted before completed should call this
- * function. Functions that add code points anywhere else but to the end
- * of an inversion list assert that they are not in the middle of an
- * iteration. If they were, the addition would make the iteration
- * problematical: if the iteration hadn't reached the place where things
- * were being added, it would be ok */
-
- PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
-
- *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
-}
-
-STATIC bool
-S_invlist_iternext(SV* invlist, UV* start, UV* end)
-{
- /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
- * This call sets in <*start> and <*end>, the next range in <invlist>.
- * Returns <TRUE> if successful and the next call will return the next
- * range; <FALSE> if was already at the end of the list. If the latter,
- * <*start> and <*end> are unchanged, and the next call to this function
- * will start over at the beginning of the list */
-
- STRLEN* pos = get_invlist_iter_addr(invlist);
- UV len = _invlist_len(invlist);
- UV *array;
-
- PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
-
- if (*pos >= len) {
- *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
- return FALSE;
- }
-
- array = invlist_array(invlist);
-
- *start = array[(*pos)++];
-
- if (*pos >= len) {
- *end = UV_MAX;
- }
- else {
- *end = array[(*pos)++] - 1;
- }
-
- return TRUE;
-}
-
PERL_STATIC_INLINE UV
-S_invlist_highest(SV* const invlist)
+S_invlist_lowest(SV* const invlist)
{
- /* Returns the highest code point that matches an inversion list. This API
- * has an ambiguity, as it returns 0 under either the highest is actually
+ /* Returns the lowest code point that matches an inversion list. This API
+ * has an ambiguity, as it returns 0 under either the lowest is actually
* 0, or if the list is empty. If this distinction matters to you, check
* for emptiness before calling this function */
UV len = _invlist_len(invlist);
UV *array;
- PERL_ARGS_ASSERT_INVLIST_HIGHEST;
+ PERL_ARGS_ASSERT_INVLIST_LOWEST;
if (len == 0) {
- return 0;
+ return 0;
}
array = invlist_array(invlist);
- /* The last element in the array in the inversion list always starts a
- * range that goes to infinity. That range may be for code points that are
- * matched in the inversion list, or it may be for ones that aren't
- * matched. In the latter case, the highest code point in the set is one
- * less than the beginning of this range; otherwise it is the final element
- * of this range: infinity */
- return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
- ? UV_MAX
- : array[len - 1] - 1;
+ return array[0];
}
STATIC SV *
* call SvREFCNT_dec() when done with it.
*/
STATIC SV*
-S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
+S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
{
dVAR;
const U8 * s = (U8*)STRING(node);
/* Start out big enough for 2 separate code points */
SV* invlist = _new_invlist(4);
- PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
+ PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
if (! UTF) {
uc = *s;
}
else { /* Single char fold */
unsigned int k;
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
Size_t folds_count;
/* It matches itself */
}
while (RExC_parse < RExC_end) {
- /* && strchr("iogcmsx", *RExC_parse) */
+ /* && memCHRs("iogcmsx", *RExC_parse) */
/* (?g), (?gc) and (?o) are useless here
and must be globally applied -- japhy */
switch (*RExC_parse) {
PERL_ARGS_ASSERT_REG;
DEBUG_PARSE("reg ");
-
max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
assert(max_open);
if (!SvIOK(max_open)) {
goto parse_rest;
}
- /* By doing this here, we avoid extra warnings for nested
- * script runs */
- ckWARNexperimental(RExC_parse,
- WARN_EXPERIMENTAL__SCRIPT_RUN,
- "The script_run feature is experimental");
-
if (paren == 's') {
/* Here, we're starting a new regular script run */
ret = reg_node(pRExC_state, SROPEN);
return 0;
}
- REGTAIL(pRExC_state, ret, atomic);
+ if (! REGTAIL(pRExC_state, ret, atomic)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
- REGTAIL(pRExC_state, atomic,
- reg_node(pRExC_state, SRCLOSE));
+ if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
+ SRCLOSE)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
RExC_in_script_run = 0;
return ret;
/*FALLTHROUGH*/
alpha_assertions:
- ckWARNexperimental(RExC_parse,
- WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
- "The alpha_assertions feature is experimental");
RExC_seen_zerolen++;
RExC_parse-seqstart, seqstart);
NOT_REACHED; /*NOTREACHED*/
case '<': /* (?<...) */
+ /* If you want to support (?<*...), first reconcile with GH #17363 */
if (*RExC_parse == '!')
paren = ',';
else if (*RExC_parse != '=')
RExC_flags & RXf_PMf_COMPILETIME
);
FLAGS(REGNODE_p(ret)) = 2;
- REGTAIL(pRExC_state, ret, eval);
+ if (! REGTAIL(pRExC_state, ret, eval)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
/* deal with the length of this later - MJD */
return ret;
}
tail = reg(pRExC_state, 1, &flag, depth+1);
RETURN_FAIL_ON_RESTART(flag, flagp);
- REGTAIL(pRExC_state, ret, tail);
+ if (! REGTAIL(pRExC_state, ret, tail)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
goto insert_if;
}
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
}
nextchar(pRExC_state);
insert_if:
- REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+ if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
+ IFTHEN, 0)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
br = regbranch(pRExC_state, &flags, 1, depth+1);
if (br == 0) {
RETURN_FAIL_ON_RESTART(flags,flagp);
FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
} else
- REGTAIL(pRExC_state, br, reganode(pRExC_state,
- LONGJMP, 0));
+ if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
+ LONGJMP, 0)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
c = UCHARAT(RExC_parse);
nextchar(pRExC_state);
if (flags&HASWIDTH)
FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
}
- REGTAIL(pRExC_state, ret, lastbr);
+ if (! REGTAIL(pRExC_state, ret, lastbr)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
c = UCHARAT(RExC_parse);
vFAIL("Switch (?(condition)... contains too many branches");
}
ender = reg_node(pRExC_state, TAIL);
- REGTAIL(pRExC_state, br, ender);
+ if (! REGTAIL(pRExC_state, br, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (lastbr) {
- REGTAIL(pRExC_state, lastbr, ender);
- REGTAIL(pRExC_state, REGNODE_OFFSET(
- NEXTOPER(
- NEXTOPER(REGNODE_p(lastbr)))),
- ender);
+ if (! REGTAIL(pRExC_state, lastbr, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
+ if (! REGTAIL(pRExC_state,
+ REGNODE_OFFSET(
+ NEXTOPER(
+ NEXTOPER(REGNODE_p(lastbr)))),
+ ender))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
#if 0 /* Removing this doesn't cause failures in the test suite -- khw */
RExC_size++; /* XXX WHY do we need this?!!
For large programs it seems to be required
ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
}
/* FALLTHROUGH */
+ case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
+ /* FALLTHROUGH */
default: /* e.g., (?i) */
RExC_parse = (char *) seqstart + 1;
parse_flags:
goto parse_rest;
} /* end switch */
}
- else {
- if (*RExC_parse == '{') {
- ckWARNregdep(RExC_parse + 1,
- "Unescaped left brace in regex is "
- "deprecated here (and will be fatal "
- "in Perl 5.32), passed through");
- }
- /* Not bothering to indent here, as the above 'else' is temporary
- * */
- if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
+ else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
capturing_parens:
parno = RExC_npar;
RExC_npar++;
/* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
paren = ':';
ret = 0;
- }
}
}
else /* ! paren */
*flagp |= flags&SIMPLE;
}
if (is_open) { /* Starts with OPEN. */
- REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
+ if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else if (paren != '?') /* Not Conditional */
ret = br;
lastbr = br;
while (*RExC_parse == '|') {
if (RExC_use_BRANCHJ) {
+ bool shut_gcc_up;
+
ender = reganode(pRExC_state, LONGJMP, 0);
/* Append to the previous. */
- REGTAIL(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
- ender);
+ shut_gcc_up = REGTAIL(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
+ ender);
+ PERL_UNUSED_VAR(shut_gcc_up);
}
nextchar(pRExC_state);
if (freeze_paren) {
}
break;
}
- DEBUG_PARSE_r(
+ DEBUG_PARSE_r({
DEBUG_PARSE_MSG("lsbr");
regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
(IV)ender,
(IV)(ender - lastbr)
);
- );
+ });
if (! REGTAIL(pRExC_state, lastbr, ender)) {
REQUIRE_BRANCHJ(flagp, 0);
}
is_nothing= 0;
}
else if (op == BRANCHJ) {
- REGTAIL_STUDY(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
- ender);
+ bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
+ ender);
+ PERL_UNUSED_VAR(shut_gcc_up);
/* for now we always disable this optimisation * /
if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
|| regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
br= PL_regkind[OP(ret_as_regnode)] != BRANCH
? regnext(ret_as_regnode)
: ret_as_regnode;
- DEBUG_PARSE_r(
+ DEBUG_PARSE_r({
DEBUG_PARSE_MSG("NADA");
regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
NULL, pRExC_state);
(IV)ender,
(IV)(ender - ret)
);
- );
+ });
OP(br)= NOTHING;
if (OP(REGNODE_p(ender)) == TAIL) {
NEXT_OFF(br)= 0;
const regnode_offset w = reg_node(pRExC_state, WHILEM);
FLAGS(REGNODE_p(w)) = 0;
- REGTAIL(pRExC_state, ret, w);
+ if (! REGTAIL(pRExC_state, ret, w)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (RExC_use_BRANCHJ) {
reginsert(pRExC_state, LONGJMP, ret, depth+1);
reginsert(pRExC_state, NOTHING, ret, depth+1);
if (RExC_use_BRANCHJ)
NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
LONGJMP. */
- REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
+ if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+ NOTHING)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
RExC_whilem_seen++;
MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
}
nest_check:
if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
- ckWARN2reg(RExC_parse,
- "%" UTF8f " matches null string many times",
- UTF8fARG(UTF, (RExC_parse >= origparse
- ? RExC_parse - origparse
- : 0),
- origparse));
+ if (origparse[0] == '\\' && origparse[1] == 'K') {
+ vFAIL2utf8f(
+ "%" UTF8f " is forbidden - matches null string many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ /* NOT-REACHED */
+ } else {
+ ckWARN2reg(RExC_parse,
+ "%" UTF8f " matches null string many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ }
}
if (*RExC_parse == '?') {
nextchar(pRExC_state);
reginsert(pRExC_state, MINMOD, ret, depth+1);
- REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
+ if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else if (*RExC_parse == '+') {
regnode_offset ender;
nextchar(pRExC_state);
ender = reg_node(pRExC_state, SUCCEED);
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
reginsert(pRExC_state, SUSPEND, ret, depth+1);
ender = reg_node(pRExC_state, TAIL);
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
if (ISMULT2(RExC_parse)) {
value = (U8 *) SvPV(value_sv, value_len);
/* See if the result is one code point vs 0 or multiple */
- if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
- ? UTF8SKIP(value)
- : 1))
+ if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
+ ? UTF8SKIP(value)
+ : 1)))
{
/* Here, exactly one code point. If that isn't what is wanted,
* fail */
* thing. */
do { /* Loop until the ending brace */
- UV cp = 0;
- char * start_digit; /* The first of the current code point */
- if (! isXDIGIT(*RExC_parse)) {
+ I32 flags = PERL_SCAN_SILENT_OVERFLOW
+ | PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_NOTIFY_ILLDIGIT
+ | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ STRLEN len = endbrace - RExC_parse;
+ NV overflow_value;
+ char * start_digit = RExC_parse;
+ UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
+
+ if (len == 0) {
RExC_parse++;
+ bad_NU:
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- start_digit = RExC_parse;
- count++;
-
- /* Loop through the hex digits of the current code point */
- do {
- /* Adding this digit will shift the result 4 bits. If that
- * result would be above the legal max, it's overflow */
- if (cp > MAX_LEGAL_CP >> 4) {
-
- /* Find the end of the code point */
- do {
- RExC_parse ++;
- } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
-
- /* Be sure to synchronize this message with the similar one
- * in utf8.c */
- vFAIL4("Use of code point 0x%.*s is not allowed; the"
- " permissible max is 0x%" UVxf,
- (int) (RExC_parse - start_digit), start_digit,
- MAX_LEGAL_CP);
- }
-
- /* Accumulate this (valid) digit into the running total */
- cp = (cp << 4) + READ_XDIGIT(RExC_parse);
+ RExC_parse += len;
- /* READ_XDIGIT advanced the input pointer. Ignore a single
- * underscore separator */
- if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
- RExC_parse++;
- }
- } while (isXDIGIT(*RExC_parse));
+ if (cp > MAX_LEGAL_CP) {
+ vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
+ }
- /* Here, have accumulated the next code point */
- if (RExC_parse >= endbrace) { /* If done ... */
- if (count != 1) {
+ if (RExC_parse >= endbrace) { /* Got to the closing '}' */
+ if (count) {
goto do_concat;
}
return TRUE;
}
- /* Here, the only legal thing would be a multiple character
- * sequence (of the form "\N{U+c1.c2. ... }". So the next
- * character must be a dot (and the one after that can't be the
- * endbrace, or we'd have something like \N{U+100.} ) */
+ /* Here, the parse stopped bfore the ending brace. This is legal
+ * only if that character is a dot separating code points, like a
+ * multiple character sequence (of the form "\N{U+c1.c2. ... }".
+ * So the next character must be a dot (and the one after that
+ * can't be the endbrace, or we'd have something like \N{U+100.} )
+ * */
if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
- RExC_parse = endbrace;
- }
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
+ malformed utf8 */
+ goto bad_NU;
}
/* Here, looks like its really a multiple character sequence. Fail
* but go through the motions of code point counting and error
* checking, if the caller doesn't want a node returned. */
- if (node_p && count == 1) {
+ if (node_p && ! substitute_parse) {
substitute_parse = newSVpvs("?:");
}
char *parse_start;
U8 op;
int invert = 0;
- U8 arg;
GET_RE_DEBUG_FLAGS_DECL;
*flagp |= HASWIDTH;
goto finish_meta_pat;
- case 'W':
- invert = 1;
- /* FALLTHROUGH */
- case 'w':
- arg = ANYOF_WORDCHAR;
- goto join_posix;
-
case 'B':
invert = 1;
/* FALLTHROUGH */
goto finish_meta_pat;
}
- case 'D':
- invert = 1;
- /* FALLTHROUGH */
- case 'd':
- arg = ANYOF_DIGIT;
- if (! DEPENDS_SEMANTICS) {
- goto join_posix;
- }
-
- /* \d doesn't have any matches in the upper Latin1 range, hence /d
- * is equivalent to /u. Changing to /u saves some branches at
- * runtime */
- op = POSIXU;
- goto join_posix_op_known;
-
case 'R':
ret = reg_node(pRExC_state, LNBREAK);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
- case 'H':
- invert = 1;
- /* FALLTHROUGH */
+ case 'd':
+ case 'D':
case 'h':
- arg = ANYOF_BLANK;
- op = POSIXU;
- goto join_posix_op_known;
-
- case 'V':
- invert = 1;
- /* FALLTHROUGH */
+ case 'H':
+ case 'p':
+ case 'P':
+ case 's':
+ case 'S':
case 'v':
- arg = ANYOF_VERTWS;
- op = POSIXU;
- goto join_posix_op_known;
+ case 'V':
+ case 'w':
+ case 'W':
+ /* These all have the same meaning inside [brackets], and it knows
+ * how to do the best optimizations for them. So, pretend we found
+ * these within brackets, and let it do the work */
+ RExC_parse--;
- case 'S':
- invert = 1;
- /* FALLTHROUGH */
- case 's':
- arg = ANYOF_SPACE;
+ ret = regclass(pRExC_state, flagp, depth+1,
+ TRUE, /* means just parse this element */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings. It
+ would be a bug if these returned
+ non-portables */
+ (bool) RExC_strict,
+ TRUE, /* Allow an optimized regnode result */
+ NULL);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
+ /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
+ * multi-char folds are allowed. */
+ if (!ret)
+ FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
+ (UV) *flagp);
- join_posix:
-
- op = POSIXD + get_regex_charset(RExC_flags);
- if (op > POSIXA) { /* /aa is same as /a */
- op = POSIXA;
- }
- else if (op == POSIXL) {
- RExC_contains_locale = 1;
- }
- else if (op == POSIXD) {
- RExC_seen_d_op = TRUE;
- }
-
- join_posix_op_known:
-
- if (invert) {
- op += NPOSIXD - POSIXD;
- }
-
- ret = reg_node(pRExC_state, op);
- FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
-
- *flagp |= HASWIDTH|SIMPLE;
- /* FALLTHROUGH */
+ RExC_parse--; /* regclass() leaves this one too far ahead */
finish_meta_pat:
- if ( UCHARAT(RExC_parse + 1) == '{'
+ /* The escapes above that don't take a parameter can't be
+ * followed by a '{'. But 'pX', 'p{foo}' and
+ * correspondingly 'P' can be */
+ if ( RExC_parse - parse_start == 1
+ && UCHARAT(RExC_parse + 1) == '{'
&& UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
{
RExC_parse += 2;
vFAIL("Unescaped left brace in regex is illegal here");
}
- nextchar(pRExC_state);
- Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
- break;
- case 'p':
- case 'P':
- RExC_parse--;
-
- ret = regclass(pRExC_state, flagp, depth+1,
- TRUE, /* means just parse this element */
- FALSE, /* don't allow multi-char folds */
- FALSE, /* don't silence non-portable warnings. It
- would be a bug if these returned
- non-portables */
- (bool) RExC_strict,
- TRUE, /* Allow an optimized regnode result */
- NULL);
- RETURN_FAIL_ON_RESTART_FLAGP(flagp);
- /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
- * multi-char folds are allowed. */
- if (!ret)
- FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
- (UV) *flagp);
-
- RExC_parse--;
-
Set_Node_Offset(REGNODE_p(ret), parse_start);
- Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
+ Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
nextchar(pRExC_state);
break;
case 'N':
STRLEN len = 0;
UV ender = 0;
char *p;
- char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
+ char *s, *old_s = NULL, *old_old_s = NULL;
char *s0;
- U8 upper_parse = MAX_NODE_STRING_SIZE;
+ U32 max_string_len = 255;
+
+ /* We may have to reparse the node, artificially stopping filling
+ * it early, based on info gleaned in the first parse. This
+ * variable gives where we stop. Make it above the normal stopping
+ * place first time through; otherwise it would stop too early */
+ U32 upper_fill = max_string_len + 1;
/* We start out as an EXACT node, even if under /i, until we find a
* character which is in a fold. The algorithm now segregates into
U8 node_type = EXACT;
/* Assume the node will be fully used; the excess is given back at
- * the end. We can't make any other length assumptions, as a byte
- * input sequence could shrink down. */
- Ptrdiff_t initial_size = STR_SZ(256);
+ * the end. Under /i, we may need to temporarily add the fold of
+ * an extra character or two at the end to check for splitting
+ * multi-char folds, so allocate extra space for that. We can't
+ * make any other length assumptions, as a byte input sequence
+ * could shrink down. */
+ Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
+ + ((! FOLD)
+ ? 0
+ : 2 * ((UTF)
+ ? UTF8_MAXBYTES_CASE
+ /* Max non-UTF-8 expansion is 2 */ : 2)));
bool next_is_quantifier;
char * oldp = NULL;
/* So is the MICRO SIGN */
bool has_micro_sign = FALSE;
+ /* Set when we fill up the current node and there is still more
+ * text to process */
+ bool overflowed;
+
/* Allocate an EXACT node. The node_type may change below to
* another EXACTish node, but since the size of the node doesn't
* change, it works */
- ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+ ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
+ "exact");
FILL_NODE(ret, node_type);
RExC_emit++;
reparse:
+ p = RExC_parse;
+ len = 0;
+ s = s0;
+ node_type = EXACT;
+ oldp = NULL;
+ maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
+ maybe_SIMPLE = SIMPLE;
+ requires_utf8_target = FALSE;
+ has_ss = FALSE;
+ has_micro_sign = FALSE;
+
+ continue_parse:
+
/* This breaks under rare circumstances. If folding, we do not
* want to split a node at a character that is a non-final in a
* multi-char fold, as an input string could just happen to want to
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
+ overflowed = FALSE;
+
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
* We quit at the first non-literal or when the node gets full, or
* under /i the categorization of folding/non-folding character
* changes */
- for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+ while (p < RExC_end && len < upper_fill) {
/* In most cases each iteration adds one byte to the output.
* The exceptions override this */
Size_t added_len = 1;
oldp = p;
+ old_old_s = old_s;
+ old_s = s;
/* White space has already been ignored */
assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
|| ! is_PATWS_safe((p), RExC_end, UTF));
switch ((U8)*p) {
+ const char* message;
+ U32 packed_warn;
+ U8 grok_c_char;
+
case '^':
case '$':
case '.':
p++;
break;
case 'o':
- {
- UV result;
- const char* error_msg;
-
- bool valid = grok_bslash_o(&p,
- RExC_end,
- &result,
- &error_msg,
- TO_OUTPUT_WARNINGS(p),
- (bool) RExC_strict,
- TRUE, /* Output warnings
- for non-
- portables */
- UTF);
- if (! valid) {
- RExC_parse = p; /* going to die anyway; point
- to exact spot of failure */
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(p - 1);
- ender = result;
- break;
- }
+ if (! grok_bslash_o(&p,
+ RExC_end,
+ &ender,
+ &message,
+ &packed_warn,
+ (bool) RExC_strict,
+ FALSE, /* No illegal cp's */
+ UTF))
+ {
+ RExC_parse = p; /* going to die anyway; point to
+ exact spot of failure */
+ vFAIL(message);
+ }
+
+ if (message && TO_OUTPUT_WARNINGS(p)) {
+ warn_non_literal_string(p, packed_warn, message);
+ }
+ break;
case 'x':
- {
- UV result = UV_MAX; /* initialize to erroneous
- value */
- const char* error_msg;
-
- bool valid = grok_bslash_x(&p,
- RExC_end,
- &result,
- &error_msg,
- TO_OUTPUT_WARNINGS(p),
- (bool) RExC_strict,
- TRUE, /* Silence warnings
- for non-
- portables */
- UTF);
- if (! valid) {
- RExC_parse = p; /* going to die anyway; point
- to exact spot of failure */
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(p - 1);
- ender = result;
+ if (! grok_bslash_x(&p,
+ RExC_end,
+ &ender,
+ &message,
+ &packed_warn,
+ (bool) RExC_strict,
+ FALSE, /* No illegal cp's */
+ UTF))
+ {
+ RExC_parse = p; /* going to die anyway; point
+ to exact spot of failure */
+ vFAIL(message);
+ }
+
+ if (message && TO_OUTPUT_WARNINGS(p)) {
+ warn_non_literal_string(p, packed_warn, message);
+ }
#ifdef EBCDIC
- if (ender < 0x100) {
- if (RExC_recode_x_to_native) {
- ender = LATIN1_TO_NATIVE(ender);
- }
- }
+ if (ender < 0x100) {
+ if (RExC_recode_x_to_native) {
+ ender = LATIN1_TO_NATIVE(ender);
+ }
+ }
#endif
- break;
- }
+ break;
case 'c':
- p++;
- ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
- UPDATE_WARNINGS_LOC(p);
p++;
+ if (! grok_bslash_c(*p, &grok_c_char,
+ &message, &packed_warn))
+ {
+ /* going to die anyway; point to exact spot of
+ * failure */
+ RExC_parse = p + ((UTF)
+ ? UTF8_SAFE_SKIP(p, RExC_end)
+ : 1);
+ vFAIL(message);
+ }
+
+ ender = grok_c_char;
+ p++;
+ if (message && TO_OUTPUT_WARNINGS(p)) {
+ warn_non_literal_string(p, packed_warn, message);
+ }
+
break;
case '8': case '9': /* must be a backreference */
--p;
/* FALLTHROUGH */
case '0':
{
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_NOTIFY_ILLDIGIT;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
p += numlen;
- if ( isDIGIT(*p) /* like \08, \178 */
- && ckWARN(WARN_REGEXP)
- && numlen < 3)
+ if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
+ && isDIGIT(*p) /* like \08, \178 */
+ && ckWARN(WARN_REGEXP))
{
reg_warn_non_literal_string(
- p + 1,
- form_short_octal_warning(p, numlen));
+ p + 1,
+ form_alien_digit_msg(8, numlen, p,
+ RExC_end, UTF, FALSE));
}
}
break;
if (ender > 255) {
REQUIRE_UTF8(flagp);
+ if ( UNICODE_IS_PERL_EXTENDED(ender)
+ && TO_OUTPUT_WARNINGS(p))
+ {
+ ckWARN2_non_literal_string(p,
+ packWARN(WARN_PORTABLE),
+ PL_extended_cp_format,
+ ender);
+ }
}
/* We need to check if the next non-ignored thing is a
/* Ready to add 'ender' to the node */
if (! FOLD) { /* The simple case, just append the literal */
+ not_fold_common:
- not_fold_common:
- if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
- *(s++) = (char) ender;
- }
- else {
- U8 * new_s = uvchr_to_utf8((U8*)s, ender);
- added_len = (char *) new_s - s;
- s = (char *) new_s;
+ /* Don't output if it would overflow */
+ if (UNLIKELY(len > max_string_len - ((UTF)
+ ? UVCHR_SKIP(ender)
+ : 1)))
+ {
+ overflowed = TRUE;
+ break;
+ }
- if (ender > 255) {
- requires_utf8_target = TRUE;
- }
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+ *(s++) = (char) ender;
+ }
+ else {
+ U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+ added_len = (char *) new_s - s;
+ s = (char *) new_s;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
}
+ }
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
goto loopdone;
}
- if (UTF) { /* Use the folded value */
+ if (UTF) { /* Alway use the folded value for UTF-8
+ patterns */
if (UVCHR_IS_INVARIANT(ender)) {
+ if (UNLIKELY(len + 1 > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
*(s)++ = (U8) toFOLD(ender);
}
else {
- ender = _to_uni_fold_flags(
+ UV folded = _to_uni_fold_flags(
ender,
- (U8 *) s,
+ (U8 *) s, /* We have allocated extra space
+ in 's' so can't run off the
+ end */
&added_len,
FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
? FOLD_FLAGS_NOMIX_ASCII
: 0));
+ if (UNLIKELY(len + added_len > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
s += added_len;
- if ( ender > 255
- && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+ if ( folded > 255
+ && LIKELY(folded != GREEK_SMALL_LETTER_MU))
{
/* U+B5 folds to the MU, so its possible for a
* non-UTF-8 target to match it */
}
}
}
- else {
-
- /* Here is non-UTF8. First, see if the character's
- * fold differs between /d and /u. */
- if (PL_fold[ender] != PL_fold_latin1[ender]) {
- maybe_exactfu = FALSE;
+ else { /* Here is non-UTF8. */
+
+ /* The fold will be one or (rarely) two characters.
+ * Check that there's room for at least a single one
+ * before setting any flags, etc. Because otherwise an
+ * overflowing character could cause a flag to be set
+ * even though it doesn't end up in this node. (For
+ * the two character fold, we check again, before
+ * setting any flags) */
+ if (UNLIKELY(len + 1 > max_string_len)) {
+ overflowed = TRUE;
+ break;
}
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- /* On non-ancient Unicode versions, this includes the
- * multi-char fold SHARP S to 'ss' */
-
- if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
- || ( isALPHA_FOLD_EQ(ender, 's')
- && len > 0
- && isALPHA_FOLD_EQ(*(s-1), 's')))
- {
- /* Here, we have one of the following:
- * a) a SHARP S. This folds to 'ss' only under
- * /u rules. If we are in that situation,
- * fold the SHARP S to 'ss'. See the comments
- * for join_exact() as to why we fold this
- * non-UTF at compile time, and no others.
- * b) 'ss'. When under /u, there's nothing
- * special needed to be done here. The
- * previous iteration handled the first 's',
- * and this iteration will handle the second.
- * If, on the otherhand it's not /u, we have
- * to exclude the possibility of moving to /u,
- * so that we won't generate an unwanted
- * match, unless, at runtime, the target
- * string is in UTF-8.
- * */
+ /* On non-ancient Unicodes, check for the only possible
+ * multi-char fold */
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ /* This potential multi-char fold means the node
+ * can't be simple (because it could match more
+ * than a single char). And in some cases it will
+ * match 'ss', so set that flag */
+ maybe_SIMPLE = 0;
has_ss = TRUE;
- maybe_exactfu = FALSE; /* Can't generate an
- EXACTFU node (unless we
- already are in one) */
- if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
- maybe_SIMPLE = 0;
- if (node_type == EXACTFU) {
- *(s++) = 's';
-
- /* Let the code below add in the extra 's' */
- ender = 's';
- added_len = 2;
+
+ /* It can't change to be an EXACTFU (unless already
+ * is one). We fold it iff under /u rules. */
+ if (node_type != EXACTFU) {
+ maybe_exactfu = FALSE;
+ }
+ else {
+ if (UNLIKELY(len + 2 > max_string_len)) {
+ overflowed = TRUE;
+ break;
}
+
+ *(s++) = 's';
+ *(s++) = 's';
+ added_len = 2;
+
+ goto done_with_this_char;
}
}
+ else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
+ && LIKELY(len > 0)
+ && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
+ {
+ /* Also, the sequence 'ss' is special when not
+ * under /u. If the target string is UTF-8, it
+ * should match SHARP S; otherwise it won't. So,
+ * here we have to exclude the possibility of this
+ * node moving to /u.*/
+ has_ss = TRUE;
+ maybe_exactfu = FALSE;
+ }
#endif
+ /* Here, the fold will be a single character */
- else if (UNLIKELY(ender == MICRO_SIGN)) {
+ if (UNLIKELY(ender == MICRO_SIGN)) {
has_micro_sign = TRUE;
}
+ else if (PL_fold[ender] != PL_fold_latin1[ender]) {
+
+ /* If the character's fold differs between /d and
+ * /u, this can't change to be an EXACTFU node */
+ maybe_exactfu = FALSE;
+ }
*(s++) = (DEPENDS_SEMANTICS)
? (char) toFOLD(ender)
}
} /* End of adding current character to the node */
+ done_with_this_char:
+
len += added_len;
if (next_is_quantifier) {
} /* End of loop through literal characters */
- /* Here we have either exhausted the input or ran out of room in
- * the node. (If we encountered a character that can't be in the
- * node, transfer is made directly to <loopdone>, and so we
- * wouldn't have fallen off the end of the loop.) In the latter
- * case, we artificially have to split the node into two, because
- * we just don't have enough space to hold everything. This
- * creates a problem if the final character participates in a
- * multi-character fold in the non-final position, as a match that
- * should have occurred won't, due to the way nodes are matched,
- * and our artificial boundary. So back off until we find a non-
- * problematic character -- one that isn't at the beginning or
- * middle of such a fold. (Either it doesn't participate in any
- * folds, or appears only in the final position of all the folds it
- * does participate in.) A better solution with far fewer false
- * positives, and that would fill the nodes more completely, would
- * be to actually have available all the multi-character folds to
- * test against, and to back-off only far enough to be sure that
- * this node isn't ending with a partial one. <upper_parse> is set
- * further below (if we need to reparse the node) to include just
- * up through that final non-problematic character that this code
- * identifies, so when it is set to less than the full node, we can
- * skip the rest of this */
- if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
- PERL_UINT_FAST8_T backup_count = 0;
-
- const STRLEN full_len = len;
-
- assert(len >= MAX_NODE_STRING_SIZE);
-
- /* Here, <s> points to just beyond where we have output the
- * final character of the node. Look backwards through the
- * string until find a non- problematic character */
-
- if (! UTF) {
-
- /* This has no multi-char folds to non-UTF characters */
- if (ASCII_FOLD_RESTRICTED) {
- goto loopdone;
+ /* Here we have either exhausted the input or run out of room in
+ * the node. If the former, we are done. (If we encountered a
+ * character that can't be in the node, transfer is made directly
+ * to <loopdone>, and so we wouldn't have fallen off the end of the
+ * loop.) */
+ if (LIKELY(! overflowed)) {
+ goto loopdone;
+ }
+
+ /* Here we have run out of room. We can grow plain EXACT and
+ * LEXACT nodes. If the pattern is gigantic enough, though,
+ * eventually we'll have to artificially chunk the pattern into
+ * multiple nodes. */
+ if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
+ Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
+ Size_t overhead_expansion = 0;
+ char temp[256];
+ Size_t max_nodes_for_string;
+ Size_t achievable;
+ SSize_t delta;
+
+ /* Here we couldn't fit the final character in the current
+ * node, so it will have to be reparsed, no matter what else we
+ * do */
+ p = oldp;
+
+ /* If would have overflowed a regular EXACT node, switch
+ * instead to an LEXACT. The code below is structured so that
+ * the actual growing code is common to changing from an EXACT
+ * or just increasing the LEXACT size. This means that we have
+ * to save the string in the EXACT case before growing, and
+ * then copy it afterwards to its new location */
+ if (node_type == EXACT) {
+ overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
+ RExC_emit += overhead_expansion;
+ Copy(s0, temp, len, char);
+ }
+
+ /* Ready to grow. If it was a plain EXACT, the string was
+ * saved, and the first few bytes of it overwritten by adding
+ * an argument field. We assume, as we do elsewhere in this
+ * file, that one byte of remaining input will translate into
+ * one byte of output, and if that's too small, we grow again,
+ * if too large the excess memory is freed at the end */
+
+ max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
+ achievable = MIN(max_nodes_for_string,
+ current_string_nodes + STR_SZ(RExC_end - p));
+ delta = achievable - current_string_nodes;
+
+ /* If there is just no more room, go finish up this chunk of
+ * the pattern. */
+ if (delta <= 0) {
+ goto loopdone;
+ }
+
+ change_engine_size(pRExC_state, delta + overhead_expansion);
+ current_string_nodes += delta;
+ max_string_len
+ = sizeof(struct regnode) * current_string_nodes;
+ upper_fill = max_string_len + 1;
+
+ /* If the length was small, we know this was originally an
+ * EXACT node now converted to LEXACT, and the string has to be
+ * restored. Otherwise the string was untouched. 260 is just
+ * a number safely above 255 so don't have to worry about
+ * getting it precise */
+ if (len < 260) {
+ node_type = LEXACT;
+ FILL_NODE(ret, node_type);
+ s0 = STRING(REGNODE_p(ret));
+ Copy(temp, s0, len, char);
+ s = s0 + len;
+ }
+
+ goto continue_parse;
+ }
+ else if (FOLD) {
+ bool splittable = FALSE;
+ bool backed_up = FALSE;
+ char * e; /* should this be U8? */
+ char * s_start; /* should this be U8? */
+
+ /* Here is /i. Running out of room creates a problem if we are
+ * folding, and the split happens in the middle of a
+ * multi-character fold, as a match that should have occurred,
+ * won't, due to the way nodes are matched, and our artificial
+ * boundary. So back off until we aren't splitting such a
+ * fold. If there is no such place to back off to, we end up
+ * taking the entire node as-is. This can happen if the node
+ * consists entirely of 'f' or entirely of 's' characters (or
+ * things that fold to them) as 'ff' and 'ss' are
+ * multi-character folds.
+ *
+ * The Unicode standard says that multi character folds consist
+ * of either two or three characters. That means we would be
+ * splitting one if the final character in the node is at the
+ * beginning of either type, or is the second of a three
+ * character fold.
+ *
+ * At this point:
+ * ender is the code point of the character that won't fit
+ * in the node
+ * s points to just beyond the final byte in the node.
+ * It's where we would place ender if there were
+ * room, and where in fact we do place ender's fold
+ * in the code below, as we've over-allocated space
+ * for s0 (hence s) to allow for this
+ * e starts at 's' and advances as we append things.
+ * old_s is the same as 's'. (If ender had fit, 's' would
+ * have been advanced to beyond it).
+ * old_old_s points to the beginning byte of the final
+ * character in the node
+ * p points to the beginning byte in the input of the
+ * character beyond 'ender'.
+ * oldp points to the beginning byte in the input of
+ * 'ender'.
+ *
+ * In the case of /il, we haven't folded anything that could be
+ * affected by the locale. That means only above-Latin1
+ * characters that fold to other above-latin1 characters get
+ * folded at compile time. To check where a good place to
+ * split nodes is, everything in it will have to be folded.
+ * The boolean 'maybe_exactfu' keeps track in /il if there are
+ * any unfolded characters in the node. */
+ bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+ /* If we do need to fold the node, we need a place to store the
+ * folded copy, and a way to map back to the unfolded original
+ * */
+ char * locfold_buf = NULL;
+ Size_t * loc_correspondence = NULL;
+
+ if (! need_to_fold_loc) { /* The normal case. Just
+ initialize to the actual node */
+ e = s;
+ s_start = s0;
+ s = old_old_s; /* Point to the beginning of the final char
+ that fits in the node */
+ }
+ else {
+
+ /* Here, we have filled a /il node, and there are unfolded
+ * characters in it. If the runtime locale turns out to be
+ * UTF-8, there are possible multi-character folds, just
+ * like when not under /l. The node hence can't terminate
+ * in the middle of such a fold. To determine this, we
+ * have to create a folded copy of this node. That means
+ * reparsing the node, folding everything assuming a UTF-8
+ * locale. (If at runtime it isn't such a locale, the
+ * actions here wouldn't have been necessary, but we have
+ * to assume the worst case.) If we find we need to back
+ * off the folded string, we do so, and then map that
+ * position back to the original unfolded node, which then
+ * gets output, truncated at that spot */
+
+ char * redo_p = RExC_parse;
+ char * redo_e;
+ char * old_redo_e;
+
+ /* Allow enough space assuming a single byte input folds to
+ * a single byte output, plus assume that the two unparsed
+ * characters (that we may need) fold to the largest number
+ * of bytes possible, plus extra for one more worst case
+ * scenario. In the loop below, if we start eating into
+ * that final spare space, we enlarge this initial space */
+ Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+ Newxz(locfold_buf, size, char);
+ Newxz(loc_correspondence, size, Size_t);
+
+ /* Redo this node's parse, folding into 'locfold_buf' */
+ redo_p = RExC_parse;
+ old_redo_e = redo_e = locfold_buf;
+ while (redo_p <= oldp) {
+
+ old_redo_e = redo_e;
+ loc_correspondence[redo_e - locfold_buf]
+ = redo_p - RExC_parse;
+
+ if (UTF) {
+ Size_t added_len;
+
+ (void) _to_utf8_fold_flags((U8 *) redo_p,
+ (U8 *) RExC_end,
+ (U8 *) redo_e,
+ &added_len,
+ FOLD_FLAGS_FULL);
+ redo_e += added_len;
+ redo_p += UTF8SKIP(redo_p);
+ }
+ else {
+
+ /* Note that if this code is run on some ancient
+ * Unicode versions, SHARP S doesn't fold to 'ss',
+ * but rather than clutter the code with #ifdef's,
+ * as is done above, we ignore that possibility.
+ * This is ok because this code doesn't affect what
+ * gets matched, but merely where the node gets
+ * split */
+ if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+ *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+ }
+ else {
+ *redo_e++ = 's';
+ *redo_e++ = 's';
+ }
+ redo_p++;
+ }
+
+
+ /* If we're getting so close to the end that a
+ * worst-case fold in the next character would cause us
+ * to overflow, increase, assuming one byte output byte
+ * per one byte input one, plus room for another worst
+ * case fold */
+ if ( redo_p <= oldp
+ && redo_e > locfold_buf + size
+ - (UTF8_MAXBYTES_CASE + 1))
+ {
+ Size_t new_size = size
+ + (oldp - redo_p)
+ + UTF8_MAXBYTES_CASE + 1;
+ Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+ Renew(locfold_buf, new_size, char);
+ Renew(loc_correspondence, new_size, Size_t);
+ size = new_size;
+
+ redo_e = locfold_buf + e_offset;
+ }
}
- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
- backup_count++;
+ /* Set so that things are in terms of the folded, temporary
+ * string */
+ s = old_redo_e;
+ s_start = locfold_buf;
+ e = redo_e;
+
+ }
+
+ /* Here, we have 's', 's_start' and 'e' set up to point to the
+ * input that goes into the node, folded.
+ *
+ * If the final character of the node and the fold of ender
+ * form the first two characters of a three character fold, we
+ * need to peek ahead at the next (unparsed) character in the
+ * input to determine if the three actually do form such a
+ * fold. Just looking at that character is not generally
+ * sufficient, as it could be, for example, an escape sequence
+ * that evaluates to something else, and it needs to be folded.
+ *
+ * khw originally thought to just go through the parse loop one
+ * extra time, but that doesn't work easily as that iteration
+ * could cause things to think that the parse is over and to
+ * goto loopdone. The character could be a '$' for example, or
+ * the character beyond could be a quantifier, and other
+ * glitches as well.
+ *
+ * The solution used here for peeking ahead is to look at that
+ * next character. If it isn't ASCII punctuation, then it will
+ * be something that continues in an EXACTish node if there
+ * were space. We append the fold of it to s, having reserved
+ * enough room in s0 for the purpose. If we can't reasonably
+ * peek ahead, we instead assume the worst case: that it is
+ * something that would form the completion of a multi-char
+ * fold.
+ *
+ * If we can't split between s and ender, we work backwards
+ * character-by-character down to s0. At each current point
+ * see if we are at the beginning of a multi-char fold. If so,
+ * that means we would be splitting the fold across nodes, and
+ * so we back up one and try again.
+ *
+ * If we're not at the beginning, we still could be at the
+ * final two characters of a (rare) three character fold. We
+ * check if the sequence starting at the character before the
+ * current position (and including the current and next
+ * characters) is a three character fold. If not, the node can
+ * be split here. If it is, we have to backup two characters
+ * and try again.
+ *
+ * Otherwise, the node can be split at the current position.
+ *
+ * The same logic is used for UTF-8 patterns and not */
+ if (UTF) {
+ Size_t added_len;
+
+ /* Append the fold of ender */
+ (void) _to_uni_fold_flags(
+ ender,
+ (U8 *) e,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ e += added_len;
+
+ /* 's' and the character folded to by ender may be the
+ * first two of a three-character fold, in which case the
+ * node should not be split here. That may mean examining
+ * the so-far unparsed character starting at 'p'. But if
+ * ender folded to more than one character, we already have
+ * three characters to look at. Also, we first check if
+ * the sequence consisting of s and the next character form
+ * the first two of some three character fold. If not,
+ * there's no need to peek ahead. */
+ if ( added_len <= UTF8SKIP(e - added_len)
+ && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
+ {
+ /* Here, the two do form the beginning of a potential
+ * three character fold. The unexamined character may
+ * or may not complete it. Peek at it. It might be
+ * something that ends the node or an escape sequence,
+ * in which case we don't know without a lot of work
+ * what it evaluates to, so we have to assume the worst
+ * case: that it does complete the fold, and so we
+ * can't split here. All such instances will have
+ * that character be an ASCII punctuation character,
+ * like a backslash. So, for that case, backup one and
+ * drop down to try at that position */
+ if (isPUNCT(*p)) {
+ s = (char *) utf8_hop_back((U8 *) s, -1,
+ (U8 *) s_start);
+ backed_up = TRUE;
+ }
+ else {
+ /* Here, since it's not punctuation, it must be a
+ * real character, and we can append its fold to
+ * 'e' (having deliberately reserved enough space
+ * for this eventuality) and drop down to check if
+ * the three actually do form a folded sequence */
+ (void) _to_utf8_fold_flags(
+ (U8 *) p, (U8 *) RExC_end,
+ (U8 *) e,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ e += added_len;
+ }
}
- len = s - s0 + 1;
- }
- else {
- /* Point to the first byte of the final character */
- s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ /* Here, we either have three characters available in
+ * sequence starting at 's', or we have two characters and
+ * know that the following one can't possibly be part of a
+ * three character fold. We go through the node backwards
+ * until we find a place where we can split it without
+ * breaking apart a multi-character fold. At any given
+ * point we have to worry about if such a fold begins at
+ * the current 's', and also if a three-character fold
+ * begins at s-1, (containing s and s+1). Splitting in
+ * either case would break apart a fold */
+ do {
+ char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
+ (U8 *) s_start);
+
+ /* If is a multi-char fold, can't split here. Backup
+ * one char and try again */
+ if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
+ s = prev_s;
+ backed_up = TRUE;
+ continue;
+ }
+
+ /* If the two characters beginning at 's' are part of a
+ * three character fold starting at the character
+ * before s, we can't split either before or after s.
+ * Backup two chars and try again */
+ if ( LIKELY(s > s_start)
+ && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
+ {
+ s = prev_s;
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
+ backed_up = TRUE;
+ continue;
+ }
- while (s >= s0) { /* Search backwards until find
- a non-problematic char */
- if (UTF8_IS_INVARIANT(*s)) {
+ /* Here there's no multi-char fold between s and the
+ * next character following it. We can split */
+ splittable = TRUE;
+ break;
- /* There are no ascii characters that participate
- * in multi-char folds under /aa. In EBCDIC, the
- * non-ascii invariants are all control characters,
- * so don't ever participate in any folds. */
- if (ASCII_FOLD_RESTRICTED
- || ! IS_NON_FINAL_FOLD(*s))
- {
- break;
- }
+ } while (s > s_start); /* End of loops backing up through the node */
+
+ /* Here we either couldn't find a place to split the node,
+ * or else we broke out of the loop setting 'splittable' to
+ * true. In the latter case, the place to split is between
+ * the first and second characters in the sequence starting
+ * at 's' */
+ if (splittable) {
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Pattern not UTF-8 */
+ if ( ender != LATIN_SMALL_LETTER_SHARP_S
+ || ASCII_FOLD_RESTRICTED)
+ {
+ assert( toLOWER_L1(ender) < 256 );
+ *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
+ }
+ else {
+ *e++ = 's';
+ *e++ = 's';
+ }
+
+ if ( e - s <= 1
+ && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
+ {
+ if (isPUNCT(*p)) {
+ s--;
+ backed_up = TRUE;
}
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
- *s, *(s+1))))
+ else {
+ if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
+ || ASCII_FOLD_RESTRICTED)
{
- break;
+ assert( toLOWER_L1(ender) < 256 );
+ *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
+ }
+ else {
+ *e++ = 's';
+ *e++ = 's';
}
}
- else if (! _invlist_contains_cp(
- PL_NonFinalFold,
- valid_utf8_to_uvchr((U8 *) s, NULL)))
- {
- break;
+ }
+
+ do {
+ if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
+ s--;
+ backed_up = TRUE;
+ continue;
}
- /* Here, the current character is problematic in that
- * it does occur in the non-final position of some
- * fold, so try the character before it, but have to
- * special case the very first byte in the string, so
- * we don't read outside the string */
- s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
- backup_count++;
- } /* End of loop backwards through the string */
-
- /* If there were only problematic characters in the string,
- * <s> will point to before s0, in which case the length
- * should be 0, otherwise include the length of the
- * non-problematic character just found */
- len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
- }
+ if ( LIKELY(s > s_start)
+ && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
+ {
+ s -= 2;
+ backed_up = TRUE;
+ continue;
+ }
- /* Here, have found the final character, if any, that is
- * non-problematic as far as ending the node without splitting
- * it across a potential multi-char fold. <len> contains the
- * number of bytes in the node up-to and including that
- * character, or is 0 if there is no such character, meaning
- * the whole node contains only problematic characters. In
- * this case, give up and just take the node as-is. We can't
- * do any better */
- if (len == 0) {
- len = full_len;
+ splittable = TRUE;
+ break;
- } else {
+ } while (s > s_start);
- /* Here, the node does contain some characters that aren't
- * problematic. If we didn't have to backup any, then the
- * final character in the node is non-problematic, and we
- * can take the node as-is */
- if (backup_count == 0) {
- goto loopdone;
+ if (splittable) {
+ s++;
}
- else if (backup_count == 1) {
+ }
- /* If the final character is problematic, but the
- * penultimate is not, back-off that last character to
- * later start a new node with it */
- p = oldp;
- goto loopdone;
+ /* Here, we are done backing up. If we didn't backup at all
+ * (the likely case), just proceed */
+ if (backed_up) {
+
+ /* If we did find a place to split, reparse the entire node
+ * stopping where we have calculated. */
+ if (splittable) {
+
+ /* If we created a temporary folded string under /l, we
+ * have to map that back to the original */
+ if (need_to_fold_loc) {
+ upper_fill = loc_correspondence[s - s_start];
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+
+ if (upper_fill == 0) {
+ FAIL2("panic: loc_correspondence[%d] is 0",
+ (int) (s - s_start));
+ }
+ }
+ else {
+ upper_fill = s - s0;
+ }
+ goto reparse;
+ }
+ else if (need_to_fold_loc) {
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
}
- /* Here, the final non-problematic character is earlier
- * in the input than the penultimate character. What we do
- * is reparse from the beginning, going up only as far as
- * this final ok one, thus guaranteeing that the node ends
- * in an acceptable character. The reason we reparse is
- * that we know how far in the character is, but we don't
- * know how to correlate its position with the input parse.
- * An alternate implementation would be to build that
- * correlation as we go along during the original parse,
- * but that would entail extra work for every node, whereas
- * this code gets executed only when the string is too
- * large for the node, and the final two characters are
- * problematic, an infrequent occurrence. Yet another
- * possible strategy would be to save the tail of the
- * string, and the next time regatom is called, initialize
- * with that. The problem with this is that unless you
- * back off one more character, you won't be guaranteed
- * regatom will get called again, unless regbranch,
- * regpiece ... are also changed. If you do back off that
- * extra character, so that there is input guaranteed to
- * force calling regatom, you can't handle the case where
- * just the first character in the node is acceptable. I
- * (khw) decided to try this method which doesn't have that
- * pitfall; if performance issues are found, we can do a
- * combination of the current approach plus that one */
- upper_parse = len;
- len = 0;
- s = s0;
- goto reparse;
+ /* Here the node consists entirely of non-final multi-char
+ * folds. (Likely it is all 'f's or all 's's.) There's no
+ * decent place to split it, so give up and just take the
+ * whole thing */
+ len = old_s - s0;
}
} /* End of verifying node ends with an appropriate char */
+ /* We need to start the next node at the character that didn't fit
+ * in this one */
+ p = oldp;
+
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
/* Free up any over-allocated space; cast is to silence bogus
* warning in MS VC */
change_engine_size(pRExC_state,
- - (Ptrdiff_t) (initial_size - STR_SZ(len)));
+ - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
else {
/* If the node type is EXACT here, check to see if it
- * should be EXACTL, or EXACT_ONLY8. */
+ * should be EXACTL, or EXACT_REQ8. */
if (node_type == EXACT) {
if (LOC) {
node_type = EXACTL;
}
else if (requires_utf8_target) {
- node_type = EXACT_ONLY8;
+ node_type = EXACT_REQ8;
}
- } else if (FOLD) {
+ }
+ else if (node_type == LEXACT) {
+ if (requires_utf8_target) {
+ node_type = LEXACT_REQ8;
+ }
+ }
+ else if (FOLD) {
if ( UNLIKELY(has_micro_sign || has_ss)
&& (node_type == EXACTFU || ( node_type == EXACTF
&& maybe_exactfu)))
if (maybe_exactfu) {
node_type = EXACTFLU8;
}
+ else if (UNLIKELY(
+ _invlist_contains_cp(PL_HasMultiCharFold, ender)))
+ {
+ /* A character that folds to more than one will
+ * match multiple characters, so can't be SIMPLE.
+ * We don't have to worry about this with EXACTFLU8
+ * nodes just above, as they have already been
+ * folded (since the fold doesn't vary at run
+ * time). Here, if the final character in the node
+ * folds to multiple, it can't be simple. (This
+ * only has an effect if the node has only a single
+ * character, hence the final one, as elsewhere we
+ * turn off simple for nodes whose length > 1 */
+ maybe_SIMPLE = 0;
+ }
}
else if (node_type == EXACTF) { /* Means is /di */
+ /* This intermediate variable is needed solely because
+ * the asserts in the macro where used exceed Win32's
+ * literal string capacity */
+ char first_char = * STRING(REGNODE_p(ret));
+
/* If 'maybe_exactfu' is clear, then we need to stay
* /di. If it is set, it means there are no code
* points that match differently depending on UTF8ness
if (! maybe_exactfu) {
RExC_seen_d_op = TRUE;
}
- else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+ else if ( isALPHA_FOLD_EQ(first_char, 's')
|| isALPHA_FOLD_EQ(ender, 's'))
{
/* But, if the node begins or ends in an 's' we
}
if (requires_utf8_target && node_type == EXACTFU) {
- node_type = EXACTFU_ONLY8;
+ node_type = EXACTFU_REQ8;
}
}
OP(REGNODE_p(ret)) = node_type;
- STR_LEN(REGNODE_p(ret)) = len;
+ setSTR_LEN(REGNODE_p(ret), len);
RExC_emit += STR_SZ(len);
/* If the node isn't a single character, it can't be SIMPLE */
- if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+ if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
maybe_SIMPLE = 0;
}
assert(PL_regkind[OP(node)] == ANYOF);
/* There is no bitmap for this node type */
- if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+ if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
return;
}
/* Recurse, with the meat of the embedded expression */
RExC_parse++;
- (void) handle_regex_sets(pRExC_state, ¤t, flagp,
- depth+1, oregcomp_parse);
+ if (! handle_regex_sets(pRExC_state, ¤t, flagp,
+ depth+1, oregcomp_parse))
+ {
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
+ }
/* Here, 'current' contains the embedded expression's
* inversion list, and RExC_parse points to the trailing
FALSE, /* Require return to be an ANYOF */
¤t))
{
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
goto regclass_failed;
}
FALSE, /* Require return to be an ANYOF */
¤t))
{
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
goto regclass_failed;
}
well have generated non-portable code points, but
they're valid on this machine */
FALSE, /* similarly, no need for strict */
- FALSE, /* Require return to be an ANYOF */
+
+ /* We can optimize into something besides an ANYOF, except
+ * under /l, which needs to be ANYOF because of runtime
+ * checks for locale sanity, etc */
+ ! in_locale,
NULL
);
RExC_flags |= RXf_PMf_FOLD;
}
- if (!node)
+ if (!node) {
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
goto regclass_failed;
+ }
/* Fix up the node type if we are in locale. (We have pretended we are
* under /u for the purposes of regclass(), as this construct will only
current Unicode version */
{
Size_t folds_count;
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
UV folded_cp;
if (isASCII(cp)) {
PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+ CLEAR_POSIX_WARNINGS();
return;
}
UPDATE_WARNINGS_LOC(RExC_parse);
}
+PERL_STATIC_INLINE Size_t
+S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
+{
+ const U8 * const start = s1;
+ const U8 * const send = start + max;
+
+ PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
+
+ while (s1 < send && *s1 == *s2) {
+ s1++; s2++;
+ }
+
+ return s1 - start;
+}
+
+
STATIC AV *
S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
{
* is already in 'value'. Otherwise, need to translate the escape
* into what it signifies. */
if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
+ const char * message;
+ U32 packed_warn;
+ U8 grok_c_char;
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
- {
- const char* error_msg;
- bool valid = grok_bslash_o(&RExC_parse,
- RExC_end,
- &value,
- &error_msg,
- TO_OUTPUT_WARNINGS(RExC_parse),
- strict,
- silence_non_portable,
- UTF);
- if (! valid) {
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(RExC_parse - 1);
- }
- non_portable_endpoint++;
+ if (! grok_bslash_o(&RExC_parse,
+ RExC_end,
+ &value,
+ &message,
+ &packed_warn,
+ strict,
+ cBOOL(range), /* MAX_UV allowed for range
+ upper limit */
+ UTF))
+ {
+ vFAIL(message);
+ }
+ else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+ warn_non_literal_string(RExC_parse, packed_warn, message);
+ }
+
+ if (value < 256) {
+ non_portable_endpoint++;
+ }
break;
case 'x':
RExC_parse--; /* function expects to be pointed at the 'x' */
- {
- const char* error_msg;
- bool valid = grok_bslash_x(&RExC_parse,
- RExC_end,
- &value,
- &error_msg,
- TO_OUTPUT_WARNINGS(RExC_parse),
- strict,
- silence_non_portable,
- UTF);
- if (! valid) {
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(RExC_parse - 1);
- }
- non_portable_endpoint++;
+ if (! grok_bslash_x(&RExC_parse,
+ RExC_end,
+ &value,
+ &message,
+ &packed_warn,
+ strict,
+ cBOOL(range), /* MAX_UV allowed for range
+ upper limit */
+ UTF))
+ {
+ vFAIL(message);
+ }
+ else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+ warn_non_literal_string(RExC_parse, packed_warn, message);
+ }
+
+ if (value < 256) {
+ non_portable_endpoint++;
+ }
break;
case 'c':
- value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
- UPDATE_WARNINGS_LOC(RExC_parse);
- RExC_parse++;
+ if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
+ &packed_warn))
+ {
+ /* going to die anyway; point to exact spot of
+ * failure */
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
+ vFAIL(message);
+ }
+
+ value = grok_c_char;
+ RExC_parse++;
+ if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+ warn_non_literal_string(RExC_parse, packed_warn, message);
+ }
+
non_portable_endpoint++;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
{
/* Take 1-3 octal digits */
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_NOTIFY_ILLDIGIT;
numlen = (strict) ? 4 : 3;
value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
: 1;
vFAIL("Need exactly 3 octal digits");
}
- else if ( numlen < 3 /* like \08, \178 */
+ else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
&& RExC_parse < RExC_end
&& isDIGIT(*RExC_parse)
&& ckWARN(WARN_REGEXP))
{
reg_warn_non_literal_string(
RExC_parse + 1,
- form_short_octal_warning(RExC_parse, numlen));
+ form_alien_digit_msg(8, numlen, RExC_parse,
+ RExC_end, UTF, FALSE));
}
}
- non_portable_endpoint++;
+ if (value < 256) {
+ non_portable_endpoint++;
+ }
break;
}
default:
/* non-Latin1 code point implies unicode semantics. */
if (value > 255) {
+ if (value > MAX_LEGAL_CP && ( value != UV_MAX
+ || prevvalue > MAX_LEGAL_CP))
+ {
+ vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
+ }
REQUIRE_UNI_RULES(flagp, 0);
+ if ( ! silence_non_portable
+ && UNICODE_IS_PERL_EXTENDED(value)
+ && TO_OUTPUT_WARNINGS(RExC_parse))
+ {
+ ckWARN2_non_literal_string(RExC_parse,
+ packWARN(WARN_PORTABLE),
+ PL_extended_cp_format,
+ value);
+ }
}
/* Ready to process either the single value, or the completed range.
STRLEN foldlen;
unsigned int k;
Size_t folds_count;
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
if (j < 256) {
/* Likewise for 'posixes' */
_invlist_union(posixes, cp_list, &cp_list);
+ SvREFCNT_dec(posixes);
/* Likewise for anything else in the range that matched only
* under UTF-8 */
|= ANYOFL_FOLD
| ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
- else if (cp_list) { /* Look to see if a 0-255 code point is in list */
- UV start, end;
- invlist_iterinit(cp_list);
- if (invlist_iternext(cp_list, &start, &end) && start < 256) {
- anyof_flags |= ANYOFL_FOLD;
- has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
- }
- invlist_iterfinish(cp_list);
+ else if (cp_list && invlist_lowest(cp_list) < 256) {
+ /* If nothing is below 256, has no locale dependency; otherwise it
+ * does */
+ anyof_flags |= ANYOFL_FOLD;
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
}
}
else if ( DEPENDS_SEMANTICS
invert = FALSE;
}
+ /* All possible optimizations below still have these characteristics.
+ * (Multi-char folds aren't SIMPLE, but they don't get this far in this
+ * routine) */
+ *flagp |= HASWIDTH|SIMPLE;
+
if (ret_invlist) {
*ret_invlist = cp_list;
return RExC_emit;
}
- /* All possible optimizations below still have these characteristics.
- * (Multi-char folds aren't SIMPLE, but they don't get this far in this
- * routine) */
- *flagp |= HASWIDTH|SIMPLE;
-
if (anyof_flags & ANYOF_LOCALE_FLAGS) {
RExC_contains_locale = 1;
}
if (optimizable) {
PERL_UINT_FAST8_T i;
- Size_t partial_cp_count = 0;
+ UV partial_cp_count = 0;
UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
UV end[MAX_FOLD_FROMS+1] = { 0 };
+ bool single_range = FALSE;
if (cp_list) { /* Count the code points in enough ranges that we would
see all the ones possible in any fold in this version
partial_cp_count += end[i] - start[i] + 1;
}
+ if (i == 1) {
+ single_range = TRUE;
+ }
invlist_iterfinish(cp_list);
}
goto not_anyof;
}
}
+
/* For well-behaved locales, some classes are subsets of others,
* so complementing the subset and including the non-complemented
* superset should match everything, like [\D[:alnum:]], and
/* Next see if can optimize classes that contain just a few code points
* into an EXACTish node. The reason to do this is to let the
- * optimizer join this node with adjacent EXACTish ones.
+ * optimizer join this node with adjacent EXACTish ones, and ANYOF
+ * nodes require conversion to code point from UTF-8.
*
* An EXACTFish node can be generated even if not under /i, and vice
* versa. But care must be taken. An EXACTFish node has to be such
* participates in no fold whatsoever, and having it EXACT tells the
* optimizer the target string cannot match unless it has a colon in
* it.
- *
- * We don't typically generate an EXACTish node if doing so would
- * require changing the pattern to UTF-8, as that affects /d and
- * otherwise is slower. However, under /i, not changing to UTF-8 can
- * miss some potential multi-character folds. We calculate the
- * EXACTish node, and then decide if something would be missed if we
- * don't upgrade */
+ */
if ( ! posixl
&& ! invert
/* Only try if there are no more code points in the class than
* in the max possible fold */
- && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
-
- && (start[0] < 256 || UTF || FOLD))
+ && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
{
if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
{
if (LOC) {
- /* Here is /l: Use EXACTL, except /li indicates EXACTFL,
- * as that means there is a fold not known until runtime so
- * shows as only a single code point here. */
- op = (FOLD) ? EXACTFL : EXACTL;
+ /* Here is /l: Use EXACTL, except if there is a fold not
+ * known until runtime so shows as only a single code point
+ * here. For code points above 255, we know which can
+ * cause problems by having a potential fold to the Latin1
+ * range. */
+ if ( ! FOLD
+ || ( start[0] > 255
+ && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
+ {
+ op = EXACTL;
+ }
+ else {
+ op = EXACTFL;
+ }
}
else if (! FOLD) { /* Not /l and not /i */
- op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
+ op = (start[0] < 256) ? EXACT : EXACT_REQ8;
}
else if (start[0] < 256) { /* /i, not /l, and the code point is
small */
applies to it */
op = _invlist_contains_cp(PL_InMultiCharFold,
start[0])
- ? EXACTFU_ONLY8
- : EXACT_ONLY8;
+ ? EXACTFU_REQ8
+ : EXACT_REQ8;
}
value = start[0];
U8 foldbuf[UTF8_MAXBYTES_CASE];
UV folded = _to_uni_fold_flags(start[0],
foldbuf, &foldlen, 0);
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
Size_t folds_to_this_cp_count = _inverse_folds(
folded,
&first_fold,
? EXACTFLU8
: (ASCII_FOLD_RESTRICTED)
? EXACTFAA
- : EXACTFU_ONLY8;
+ : EXACTFU_REQ8;
value = folded;
}
} /* Below, the lowest code point < 256 */
}
if (op != END) {
+ U8 len;
- /* Here, we have calculated what EXACTish node we would use.
- * But we don't use it if it would require converting the
- * pattern to UTF-8, unless not using it could cause us to miss
- * some folds (hence be buggy) */
-
- if (! UTF && value > 255) {
- SV * in_multis = NULL;
-
- assert(FOLD);
-
- /* If there is no code point that is part of a multi-char
- * fold, then there aren't any matches, so we don't do this
- * optimization. Otherwise, it could match depending on
- * the context around us, so we do upgrade */
- _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
- if (UNLIKELY(_invlist_len(in_multis) != 0)) {
+ /* Here, we have calculated what EXACTish node to use. Have to
+ * convert to UTF-8 if not already there */
+ if (value > 255) {
+ if (! UTF) {
+ SvREFCNT_dec(cp_list);;
REQUIRE_UTF8(flagp);
}
- else {
- op = END;
+
+ /* This is a kludge to the special casing issues with this
+ * ligature under /aa. FB05 should fold to FB06, but the
+ * call above to _to_uni_fold_flags() didn't find this, as
+ * it didn't use the /aa restriction in order to not miss
+ * other folds that would be affected. This is the only
+ * instance likely to ever be a problem in all of Unicode.
+ * So special case it. */
+ if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
+ && ASCII_FOLD_RESTRICTED)
+ {
+ value = LATIN_SMALL_LIGATURE_ST;
}
}
- if (op != END) {
- U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
+ len = (UTF) ? UVCHR_SKIP(value) : 1;
- ret = regnode_guts(pRExC_state, op, len, "exact");
- FILL_NODE(ret, op);
- RExC_emit += 1 + STR_SZ(len);
- STR_LEN(REGNODE_p(ret)) = len;
- if (len == 1) {
- *STRING(REGNODE_p(ret)) = (U8) value;
- }
- else {
- uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
- }
- goto not_anyof;
+ ret = regnode_guts(pRExC_state, op, len, "exact");
+ FILL_NODE(ret, op);
+ RExC_emit += 1 + STR_SZ(len);
+ setSTR_LEN(REGNODE_p(ret), len);
+ if (len == 1) {
+ *STRINGs(REGNODE_p(ret)) = (U8) value;
}
+ else {
+ uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
+ }
+ goto not_anyof;
}
}
if (invlist_highest(cp_list) <= max_permissible) {
UV this_start, this_end;
- UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */
+ UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
U8 bits_differing = 0;
Size_t full_cp_count = 0;
bool first_time = TRUE;
if (op != END) {
goto not_anyof;
}
+
+ /* XXX We could create an ANYOFR_LOW node here if we saved above if
+ * all were invariants, it wasn't inverted, and there is a single
+ * range. This would be faster than some of the posix nodes we
+ * create below like /\d/a, but would be twice the size. Without
+ * having actually measured the gain, khw doesn't think the
+ * tradeoff is really worth it */
}
if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
SvREFCNT_dec(intersection);
}
+ /* If it is a single contiguous range, ANYOFR is an efficient regnode,
+ * both in size and speed. Currently, a 20 bit range base (smallest
+ * code point in the range), and a 12 bit maximum delta are packed into
+ * a 32 bit word. This allows for using it on all of the Unicode code
+ * points except for the highest plane, which is only for private use
+ * code points. khw doubts that a bigger delta is likely in real world
+ * applications */
+ if ( single_range
+ && ! has_runtime_dependency
+ && anyof_flags == 0
+ && start[0] < (1 << ANYOFR_BASE_BITS)
+ && end[0] - start[0]
+ < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
+ * CHARBITS - ANYOFR_BASE_BITS))))
+
+ {
+ U8 low_utf8[UTF8_MAXBYTES+1];
+ U8 high_utf8[UTF8_MAXBYTES+1];
+
+ ret = reganode(pRExC_state, ANYOFR,
+ (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
+
+ /* Place the lowest UTF-8 start byte in the flags field, so as to
+ * allow efficient ruling out at run time of many possible inputs.
+ * */
+ (void) uvchr_to_utf8(low_utf8, start[0]);
+ (void) uvchr_to_utf8(high_utf8, end[0]);
+
+ /* If all code points share the same first byte, this can be an
+ * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
+ * quickly rule out many inputs at run-time without having to
+ * compute the code point from UTF-8. For EBCDIC, we use I8, as
+ * not doing that transformation would not rule out nearly so many
+ * things */
+ if (low_utf8[0] == high_utf8[0]) {
+ OP(REGNODE_p(ret)) = ANYOFRb;
+ ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
+ }
+ else {
+ ANYOF_FLAGS(REGNODE_p(ret))
+ = NATIVE_UTF8_TO_I8(low_utf8[0]);
+ }
+
+ goto not_anyof;
+ }
+
/* If didn't find an optimization and there is no need for a bitmap,
* optimize to indicate that */
if ( start[0] >= NUM_ANYOF_CODE_POINTS
U8 low_utf8[UTF8_MAXBYTES+1];
UV highest_cp = invlist_highest(cp_list);
- op = ANYOFH;
-
/* Currently the maximum allowed code point by the system is
* IV_MAX. Higher ones are reserved for future internal use. This
* particular regnode can be used for higher ones, but we can't
* calculate the code point of those. IV_MAX suffices though, as
* it will be a large first byte */
- (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+ Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
+ - low_utf8;
/* We store the lowest possible first byte of the UTF-8
* representation, using the flags field. This allows for quick
* ruling out of some inputs without having to convert from UTF-8
- * to code point. For EBCDIC, this has to be I8. */
+ * to code point. For EBCDIC, we use I8, as not doing that
+ * transformation would not rule out nearly so many things */
anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+ op = ANYOFH;
+
/* If the first UTF-8 start byte for the highest code point in the
* range is suitably small, we may be able to get an upper bound as
* well */
if (highest_cp <= IV_MAX) {
U8 high_utf8[UTF8_MAXBYTES+1];
-
- (void) uvchr_to_utf8(high_utf8, highest_cp);
+ Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
+ - high_utf8;
/* If the lowest and highest are the same, we can get an exact
- * first byte instead of a just minimum. We signal this with a
- * different regnode */
+ * first byte instead of a just minimum or even a sequence of
+ * exact leading bytes. We signal these with different
+ * regnodes */
if (low_utf8[0] == high_utf8[0]) {
+ Size_t len = find_first_differing_byte_pos(low_utf8,
+ high_utf8,
+ MIN(low_len, high_len));
- /* No need to convert to I8 for EBCDIC as this is an exact
- * match */
- anyof_flags = low_utf8[0];
- op = ANYOFHb;
+ if (len == 1) {
+
+ /* No need to convert to I8 for EBCDIC as this is an
+ * exact match */
+ anyof_flags = low_utf8[0];
+ op = ANYOFHb;
+ }
+ else {
+ op = ANYOFHs;
+ ret = regnode_guts(pRExC_state, op,
+ regarglen[op] + STR_SZ(len),
+ "anyofhs");
+ FILL_NODE(ret, op);
+ ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
+ = len;
+ Copy(low_utf8, /* Add the common bytes */
+ ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
+ len, U8);
+ RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
+ set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
+ NULL, only_utf8_locale_list);
+ goto not_anyof;
+ }
}
else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
{
set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
- ? listsv : NULL,
+ ? listsv
+ : NULL,
only_utf8_locale_list);
+ SvREFCNT_dec(cp_list);;
+ SvREFCNT_dec(only_utf8_locale_list);
return ret;
not_anyof:
Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
RExC_parse - orig_parse);;
SvREFCNT_dec(cp_list);;
+ SvREFCNT_dec(only_utf8_locale_list);
return ret;
}
SV *rv;
if (cp_list) {
- av_store(av, INVLIST_INDEX, cp_list);
+ av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
}
if (only_utf8_locale_list) {
- av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
+ av_store(av, ONLY_LOCALE_MATCHES_INDEX,
+ SvREFCNT_inc_NN(only_utf8_locale_list));
}
if (runtime_defns) {
- av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
+ av_store(av, DEFERRED_USER_DEFINED_INDEX,
+ SvREFCNT_inc_NN(runtime_defns));
}
rv = newRV_noinc(MUTABLE_SV(av));
STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
- av_store(av, INVLIST_INDEX, invlist);
+ ary[INVLIST_INDEX] = invlist;
av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
- ? ONLY_LOCALE_MATCHES_INDEX:
- INVLIST_INDEX);
+ ? ONLY_LOCALE_MATCHES_INDEX
+ : INVLIST_INDEX);
si = NULL;
}
}
UV prev_cp = 0;
U8 count = 0;
- /* Ignore everything before the first new-line */
- while (*si_string != '\n' && remaining > 0) {
- si_string++;
- remaining--;
- }
- assert(remaining > 0);
-
+ /* Ignore everything before and including the first new-line */
+ si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
+ assert (si_string != NULL);
si_string++;
- remaining--;
+ remaining = SvPVX(si) + SvCUR(si) - si_string;
while (remaining > 0) {
/* The data consists of just strings defining user-defined
* property names, but in prior incarnations, and perhaps
* somehow from pluggable regex engines, it could still
- * hold hex code point definitions. Each component of a
- * range would be separated by a tab, and each range by a
- * new-line. If these are found, instead add them to the
- * inversion list */
+ * hold hex code point definitions, all of which should be
+ * legal (or it wouldn't have gotten this far). Each
+ * component of a range would be separated by a tab, and
+ * each range by a new-line. If these are found, instead
+ * add them to the inversion list */
I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
|PERL_SCAN_SILENT_NON_PORTABLE;
STRLEN len = remaining;
continue;
}
- /* Here, didn't find a legal hex number. Just add it from
- * here to the next \n */
+ /* Here, didn't find a legal hex number. Just add the text
+ * from here up to the next \n, omitting any trailing
+ * markers. */
remaining -= len;
- while (*(si_string + len) != '\n' && remaining > 0) {
- remaining--;
- len++;
- }
- if (*(si_string + len) == '\n') {
- len++;
- remaining--;
- }
+ len = strcspn(si_string,
+ DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
+ remaining -= len;
if (matches_string) {
- sv_catpvn(matches_string, si_string, len - 1);
+ sv_catpvn(matches_string, si_string, len);
}
else {
- matches_string = newSVpvn(si_string, len - 1);
+ matches_string = newSVpvn(si_string, len);
}
- si_string += len;
sv_catpvs(matches_string, " ");
+
+ si_string += len;
+ if ( remaining
+ && UCHARAT(si_string)
+ == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
+ {
+ si_string++;
+ remaining--;
+ }
+ if (remaining && UCHARAT(si_string) == '\n') {
+ si_string++;
+ remaining--;
+ }
} /* end of loop through the text */
assert(matches_string);
STATIC void
S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
{
- /* 'size' is the delta to add or subtract from the current memory allocated
- * to the regex engine being constructed */
+ /* 'size' is the delta number of smallest regnode equivalents to add or
+ * subtract from the current memory allocated to the regex engine being
+ * constructed. */
PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
{
- /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns
- * and increments RExC_size and RExC_emit
+ /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
+ * equivalents space. It aligns and increments RExC_size
*
* It returns the regnode's offset into the regex engine program */
scan = REGNODE_OFFSET(temp);
}
+ assert(val >= scan);
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
}
else {
if (val - scan > U16_MAX) {
- /* Since not all callers check the return value, populate this with
- * something that won't loop and will likely lead to a crash if
+ /* Populate this with something that won't loop and will likely
+ * lead to a crash if the caller ignores the failure return, and
* execution continues */
NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
return FALSE;
#endif
if ( exact ) {
switch (OP(REGNODE_p(scan))) {
+ case LEXACT:
case EXACT:
- case EXACT_ONLY8:
+ case LEXACT_REQ8:
+ case EXACT_REQ8:
case EXACTL:
case EXACTF:
case EXACTFU_S_EDGE:
case EXACTFAA_NO_TRIE:
case EXACTFAA:
case EXACTFU:
- case EXACTFU_ONLY8:
+ case EXACTFU_REQ8:
case EXACTFLU8:
case EXACTFUP:
case EXACTFL:
}
else {
if (val - scan > U16_MAX) {
+ /* Populate this with something that won't loop and will likely
+ * lead to a crash if the caller ignores the failure return, and
+ * execution continues */
NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
return FALSE;
}
NULL,
NULL,
NULL,
+ 0,
FALSE
);
sv_catpvs(sv, "]");
else if (k == LOGICAL)
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
- else if (k == ANYOF) {
- const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
- ? 0
- : ANYOF_FLAGS(o);
+ else if (k == ANYOF || k == ANYOFR) {
+ U8 flags;
+ char * bitmap;
+ U32 arg;
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
/* And things that aren't in the bitmap, but are small enough to be */
SV* bitmap_range_not_in_bitmap = NULL;
- const bool inverted = flags & ANYOF_INVERT;
+ bool inverted;
+
+ if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
+ flags = 0;
+ bitmap = NULL;
+ arg = 0;
+ }
+ else {
+ flags = ANYOF_FLAGS(o);
+ bitmap = ANYOF_BITMAP(o);
+ arg = ARG(o);
+ }
if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
}
}
+ inverted = flags & ANYOF_INVERT;
+
/* If there is stuff outside the bitmap, get it */
- if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
- (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+ if (arg != ANYOF_ONLY_HAS_BITMAP) {
+ if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ ANYOFRbase(o),
+ ANYOFRbase(o) + ANYOFRdelta(o));
+ }
+ else {
+ (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
&unresolved,
&only_utf8_locale_invlist,
&nonbitmap_invlist);
+ }
+
/* The non-bitmap data may contain stuff that could fit in the
* bitmap. This could come from a user-defined property being
* finally resolved when this call was done; or much more likely
* because there are matches that require UTF-8 to be valid, and so
- * aren't in the bitmap. This is teased apart later */
+ * aren't in the bitmap (or ANYOFR). This is teased apart later */
_invlist_intersection(nonbitmap_invlist,
PL_InBitmap,
&bitmap_range_not_in_bitmap);
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ /* ANYOFH by definition doesn't have anything that will fit inside the
+ * bitmap; ANYOFR may or may not. */
+ if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
+ && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
+ || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
+ {
/* Then all the things that could fit in the bitmap */
do_sep = put_charclass_bitmap_innards(sv,
- ANYOF_BITMAP(o),
+ bitmap,
bitmap_range_not_in_bitmap,
only_utf8_locale_invlist,
o,
+ flags,
/* Can't try inverting for a
* better display if there
* are things that haven't
* been resolved */
- unresolved != NULL);
+ unresolved != NULL
+ || inRANGE(OP(o), ANYOFR, ANYOFRb));
SvREFCNT_dec(bitmap_range_not_in_bitmap);
/* If there are user-defined properties which haven't been defined
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
- if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ if (OP(o) == ANYOFHs) {
+ Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+ }
+ else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
U8 lowest = (OP(o) != ANYOFHr)
? FLAGS(o)
: LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
- U8 highest = (OP(o) == ANYOFHb)
- ? lowest
- : OP(o) == ANYOFH
+ U8 highest = (OP(o) == ANYOFHr)
+ ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+ : (OP(o) == ANYOFH || OP(o) == ANYOFR)
? 0xFF
- : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+ : lowest;
Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
if (lowest != highest) {
Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
_invlist_invert(cp_list);
}
- put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
+ put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
SvREFCNT_dec(cp_list);
/* As a final resort, output the range or subrange as hex. */
- this_end = (end < NUM_ANYOF_CODE_POINTS)
- ? end
- : NUM_ANYOF_CODE_POINTS - 1;
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ this_end = end;
+ }
+ else { /* Have to split range at the bitmap boundary */
+ this_end = (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
+ }
#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
? "\\x%02" UVXf "-\\x%02" UVXf
SV *nonbitmap_invlist,
SV *only_utf8_locale_invlist,
const regnode * const node,
+ const U8 flags,
const bool force_as_is_display)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* 'node' is the regex pattern ANYOF node. It is needed only when the
* above two parameters are not null, and is passed so that this
* routine can tease apart the various reasons for them.
+ * 'flags' is the flags field of 'node'
* 'force_as_is_display' is TRUE if this routine should definitely NOT try
* to invert things to see if that leads to a cleaner display. If
* FALSE, this routine is free to use its judgment about doing this.
literally */
SV* inverted_display; /* The output string when we invert the inputs */
- U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
-
bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
to match? */
/* We are biased in favor of displaying things without them being inverted,
else if ( op == PLUS || op == STAR) {
DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
}
- else if (PL_regkind[(U8)op] == EXACT) {
+ else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
/* Literal string, where present. */
node += NODE_SZ_STR(node) - 1;
node = NEXTOPER(node);
{
dVAR;
+#ifdef DEBUGGING
+ char * dump_len_string;
+
+ dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+ if ( ! dump_len_string
+ || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+ {
+ PL_dump_re_max_len = 60; /* A reasonable default */
+ }
+#endif
+
PL_user_def_props = newHV();
#ifdef USE_ITHREADS
#endif
- /* Set up the inversion list global variables */
+ /* Set up the inversion list interpreter-level variables */
PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
+ PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
UNI__PERL_FOLDS_TO_MULTI_CHAR]);
PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
- PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
- UNI__PERL_NON_FINAL_FOLDS]);
-
PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
* Other parameters will be set on return as described below */
const char * const name, /* The first non-blank in the \p{}, \P{} */
- const Size_t name_len, /* Its length in bytes, not including any
+ Size_t name_len, /* Its length in bytes, not including any
trailing space */
const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
const bool to_fold, /* ? Is this under /i */
user-defined property */
SV * msg, /* Any error or warning msg(s) are appended to
this */
- const STRLEN level) /* Recursion level of this call */
+ const STRLEN level) /* Recursion level of this call */
{
dVAR;
char* lookup_name; /* normalized name for lookup in our tables */
unsigned lookup_len; /* Its length */
- bool stricter = FALSE; /* Some properties have stricter name
- normalization rules, which we decide upon
- based on parsing */
+ enum { Not_Strict = 0, /* Some properties have stricter name */
+ Strict, /* normalization rules, which we decide */
+ As_Is /* upon based on parsing */
+ } stricter = Not_Strict;
/* nv= or numeric_value=, or possibly one of the cjk numeric properties
* (though it requires extra effort to download them from Unicode and
the normalized name in certain situations */
Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
part of a package name */
+ Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
property rather than a Unicode
one. */
qualified name */
bool invert_return = FALSE; /* ? Do we need to complement the result before
returning it */
+ bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
+ explicit utf8:: package that we strip
+ off */
+ /* The expansion of properties that could be either user-defined or
+ * official unicode ones is deferred until runtime, including a marker for
+ * those that might be in the latter category. This boolean indicates if
+ * we've seen that marker. If not, what we're parsing can't be such an
+ * official Unicode property whose expansion was deferred */
+ bool could_be_deferred_official = FALSE;
PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
break;
}
+ /* If this looks like it is a marker we inserted at compile time,
+ * set a flag and otherwise ignore it. If it isn't in the final
+ * position, keep it as it would have been user input. */
+ if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
+ && ! deferrable
+ && could_be_user_defined
+ && i == name_len - 1)
+ {
+ name_len--;
+ could_be_deferred_official = TRUE;
+ continue;
+ }
+
/* Otherwise, this character is part of the name. */
lookup_name[j++] = cur;
i++;
non_pkg_begin = i + 1;
lookup_name[j++] = ':';
+ lun_non_pkg_begin = j;
}
else { /* Only word chars (and '::') can be in a user-defined name */
could_be_user_defined = FALSE;
lookup_name += STRLENs("utf8::");
j -= STRLENs("utf8::");
equals_pos -= STRLENs("utf8::");
+ stripped_utf8_pkg = TRUE;
}
/* Here, we are either done with the whole property name, if it was simple;
* or are positioned just after the '=' if it is compound. */
if (equals_pos >= 0) {
- assert(! stricter); /* We shouldn't have set this yet */
+ assert(stricter == Not_Strict); /* We shouldn't have set this yet */
/* Space immediately after the '=' is ignored */
i++;
/* Most punctuation after the equals indicates a subpattern, like
* \p{foo=/bar/} */
if ( isPUNCT_A(name[i])
- && name[i] != '-'
- && name[i] != '+'
- && name[i] != '_'
- && name[i] != '{')
+ && name[i] != '-'
+ && name[i] != '+'
+ && name[i] != '_'
+ && name[i] != '{'
+ /* A backslash means the real delimitter is the next character,
+ * but it must be punctuation */
+ && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
{
/* Find the property. The table includes the equals sign, so we
* use 'j' as-is */
const char * pos_in_brackets;
bool escaped = 0;
- /* A backslash means the real delimitter is the next character.
- * */
+ /* Backslash => delimitter is the character following. We
+ * already checked that it is punctuation */
if (open == '\\') {
open = name[i++];
escaped = 1;
* set of closing is so that if the opening is something like
* ']', the closing will be that as well. Something similar is
* done in toke.c */
- pos_in_brackets = strchr("([<)]>)]>", open);
+ pos_in_brackets = memCHRs("([<)]>)]>", open);
close = (pos_in_brackets) ? pos_in_brackets[3] : open;
if ( i >= name_len
|| name[name_len-1] != close
- || (escaped && name[name_len-2] != '\\'))
+ || (escaped && name[name_len-2] != '\\')
+ /* Also make sure that there are enough characters.
+ * e.g., '\\\' would show up incorrectly as legal even
+ * though it is too short */
+ || (SSize_t) (name_len - i - 1 - escaped) < 0)
{
sv_catpvs(msg, "Unicode property wildcard not terminated");
goto append_name_to_msg;
Titlecase Mapping (both full and simple)
Uppercase Mapping (both full and simple)
* Move the part that looks at the property values into a perl
- * script, like utf8_heavy.pl is done. This makes things somewhat
+ * script, like utf8_heavy.pl was done. This makes things somewhat
* easier, but most importantly, it avoids always adding all these
* strings to the memory usage when the feature is little-used.
*
* some constructs in their subpattern, like \A. */
} /* End of is a wildcard subppattern */
+ /* \p{name=...} is handled specially. Instead of using the normal
+ * mechanism involving charclass_invlists.h, it uses _charnames.pm
+ * which has the necessary (huge) data accessible to it, and which
+ * doesn't get loaded unless necessary. The legal syntax for names is
+ * somewhat different than other properties due both to the vagaries of
+ * a few outlier official names, and the fact that only a few ASCII
+ * characters are permitted in them */
+ if ( memEQs(lookup_name, j - 1, "name")
+ || memEQs(lookup_name, j - 1, "na"))
+ {
+ dSP;
+ HV * table;
+ SV * character;
+ const char * error_msg;
+ CV* lookup_loose;
+ SV * character_name;
+ STRLEN character_len;
+ UV cp;
+
+ stricter = As_Is;
+
+ /* Since the RHS (after skipping initial space) is passed unchanged
+ * to charnames, and there are different criteria for what are
+ * legal characters in the name, just parse it here. A character
+ * name must begin with an ASCII alphabetic */
+ if (! isALPHA(name[i])) {
+ goto failed;
+ }
+ lookup_name[j++] = name[i];
+
+ for (++i; i < name_len; i++) {
+ /* Official names can only be in the ASCII range, and only
+ * certain characters */
+ if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
+ goto failed;
+ }
+ lookup_name[j++] = name[i];
+ }
+
+ /* Finished parsing, save the name into an SV */
+ character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
+
+ /* Make sure _charnames is loaded. (The parameters give context
+ * for any errors generated */
+ table = load_charnames(character_name, name, name_len, &error_msg);
+ if (table == NULL) {
+ sv_catpv(msg, error_msg);
+ goto append_name_to_msg;
+ }
+
+ lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
+ if (! lookup_loose) {
+ Perl_croak(aTHX_
+ "panic: Can't find '_charnames::_loose_regcomp_lookup");
+ }
+
+ PUSHSTACKi(PERLSI_OVERLOAD);
+ ENTER ;
+ SAVETMPS;
+
+ PUSHMARK(SP) ;
+ XPUSHs(character_name);
+ PUTBACK;
+ call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
+
+ SPAGAIN ;
+
+ character = POPs;
+ SvREFCNT_inc_simple_void_NN(character);
+
+ PUTBACK ;
+ FREETMPS ;
+ LEAVE ;
+ POPSTACK;
+
+ if (! SvOK(character)) {
+ goto failed;
+ }
+
+ cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
+ if (character_len < SvCUR(character)) {
+ goto failed;
+ }
+
+ prop_definition = add_cp_to_invlist(NULL, cp);
+ return prop_definition;
+ }
/* Certain properties whose values are numeric need special handling.
* They may optionally be prefixed by 'is'. Ignore that prefix for the
* But the numeric type properties can have the alphas [Ee] to
* signify an exponent, and it is still a number with stricter
* rules. So look for an alpha that signifies not-strict */
- stricter = TRUE;
+ stricter = Strict;
for (k = i; k < name_len; k++) {
if ( isALPHA_A(name[k])
&& (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
{
- stricter = FALSE;
+ stricter = Not_Strict;
break;
}
}
&& memNEs(lookup_name + 4, j - 4, "space")
&& memNEs(lookup_name + 4, j - 4, "word"))
{
- stricter = TRUE;
+ stricter = Strict;
/* We set the inputs back to 0 and the code below will reparse,
* using strict */
/* Here, the name could be for a user defined property, which are
* implemented as subs. */
user_sub = get_cvn_flags(name, name_len, 0);
- if (user_sub) {
+ if (! user_sub) {
+
+ /* Here, the property name could be a user-defined one, but there
+ * is no subroutine to handle it (as of now). Defer handling it
+ * until runtime. Otherwise, a block defined by Unicode in a later
+ * release would get the synonym InFoo added for it, and existing
+ * code that used that name would suddenly break if it referred to
+ * the property before the sub was declared. See [perl #134146] */
+ if (deferrable) {
+ goto definition_deferred;
+ }
+
+ /* Here, we are at runtime, and didn't find the user property. It
+ * could be an official property, but only if no package was
+ * specified, or just the utf8:: package. */
+ if (could_be_deferred_official) {
+ lookup_name += lun_non_pkg_begin;
+ j -= lun_non_pkg_begin;
+ }
+ else if (! stripped_utf8_pkg) {
+ goto unknown_user_defined;
+ }
+
+ /* Drop down to look up in the official properties */
+ }
+ else {
const char insecure[] = "Insecure user-defined property";
/* Here, there is a sub by the correct name. Normally we call it
* but not yet used. */
save_item(PL_subname);
+ /* G_SCALAR guarantees a single return value */
(void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
SPAGAIN;
(void) POPs;
prop_definition = NULL;
}
- else { /* G_SCALAR guarantees a single return value */
+ else {
SV * contents = POPs;
/* The contents is supposed to be the expansion of the property
* property hasn't been encountered yet, but at runtime, it's
* an error to try to use an undefined one */
if (! deferrable) {
- if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
- sv_catpvs(msg, "Unknown user-defined property name");
- goto append_name_to_msg;
+ goto unknown_user_defined;;
}
goto definition_deferred;
}
return prop_definition;
+ unknown_user_defined:
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Unknown user-defined property name");
+ goto append_name_to_msg;
failed:
if (non_pkg_begin != 0) {
definition_deferred:
- /* Here it could yet to be defined, so defer evaluation of this
- * until its needed at runtime. We need the fully qualified property name
- * to avoid ambiguity, and a trailing newline */
- if (! fq_name) {
- fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
- non_pkg_begin != 0 /* If has "::" */
- );
- }
- sv_catpvs(fq_name, "\n");
+ {
+ bool is_qualified = non_pkg_begin != 0; /* If has "::" */
- *user_defined_ptr = TRUE;
- return fq_name;
+ /* Here it could yet to be defined, so defer evaluation of this until
+ * its needed at runtime. We need the fully qualified property name to
+ * avoid ambiguity */
+ if (! fq_name) {
+ fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+ is_qualified);
+ }
+
+ /* If it didn't come with a package, or the package is utf8::, this
+ * actually could be an official Unicode property whose inclusion we
+ * are deferring until runtime to make sure that it isn't overridden by
+ * a user-defined property of the same name (which we haven't
+ * encountered yet). Add a marker to indicate this possibility, for
+ * use at such time when we first need the definition during pattern
+ * matching execution */
+ if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
+ sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
+ }
+
+ /* We also need a trailing newline */
+ sv_catpvs(fq_name, "\n");
+
+ *user_defined_ptr = TRUE;
+ return fq_name;
+ }
}
#endif