# include "regcomp.h"
#endif
-#include "dquote_inline.h"
#include "invlist_inline.h"
#include "unicode_constants.h"
/* Certain characters are output as a sequence with the first being a
* backslash. */
-#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
+#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
struct RExC_state_t {
U32 seen;
SSize_t size; /* Number of regnode equivalents in
pattern */
+ Size_t sets_depth; /* Counts recursion depth of already-
+ compiled regex set patterns */
/* position beyond 'precomp' of the warning message furthest away from
* 'precomp'. During the parse, no warnings are raised for any problems
U32 frame_count;
AV *warn_text;
HV *unlexed_names;
-#ifdef ADD_TO_REGEXEC
- char *starttry; /* -Dr: where regtry was called. */
-#define RExC_starttry (pRExC_state->starttry)
-#endif
SV *runtime_code_qr; /* qr with the runtime code blocks */
#ifdef DEBUGGING
const char *lastparse;
bool study_started;
bool in_script_run;
bool use_BRANCHJ;
+ bool sWARN_EXPERIMENTAL__VLB;
+ bool sWARN_EXPERIMENTAL__REGEX_SETS;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_paren_names (pRExC_state->paren_names)
#define RExC_recurse (pRExC_state->recurse)
#define RExC_recurse_count (pRExC_state->recurse_count)
+#define RExC_sets_depth (pRExC_state->sets_depth)
#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
#define RExC_study_chunk_recursed_bytes \
(pRExC_state->study_chunk_recursed_bytes)
#define RExC_warn_text (pRExC_state->warn_text)
#define RExC_in_script_run (pRExC_state->in_script_run)
#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
+#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
+#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
#define RExC_unlexed_names (pRExC_state->unlexed_names)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
#define PBITVAL(paren) (1 << ((paren) & 7))
-#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
-#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
-#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
+#define PAREN_OFFSET(depth) \
+ (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
+#define PAREN_TEST(depth, paren) \
+ (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
+#define PAREN_SET(depth, paren) \
+ (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
+#define PAREN_UNSET(depth, paren) \
+ (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
#define REQUIRE_UTF8(flagp) STMT_START { \
if (!UTF) { \
#define _invlist_intersection_complement_2nd(a, b, output) \
_invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
-/* We add a marker if we are deferring expansion of a potential user-defined
- * property until it is needed at runtime the first time it is encountered in a
- * pattern match. This marker that shouldn't conflict with any that could be
- * in a legal name is appended to its name to indicate this. There is a string
- * and character form */
-#define DEFERRED_PROP_EXPANSION_MARKERs "~"
-#define DEFERRED_PROP_EXPANSION_MARKERc '~'
+/* We add a marker if we are deferring expansion of a property that is both
+ * 1) potentiallly user-defined; and
+ * 2) could also be an official Unicode property.
+ *
+ * Without this marker, any deferred expansion can only be for a user-defined
+ * one. This marker shouldn't conflict with any that could be in a legal name,
+ * and is appended to its name to indicate this. There is a string and
+ * character form */
+#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
+#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
/* About scan_data_t.
} STMT_END
/* m is not necessarily a "literal string", in this macro */
-#define reg_warn_non_literal_string(loc, m) \
- _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
+#define warn_non_literal_string(loc, packed_warn, m) \
+ _WARN_HELPER(loc, packed_warn, \
+ Perl_warner(aTHX_ packed_warn, \
"%s" REPORT_LOCATION, \
m, REPORT_LOCATION_ARGS(loc)))
+#define reg_warn_non_literal_string(loc, m) \
+ warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
+
+#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
+ STMT_START { \
+ char * format; \
+ Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
+ Newx(format, format_size, char); \
+ my_strlcpy(format, m, format_size); \
+ my_strlcat(format, REPORT_LOCATION, format_size); \
+ SAVEFREEPV(format); \
+ _WARN_HELPER(loc, packwarn, \
+ Perl_ck_warner(aTHX_ packwarn, \
+ format, \
+ a1, REPORT_LOCATION_ARGS(loc))); \
+ } STMT_END
#define ckWARNreg(loc,m) \
_WARN_HELPER(loc, packWARN(WARN_REGEXP), \
REPORT_LOCATION_ARGS(loc)))
#define ckWARNexperimental(loc, class, m) \
- _WARN_HELPER(loc, packWARN(class), \
+ STMT_START { \
+ if (! RExC_warned_ ## class) { /* warn once per compilation */ \
+ RExC_warned_ ## class = 1; \
+ _WARN_HELPER(loc, packWARN(class), \
Perl_ck_warner_d(aTHX_ packWARN(class), \
m REPORT_LOCATION, \
- REPORT_LOCATION_ARGS(loc)))
+ REPORT_LOCATION_ARGS(loc)));\
+ } \
+ } STMT_END
/* Convert between a pointer to a node and its offset from the beginning of the
* program */
/* END of edit_distance() stuff
* ========================================================= */
-/* is c a control character for which we have a mnemonic? */
-#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
-
-STATIC const char *
-S_cntrl_to_mnemonic(const U8 c)
-{
- /* Returns the mnemonic string that represents character 'c', if one
- * exists; NULL otherwise. The only ones that exist for the purposes of
- * this routine are a few control characters */
-
- switch (c) {
- case '\a': return "\\a";
- case '\b': return "\\b";
- case ESC_NATIVE: return "\\e";
- case '\f': return "\\f";
- case '\n': return "\\n";
- case '\r': return "\\r";
- case '\t': return "\\t";
- }
-
- return NULL;
-}
-
/* Mark that we cannot extend a found fixed substring at this point.
Update the longest found anchored substring or the longest found
floating substrings if needed. */
if (OP(noper) == NOTHING) {
/* skip past a NOTHING at the start of an alternation
* eg, /(?:)a|(?:b)/ should be the same as /a|b/
+ *
+ * If the next node is not something we are supposed to process
+ * we will just ignore it due to the condition guarding the
+ * next block.
*/
+
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
+ /* we will undo this assignment if noper does not
+ * point at a trieable type in the else clause of
+ * the following statement. */
}
if ( noper < tail
Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
}
}
- }
+ } else {
+ /* If we end up here it is because we skipped past a NOTHING, but did not end up
+ * on a trieable type. So we need to reset noper back to point at the first regop
+ * in the branch before we call TRIE_HANDLE_WORD()
+ */
+ noper= NEXTOPER(cur);
+ }
TRIE_HANDLE_WORD(state);
} /* end second pass */
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
+ /* we will undo this assignment if noper does not
+ * point at a trieable type in the else clause of
+ * the following statement. */
}
if ( noper < tail
/* charid is now 0 if we dont know the char read, or
* nonzero if we do */
}
+ } else {
+ /* If we end up here it is because we skipped past a NOTHING, but did not end up
+ * on a trieable type. So we need to reset noper back to point at the first regop
+ * in the branch before we call TRIE_HANDLE_WORD().
+ */
+ noper= NEXTOPER(cur);
}
accept_state = TRIE_NODENUM( state );
TRIE_HANDLE_WORD(accept_state);
str=STRING(convert);
setSTR_LEN(convert, 0);
}
- setSTR_LEN(convert, STR_LEN(convert) + len);
+ assert( ( STR_LEN(convert) + len ) < 256 );
+ setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
while (len--)
*str++ = *ch++;
} else {
trie->prefixlen = (state-1);
if (str) {
regnode *n = convert+NODE_SZ_STR(convert);
- NEXT_OFF(convert) = NODE_SZ_STR(convert);
+ assert( NODE_SZ_STR(convert) <= U16_MAX );
+ NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
trie->startstate = state;
trie->minlen -= (state - 1);
trie->maxlen -= (state - 1);
* using /iaa matching will be doing so almost entirely with ASCII
* strings, so this should rarely be encountered in practice */
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
- if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT \
- && OP(scan) != LEXACT_REQ8) \
- join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
-
STATIC U32
S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
UV *min_subtract, bool *unfolded_multi_char,
merged++;
NEXT_OFF(scan) += NEXT_OFF(n);
- setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
+ assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
+ setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
}
#endif
}
-
- if ( STR_LEN(scan) == 1
- && isALPHA_A(* STRING(scan))
- && ( OP(scan) == EXACTFAA
- || ( OP(scan) == EXACTFU
- && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
- {
- U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
-
- /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
- * with the mask set to the complement of the bit that differs
- * between upper and lower case, and the lowest code point of the
- * pair (which the '&' forces) */
- OP(scan) = ANYOFM;
- ARG_SET(scan, *STRING(scan) & mask);
- FLAGS(scan) = mask;
- }
}
#ifdef DEBUGGING
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
+ SSize_t final_minlen;
/* There must be at least this number of characters to match */
SSize_t min = 0;
I32 pars = 0, code;
U32 j;
for ( j = 0 ; j < recursed_depth ; j++ ) {
for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
- if (
- PAREN_TEST(RExC_study_chunk_recursed +
- ( j * RExC_study_chunk_recursed_bytes), i )
- && (
- !j ||
- !PAREN_TEST(RExC_study_chunk_recursed +
- (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
- )
- ) {
+ if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
Perl_re_printf( aTHX_ " %d",(int)i);
break;
}
* parsing code, as each (?:..) is handled by a different invocation of
* reg() -- Yves
*/
- JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
+ if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT
+ && OP(scan) != LEXACT_REQ8)
+ join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
+ 0, NULL, depth + 1);
/* Follow the next-chain of the current node and optimize
away all the NOTHINGs from it. */
if (
!recursed_depth
- ||
- !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
+ || !PAREN_TEST(recursed_depth - 1, paren)
) {
/* it is quite possible that there are more efficient ways
* to do this. We maintain a bitmap per level of recursion
if (!recursed_depth) {
Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
} else {
- Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
- RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
+ Copy(PAREN_OFFSET(recursed_depth - 1),
+ PAREN_OFFSET(recursed_depth),
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
- PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
+ PAREN_SET(recursed_depth, paren);
my_recursed_depth= recursed_depth + 1;
} else {
DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
|| OP(scan) == LEXACT_REQ8
|| OP(scan) == EXACTL)
{
- SSize_t l = STR_LEN(scan);
+ SSize_t bytelen = STR_LEN(scan), charlen;
UV uc;
- assert(l);
+ assert(bytelen);
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
- uc = utf8_to_uvchr_buf(s, s + l, NULL);
- l = utf8_length(s, s + l);
+ uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+ charlen = utf8_length(s, s + bytelen);
} else {
uc = *((U8*)STRING(scan));
+ charlen = bytelen;
}
- min += l;
+ min += charlen;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
/* The code below prefers earlier match for fixed
offset, later match for variable offset. */
data->last_start_max = is_inf
? SSize_t_MAX : data->pos_min + data->pos_delta;
}
- sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+ sv_catpvn(data->last_found, STRING(scan), bytelen);
if (UTF)
SvUTF8_on(data->last_found);
{
MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
mg_find(sv, PERL_MAGIC_utf8) : NULL;
if (mg && mg->mg_len >= 0)
- mg->mg_len += utf8_length((U8*)STRING(scan),
- (U8*)STRING(scan)+STR_LEN(scan));
+ mg->mg_len += charlen;
}
- data->last_end = data->pos_min + l;
- data->pos_min += l; /* As in the first entry. */
+ data->last_end = data->pos_min + charlen;
+ data->pos_min += charlen; /* As in the first entry. */
data->flags &= ~SF_BEFORE_EOL;
}
}
else if (PL_regkind[OP(scan)] == EXACT) {
/* But OP != EXACT!, so is EXACTFish */
- SSize_t l = STR_LEN(scan);
+ SSize_t bytelen = STR_LEN(scan), charlen;
const U8 * s = (U8*)STRING(scan);
+ /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+ * with the mask set to the complement of the bit that differs
+ * between upper and lower case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ if ( bytelen == 1
+ && isALPHA_A(*s)
+ && ( OP(scan) == EXACTFAA
+ || ( OP(scan) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))))
+ {
+ U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+ OP(scan) = ANYOFM;
+ ARG_SET(scan, *s & mask);
+ FLAGS(scan) = mask;
+ /* we're not EXACTFish any more, so restudy */
+ continue;
+ }
+
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
assert(data);
scan_commit(pRExC_state, data, minlenp, is_inf);
}
- if (UTF) {
- l = utf8_length(s, s + l);
- }
+ charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
if (unfolded_multi_char) {
RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
}
- min += l - min_subtract;
+ min += charlen - min_subtract;
assert (min >= 0);
delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
- data->pos_min += l - min_subtract;
+ data->pos_min += charlen - min_subtract;
if (data->pos_min < 0) {
data->pos_min = 0;
}
continue;
default:
-#ifdef DEBUGGING
Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
OP(scan));
-#endif
case REF:
case CLUMP:
if (flags & SCF_DO_SUBSTR) {
if (trie->jump) /* no more substrings -- for now /grr*/
flags &= ~SCF_DO_SUBSTR;
}
+ else if (OP(scan) == REGEX_SET) {
+ Perl_croak(aTHX_ "panic: %s regnode should be resolved"
+ " before optimization", reg_name[REGEX_SET]);
+ }
+
#endif /* old or new */
#endif /* TRIE_STUDY_OPT */
DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
- {
- SSize_t final_minlen= min < stopmin ? min : stopmin;
-
- if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
- if (final_minlen > SSize_t_MAX - delta)
- RExC_maxlen = SSize_t_MAX;
- else if (RExC_maxlen < final_minlen + delta)
- RExC_maxlen = final_minlen + delta;
- }
- return final_minlen;
+ final_minlen = min < stopmin
+ ? min : stopmin;
+ if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
+ if (final_minlen > SSize_t_MAX - delta)
+ RExC_maxlen = SSize_t_MAX;
+ else if (RExC_maxlen < final_minlen + delta)
+ RExC_maxlen = final_minlen + delta;
}
- NOT_REACHED; /* NOTREACHED */
+ return final_minlen;
}
STATIC U32
REGEXP *
Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
{
- SV *pat = pattern; /* defeat constness! */
PERL_ARGS_ASSERT_RE_COMPILE;
+ return re_op_compile_wrapper(pattern, rx_flags, 0);
+}
+
+REGEXP *
+S_re_op_compile_wrapper(pTHX_ SV * const pattern, U32 rx_flags, const U32 pm_flags)
+{
+ SV *pat = pattern; /* defeat constness! */
+
+ PERL_ARGS_ASSERT_RE_OP_COMPILE_WRAPPER;
+
return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
#ifdef PERL_IN_XSUB_RE
&my_reg_engine,
#else
&PL_core_reg_engine,
#endif
- NULL, NULL, rx_flags, 0);
+ NULL, NULL, rx_flags, pm_flags);
}
*
* pm_flags contains the PMf_* flags, typically based on those from the
* pm_flags field of the related PMOP. Currently we're only interested in
- * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
+ * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
*
* For many years this code had an initial sizing pass that calculated
* (sometimes incorrectly, leading to security holes) the size needed for the
RExC_frame_count= 0;
RExC_latest_warn_offset = 0;
RExC_use_BRANCHJ = 0;
+ RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
+ RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
RExC_total_parens = 0;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
RExC_study_chunk_recursed = NULL;
RExC_study_chunk_recursed_bytes= 0;
RExC_recurse_count = 0;
+ RExC_sets_depth = 0;
pRExC_state->code_index = 0;
/* Initialize the string in the compiled pattern. This is so that there is
i = t1 - s1;
if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
- i = el;
+ i = el;
}
return i;
}
}
else { /* Single char fold */
unsigned int k;
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
Size_t folds_count;
/* It matches itself */
}
while (RExC_parse < RExC_end) {
- /* && strchr("iogcmsx", *RExC_parse) */
+ /* && memCHRs("iogcmsx", *RExC_parse) */
/* (?g), (?gc) and (?o) are useless here
and must be globally applied -- japhy */
+ if ((RExC_pm_flags & PMf_WILDCARD)) {
+ if (flagsp == & negflags) {
+ if (*RExC_parse == 'm') {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by <--
+ HERE in m/%s/ */
+ vFAIL("Use of modifier '-m' is not allowed in Unicode"
+ " property wildcard subpatterns");
+ }
+ }
+ else {
+ if (*RExC_parse == 's') {
+ goto modifier_illegal_in_wildcard;
+ }
+ }
+ }
+
switch (*RExC_parse) {
/* Code for the imsxn flags */
vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
*(RExC_parse - 1));
NOT_REACHED; /*NOTREACHED*/
- case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ goto modifier_illegal_in_wildcard;
+ }
+ /*FALLTHROUGH*/
+ case ONCE_PAT_MOD: /* 'o' */
if (ckWARN(WARN_REGEXP)) {
const I32 wflagbit = *RExC_parse == 'o'
? WASTED_O
break;
case CONTINUE_PAT_MOD: /* 'c' */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ goto modifier_illegal_in_wildcard;
+ }
if (ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
}
break;
case KEEPCOPY_PAT_MOD: /* 'p' */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ goto modifier_illegal_in_wildcard;
+ }
if (flagsp == &negflags) {
ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
} else {
case ':':
case ')':
+ if ( (RExC_pm_flags & PMf_WILDCARD)
+ && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+ {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by <--
+ HERE in m/%s/ */
+ vFAIL2("Use of modifier '%c' is not allowed in Unicode"
+ " property wildcard subpatterns",
+ has_charset_modifier);
+ }
+
if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
negflags |= RXf_PMf_EXTENDED_MORE;
}
}
vFAIL("Sequence (?... not terminated");
+
+ modifier_illegal_in_wildcard:
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
+ subpatterns in regex; marked by <-- HERE in m/%s/ */
+ vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
+ " subpatterns", *(RExC_parse - 1));
}
/*
RExC_parse-seqstart, seqstart);
NOT_REACHED; /*NOTREACHED*/
case '<': /* (?<...) */
+ /* If you want to support (?<*...), first reconcile with GH #17363 */
if (*RExC_parse == '!')
paren = ',';
else if (*RExC_parse != '=')
ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
}
/* FALLTHROUGH */
+ case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
+ /* FALLTHROUGH */
default: /* e.g., (?i) */
RExC_parse = (char *) seqstart + 1;
parse_flags:
}
break;
}
- DEBUG_PARSE_r(
+ DEBUG_PARSE_r({
DEBUG_PARSE_MSG("lsbr");
regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
(IV)ender,
(IV)(ender - lastbr)
);
- );
+ });
if (! REGTAIL(pRExC_state, lastbr, ender)) {
REQUIRE_BRANCHJ(flagp, 0);
}
br= PL_regkind[OP(ret_as_regnode)] != BRANCH
? regnext(ret_as_regnode)
: ret_as_regnode;
- DEBUG_PARSE_r(
+ DEBUG_PARSE_r({
DEBUG_PARSE_MSG("NADA");
regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
NULL, pRExC_state);
(IV)ender,
(IV)(ender - ret)
);
- );
+ });
OP(br)= NOTHING;
if (OP(REGNODE_p(ender)) == TAIL) {
NEXT_OFF(br)= 0;
do_curly:
if ((flags&SIMPLE)) {
if (min == 0 && max == REG_INFTY) {
+
+ /* Going from 0..inf is currently forbidden in wildcard
+ * subpatterns. The only reason is to make it harder to
+ * write patterns that take a long long time to halt, and
+ * because the use of this construct isn't necessary in
+ * matching Unicode property values */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by
+ <-- HERE in m/%s/ */
+ vFAIL("Use of quantifier '*' is not allowed in"
+ " Unicode property wildcard subpatterns");
+ /* Note, don't need to worry about {0,}, as a '}' isn't
+ * legal at all in wildcards, so wouldn't get this far
+ * */
+ }
reginsert(pRExC_state, STAR, ret, depth+1);
MARK_NAUGHTY(4);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
}
nest_check:
if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
- ckWARN2reg(RExC_parse,
- "%" UTF8f " matches null string many times",
- UTF8fARG(UTF, (RExC_parse >= origparse
- ? RExC_parse - origparse
- : 0),
- origparse));
+ if (origparse[0] == '\\' && origparse[1] == 'K') {
+ vFAIL2utf8f(
+ "%" UTF8f " is forbidden - matches null string many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ /* NOT-REACHED */
+ } else {
+ ckWARN2reg(RExC_parse,
+ "%" UTF8f " matches null string many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ }
}
if (*RExC_parse == '?') {
* thing. */
do { /* Loop until the ending brace */
- UV cp = 0;
- char * start_digit; /* The first of the current code point */
- if (! isXDIGIT(*RExC_parse)) {
+ I32 flags = PERL_SCAN_SILENT_OVERFLOW
+ | PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_NOTIFY_ILLDIGIT
+ | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ STRLEN len = endbrace - RExC_parse;
+ NV overflow_value;
+ char * start_digit = RExC_parse;
+ UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
+
+ if (len == 0) {
RExC_parse++;
+ bad_NU:
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- start_digit = RExC_parse;
- count++;
-
- /* Loop through the hex digits of the current code point */
- do {
- /* Adding this digit will shift the result 4 bits. If that
- * result would be above the legal max, it's overflow */
- if (cp > MAX_LEGAL_CP >> 4) {
-
- /* Find the end of the code point */
- do {
- RExC_parse ++;
- } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
-
- /* Be sure to synchronize this message with the similar one
- * in utf8.c */
- vFAIL4("Use of code point 0x%.*s is not allowed; the"
- " permissible max is 0x%" UVxf,
- (int) (RExC_parse - start_digit), start_digit,
- MAX_LEGAL_CP);
- }
-
- /* Accumulate this (valid) digit into the running total */
- cp = (cp << 4) + READ_XDIGIT(RExC_parse);
+ RExC_parse += len;
- /* READ_XDIGIT advanced the input pointer. Ignore a single
- * underscore separator */
- if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
- RExC_parse++;
- }
- } while (isXDIGIT(*RExC_parse));
+ if (cp > MAX_LEGAL_CP) {
+ vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
+ }
- /* Here, have accumulated the next code point */
- if (RExC_parse >= endbrace) { /* If done ... */
- if (count != 1) {
+ if (RExC_parse >= endbrace) { /* Got to the closing '}' */
+ if (count) {
goto do_concat;
}
return TRUE;
}
- /* Here, the only legal thing would be a multiple character
- * sequence (of the form "\N{U+c1.c2. ... }". So the next
- * character must be a dot (and the one after that can't be the
- * endbrace, or we'd have something like \N{U+100.} ) */
+ /* Here, the parse stopped bfore the ending brace. This is legal
+ * only if that character is a dot separating code points, like a
+ * multiple character sequence (of the form "\N{U+c1.c2. ... }".
+ * So the next character must be a dot (and the one after that
+ * can't be the endbrace, or we'd have something like \N{U+100.} )
+ * */
if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
- RExC_parse = endbrace;
- }
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
+ malformed utf8 */
+ goto bad_NU;
}
/* Here, looks like its really a multiple character sequence. Fail
* but go through the motions of code point counting and error
* checking, if the caller doesn't want a node returned. */
- if (node_p && count == 1) {
+ if (node_p && ! substitute_parse) {
substitute_parse = newSVpvs("?:");
}
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
- ret = reg_node(pRExC_state, SBOL);
- /* SBOL is shared with /^/ so we set the flags so we can tell
- * /\A/ from /^/ in split. */
- FLAGS(REGNODE_p(ret)) = 1;
+ /* Under wildcards, this is changed to match \n; should be
+ * invisible to the user, as they have to compile under /m */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ ret = reg_node(pRExC_state, MBOL);
+ }
+ else {
+ ret = reg_node(pRExC_state, SBOL);
+ /* SBOL is shared with /^/ so we set the flags so we can tell
+ * /\A/ from /^/ in split. */
+ FLAGS(REGNODE_p(ret)) = 1;
+ }
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode property
+ wildcard subpatterns in regex; marked by <-- HERE in m/%s/
+ */
+ vFAIL("Use of '\\G' is not allowed in Unicode property"
+ " wildcard subpatterns");
+ }
ret = reg_node(pRExC_state, GPOS);
RExC_seen |= REG_GPOS_SEEN;
*flagp |= SIMPLE;
vFAIL("\\K not permitted in lookahead/lookbehind");
}
case 'Z':
- ret = reg_node(pRExC_state, SEOL);
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ /* See comment under \A above */
+ ret = reg_node(pRExC_state, MEOL);
+ }
+ else {
+ ret = reg_node(pRExC_state, SEOL);
+ }
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
case 'z':
- ret = reg_node(pRExC_state, EOS);
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ /* See comment under \A above */
+ ret = reg_node(pRExC_state, MEOL);
+ }
+ else {
+ ret = reg_node(pRExC_state, EOS);
+ }
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
|| ! is_PATWS_safe((p), RExC_end, UTF));
switch ((U8)*p) {
+ const char* message;
+ U32 packed_warn;
+ U8 grok_c_char;
+
case '^':
case '$':
case '.':
p++;
break;
case 'o':
- {
- UV result;
- const char* error_msg;
-
- bool valid = grok_bslash_o(&p,
- RExC_end,
- &result,
- &error_msg,
- TO_OUTPUT_WARNINGS(p),
- (bool) RExC_strict,
- TRUE, /* Output warnings
- for non-
- portables */
- UTF);
- if (! valid) {
- RExC_parse = p; /* going to die anyway; point
- to exact spot of failure */
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(p - 1);
- ender = result;
- break;
- }
+ if (! grok_bslash_o(&p,
+ RExC_end,
+ &ender,
+ &message,
+ &packed_warn,
+ (bool) RExC_strict,
+ FALSE, /* No illegal cp's */
+ UTF))
+ {
+ RExC_parse = p; /* going to die anyway; point to
+ exact spot of failure */
+ vFAIL(message);
+ }
+
+ if (message && TO_OUTPUT_WARNINGS(p)) {
+ warn_non_literal_string(p, packed_warn, message);
+ }
+ break;
case 'x':
- {
- UV result = UV_MAX; /* initialize to erroneous
- value */
- const char* error_msg;
-
- bool valid = grok_bslash_x(&p,
- RExC_end,
- &result,
- &error_msg,
- TO_OUTPUT_WARNINGS(p),
- (bool) RExC_strict,
- TRUE, /* Silence warnings
- for non-
- portables */
- UTF);
- if (! valid) {
- RExC_parse = p; /* going to die anyway; point
- to exact spot of failure */
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(p - 1);
- ender = result;
+ if (! grok_bslash_x(&p,
+ RExC_end,
+ &ender,
+ &message,
+ &packed_warn,
+ (bool) RExC_strict,
+ FALSE, /* No illegal cp's */
+ UTF))
+ {
+ RExC_parse = p; /* going to die anyway; point
+ to exact spot of failure */
+ vFAIL(message);
+ }
+
+ if (message && TO_OUTPUT_WARNINGS(p)) {
+ warn_non_literal_string(p, packed_warn, message);
+ }
#ifdef EBCDIC
- if (ender < 0x100) {
- if (RExC_recode_x_to_native) {
- ender = LATIN1_TO_NATIVE(ender);
- }
- }
+ if (ender < 0x100) {
+ if (RExC_recode_x_to_native) {
+ ender = LATIN1_TO_NATIVE(ender);
+ }
+ }
#endif
- break;
- }
+ break;
case 'c':
- p++;
- ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
- UPDATE_WARNINGS_LOC(p);
p++;
+ if (! grok_bslash_c(*p, &grok_c_char,
+ &message, &packed_warn))
+ {
+ /* going to die anyway; point to exact spot of
+ * failure */
+ RExC_parse = p + ((UTF)
+ ? UTF8_SAFE_SKIP(p, RExC_end)
+ : 1);
+ vFAIL(message);
+ }
+
+ ender = grok_c_char;
+ p++;
+ if (message && TO_OUTPUT_WARNINGS(p)) {
+ warn_non_literal_string(p, packed_warn, message);
+ }
+
break;
case '8': case '9': /* must be a backreference */
--p;
/* FALLTHROUGH */
case '0':
{
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_NOTIFY_ILLDIGIT;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
p += numlen;
- if ( isDIGIT(*p) /* like \08, \178 */
- && ckWARN(WARN_REGEXP)
- && numlen < 3)
+ if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
+ && isDIGIT(*p) /* like \08, \178 */
+ && ckWARN(WARN_REGEXP))
{
reg_warn_non_literal_string(
- p + 1,
- form_short_octal_warning(p, numlen));
+ p + 1,
+ form_alien_digit_msg(8, numlen, p,
+ RExC_end, UTF, FALSE));
}
}
break;
if (ender > 255) {
REQUIRE_UTF8(flagp);
+ if ( UNICODE_IS_PERL_EXTENDED(ender)
+ && TO_OUTPUT_WARNINGS(p))
+ {
+ ckWARN2_non_literal_string(p,
+ packWARN(WARN_PORTABLE),
+ PL_extended_cp_format,
+ ender);
+ }
}
/* We need to check if the next non-ignored thing is a
else if (FOLD) {
bool splittable = FALSE;
bool backed_up = FALSE;
- char * e;
- char * s_start;
+ char * e; /* should this be U8? */
+ char * s_start; /* should this be U8? */
/* Here is /i. Running out of room creates a problem if we are
* folding, and the split happens in the middle of a
if ( ender != LATIN_SMALL_LETTER_SHARP_S
|| ASCII_FOLD_RESTRICTED)
{
- *e++ = toLOWER_L1(ender);
+ assert( toLOWER_L1(ender) < 256 );
+ *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
}
else {
*e++ = 's';
if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
|| ASCII_FOLD_RESTRICTED)
{
- *e++ = toLOWER_L1(ender);
+ assert( toLOWER_L1(ender) < 256 );
+ *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
}
else {
*e++ = 's';
with left paren in stack is; -1 if none.
*/
STRLEN len; /* Temporary */
- regnode_offset node; /* Temporary, and final regnode returned by
+ regnode_offset node; /* Temporary, and final regnode returned by
this function */
const bool save_fold = FOLD; /* Temporary */
char *save_end, *save_parse; /* Temporaries */
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
+ PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
DEBUG_PARSE("xcls");
SV* rhs; /* Operand to the right of the operator */
SV* fence_ptr; /* Pointer to top element of the fence
stack */
-
case '(':
if ( RExC_parse < RExC_end - 2
&& UCHARAT(RExC_parse + 1) == '?'
&& UCHARAT(RExC_parse + 2) == '^')
{
- /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
+ const regnode_offset orig_emit = RExC_emit;
+ SV * resultant_invlist;
+
+ /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
* This happens when we have some thing like
*
* my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
*
* Here we would be handling the interpolated
* '$thai_or_lao'. We handle this by a recursive call to
- * ourselves which returns the inversion list the
- * interpolated expression evaluates to. We use the flags
- * from the interpolated pattern. */
- U32 save_flags = RExC_flags;
- const char * save_parse;
-
- RExC_parse += 2; /* Skip past the '(?' */
- save_parse = RExC_parse;
-
- /* Parse the flags for the '(?'. We already know the first
- * flag to parse is a '^' */
- parse_lparen_question_flags(pRExC_state);
-
- if ( RExC_parse >= RExC_end - 4
- || UCHARAT(RExC_parse) != ':'
- || UCHARAT(++RExC_parse) != '('
- || UCHARAT(++RExC_parse) != '?'
- || UCHARAT(++RExC_parse) != '[')
- {
+ * reg which returns the inversion list the
+ * interpolated expression evaluates to. Actually, the
+ * return is a special regnode containing a pointer to that
+ * inversion list. If the return isn't that regnode alone,
+ * we know that this wasn't such an interpolation, which is
+ * an error: we need to get a single inversion list back
+ * from the recursion */
- /* In combination with the above, this moves the
- * pointer to the point just after the first erroneous
- * character. */
- if (RExC_parse >= RExC_end - 4) {
- RExC_parse = RExC_end;
- }
- else if (RExC_parse != save_parse) {
- RExC_parse += (UTF)
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
- : 1;
- }
- vFAIL("Expecting '(?flags:(?[...'");
- }
-
- /* Recurse, with the meat of the embedded expression */
RExC_parse++;
- if (! handle_regex_sets(pRExC_state, ¤t, flagp,
- depth+1, oregcomp_parse))
- {
- RETURN_FAIL_ON_RESTART(*flagp, flagp);
- }
+ RExC_sets_depth++;
- /* Here, 'current' contains the embedded expression's
- * inversion list, and RExC_parse points to the trailing
- * ']'; the next character should be the ')' */
- RExC_parse++;
- if (UCHARAT(RExC_parse) != ')')
- vFAIL("Expecting close paren for nested extended charclass");
+ node = reg(pRExC_state, 2, flagp, depth+1);
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
- /* Then the ')' matching the original '(' handled by this
- * case: statement */
- RExC_parse++;
- if (UCHARAT(RExC_parse) != ')')
- vFAIL("Expecting close paren for wrapper for nested extended charclass");
+ if ( OP(REGNODE_p(node)) != REGEX_SET
+ /* If more than a single node returned, the nested
+ * parens evaluated to more than just a (?[...]),
+ * which isn't legal */
+ || node != 1) {
+ vFAIL("Expecting interpolated extended charclass");
+ }
+ resultant_invlist = (SV *) ARGp(REGNODE_p(node));
+ current = invlist_clone(resultant_invlist, NULL);
+ SvREFCNT_dec(resultant_invlist);
- RExC_flags = save_flags;
+ RExC_sets_depth--;
+ RExC_emit = orig_emit;
goto handle_operand;
}
return END;
}
- /* Otherwise generate a resultant node, based on 'final'. regclass() is
- * expecting a string of ranges and individual code points */
- invlist_iterinit(final);
- result_string = newSVpvs("");
- while (invlist_iternext(final, &start, &end)) {
- if (start == end) {
- Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
- }
- else {
- Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
- start, end);
- }
+ if (RExC_sets_depth) { /* If within a recursive call, return in a special
+ regnode */
+ RExC_parse++;
+ node = regpnode(pRExC_state, REGEX_SET, (void *) final);
}
+ else {
- /* About to generate an ANYOF (or similar) node from the inversion list we
- * have calculated */
- save_parse = RExC_parse;
- RExC_parse = SvPV(result_string, len);
- save_end = RExC_end;
- RExC_end = RExC_parse + len;
- TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
+ /* Otherwise generate a resultant node, based on 'final'. regclass()
+ * is expecting a string of ranges and individual code points */
+ invlist_iterinit(final);
+ result_string = newSVpvs("");
+ while (invlist_iternext(final, &start, &end)) {
+ if (start == end) {
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
+ UVXf "}", start, end);
+ }
+ }
+
+ /* About to generate an ANYOF (or similar) node from the inversion list
+ * we have calculated */
+ save_parse = RExC_parse;
+ RExC_parse = SvPV(result_string, len);
+ save_end = RExC_end;
+ RExC_end = RExC_parse + len;
+ TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
+
+ /* We turn off folding around the call, as the class we have
+ * constructed already has all folding taken into consideration, and we
+ * don't want regclass() to add to that */
+ RExC_flags &= ~RXf_PMf_FOLD;
+ /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
+ * folds are allowed. */
+ node = regclass(pRExC_state, flagp, depth+1,
+ FALSE, /* means parse the whole char class */
+ FALSE, /* don't allow multi-char folds */
+ TRUE, /* silence non-portable warnings. The above may
+ very well have generated non-portable code
+ points, but they're valid on this machine */
+ FALSE, /* similarly, no need for strict */
+
+ /* We can optimize into something besides an ANYOF,
+ * except under /l, which needs to be ANYOF because of
+ * runtime checks for locale sanity, etc */
+ ! in_locale,
+ NULL
+ );
- /* We turn off folding around the call, as the class we have constructed
- * already has all folding taken into consideration, and we don't want
- * regclass() to add to that */
- RExC_flags &= ~RXf_PMf_FOLD;
- /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
- * folds are allowed. */
- node = regclass(pRExC_state, flagp, depth+1,
- FALSE, /* means parse the whole char class */
- FALSE, /* don't allow multi-char folds */
- TRUE, /* silence non-portable warnings. The above may very
- well have generated non-portable code points, but
- they're valid on this machine */
- FALSE, /* similarly, no need for strict */
-
- /* We can optimize into something besides an ANYOF, except
- * under /l, which needs to be ANYOF because of runtime
- * checks for locale sanity, etc */
- ! in_locale,
- NULL
- );
+ RESTORE_WARNINGS;
+ RExC_parse = save_parse + 1;
+ RExC_end = save_end;
+ SvREFCNT_dec_NN(final);
+ SvREFCNT_dec_NN(result_string);
- RESTORE_WARNINGS;
- RExC_parse = save_parse + 1;
- RExC_end = save_end;
- SvREFCNT_dec_NN(final);
- SvREFCNT_dec_NN(result_string);
-
- if (save_fold) {
- RExC_flags |= RXf_PMf_FOLD;
- }
-
- if (!node) {
- RETURN_FAIL_ON_RESTART(*flagp, flagp);
- goto regclass_failed;
- }
-
- /* Fix up the node type if we are in locale. (We have pretended we are
- * under /u for the purposes of regclass(), as this construct will only
- * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
- * as to cause any warnings about bad locales to be output in regexec.c),
- * and add the flag that indicates to check if not in a UTF-8 locale. The
- * reason we above forbid optimization into something other than an ANYOF
- * node is simply to minimize the number of code changes in regexec.c.
- * Otherwise we would have to create new EXACTish node types and deal with
- * them. This decision could be revisited should this construct become
- * popular.
- *
- * (One might think we could look at the resulting ANYOF node and suppress
- * the flag if everything is above 255, as those would be UTF-8 only,
- * but this isn't true, as the components that led to that result could
- * have been locale-affected, and just happen to cancel each other out
- * under UTF-8 locales.) */
- if (in_locale) {
- set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+ if (save_fold) {
+ RExC_flags |= RXf_PMf_FOLD;
+ }
+
+ if (!node) {
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
+ goto regclass_failed;
+ }
+
+ /* Fix up the node type if we are in locale. (We have pretended we are
+ * under /u for the purposes of regclass(), as this construct will only
+ * work under UTF-8 locales. But now we change the opcode to be ANYOFL
+ * (so as to cause any warnings about bad locales to be output in
+ * regexec.c), and add the flag that indicates to check if not in a
+ * UTF-8 locale. The reason we above forbid optimization into
+ * something other than an ANYOF node is simply to minimize the number
+ * of code changes in regexec.c. Otherwise we would have to create new
+ * EXACTish node types and deal with them. This decision could be
+ * revisited should this construct become popular.
+ *
+ * (One might think we could look at the resulting ANYOF node and
+ * suppress the flag if everything is above 255, as those would be
+ * UTF-8 only, but this isn't true, as the components that led to that
+ * result could have been locale-affected, and just happen to cancel
+ * each other out under UTF-8 locales.) */
+ if (in_locale) {
+ set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
- assert(OP(REGNODE_p(node)) == ANYOF);
+ assert(OP(REGNODE_p(node)) == ANYOF);
- OP(REGNODE_p(node)) = ANYOFL;
- ANYOF_FLAGS(REGNODE_p(node))
- |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+ OP(REGNODE_p(node)) = ANYOFL;
+ ANYOF_FLAGS(REGNODE_p(node))
+ |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+ }
}
nextchar(pRExC_state);
current Unicode version */
{
Size_t folds_count;
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
UV folded_cp;
if (isASCII(cp)) {
PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+ CLEAR_POSIX_WARNINGS();
return;
}
* is already in 'value'. Otherwise, need to translate the escape
* into what it signifies. */
if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
+ const char * message;
+ U32 packed_warn;
+ U8 grok_c_char;
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
{
char *e;
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by <--
+ HERE in m/%s/ */
+ vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
+ " wildcard subpatterns", (char) value, *(RExC_parse - 1));
+ }
+
/* \p means they want Unicode semantics */
REQUIRE_UNI_RULES(flagp, 0);
case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
- {
- const char* error_msg;
- bool valid = grok_bslash_o(&RExC_parse,
- RExC_end,
- &value,
- &error_msg,
- TO_OUTPUT_WARNINGS(RExC_parse),
- strict,
- silence_non_portable,
- UTF);
- if (! valid) {
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(RExC_parse - 1);
- }
- non_portable_endpoint++;
+ if (! grok_bslash_o(&RExC_parse,
+ RExC_end,
+ &value,
+ &message,
+ &packed_warn,
+ strict,
+ cBOOL(range), /* MAX_UV allowed for range
+ upper limit */
+ UTF))
+ {
+ vFAIL(message);
+ }
+ else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+ warn_non_literal_string(RExC_parse, packed_warn, message);
+ }
+
+ if (value < 256) {
+ non_portable_endpoint++;
+ }
break;
case 'x':
RExC_parse--; /* function expects to be pointed at the 'x' */
- {
- const char* error_msg;
- bool valid = grok_bslash_x(&RExC_parse,
- RExC_end,
- &value,
- &error_msg,
- TO_OUTPUT_WARNINGS(RExC_parse),
- strict,
- silence_non_portable,
- UTF);
- if (! valid) {
- vFAIL(error_msg);
- }
- UPDATE_WARNINGS_LOC(RExC_parse - 1);
- }
- non_portable_endpoint++;
+ if (! grok_bslash_x(&RExC_parse,
+ RExC_end,
+ &value,
+ &message,
+ &packed_warn,
+ strict,
+ cBOOL(range), /* MAX_UV allowed for range
+ upper limit */
+ UTF))
+ {
+ vFAIL(message);
+ }
+ else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+ warn_non_literal_string(RExC_parse, packed_warn, message);
+ }
+
+ if (value < 256) {
+ non_portable_endpoint++;
+ }
break;
case 'c':
- value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
- UPDATE_WARNINGS_LOC(RExC_parse);
- RExC_parse++;
+ if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
+ &packed_warn))
+ {
+ /* going to die anyway; point to exact spot of
+ * failure */
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
+ vFAIL(message);
+ }
+
+ value = grok_c_char;
+ RExC_parse++;
+ if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+ warn_non_literal_string(RExC_parse, packed_warn, message);
+ }
+
non_portable_endpoint++;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
{
/* Take 1-3 octal digits */
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_NOTIFY_ILLDIGIT;
numlen = (strict) ? 4 : 3;
value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
: 1;
vFAIL("Need exactly 3 octal digits");
}
- else if ( numlen < 3 /* like \08, \178 */
+ else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
&& RExC_parse < RExC_end
&& isDIGIT(*RExC_parse)
&& ckWARN(WARN_REGEXP))
{
reg_warn_non_literal_string(
RExC_parse + 1,
- form_short_octal_warning(RExC_parse, numlen));
+ form_alien_digit_msg(8, numlen, RExC_parse,
+ RExC_end, UTF, FALSE));
}
}
- non_portable_endpoint++;
+ if (value < 256) {
+ non_portable_endpoint++;
+ }
break;
}
default:
/* non-Latin1 code point implies unicode semantics. */
if (value > 255) {
+ if (value > MAX_LEGAL_CP && ( value != UV_MAX
+ || prevvalue > MAX_LEGAL_CP))
+ {
+ vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
+ }
REQUIRE_UNI_RULES(flagp, 0);
+ if ( ! silence_non_portable
+ && UNICODE_IS_PERL_EXTENDED(value)
+ && TO_OUTPUT_WARNINGS(RExC_parse))
+ {
+ ckWARN2_non_literal_string(RExC_parse,
+ packWARN(WARN_PORTABLE),
+ PL_extended_cp_format,
+ value);
+ }
}
/* Ready to process either the single value, or the completed range.
STRLEN foldlen;
unsigned int k;
Size_t folds_count;
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
if (j < 256) {
invert = FALSE;
}
+ /* All possible optimizations below still have these characteristics.
+ * (Multi-char folds aren't SIMPLE, but they don't get this far in this
+ * routine) */
+ *flagp |= HASWIDTH|SIMPLE;
+
if (ret_invlist) {
*ret_invlist = cp_list;
return RExC_emit;
}
- /* All possible optimizations below still have these characteristics.
- * (Multi-char folds aren't SIMPLE, but they don't get this far in this
- * routine) */
- *flagp |= HASWIDTH|SIMPLE;
-
if (anyof_flags & ANYOF_LOCALE_FLAGS) {
RExC_contains_locale = 1;
}
U8 foldbuf[UTF8_MAXBYTES_CASE];
UV folded = _to_uni_fold_flags(start[0],
foldbuf, &foldlen, 0);
- unsigned int first_fold;
- const unsigned int * remaining_folds;
+ U32 first_fold;
+ const U32 * remaining_folds;
Size_t folds_to_this_cp_count = _inverse_folds(
folded,
&first_fold,
/* The data consists of just strings defining user-defined
* property names, but in prior incarnations, and perhaps
* somehow from pluggable regex engines, it could still
- * hold hex code point definitions. Each component of a
- * range would be separated by a tab, and each range by a
- * new-line. If these are found, instead add them to the
- * inversion list */
+ * hold hex code point definitions, all of which should be
+ * legal (or it wouldn't have gotten this far). Each
+ * component of a range would be separated by a tab, and
+ * each range by a new-line. If these are found, instead
+ * add them to the inversion list */
I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
|PERL_SCAN_SILENT_NON_PORTABLE;
STRLEN len = remaining;
remaining -= len;
len = strcspn(si_string,
- DEFERRED_PROP_EXPANSION_MARKERs "\n");
+ DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
remaining -= len;
if (matches_string) {
sv_catpvn(matches_string, si_string, len);
si_string += len;
if ( remaining
&& UCHARAT(si_string)
- == DEFERRED_PROP_EXPANSION_MARKERc)
+ == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
{
si_string++;
remaining--;
return(ret);
}
+/*
+- regpnode - emit a temporary node with a void* argument
+*/
+STATIC regnode_offset /* Location. */
+S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg)
+{
+ const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regvnode");
+ regnode_offset ptr = ret;
+
+ PERL_ARGS_ASSERT_REGPNODE;
+
+ FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
+ RExC_emit = ptr;
+ return(ret);
+}
+
STATIC regnode_offset
S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
{
SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
+ /* Returns an SV containing a string that must appear in the target for it
+ * to match */
+
struct regexp *const prog = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_COMPILE_r(
{
- const char * const s = SvPV_nolen_const(RX_UTF8(r)
+ if (prog->maxlen > 0) {
+ const char * const s = SvPV_nolen_const(RX_UTF8(r)
? prog->check_utf8 : prog->check_substr);
- if (!PL_colorset) reginitcolors();
- Perl_re_printf( aTHX_
+ if (!PL_colorset) reginitcolors();
+ Perl_re_printf( aTHX_
"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
PL_colors[4],
RX_UTF8(r) ? "utf8 " : "",
s,
PL_colors[1],
(strlen(s) > PL_dump_re_max_len ? "..." : ""));
+ }
} );
/* use UTF8 check substring if regexp pattern itself is in UTF8 */
#ifndef PERL_IN_XSUB_RE
-#include "uni_keywords.h"
+# include "uni_keywords.h"
void
Perl_init_uniprops(pTHX)
{
dVAR;
-#ifdef DEBUGGING
+# ifdef DEBUGGING
char * dump_len_string;
dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
{
PL_dump_re_max_len = 60; /* A reasonable default */
}
-#endif
+# endif
PL_user_def_props = newHV();
-#ifdef USE_ITHREADS
+# ifdef USE_ITHREADS
HvSHAREKEYS_off(PL_user_def_props);
PL_user_def_props_aTHX = aTHX;
-#endif
+# endif
/* Set up the inversion list interpreter-level variables */
PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
- PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist);
+ PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
-#ifdef UNI_XIDC
+# ifdef UNI_XIDC
/* The below are used only by deprecated functions. They could be removed */
PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
-#endif
+# endif
}
-#if 0
+# if 0
This code was mainly added for backcompat to give a warning for non-portable
code points in user-defined properties. But experiments showed that the
return SvPVX(*msg);
}
-#endif
+# endif
+
+STATIC REGEXP *
+S_compile_wildcard(pTHX_ const char * name, const STRLEN len,
+ const bool ignore_case)
+{
+ U32 flags = PMf_MULTILINE|PMf_WILDCARD;
+ REGEXP * subpattern_re;
+
+ PERL_ARGS_ASSERT_COMPILE_WILDCARD;
+
+ if (ignore_case) {
+ flags |= PMf_FOLD;
+ }
+ set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
+
+ subpattern_re = re_op_compile_wrapper(sv_2mortal(newSVpvn(name, len)),
+ /* Like in op.c, we copy the compile
+ * time pm flags to the rx ones */
+ (flags & RXf_PMf_COMPILETIME), flags);
+
+ assert(subpattern_re); /* Should have died if didn't compile successfully */
+ return subpattern_re;
+}
+
+STATIC I32
+S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
+ char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
+{
+ I32 result;
+
+ PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
+
+ result = pregexec(prog, stringarg, strend, strbeg, minend, screamer, nosave);
+
+ return result;
+}
SV *
Perl_handle_user_defined_property(pTHX_
goto return_failure;
}
-#if 0 /* See explanation at definition above of get_extended_utf8_msg() */
+# if 0 /* See explanation at definition above of get_extended_utf8_msg() */
if ( UNICODE_IS_PERL_EXTENDED(min)
|| UNICODE_IS_PERL_EXTENDED(max))
sv_catpvs(msg, "\"");
}
-#endif
+# endif
/* Here, this line contains a legal range */
this_definition = sv_2mortal(_new_invlist(2));
/* As explained below, certain operations need to take place in the first
* thread created. These macros switch contexts */
-#ifdef USE_ITHREADS
-# define DECLARATION_FOR_GLOBAL_CONTEXT \
+# ifdef USE_ITHREADS
+# define DECLARATION_FOR_GLOBAL_CONTEXT \
PerlInterpreter * save_aTHX = aTHX;
-# define SWITCH_TO_GLOBAL_CONTEXT \
+# define SWITCH_TO_GLOBAL_CONTEXT \
PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
-# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
-# define CUR_CONTEXT aTHX
-# define ORIGINAL_CONTEXT save_aTHX
-#else
-# define DECLARATION_FOR_GLOBAL_CONTEXT
-# define SWITCH_TO_GLOBAL_CONTEXT NOOP
-# define RESTORE_CONTEXT NOOP
-# define CUR_CONTEXT NULL
-# define ORIGINAL_CONTEXT NULL
-#endif
+# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
+# define CUR_CONTEXT aTHX
+# define ORIGINAL_CONTEXT save_aTHX
+# else
+# define DECLARATION_FOR_GLOBAL_CONTEXT
+# define SWITCH_TO_GLOBAL_CONTEXT NOOP
+# define RESTORE_CONTEXT NOOP
+# define CUR_CONTEXT NULL
+# define ORIGINAL_CONTEXT NULL
+# endif
STATIC void
S_delete_recursion_entry(pTHX_ void *key)
user-defined property */
SV * msg, /* Any error or warning msg(s) are appended to
this */
- const STRLEN level) /* Recursion level of this call */
+ const STRLEN level) /* Recursion level of this call */
{
dVAR;
char* lookup_name; /* normalized name for lookup in our tables */
unsigned lookup_len; /* Its length */
- bool stricter = FALSE; /* Some properties have stricter name
- normalization rules, which we decide upon
- based on parsing */
+ enum { Not_Strict = 0, /* Some properties have stricter name */
+ Strict, /* normalization rules, which we decide */
+ As_Is /* upon based on parsing */
+ } stricter = Not_Strict;
/* nv= or numeric_value=, or possibly one of the cjk numeric properties
* (though it requires extra effort to download them from Unicode and
the normalized name in certain situations */
Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
part of a package name */
+ Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
property rather than a Unicode
one. */
bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
explicit utf8:: package that we strip
off */
+ /* The expansion of properties that could be either user-defined or
+ * official unicode ones is deferred until runtime, including a marker for
+ * those that might be in the latter category. This boolean indicates if
+ * we've seen that marker. If not, what we're parsing can't be such an
+ * official Unicode property whose expansion was deferred */
+ bool could_be_deferred_official = FALSE;
PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
}
/* If this looks like it is a marker we inserted at compile time,
- * ignore it; otherwise keep it as it would have been user input. */
- if ( UNLIKELY(cur == DEFERRED_PROP_EXPANSION_MARKERc)
+ * set a flag and otherwise ignore it. If it isn't in the final
+ * position, keep it as it would have been user input. */
+ if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
&& ! deferrable
&& could_be_user_defined
&& i == name_len - 1)
{
name_len--;
+ could_be_deferred_official = TRUE;
continue;
}
i++;
non_pkg_begin = i + 1;
lookup_name[j++] = ':';
+ lun_non_pkg_begin = j;
}
else { /* Only word chars (and '::') can be in a user-defined name */
could_be_user_defined = FALSE;
} /* End of parsing through the lhs of the property name (or all of it if
no rhs) */
-#define STRLENs(s) (sizeof("" s "") - 1)
+# define STRLENs(s) (sizeof("" s "") - 1)
/* If there is a single package name 'utf8::', it is ambiguous. It could
* be for a user-defined property, or it could be a Unicode property, as
* or are positioned just after the '=' if it is compound. */
if (equals_pos >= 0) {
- assert(! stricter); /* We shouldn't have set this yet */
+ assert(stricter == Not_Strict); /* We shouldn't have set this yet */
/* Space immediately after the '=' is ignored */
i++;
/* Most punctuation after the equals indicates a subpattern, like
* \p{foo=/bar/} */
if ( isPUNCT_A(name[i])
- && name[i] != '-'
- && name[i] != '+'
- && name[i] != '_'
- && name[i] != '{')
+ && name[i] != '-'
+ && name[i] != '+'
+ && name[i] != '_'
+ && name[i] != '{'
+ /* A backslash means the real delimitter is the next character,
+ * but it must be punctuation */
+ && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
{
/* Find the property. The table includes the equals sign, so we
* use 'j' as-is */
if (table_index) {
const char * const * prop_values
= UNI_prop_value_ptrs[table_index];
- SV * subpattern;
- Size_t subpattern_len;
REGEXP * subpattern_re;
char open = name[i++];
char close;
const char * pos_in_brackets;
bool escaped = 0;
- /* A backslash means the real delimitter is the next character.
- * */
+ /* Backslash => delimitter is the character following. We
+ * already checked that it is punctuation */
if (open == '\\') {
open = name[i++];
escaped = 1;
* set of closing is so that if the opening is something like
* ']', the closing will be that as well. Something similar is
* done in toke.c */
- pos_in_brackets = strchr("([<)]>)]>", open);
+ pos_in_brackets = memCHRs("([<)]>)]>", open);
close = (pos_in_brackets) ? pos_in_brackets[3] : open;
if ( i >= name_len
|| name[name_len-1] != close
- || (escaped && name[name_len-2] != '\\'))
+ || (escaped && name[name_len-2] != '\\')
+ /* Also make sure that there are enough characters.
+ * e.g., '\\\' would show up incorrectly as legal even
+ * though it is too short */
+ || (SSize_t) (name_len - i - 1 - escaped) < 0)
{
sv_catpvs(msg, "Unicode property wildcard not terminated");
goto append_name_to_msg;
* pattern fails to compile, our added text to the user's
* pattern will be displayed to the user, which is not so
* desirable. */
- subpattern_len = name_len - i - 1 - escaped;
- subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
- (unsigned) subpattern_len,
- name + i);
- subpattern = sv_2mortal(subpattern);
- subpattern_re = re_compile(subpattern, 0);
- assert(subpattern_re); /* Should have died if didn't compile
- successfully */
+ subpattern_re = compile_wildcard(name + i,
+ name_len - i - 1 - escaped,
+ TRUE /* /i */
+ );
/* For each legal property value, see if the supplied pattern
* matches it. */
const Size_t len = strlen(entry);
SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
- if (pregexec(subpattern_re,
+ if (execute_wildcard(subpattern_re,
(char *) entry,
(char *) entry + len,
(char *) entry, 0,
* some constructs in their subpattern, like \A. */
} /* End of is a wildcard subppattern */
+ /* \p{name=...} is handled specially. Instead of using the normal
+ * mechanism involving charclass_invlists.h, it uses _charnames.pm
+ * which has the necessary (huge) data accessible to it, and which
+ * doesn't get loaded unless necessary. The legal syntax for names is
+ * somewhat different than other properties due both to the vagaries of
+ * a few outlier official names, and the fact that only a few ASCII
+ * characters are permitted in them */
+ if ( memEQs(lookup_name, j - 1, "name")
+ || memEQs(lookup_name, j - 1, "na"))
+ {
+ dSP;
+ HV * table;
+ SV * character;
+ const char * error_msg;
+ CV* lookup_loose;
+ SV * character_name;
+ STRLEN character_len;
+ UV cp;
+
+ stricter = As_Is;
+
+ /* Since the RHS (after skipping initial space) is passed unchanged
+ * to charnames, and there are different criteria for what are
+ * legal characters in the name, just parse it here. A character
+ * name must begin with an ASCII alphabetic */
+ if (! isALPHA(name[i])) {
+ goto failed;
+ }
+ lookup_name[j++] = name[i];
+
+ for (++i; i < name_len; i++) {
+ /* Official names can only be in the ASCII range, and only
+ * certain characters */
+ if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
+ goto failed;
+ }
+ lookup_name[j++] = name[i];
+ }
+
+ /* Finished parsing, save the name into an SV */
+ character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
+
+ /* Make sure _charnames is loaded. (The parameters give context
+ * for any errors generated */
+ table = load_charnames(character_name, name, name_len, &error_msg);
+ if (table == NULL) {
+ sv_catpv(msg, error_msg);
+ goto append_name_to_msg;
+ }
+
+ lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
+ if (! lookup_loose) {
+ Perl_croak(aTHX_
+ "panic: Can't find '_charnames::_loose_regcomp_lookup");
+ }
+
+ PUSHSTACKi(PERLSI_OVERLOAD);
+ ENTER ;
+ SAVETMPS;
+ save_re_context();
+
+ PUSHMARK(SP) ;
+ XPUSHs(character_name);
+ PUTBACK;
+ call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
+
+ SPAGAIN ;
+
+ character = POPs;
+ SvREFCNT_inc_simple_void_NN(character);
+
+ PUTBACK ;
+ FREETMPS ;
+ LEAVE ;
+ POPSTACK;
+
+ if (! SvOK(character)) {
+ goto failed;
+ }
+
+ cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
+ if (character_len < SvCUR(character)) {
+ goto failed;
+ }
+
+ prop_definition = add_cp_to_invlist(NULL, cp);
+ return prop_definition;
+ }
/* Certain properties whose values are numeric need special handling.
* They may optionally be prefixed by 'is'. Ignore that prefix for the
* But the numeric type properties can have the alphas [Ee] to
* signify an exponent, and it is still a number with stricter
* rules. So look for an alpha that signifies not-strict */
- stricter = TRUE;
+ stricter = Strict;
for (k = i; k < name_len; k++) {
if ( isALPHA_A(name[k])
&& (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
{
- stricter = FALSE;
+ stricter = Not_Strict;
break;
}
}
&& memNEs(lookup_name + 4, j - 4, "space")
&& memNEs(lookup_name + 4, j - 4, "word"))
{
- stricter = TRUE;
+ stricter = Strict;
/* We set the inputs back to 0 and the code below will reparse,
* using strict */
goto definition_deferred;
}
- /* If we haven't already stripped the package name (if one), do so
- * now so can look for an official property with the stripped name.
- * */
- if (! stripped_utf8_pkg) {
- lookup_name += non_pkg_begin;
- j -= non_pkg_begin;
+ /* Here, we are at runtime, and didn't find the user property. It
+ * could be an official property, but only if no package was
+ * specified, or just the utf8:: package. */
+ if (could_be_deferred_official) {
+ lookup_name += lun_non_pkg_begin;
+ j -= lun_non_pkg_begin;
+ }
+ else if (! stripped_utf8_pkg) {
+ goto unknown_user_defined;
}
/* Drop down to look up in the official properties */
* but not yet used. */
save_item(PL_subname);
+ /* G_SCALAR guarantees a single return value */
(void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
SPAGAIN;
(void) POPs;
prop_definition = NULL;
}
- else { /* G_SCALAR guarantees a single return value */
+ else {
SV * contents = POPs;
/* The contents is supposed to be the expansion of the property
* property hasn't been encountered yet, but at runtime, it's
* an error to try to use an undefined one */
if (! deferrable) {
- if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
- sv_catpvs(msg, "Unknown user-defined property name");
- goto append_name_to_msg;
+ goto unknown_user_defined;;
}
goto definition_deferred;
}
return prop_definition;
+ unknown_user_defined:
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Unknown user-defined property name");
+ goto append_name_to_msg;
failed:
if (non_pkg_begin != 0) {
* use at such time when we first need the definition during pattern
* matching execution */
if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
- sv_catpvs(fq_name, DEFERRED_PROP_EXPANSION_MARKERs);
+ sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
}
/* We also need a trailing newline */
}
}
-#endif
+#endif /* end of ! PERL_IN_XSUB_RE */
/*
* ex: set ts=8 sts=4 sw=4 et: