#define PERL_IN_REGCOMP_C
#include "perl.h"
-#ifndef PERL_IN_XSUB_RE
-# include "INTERN.h"
-#endif
-
#define REG_COMP_C
#ifdef PERL_IN_XSUB_RE
# include "re_comp.h"
char *parse; /* Input-scan pointer. */
char *copy_start; /* start of copy of input within
constructed parse string */
+ char *save_copy_start; /* Provides one level of saving
+ and restoring 'copy_start' */
char *copy_start_in_input; /* Position in input string
corresponding to copy_start */
SSize_t whilem_seen; /* number of WHILEM in this expr */
I32 seen_zerolen;
regnode_offset *open_parens; /* offsets to open parens */
regnode_offset *close_parens; /* offsets to close parens */
+ I32 parens_buf_size; /* #slots malloced open/close_parens */
regnode *end_op; /* END node in program */
I32 utf8; /* whether the pattern is utf8 or not */
I32 orig_utf8; /* whether the pattern was originally in utf8 */
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
I32 in_lookbehind;
+ I32 in_lookahead;
I32 contains_locale;
I32 override_recoding;
-#ifdef EBCDIC
- I32 recode_x_to_native;
-#endif
+ I32 recode_x_to_native;
I32 in_multi_char_class;
struct reg_code_blocks *code_blocks;/* positions of literal (?{})
within pattern */
scan_frame *frame_last;
U32 frame_count;
AV *warn_text;
+ HV *unlexed_names;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_precomp (pRExC_state->precomp)
#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
+#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
#define RExC_precomp_end (pRExC_state->precomp_end)
#define RExC_rx_sv (pRExC_state->rx_sv)
#define RExC_rx (pRExC_state->rx)
#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
under /d from /u ? */
-
#ifdef RE_TRACK_PATTERN_OFFSETS
# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
others */
#define RExC_maxlen (pRExC_state->maxlen)
#define RExC_npar (pRExC_state->npar)
#define RExC_total_parens (pRExC_state->total_par)
+#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
#define RExC_nestroot (pRExC_state->nestroot)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_study_chunk_recursed_bytes \
(pRExC_state->study_chunk_recursed_bytes)
#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
+#define RExC_in_lookahead (pRExC_state->in_lookahead)
#define RExC_contains_locale (pRExC_state->contains_locale)
+#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+
#ifdef EBCDIC
-# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+# define SET_recode_x_to_native(x) \
+ STMT_START { RExC_recode_x_to_native = (x); } STMT_END
+#else
+# define SET_recode_x_to_native(x) NOOP
#endif
+
#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
#define RExC_frame_head (pRExC_state->frame_head)
#define RExC_frame_last (pRExC_state->frame_last)
#define RExC_warn_text (pRExC_state->warn_text)
#define RExC_in_script_run (pRExC_state->in_script_run)
#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
+#define RExC_unlexed_names (pRExC_state->unlexed_names)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
if (DEPENDS_SEMANTICS) { \
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
RExC_uni_semantics = 1; \
- if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \
+ if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
/* No need to restart the parse if we haven't seen \
* anything that differs between /u and /d, and no need \
* to restart immediately if we're going to reparse \
} \
} STMT_END
-#define BRANCH_MAX_OFFSET U16_MAX
#define REQUIRE_BRANCHJ(flagp, restart_retval) \
STMT_START { \
RExC_use_BRANCHJ = 1; \
- if (LIKELY(RExC_total_parens >= 0)) { \
- /* No need to restart the parse immediately if we're \
- * going to reparse anyway to count parens */ \
- *flagp |= RESTART_PARSE; \
- return restart_retval; \
- } \
+ *flagp |= RESTART_PARSE; \
+ return restart_retval; \
} STMT_END
+/* Until we have completed the parse, we leave RExC_total_parens at 0 or
+ * less. After that, it must always be positive, because the whole re is
+ * considered to be surrounded by virtual parens. Setting it to negative
+ * indicates there is some construct that needs to know the actual number of
+ * parens to be properly handled. And that means an extra pass will be
+ * required after we've counted them all */
+#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
#define REQUIRE_PARENS_PASS \
- STMT_START { \
- if (RExC_total_parens == 0) RExC_total_parens = -1; \
+ STMT_START { /* No-op if have completed a pass */ \
+ if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
} STMT_END
+#define IN_PARENS_PASS (RExC_total_parens < 0)
+
/* This is used to return failure (zero) early from the calling function if
* various flags in 'flags' are set. Two flags always cause a return:
/* Used to point after bad bytes for an error message, but avoid skipping
* past a nul byte. */
-#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
+#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
/* Set up to clean up after our imminent demise */
#define PREPARE_TO_DIE \
Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
+#define FAIL3(msg,arg1,arg2) _FAIL( \
+ Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
+ arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
+
/*
* Simple_vFAIL -- like FAIL, but marks the current location in the scan
*/
} STMT_END
/* Setting this to NULL is a signal to not output warnings */
-#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
-#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
+#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
+ STMT_START { \
+ RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
+ RExC_copy_start_in_constructed = NULL; \
+ } STMT_END
+#define RESTORE_WARNINGS \
+ RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
/* Since a warning can be generated multiple times as the input is reparsed, we
* output it the first time we come to that point in the parse, but suppress it
return TRUE;
}
+#define INVLIST_INDEX 0
+#define ONLY_LOCALE_MATCHES_INDEX 1
+#define DEFERRED_USER_DEFINED_INDEX 2
+
STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
const regnode_charclass* const node)
* returned list must, and will, contain every code point that is a
* possibility. */
+ dVAR;
SV* invlist = NULL;
SV* only_utf8_locale_invlist = NULL;
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
+ const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+ ? 0
+ : ANYOF_FLAGS(node);
PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
SV **const ary = AvARRAY(av);
assert(RExC_rxi->data->what[n] == 's');
- if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
- invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
- }
- else if (ary[0] && ary[0] != &PL_sv_undef) {
+ if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
- /* Here, no compile-time swash, and there are things that won't be
- * known until runtime -- we have to assume it could be anything */
+ /* Here there are things that won't be known until runtime -- we
+ * have to assume it could be anything */
invlist = sv_2mortal(_new_invlist(1));
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
- else if (ary[3] && ary[3] != &PL_sv_undef) {
+ else if (ary[INVLIST_INDEX]) {
- /* Here no compile-time swash, and no run-time only data. Use the
- * node's inversion list */
- invlist = sv_2mortal(invlist_clone(ary[3], NULL));
+ /* Use the node's inversion list */
+ invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
}
/* Get the code points valid only under UTF-8 locales */
- if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
- && ary[2] && ary[2] != &PL_sv_undef)
+ if ( (flags & ANYOFL_FOLD)
+ && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
{
- only_utf8_locale_invlist = ary[2];
+ only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
}
}
* actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
* have to do this here before we add the unconditionally matched code
* points */
- if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ if (flags & ANYOF_INVERT) {
_invlist_intersection_complement_2nd(invlist,
PL_UpperLatin1,
&invlist);
}
/* Add in the points from the bit map */
- if (OP(node) != ANYOFH) {
+ if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
unsigned int start = i++;
* as well. But don't add them if inverting, as when that gets done below,
* it would exclude all these characters, including the ones it shouldn't
* that were added just above */
- if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
- && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+ if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
+ && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
{
_invlist_union(invlist, PL_UpperLatin1, &invlist);
}
/* Similarly for these */
- if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
_invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
}
- if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ if (flags & ANYOF_INVERT) {
_invlist_invert(invlist);
}
- else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+ else if (flags & ANYOFL_FOLD) {
+ if (new_node_has_latin1) {
+
+ /* Under /li, any 0-255 could fold to any other 0-255, depending on
+ * the locale. We can skip this if there are no 0-255 at all. */
+ _invlist_union(invlist, PL_Latin1, &invlist);
- /* Under /li, any 0-255 could fold to any other 0-255, depending on the
- * locale. We can skip this if there are no 0-255 at all. */
- _invlist_union(invlist, PL_Latin1, &invlist);
+ invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else {
+ if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
+ invlist = add_cp_to_invlist(invlist, 'I');
+ }
+ if (_invlist_contains_cp(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
+ {
+ invlist = add_cp_to_invlist(invlist, 'i');
+ }
+ }
}
/* Similarly add the UTF-8 locale possible matches. These have to be
if (only_utf8_locale_invlist) {
_invlist_union_maybe_complement_2nd(invlist,
only_utf8_locale_invlist,
- ANYOF_FLAGS(node) & ANYOF_INVERT,
+ flags & ANYOF_INVERT,
&invlist);
}
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
+ U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(and_with);
U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
* the code point inversion list and just the relevant flags */
if (is_ANYOF_SYNTHETIC(and_with)) {
anded_cp_list = ((regnode_ssc *)and_with)->invlist;
- anded_flags = ANYOF_FLAGS(and_with);
+ anded_flags = and_with_flags;
/* XXX This is a kludge around what appears to be deficiencies in the
* optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
else {
anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
if (OP(and_with) == ANYOFD) {
- anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+ anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
}
else {
- anded_flags = ANYOF_FLAGS(and_with)
+ anded_flags = and_with_flags
&( ANYOF_COMMON_FLAGS
|ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
|ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
- if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+ if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
anded_flags &=
ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
* <= (C1 & ~C2) | (P1 & ~P2)
* */
- if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
+ if ((and_with_flags & ANYOF_INVERT)
&& ! is_ANYOF_SYNTHETIC(and_with))
{
unsigned int i;
/* If either P1 or P2 is empty, the intersection will be also; can skip
* the loop */
- if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
+ if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
ANYOF_POSIXL_ZERO(ssc);
}
else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
else {
ssc->invlist = anded_cp_list;
ANYOF_POSIXL_ZERO(ssc);
- if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+ if (and_with_flags & ANYOF_MATCHES_POSIXL) {
ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
}
}
}
else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
- || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
+ || (and_with_flags & ANYOF_MATCHES_POSIXL))
{
/* One or the other of P1, P2 is non-empty. */
- if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+ if (and_with_flags & ANYOF_MATCHES_POSIXL) {
ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
}
ssc_union(ssc, anded_cp_list, FALSE);
SV* ored_cp_list;
U8 ored_flags;
+ U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(or_with);
PERL_ARGS_ASSERT_SSC_OR;
* the code point inversion list and just the relevant flags */
if (is_ANYOF_SYNTHETIC(or_with)) {
ored_cp_list = ((regnode_ssc*) or_with)->invlist;
- ored_flags = ANYOF_FLAGS(or_with);
+ ored_flags = or_with_flags;
}
else {
ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
- ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
+ ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
if (OP(or_with) != ANYOFD) {
ored_flags
- |= ANYOF_FLAGS(or_with)
+ |= or_with_flags
& ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
|ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
- if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+ if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
ored_flags |=
ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
* (which results in actually simpler code than the non-inverted case)
* */
- if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
+ if ((or_with_flags & ANYOF_INVERT)
&& ! is_ANYOF_SYNTHETIC(or_with))
{
/* We ignore P2, leaving P1 going forward */
} /* else Not inverted */
- else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
+ else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
unsigned int i;
U32 count = 0; /* Running total of number of code points matched by
'ssc' */
UV start, end; /* Start and end points of current range in inversion
- list */
+ XXX outdated. UTF-8 locales are common, what about invert? list */
const U32 max_code_points = (LOC)
? 256
: (( ! UNI_SEMANTICS
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
- set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
- NULL, NULL, NULL, FALSE);
+ set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
/* Make sure is clone-safe */
ssc->invlist = NULL;
if (UTF) { \
SV *zlopp = newSV(UTF8_MAXBYTES); \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+ unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+ *kapow = '\0'; \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
trie_words = newAV();
});
- re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+ re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
assert(re_trie_maxbuff);
if (!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
if ( state==1 ) {
OP( convert ) = nodetype;
str=STRING(convert);
- STR_LEN(convert)=0;
+ setSTR_LEN(convert, 0);
}
- STR_LEN(convert) += len;
+ setSTR_LEN(convert, STR_LEN(convert) + len);
while (len--)
*str++ = *ch++;
} else {
* using /iaa matching will be doing so almost entirely with ASCII
* strings, so this should rarely be encountered in practice */
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
- if (PL_regkind[OP(scan)] == EXACT) \
+#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
+ if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT \
+ && OP(scan) != LEXACT_ONLY8) \
join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
STATIC U32
merged++;
NEXT_OFF(scan) += NEXT_OFF(n);
- STR_LEN(scan) += STR_LEN(n);
+ setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
/* recursed: which subroutines have we recursed into */
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
+ dVAR;
/* There must be at least this number of characters to match */
SSize_t min = 0;
I32 pars = 0, code;
}
}
else if ( OP(scan) == EXACT
+ || OP(scan) == LEXACT
|| OP(scan) == EXACT_ONLY8
+ || OP(scan) == LEXACT_ONLY8
|| OP(scan) == EXACTL)
{
SSize_t l = STR_LEN(scan);
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
if ( OP(next) == EXACT
+ || OP(next) == LEXACT
|| OP(next) == EXACT_ONLY8
+ || OP(next) == LEXACT_ONLY8
|| OP(next) == EXACTL
|| (flags & SCF_DO_STCLASS))
{
STRLEN l;
const char * const s = SvPV_const(data->last_found, l);
SSize_t old = b - data->last_start_min;
+ assert(old >= 0);
if (UTF)
- old = utf8_hop((U8*)s, old) - (U8*)s;
+ old = utf8_hop_forward((U8*)s, old,
+ (U8 *) SvEND(data->last_found))
+ - (U8*)s;
l -= old;
/* Get the added string: */
last_str = newSVpvn_utf8(s + old, l, UTF);
case ANYOFL:
case ANYOFPOSIXL:
case ANYOFH:
+ case ANYOFHb:
+ case ANYOFHr:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
last, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1);
if (scan->flags) {
- if (deltanext) {
- FAIL("Variable length lookbehind not implemented");
- }
- else if (minnext > (I32)U8_MAX) {
+ if ( deltanext < 0
+ || deltanext > (I32) U8_MAX
+ || minnext > (I32)U8_MAX
+ || minnext + deltanext > (I32)U8_MAX)
+ {
FAIL2("Lookbehind longer than %" UVuf " not implemented",
(UV)U8_MAX);
}
- scan->flags = (U8)minnext;
+
+ /* The 'next_off' field has been repurposed to count the
+ * additional starting positions to try beyond the initial
+ * one. (This leaves it at 0 for non-variable length
+ * matches to avoid breakage for those not using this
+ * extension) */
+ if (deltanext) {
+ scan->next_off = deltanext;
+ ckWARNexperimental(RExC_parse,
+ WARN_EXPERIMENTAL__VLB,
+ "Variable length lookbehind is experimental");
+ }
+ scan->flags = (U8)minnext + deltanext;
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
stopparen, recursed_depth, NULL,
f, depth+1);
if (scan->flags) {
- if (deltanext) {
- FAIL("Variable length lookbehind not implemented");
- }
- else if (*minnextp > (I32)U8_MAX) {
+ assert(0); /* This code has never been tested since this
+ is normally not compiled */
+ if ( deltanext < 0
+ || deltanext > (I32) U8_MAX
+ || *minnextp > (I32)U8_MAX
+ || *minnextp + deltanext > (I32)U8_MAX)
+ {
FAIL2("Lookbehind longer than %" UVuf " not implemented",
(UV)U8_MAX);
}
- scan->flags = (U8)*minnextp;
+
+ if (deltanext) {
+ scan->next_off = deltanext;
+ }
+ scan->flags = (U8)*minnextp + deltanext;
}
*minnextp += min;
const char* name;
name = get_regex_charset_name(RExC_rx->extflags, &len);
- if strEQ(name, DEPENDS_PAT_MODS) { /* /d under UTF-8 => /u */
+ if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
assert(RExC_utf8);
name = UNICODE_PAT_MODS;
len = sizeof(UNICODE_PAT_MODS) - 1;
OP *expr, const regexp_engine* eng, REGEXP *old_re,
bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
{
+ dVAR;
REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
STRLEN plen;
char *exp;
}
pRExC_state->warn_text = NULL;
+ pRExC_state->unlexed_names = NULL;
pRExC_state->code_blocks = NULL;
if (is_bare_re)
&& memEQ(RX_PRECOMP(old_re), exp, plen)
&& !runtime_code /* with runtime code, always recompile */ )
{
+ DEBUG_COMPILE_r({
+ SV *dsv= sv_newmortal();
+ RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
+ Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
+ PL_colors[4], PL_colors[5], s);
+ });
return old_re;
}
RExC_seen = 0;
RExC_maxlen = 0;
RExC_in_lookbehind = 0;
+ RExC_in_lookahead = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
-#ifdef EBCDIC
RExC_recode_x_to_native = 0;
-#endif
RExC_in_multi_char_class = 0;
RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_parens_buf_size = 0;
RExC_emit_start = RExC_rxi->program;
pRExC_state->code_index = 0;
/* Do the parse */
if (reg(pRExC_state, 0, &flags, 1)) {
- /* Success!, But if RExC_total_parens < 0, we need to redo the parse
- * knowing how many parens there actually are */
- if (RExC_total_parens < 0) {
+ /* Success!, But we may need to redo the parse knowing how many parens
+ * there actually are */
+ if (IN_PARENS_PASS) {
flags |= RESTART_PARSE;
}
DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
}
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
/* Make enough room for all the known parens, and zero it */
Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
SetProgLen(RExC_rxi,RExC_size);
#endif
+ DEBUG_DUMP_PRE_OPTIMIZE_r({
+ SV * const sv = sv_newmortal();
+ RXi_GET_DECL(RExC_rx, ri);
+ DEBUG_RExC_seen();
+ Perl_re_printf( aTHX_ "Program before optimization:\n");
+
+ (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
+ sv, 0, 0);
+ });
+
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "Starting post parse optimization\n");
);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
if ( OP(first) == EXACT
+ || OP(first) == LEXACT
|| OP(first) == EXACT_ONLY8
+ || OP(first) == LEXACT_ONLY8
|| OP(first) == EXACTL)
{
NOOP; /* Empty, get anchored substr later. */
&& nop == END)
RExC_rx->extflags |= RXf_WHITE;
else if ( RExC_rx->extflags & RXf_SPLIT
- && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
+ && ( fop == EXACT || fop == LEXACT
+ || fop == EXACT_ONLY8 || fop == LEXACT_ONLY8
+ || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& nop == END )
/* It might be a forward reference; we can't fail until we
* know, by completing the parse to get all the groups, and
* then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
vFAIL("Reference to nonexistent named group");
}
else {
initial_size = 10;
}
- /* Allocate the initial space */
new_list = newSV_type(SVt_INVLIST);
-
initialize_invlist_guts(new_list, initial_size);
return new_list;
}
void
-Perl__invlist_populate_swatch(SV* const invlist,
- const UV start, const UV end, U8* swatch)
-{
- /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
- * but is used when the swash has an inversion list. This makes this much
- * faster, as it uses a binary search instead of a linear one. This is
- * intimately tied to that function, and perhaps should be in utf8.c,
- * except it is intimately tied to inversion lists as well. It assumes
- * that <swatch> is all 0's on input */
-
- UV current = start;
- const IV len = _invlist_len(invlist);
- IV i;
- const UV * array;
-
- PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
-
- if (len == 0) { /* Empty inversion list */
- return;
- }
-
- array = invlist_array(invlist);
-
- /* Find which element it is */
- i = _invlist_search(invlist, start);
-
- /* We populate from <start> to <end> */
- while (current < end) {
- UV upper;
-
- /* The inversion list gives the results for every possible code point
- * after the first one in the list. Only those ranges whose index is
- * even are ones that the inversion list matches. For the odd ones,
- * and if the initial code point is not in the list, we have to skip
- * forward to the next element */
- if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
- i++;
- if (i >= len) { /* Finished if beyond the end of the array */
- return;
- }
- current = array[i];
- if (current >= end) { /* Finished if beyond the end of what we
- are populating */
- if (LIKELY(end < UV_MAX)) {
- return;
- }
-
- /* We get here when the upper bound is the maximum
- * representable on the machine, and we are looking for just
- * that code point. Have to special case it */
- i = len;
- goto join_end_of_list;
- }
- }
- assert(current >= start);
-
- /* The current range ends one below the next one, except don't go past
- * <end> */
- i++;
- upper = (i < len && array[i] < end) ? array[i] : end;
-
- /* Here we are in a range that matches. Populate a bit in the 3-bit U8
- * for each code point in it */
- for (; current < upper; current++) {
- const STRLEN offset = (STRLEN)(current - start);
- swatch[offset >> 3] |= 1 << (offset & 7);
- }
-
- join_end_of_list:
-
- /* Quit if at the end of the list */
- if (i >= len) {
-
- /* But first, have to deal with the highest possible code point on
- * the platform. The previous code assumes that <end> is one
- * beyond where we want to populate, but that is impossible at the
- * platform's infinity, so have to handle it specially */
- if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
- {
- const STRLEN offset = (STRLEN)(end - start);
- swatch[offset >> 3] |= 1 << (offset & 7);
- }
- return;
- }
-
- /* Advance to the next range, which will be for code points not in the
- * inversion list */
- current = array[i];
- }
-
- return;
-}
-
-void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
const bool complement_b, SV** output)
{
SV*
Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
{
-
/* Return a new inversion list that is a copy of the input one, which is
* unchanged. The new list will not be mortal even if the old one was. */
- const STRLEN nominal_length = _invlist_len(invlist); /* Why not +1 XXX */
+ const STRLEN nominal_length = _invlist_len(invlist);
const STRLEN physical_length = SvCUR(invlist);
const bool offset = *(get_invlist_offset_addr(invlist));
PERL_ARGS_ASSERT_INVLIST_CLONE;
- /* Need to allocate extra space to accommodate Perl's addition of a
- * trailing NUL to SvPV's, since it thinks they are always strings */
if (new_invlist == NULL) {
new_invlist = _new_invlist(nominal_length);
}
STATIC SV*
S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
{
+ dVAR;
const U8 * s = (U8*)STRING(node);
SSize_t bytelen = STR_LEN(node);
UV uc;
}
else {
/* Any Latin1 range character can potentially match any
- * other depending on the locale */
+ * other depending on the locale, and in Turkic locales, U+130 and
+ * U+131 */
if (OP(node) == EXACTFL) {
_invlist_union(invlist, PL_Latin1, &invlist);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
}
else {
/* But otherwise, it matches at least itself. We can
invlist = add_cp_to_invlist(invlist, c);
}
+
+ if (OP(node) == EXACTFL) {
+
+ /* If either [iI] are present in an EXACTFL node the above code
+ * should have added its normal case pair, but under a Turkish
+ * locale they could match instead the case pairs from it. Add
+ * those as potential matches as well */
+ if (isALPHA_FOLD_EQ(fc, 'I')) {
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
+ invlist = add_cp_to_invlist(invlist, 'I');
+ }
+ else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ invlist = add_cp_to_invlist(invlist, 'i');
+ }
+ }
}
}
return;
default:
fail_modifiers:
- RExC_parse += SKIP_IF_CHAR(RExC_parse);
+ RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
RExC_sawback = 1;
ret = reganode(pRExC_state,
((! FOLD)
- ? NREF
+ ? REFN
: (ASCII_FOLD_RESTRICTED)
- ? NREFFA
+ ? REFFAN
: (AT_LEAST_UNI_SEMANTICS)
- ? NREFFU
+ ? REFFUN
: (LOC)
- ? NREFFL
- : NREFF),
+ ? REFFLN
+ : REFFN),
num);
*flagp |= HASWIDTH;
I32 freeze_paren = 0;
I32 after_freeze = 0;
I32 num; /* numeric backreferences */
+ SV * max_open; /* Max number of unclosed parens */
char * parse_start = RExC_parse; /* MJD */
char * const oregcomp_parse = RExC_parse;
PERL_ARGS_ASSERT_REG;
DEBUG_PARSE("reg ");
+
+ max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
+ assert(max_open);
+ if (!SvIOK(max_open)) {
+ sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
+ }
+ if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
+ open paren */
+ vFAIL("Too many nested open parens");
+ }
+
*flagp = 0; /* Tentatively. */
+ if (RExC_in_lookbehind) {
+ RExC_in_lookbehind++;
+ }
+ if (RExC_in_lookahead) {
+ RExC_in_lookahead++;
+ }
+
/* Having this true makes it feasible to have a lot fewer tests for the
* parse pointer being in scope. For example, we can write
* while(isFOO(*RExC_parse)) RExC_parse++;
return 0;
}
- REGTAIL(pRExC_state, ret, atomic);
+ if (! REGTAIL(pRExC_state, ret, atomic)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
- REGTAIL(pRExC_state, atomic,
- reg_node(pRExC_state, SRCLOSE));
+ if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
+ SRCLOSE)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
RExC_in_script_run = 0;
return ret;
} /* End of switch */
if ( ! op ) {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += UTF
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
if (has_upper || verb_len == 0) {
vFAIL2utf8f(
"Unknown verb pattern '%" UTF8f "'",
return handle_named_backref(pRExC_state, flagp,
parse_start, ')');
}
- RExC_parse += SKIP_IF_CHAR(RExC_parse);
+ RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
RExC_parse-seqstart, seqstart);
if (RExC_parse >= RExC_end) {
vFAIL("Sequence (?... not terminated");
}
-
- /* FALLTHROUGH */
+ RExC_seen_zerolen++;
+ break;
case '=': /* (?=...) */
RExC_seen_zerolen++;
+ RExC_in_lookahead++;
break;
case '!': /* (?!...) */
RExC_seen_zerolen++;
goto gen_recurse_regop;
/* NOTREACHED */
case '+':
- if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+ if (! inRANGE(RExC_parse[0], '1', '9')) {
RExC_parse++;
vFAIL("Illegal pattern");
}
goto parse_recursion;
/* NOTREACHED*/
case '-': /* (?-1) */
- if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+ if (! inRANGE(RExC_parse[0], '1', '9')) {
RExC_parse--; /* rewind to let it be handled later */
goto parse_flags;
}
/* It might be a forward reference; we can't fail until
* we know, by completing the parse to get all the
* groups, and then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
RExC_parse++;
vFAIL("Reference to nonexistent group");
}
/* It might be a forward reference; we can't fail until we
* know, by completing the parse to get all the groups, and
* then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
if (num >= RExC_total_parens) {
RExC_parse++;
vFAIL("Reference to nonexistent group");
case '?': /* (??...) */
is_logical = 1;
if (*RExC_parse != '{') {
- RExC_parse += SKIP_IF_CHAR(RExC_parse);
+ RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f(
"Sequence (%" UTF8f "...) not recognized",
RExC_flags & RXf_PMf_COMPILETIME
);
FLAGS(REGNODE_p(ret)) = 2;
- REGTAIL(pRExC_state, ret, eval);
+ if (! REGTAIL(pRExC_state, ret, eval)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
/* deal with the length of this later - MJD */
return ret;
}
tail = reg(pRExC_state, 1, &flag, depth+1);
RETURN_FAIL_ON_RESTART(flag, flagp);
- REGTAIL(pRExC_state, ret, tail);
+ if (! REGTAIL(pRExC_state, ret, tail)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
goto insert_if;
}
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void_NN(sv_dat);
}
- ret = reganode(pRExC_state, NGROUPP, num);
+ ret = reganode(pRExC_state, GROUPPN, num);
goto insert_if_check_paren;
}
else if (memBEGINs(RExC_parse,
parno = 1;
RExC_parse++;
}
- else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ else if (inRANGE(RExC_parse[0], '1', '9')) {
UV uv;
endptr = RExC_end;
if (grok_atoUV(RExC_parse, &uv, &endptr)
ret = reganode(pRExC_state, INSUBP, parno);
goto insert_if_check_paren;
}
- else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ else if (inRANGE(RExC_parse[0], '1', '9')) {
/* (?(1)...) */
char c;
UV uv;
insert_if_check_paren:
if (UCHARAT(RExC_parse) != ')') {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += UTF
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL("Switch condition not recognized");
}
nextchar(pRExC_state);
insert_if:
- REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+ if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
+ IFTHEN, 0)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
br = regbranch(pRExC_state, &flags, 1, depth+1);
if (br == 0) {
RETURN_FAIL_ON_RESTART(flags,flagp);
FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
} else
- REGTAIL(pRExC_state, br, reganode(pRExC_state,
- LONGJMP, 0));
+ if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
+ LONGJMP, 0)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
c = UCHARAT(RExC_parse);
nextchar(pRExC_state);
if (flags&HASWIDTH)
FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
}
- REGTAIL(pRExC_state, ret, lastbr);
+ if (! REGTAIL(pRExC_state, ret, lastbr)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
c = UCHARAT(RExC_parse);
vFAIL("Switch (?(condition)... contains too many branches");
}
ender = reg_node(pRExC_state, TAIL);
- REGTAIL(pRExC_state, br, ender);
+ if (! REGTAIL(pRExC_state, br, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (lastbr) {
- REGTAIL(pRExC_state, lastbr, ender);
- REGTAIL(pRExC_state, REGNODE_OFFSET(
- NEXTOPER(
- NEXTOPER(REGNODE_p(lastbr)))),
- ender);
+ if (! REGTAIL(pRExC_state, lastbr, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
+ if (! REGTAIL(pRExC_state,
+ REGNODE_OFFSET(
+ NEXTOPER(
+ NEXTOPER(REGNODE_p(lastbr)))),
+ ender))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
#if 0 /* Removing this doesn't cause failures in the test suite -- khw */
RExC_size++; /* XXX WHY do we need this?!!
For large programs it seems to be required
#endif
return ret;
}
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += UTF
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
+
+ case ')':
+ if (RExC_strict) { /* [perl #132851] */
+ ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
+ }
+ /* FALLTHROUGH */
default: /* e.g., (?i) */
RExC_parse = (char *) seqstart + 1;
parse_flags:
capturing_parens:
parno = RExC_npar;
RExC_npar++;
- if (RExC_total_parens <= 0) {
+ if (! ALL_PARENS_COUNTED) {
/* If we are in our first pass through (and maybe only pass),
* we need to allocate memory for the capturing parentheses
- * data structures. Since we start at npar=1, when it reaches
- * 2, for the first time it has something to put in it. Above
- * 2 means we extend what we already have */
- if (RExC_npar == 2) {
+ * data structures.
+ */
+
+ if (!RExC_parens_buf_size) {
+ /* first guess at number of parens we might encounter */
+ RExC_parens_buf_size = 10;
+
/* setup RExC_open_parens, which holds the address of each
* OPEN tag, and to make things simpler for the 0 index the
* start of the program - this is used later for offsets */
- Newxz(RExC_open_parens, RExC_npar, regnode_offset);
+ Newxz(RExC_open_parens, RExC_parens_buf_size,
+ regnode_offset);
RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
/* setup RExC_close_parens, which holds the address of each
* CLOSE tag, and to make things simpler for the 0 index
* the end of the program - this is used later for offsets
* */
- Newxz(RExC_close_parens, RExC_npar, regnode_offset);
+ Newxz(RExC_close_parens, RExC_parens_buf_size,
+ regnode_offset);
/* we dont know where end op starts yet, so we dont need to
* set RExC_close_parens[0] like we do RExC_open_parens[0]
* above */
}
- else {
- Renew(RExC_open_parens, RExC_npar, regnode_offset);
- Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
+ else if (RExC_npar > RExC_parens_buf_size) {
+ I32 old_size = RExC_parens_buf_size;
+
+ RExC_parens_buf_size *= 2;
+
+ Renew(RExC_open_parens, RExC_parens_buf_size,
+ regnode_offset);
+ Zero(RExC_open_parens + old_size,
+ RExC_parens_buf_size - old_size, regnode_offset);
- Renew(RExC_close_parens, RExC_npar, regnode_offset);
- Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
+ Renew(RExC_close_parens, RExC_parens_buf_size,
+ regnode_offset);
+ Zero(RExC_close_parens + old_size,
+ RExC_parens_buf_size - old_size, regnode_offset);
}
}
*flagp |= flags&SIMPLE;
}
if (is_open) { /* Starts with OPEN. */
- REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
+ if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else if (paren != '?') /* Not Conditional */
ret = br;
lastbr = br;
while (*RExC_parse == '|') {
if (RExC_use_BRANCHJ) {
+ bool shut_gcc_up;
+
ender = reganode(pRExC_state, LONGJMP, 0);
/* Append to the previous. */
- REGTAIL(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
- ender);
+ shut_gcc_up = REGTAIL(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
+ ender);
+ PERL_UNUSED_VAR(shut_gcc_up);
}
nextchar(pRExC_state);
if (freeze_paren) {
RETURN_FAIL_ON_RESTART(flags, flagp);
FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
}
- REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
+ if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
lastbr = br;
*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
}
(IV)(ender - lastbr)
);
);
- REGTAIL(pRExC_state, lastbr, ender);
+ if (! REGTAIL(pRExC_state, lastbr, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (have_branch) {
char is_nothing= 1;
for (br = REGNODE_p(ret); br; br = regnext(br)) {
const U8 op = PL_regkind[OP(br)];
if (op == BRANCH) {
- REGTAIL_STUDY(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(br)),
- ender);
+ if (! REGTAIL_STUDY(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(br)),
+ ender))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if ( OP(NEXTOPER(br)) != NOTHING
|| regnext(NEXTOPER(br)) != REGNODE_p(ender))
is_nothing= 0;
}
else if (op == BRANCHJ) {
- REGTAIL_STUDY(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
- ender);
+ bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
+ ender);
+ PERL_UNUSED_VAR(shut_gcc_up);
/* for now we always disable this optimisation * /
if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
|| regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
FLAGS(REGNODE_p(ret)) = flag;
- REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
+ if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
}
if (RExC_in_lookbehind) {
RExC_in_lookbehind--;
}
+ if (RExC_in_lookahead) {
+ RExC_in_lookahead--;
+ }
if (after_freeze > RExC_npar)
RExC_npar = after_freeze;
return(ret);
/* FIXME adding one for every branch after the first is probably
* excessive now we have TRIE support. (hv) */
MARK_NAUGHTY(1);
- if ( chain > (SSize_t) BRANCH_MAX_OFFSET
- && ! RExC_use_BRANCHJ)
- {
+ if (! REGTAIL(pRExC_state, chain, latest)) {
/* XXX We could just redo this branch, but figuring out what
- * bookkeeping needs to be reset is a pain */
+ * bookkeeping needs to be reset is a pain, and it's likely
+ * that other branches that goto END will also be too large */
REQUIRE_BRANCHJ(flagp, 0);
}
- REGTAIL(pRExC_state, chain, latest);
}
chain = latest;
c++;
const regnode_offset w = reg_node(pRExC_state, WHILEM);
FLAGS(REGNODE_p(w)) = 0;
- REGTAIL(pRExC_state, ret, w);
+ if (! REGTAIL(pRExC_state, ret, w)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (RExC_use_BRANCHJ) {
reginsert(pRExC_state, LONGJMP, ret, depth+1);
reginsert(pRExC_state, NOTHING, ret, depth+1);
if (RExC_use_BRANCHJ)
NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
LONGJMP. */
- REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
+ if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+ NOTHING)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
RExC_whilem_seen++;
MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
if (*RExC_parse == '?') {
nextchar(pRExC_state);
reginsert(pRExC_state, MINMOD, ret, depth+1);
- REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
+ if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
else if (*RExC_parse == '+') {
regnode_offset ender;
nextchar(pRExC_state);
ender = reg_node(pRExC_state, SUCCEED);
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
reginsert(pRExC_state, SUSPEND, ret, depth+1);
ender = reg_node(pRExC_state, TAIL);
- REGTAIL(pRExC_state, ret, ender);
+ if (! REGTAIL(pRExC_state, ret, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
if (ISMULT2(RExC_parse)) {
* points) that this \N sequence matches. This is set, and the input is
* parsed for errors, even if the function returns FALSE, as detailed below.
*
- * There are 5 possibilities here, as detailed in the next 5 paragraphs.
+ * There are 6 possibilities here, as detailed in the next 6 paragraphs.
*
* Probably the most common case is for the \N to specify a single code point.
* *cp_count will be set to 1, and *code_point_p will be set to that code
* point.
*
- * Another possibility is for the input to be an empty \N{}, which for
- * backwards compatibility we accept. *cp_count will be set to 0. *node_p
- * will be set to a generated NOTHING node.
+ * Another possibility is for the input to be an empty \N{}. This is no
+ * longer accepted, and will generate a fatal error.
+ *
+ * Another possibility is for a custom charnames handler to be in effect which
+ * translates the input name to an empty string. *cp_count will be set to 0.
+ * *node_p will be set to a generated NOTHING node.
*
* Still another possibility is for the \N to mean [^\n]. *cp_count will be
* set to 0. *node_p will be set to a generated REG_ANY node.
*
- * The fourth possibility is that \N resolves to a sequence of more than one
+ * The fifth possibility is that \N resolves to a sequence of more than one
* code points. *cp_count will be set to the number of code points in the
* sequence. *node_p will be set to a generated node returned by this
* function calling S_reg().
* The final possibility is that it is premature to be calling this function;
* the parse needs to be restarted. This can happen when this changes from
* /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
- * latter occurs only when the fourth possibility would otherwise be in
+ * latter occurs only when the fifth possibility would otherwise be in
* effect, and is because one of those code points requires the pattern to be
* recompiled as UTF-8. The function returns FALSE, and sets the
* RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
* so we need a way to take a snapshot of what they resolve to at the time of
* the original parse. [perl #56444].
*
- * That parsing is skipped for single-quoted regexes, so we may here get
- * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
- * parser. But if the single-quoted regex is something like '\N{U+41}', that
- * is legal and handled here. The code point is Unicode, and has to be
- * translated into the native character set for non-ASCII platforms.
- */
+ * That parsing is skipped for single-quoted regexes, so here we may get
+ * '\N{NAME}', which is parsed now. If the single-quoted regex is something
+ * like '\N{U+41}', that code point is Unicode, and has to be translated into
+ * the native character set for non-ASCII platforms. The other possibilities
+ * are already native, so no translation is done. */
char * endbrace; /* points to '}' following the name */
char* p = RExC_parse; /* Temporary */
char *orig_end;
char *save_start;
I32 flags;
- Size_t count = 0; /* code point count kept internally by this function */
GET_RE_DEBUG_FLAGS_DECL;
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The latter is assumed when the {...} following the \N is a legal
- * quantifier, or there is no '{' at all */
+ * quantifier, or if there is no '{' at all */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
if (cp_count) {
vFAIL2("Missing right brace on \\%c{}", 'N');
}
- /* Here, we have decided it should be a named character or sequence */
- REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
- semantics */
+ /* Here, we have decided it should be a named character or sequence. These
+ * imply Unicode semantics */
+ REQUIRE_UNI_RULES(flagp, FALSE);
- if (endbrace == RExC_parse) { /* empty: \N{} */
+ /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
+ * nothing at all (not allowed under strict) */
+ if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
+ RExC_parse = endbrace;
if (strict) {
RExC_parse++; /* Position after the "}" */
vFAIL("Zero length \\N{}");
}
+
if (cp_count) {
*cp_count = 0;
}
return TRUE;
}
- /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
- if ( endbrace - RExC_parse < 2
- || strnNE(RExC_parse, "U+", 2))
- {
- RExC_parse = endbrace; /* position msg's '<--HERE' */
- vFAIL("\\N{NAME} must be resolved by the lexer");
- }
+ if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
+
+ /* Here, the name isn't of the form U+.... This can happen if the
+ * pattern is single-quoted, so didn't get evaluated in toke.c. Now
+ * is the time to find out what the name means */
+
+ const STRLEN name_len = endbrace - RExC_parse;
+ SV * value_sv; /* What does this name evaluate to */
+ SV ** value_svp;
+ const U8 * value; /* string of name's value */
+ STRLEN value_len; /* and its length */
+
+ /* RExC_unlexed_names is a hash of names that weren't evaluated by
+ * toke.c, and their values. Make sure is initialized */
+ if (! RExC_unlexed_names) {
+ RExC_unlexed_names = newHV();
+ }
+
+ /* If we have already seen this name in this pattern, use that. This
+ * allows us to only call the charnames handler once per name per
+ * pattern. A broken or malicious handler could return something
+ * different each time, which could cause the results to vary depending
+ * on if something gets added or subtracted from the pattern that
+ * causes the number of passes to change, for example */
+ if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
+ name_len, 0)))
+ {
+ value_sv = *value_svp;
+ }
+ else { /* Otherwise we have to go out and get the name */
+ const char * error_msg = NULL;
+ value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
+ UTF,
+ &error_msg);
+ if (error_msg) {
+ RExC_parse = endbrace;
+ vFAIL(error_msg);
+ }
+
+ /* If no error message, should have gotten a valid return */
+ assert (value_sv);
+
+ /* Save the name's meaning for later use */
+ if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
+ value_sv, 0))
+ {
+ Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+ }
+ }
+
+ /* Here, we have the value the name evaluates to in 'value_sv' */
+ value = (U8 *) SvPV(value_sv, value_len);
+
+ /* See if the result is one code point vs 0 or multiple */
+ if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
+ ? UTF8SKIP(value)
+ : 1))
+ {
+ /* Here, exactly one code point. If that isn't what is wanted,
+ * fail */
+ if (! code_point_p) {
+ RExC_parse = p;
+ return FALSE;
+ }
+
+ /* Convert from string to numeric code point */
+ *code_point_p = (SvUTF8(value_sv))
+ ? valid_utf8_to_uvchr(value, NULL)
+ : *value;
- /* This code purposely indented below because of future changes coming */
+ /* Have parsed this entire single code point \N{...}. *cp_count
+ * has already been set to 1, so don't do it again. */
+ RExC_parse = endbrace;
+ nextchar(pRExC_state);
+ return TRUE;
+ } /* End of is a single code point */
+
+ /* Count the code points, if caller desires. The API says to do this
+ * even if we will later return FALSE */
+ if (cp_count) {
+ *cp_count = 0;
+
+ *cp_count = (SvUTF8(value_sv))
+ ? utf8_length(value, value + value_len)
+ : value_len;
+ }
+
+ /* Fail if caller doesn't want to handle a multi-code-point sequence.
+ * But don't back the pointer up if the caller wants to know how many
+ * code points there are (they need to handle it themselves in this
+ * case). */
+ if (! node_p) {
+ if (! cp_count) {
+ RExC_parse = p;
+ }
+ return FALSE;
+ }
+
+ /* Convert this to a sub-pattern of the form "(?: ... )", and then call
+ * reg recursively to parse it. That way, it retains its atomicness,
+ * while not having to worry about any special handling that some code
+ * points may have. */
+
+ substitute_parse = newSVpvs("?:");
+ sv_catsv(substitute_parse, value_sv);
+ sv_catpv(substitute_parse, ")");
+
+ /* The value should already be native, so no need to convert on EBCDIC
+ * platforms.*/
+ assert(! RExC_recode_x_to_native);
+
+ }
+ else { /* \N{U+...} */
+ Size_t count = 0; /* code point count kept internally */
/* We can get to here when the input is \N{U+...} or when toke.c has
* converted a name to the \N{U+...} form. This include changing a
sv_catpvs(substitute_parse, ")");
-#ifdef EBCDIC
/* The values are Unicode, and therefore have to be converted to native
* on a non-Unicode (meaning non-ASCII) platform. */
- RExC_recode_x_to_native = 1;
-#endif
+ SET_recode_x_to_native(1);
+ }
/* Here, we have the string the name evaluates to, ready to be parsed,
* stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
RExC_start = save_start;
RExC_parse = endbrace;
RExC_end = orig_end;
-#ifdef EBCDIC
- RExC_recode_x_to_native = 0;
-#endif
+ SET_recode_x_to_native(0);
SvREFCNT_dec_NN(substitute_parse);
STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
+ dVAR;
regnode_offset ret = 0;
I32 flags = 0;
char *parse_start;
U8 op;
int invert = 0;
- U8 arg;
GET_RE_DEBUG_FLAGS_DECL;
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'K':
- RExC_seen_zerolen++;
- ret = reg_node(pRExC_state, KEEPS);
- *flagp |= SIMPLE;
- /* XXX:dmq : disabling in-place substitution seems to
- * be necessary here to avoid cases of memory corruption, as
- * with: C<$_="x" x 80; s/x\K/y/> -- rgs
- */
- RExC_seen |= REG_LOOKBEHIND_SEEN;
- goto finish_meta_pat;
+ if (!RExC_in_lookbehind && !RExC_in_lookahead) {
+ RExC_seen_zerolen++;
+ ret = reg_node(pRExC_state, KEEPS);
+ *flagp |= SIMPLE;
+ /* XXX:dmq : disabling in-place substitution seems to
+ * be necessary here to avoid cases of memory corruption, as
+ * with: C<$_="x" x 80; s/x\K/y/> -- rgs
+ */
+ RExC_seen |= REG_LOOKBEHIND_SEEN;
+ goto finish_meta_pat;
+ }
+ else {
+ ++RExC_parse; /* advance past the 'K' */
+ vFAIL("\\K not permitted in lookahead/lookbehind");
+ }
case 'Z':
ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
*flagp |= HASWIDTH;
goto finish_meta_pat;
- case 'W':
- invert = 1;
- /* FALLTHROUGH */
- case 'w':
- arg = ANYOF_WORDCHAR;
- goto join_posix;
-
case 'B':
invert = 1;
/* FALLTHROUGH */
char name = *RExC_parse;
char * endbrace = NULL;
RExC_parse += 2;
- if (RExC_parse < RExC_end) {
- endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
- }
+ endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
if (! endbrace) {
vFAIL2("Missing right brace on \\%c{}", name);
goto finish_meta_pat;
}
- case 'D':
- invert = 1;
- /* FALLTHROUGH */
- case 'd':
- arg = ANYOF_DIGIT;
- if (! DEPENDS_SEMANTICS) {
- goto join_posix;
- }
-
- /* \d doesn't have any matches in the upper Latin1 range, hence /d
- * is equivalent to /u. Changing to /u saves some branches at
- * runtime */
- op = POSIXU;
- goto join_posix_op_known;
-
case 'R':
ret = reg_node(pRExC_state, LNBREAK);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
- case 'H':
- invert = 1;
- /* FALLTHROUGH */
+ case 'd':
+ case 'D':
case 'h':
- arg = ANYOF_BLANK;
- op = POSIXU;
- goto join_posix_op_known;
-
- case 'V':
- invert = 1;
- /* FALLTHROUGH */
- case 'v':
- arg = ANYOF_VERTWS;
- op = POSIXU;
- goto join_posix_op_known;
-
- case 'S':
- invert = 1;
- /* FALLTHROUGH */
+ case 'H':
+ case 'p':
+ case 'P':
case 's':
- arg = ANYOF_SPACE;
-
- join_posix:
-
- op = POSIXD + get_regex_charset(RExC_flags);
- if (op > POSIXA) { /* /aa is same as /a */
- op = POSIXA;
- }
- else if (op == POSIXL) {
- RExC_contains_locale = 1;
- }
- else if (op == POSIXD) {
- RExC_seen_d_op = TRUE;
- }
-
- join_posix_op_known:
-
- if (invert) {
- op += NPOSIXD - POSIXD;
- }
-
- ret = reg_node(pRExC_state, op);
- FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
-
- *flagp |= HASWIDTH|SIMPLE;
- /* FALLTHROUGH */
-
- finish_meta_pat:
- if ( UCHARAT(RExC_parse + 1) == '{'
- && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
- {
- RExC_parse += 2;
- vFAIL("Unescaped left brace in regex is illegal here");
- }
- nextchar(pRExC_state);
- Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
- break;
- case 'p':
- case 'P':
- RExC_parse--;
+ case 'S':
+ case 'v':
+ case 'V':
+ case 'w':
+ case 'W':
+ /* These all have the same meaning inside [brackets], and it knows
+ * how to do the best optimizations for them. So, pretend we found
+ * these within brackets, and let it do the work */
+ RExC_parse--;
ret = regclass(pRExC_state, flagp, depth+1,
TRUE, /* means just parse this element */
FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
(UV) *flagp);
- RExC_parse--;
+ RExC_parse--; /* regclass() leaves this one too far ahead */
+ finish_meta_pat:
+ /* The escapes above that don't take a parameter can't be
+ * followed by a '{'. But 'pX', 'p{foo}' and
+ * correspondingly 'P' can be */
+ if ( RExC_parse - parse_start == 1
+ && UCHARAT(RExC_parse + 1) == '{'
+ && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+ {
+ RExC_parse += 2;
+ vFAIL("Unescaped left brace in regex is illegal here");
+ }
Set_Node_Offset(REGNODE_p(ret), parse_start);
- Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
+ Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
nextchar(pRExC_state);
break;
case 'N':
&& num >= RExC_npar
/* cannot be an octal escape if it starts with 8 */
&& *RExC_parse != '8'
- /* cannot be an octal escape it it starts with 9 */
+ /* cannot be an octal escape if it starts with 9 */
&& *RExC_parse != '9'
) {
/* Probably not meant to be a backref, instead likely
/* It might be a forward reference; we can't fail until we
* know, by completing the parse to get all the groups, and
* then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
if (num >= RExC_total_parens) {
vFAIL("Reference to nonexistent group");
}
UV ender = 0;
char *p;
char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
char *s0;
- U8 upper_parse = MAX_NODE_STRING_SIZE;
+ U32 max_string_len = 255;
+
+ /* We may have to reparse the node, artificially stopping filling
+ * it early, based on info gleaned in the first parse. This
+ * variable gives where we stop. Make it above the normal stopping
+ * place first time through. */
+ U32 upper_fill = max_string_len + 1;
/* We start out as an EXACT node, even if under /i, until we find a
* character which is in a fold. The algorithm now segregates into
/* Assume the node will be fully used; the excess is given back at
* the end. We can't make any other length assumptions, as a byte
* input sequence could shrink down. */
- Ptrdiff_t initial_size = STR_SZ(256);
+ Ptrdiff_t current_string_nodes = STR_SZ(max_string_len);
bool next_is_quantifier;
char * oldp = NULL;
/* So is the MICRO SIGN */
bool has_micro_sign = FALSE;
+ /* Set when we fill up the current node and there is still more
+ * text to process */
+ bool overflowed;
+
/* Allocate an EXACT node. The node_type may change below to
* another EXACTish node, but since the size of the node doesn't
* change, it works */
- ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+ ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
+ "exact");
FILL_NODE(ret, node_type);
RExC_emit++;
reparse:
+ p = RExC_parse;
+ len = 0;
+ s = s0;
+
+ continue_parse:
+
/* This breaks under rare circumstances. If folding, we do not
* want to split a node at a character that is a non-final in a
* multi-char fold, as an input string could just happen to want to
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
+ overflowed = FALSE;
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
* We quit at the first non-literal or when the node gets full, or
* under /i the categorization of folding/non-folding character
* changes */
- for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+ while (p < RExC_end && len < upper_fill) {
/* In most cases each iteration adds one byte to the output.
* The exceptions override this */
UPDATE_WARNINGS_LOC(p - 1);
ender = result;
- if (ender < 0x100) {
#ifdef EBCDIC
+ if (ender < 0x100) {
if (RExC_recode_x_to_native) {
ender = LATIN1_TO_NATIVE(ender);
}
-#endif
}
+#endif
break;
}
case 'c':
/* Ready to add 'ender' to the node */
if (! FOLD) { /* The simple case, just append the literal */
+ not_fold_common:
- not_fold_common:
- if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
- *(s++) = (char) ender;
- }
- else {
- U8 * new_s = uvchr_to_utf8((U8*)s, ender);
- added_len = (char *) new_s - s;
- s = (char *) new_s;
+ /* Don't output if it would overflow */
+ if (UNLIKELY(len > max_string_len - ((UTF)
+ ? UVCHR_SKIP(ender)
+ : 1)))
+ {
+ overflowed = TRUE;
+ break;
+ }
- if (ender > 255) {
- requires_utf8_target = TRUE;
- }
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+ *(s++) = (char) ender;
+ }
+ else {
+ U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+ added_len = (char *) new_s - s;
+ s = (char *) new_s;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
}
+ }
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
if (UTF) { /* Use the folded value */
if (UVCHR_IS_INVARIANT(ender)) {
+ if (UNLIKELY(len + 1 > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
*(s)++ = (U8) toFOLD(ender);
}
else {
- ender = _to_uni_fold_flags(
+ U8 temp[UTF8_MAXBYTES_CASE+1];
+
+ UV folded = _to_uni_fold_flags(
ender,
- (U8 *) s,
+ temp,
&added_len,
FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
? FOLD_FLAGS_NOMIX_ASCII
: 0));
+ if (UNLIKELY(len + added_len > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
+ Copy(temp, s, added_len, char);
s += added_len;
- if ( ender > 255
- && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+ if ( folded > 255
+ && LIKELY(folded != GREEK_SMALL_LETTER_MU))
{
/* U+B5 folds to the MU, so its possible for a
* non-UTF-8 target to match it */
if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
maybe_SIMPLE = 0;
if (node_type == EXACTFU) {
+
+ if (UNLIKELY(len + 2 > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
*(s++) = 's';
- /* Let the code below add in the extra 's' */
+ /* Let the code below add in the extra 's'
+ * */
ender = 's';
added_len = 2;
}
has_micro_sign = TRUE;
}
- *(s++) = (char) (DEPENDS_SEMANTICS)
- ? toFOLD(ender)
-
- /* Under /u, the fold of any
- * character in the 0-255 range
- * happens to be its lowercase
- * equivalent, except for LATIN SMALL
- * LETTER SHARP S, which was handled
- * above, and the MICRO SIGN, whose
- * fold requires UTF-8 to represent.
- * */
- : toLOWER_L1(ender);
+ if (UNLIKELY(len + 1 > max_string_len)) {
+ overflowed = TRUE;
+ break;
+ }
+
+ *(s++) = (DEPENDS_SEMANTICS)
+ ? (char) toFOLD(ender)
+
+ /* Under /u, the fold of any character in
+ * the 0-255 range happens to be its
+ * lowercase equivalent, except for LATIN
+ * SMALL LETTER SHARP S, which was handled
+ * above, and the MICRO SIGN, whose fold
+ * requires UTF-8 to represent. */
+ : (char) toLOWER_L1(ender);
}
} /* End of adding current character to the node */
} /* End of loop through literal characters */
- /* Here we have either exhausted the input or ran out of room in
- * the node. (If we encountered a character that can't be in the
- * node, transfer is made directly to <loopdone>, and so we
- * wouldn't have fallen off the end of the loop.) In the latter
- * case, we artificially have to split the node into two, because
- * we just don't have enough space to hold everything. This
- * creates a problem if the final character participates in a
- * multi-character fold in the non-final position, as a match that
- * should have occurred won't, due to the way nodes are matched,
- * and our artificial boundary. So back off until we find a non-
- * problematic character -- one that isn't at the beginning or
- * middle of such a fold. (Either it doesn't participate in any
- * folds, or appears only in the final position of all the folds it
- * does participate in.) A better solution with far fewer false
- * positives, and that would fill the nodes more completely, would
- * be to actually have available all the multi-character folds to
- * test against, and to back-off only far enough to be sure that
- * this node isn't ending with a partial one. <upper_parse> is set
- * further below (if we need to reparse the node) to include just
- * up through that final non-problematic character that this code
- * identifies, so when it is set to less than the full node, we can
- * skip the rest of this */
- if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
- PERL_UINT_FAST8_T backup_count = 0;
-
- const STRLEN full_len = len;
-
- assert(len >= MAX_NODE_STRING_SIZE);
-
- /* Here, <s> points to just beyond where we have output the
- * final character of the node. Look backwards through the
- * string until find a non- problematic character */
-
- if (! UTF) {
-
- /* This has no multi-char folds to non-UTF characters */
- if (ASCII_FOLD_RESTRICTED) {
- goto loopdone;
- }
+ /* Here we have either exhausted the input or run out of room in
+ * the node. If the former, we are done. (If we encountered a
+ * character that can't be in the node, transfer is made directly
+ * to <loopdone>, and so we wouldn't have fallen off the end of the
+ * loop.) */
+ if (LIKELY(! overflowed)) {
+ goto loopdone;
+ }
- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
- backup_count++;
- }
- len = s - s0 + 1;
- }
- else {
+ /* Here we have run out of room. We can grow plain EXACT and
+ * LEXACT nodes. If the pattern is gigantic enough, though,
+ * eventually we'll have to artificially chunk the pattern into
+ * multiple nodes. */
+ if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
+ Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
+ Size_t overhead_expansion = 0;
+ char temp[256];
+ Size_t max_nodes_for_string;
+ Size_t achievable;
+ SSize_t delta;
+
+ /* Here we couldn't fit the final character in the current
+ * node, so it will have to be reparsed, no matter what else we
+ * do */
+ p = oldp;
+
+
+ /* If would have overflowed a regular EXACT node, switch
+ * instead to an LEXACT. The code below is structured so that
+ * the actual growing code is common to changing from an EXACT
+ * or just increasing the LEXACT size. This means that we have
+ * to save the string in the EXACT case before growing, and
+ * then copy it afterwards to its new location */
+ if (node_type == EXACT) {
+ overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
+ RExC_emit += overhead_expansion;
+ Copy(s0, temp, len, char);
+ }
+
+ /* Ready to grow. If it was a plain EXACT, the string was
+ * saved, and the first few bytes of it overwritten by adding
+ * an argument field. We assume, as we do elsewhere in this
+ * file, that one byte of remaining input will translate into
+ * one byte of output, and if that's too small, we grow again,
+ * if too large the excess memory is freed at the end */
+
+ max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
+ achievable = MIN(max_nodes_for_string,
+ current_string_nodes + STR_SZ(RExC_end - p));
+ delta = achievable - current_string_nodes;
+
+ /* If there is just no more room, go finish up this chunk of
+ * the pattern. */
+ if (delta <= 0) {
+ goto loopdone;
+ }
- /* Point to the first byte of the final character */
- s = (char *) utf8_hop((U8 *) s, -1);
+ change_engine_size(pRExC_state, delta + overhead_expansion);
+ current_string_nodes += delta;
+ max_string_len
+ = sizeof(struct regnode) * current_string_nodes;
+ upper_fill = max_string_len + 1;
- while (s >= s0) { /* Search backwards until find
- a non-problematic char */
- if (UTF8_IS_INVARIANT(*s)) {
+ /* If the length was small, we know this was originally an
+ * EXACT node now converted to LEXACT, and the string has to be
+ * restored. Otherwise the string was untouched. 260 is just
+ * a number safely above 255 so don't have to worry about
+ * getting it precise */
+ if (len < 260) {
+ node_type = LEXACT;
+ FILL_NODE(ret, node_type);
+ s0 = STRING(REGNODE_p(ret));
+ Copy(temp, s0, len, char);
+ s = s0 + len;
+ }
- /* There are no ascii characters that participate
- * in multi-char folds under /aa. In EBCDIC, the
- * non-ascii invariants are all control characters,
- * so don't ever participate in any folds. */
- if (ASCII_FOLD_RESTRICTED
- || ! IS_NON_FINAL_FOLD(*s))
- {
- break;
- }
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
- *s, *(s+1))))
- {
- break;
- }
+ goto continue_parse;
+ }
+ else {
+
+ /* Here is /i. Running out of room creates a problem if we are
+ * folding, and the split happens in the middle of a
+ * multi-character fold, as a match that should have occurred,
+ * won't, due to the way nodes are matched, and our artificial
+ * boundary. So back off until we aren't splitting such a
+ * fold. If there is no such place to back off to, we end up
+ * taking the entire node as-is. This can happen if the node
+ * consists entirely of 'f' or entirely of 's' characters (or
+ * things that fold to them) as 'ff' and 'ss' are
+ * multi-character folds.
+ *
+ * At this point:
+ * oldp points to the beginning in the input of the
+ * final character in the node.
+ * p points to the beginning in the input of the
+ * next character in the input, the one that won't
+ * fit in the node.
+ *
+ * We aren't in the middle of a multi-char fold unless the
+ * final character in the node can appear in a non-final
+ * position in such a fold. Very few characters actually
+ * participate in multi-character folds, and fewer still can be
+ * in the non-final position. But it's complicated to know
+ * here if that final character is folded or not, so skip this
+ * check */
+
+ /* Make sure enough space for final char of node,
+ * first char of following node, and the fold of the
+ * following char (so we don't have to worry about
+ * that fold running off the end */
+ U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
+ STRLEN fold_len;
+ UV folded;
+
+ assert(FOLD);
+
+ /* The Unicode standard says that multi character folds consist
+ * of either two or three characters. So we create a buffer
+ * containing a window of three. The first is the final
+ * character in the node (folded), and then the two that begin
+ * the following node. But if the first character of the
+ * following node can't be in a non-final fold position, there
+ * is no need to look at its successor character. The macros
+ * used below to check for multi character folds require folded
+ * inputs, so we have to fold these. (The fold of p was likely
+ * calculated in the loop above, but it hasn't beeen saved, and
+ * khw thinks it would be too entangled to change to do so) */
+
+ if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
+ folded = _to_uni_fold_flags(ender,
+ foldbuf,
+ &fold_len,
+ FOLD_FLAGS_FULL);
+ }
+ else {
+ foldbuf[0] = folded = MICRO_SIGN;
+ fold_len = 1;
+ }
+
+ /* Here, foldbuf contains the fold of the first character in
+ * the next node. We may also need the next one (if there is
+ * one) to get our third, but if the first character folded to
+ * more than one, those extra one(s) will serve as the third.
+ * Also, we don't need a third unless the previous one can
+ * appear in a non-final position in a fold */
+ if ( ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
+ && (fold_len == 1 || ( UTF
+ && UVCHR_SKIP(folded) == fold_len))
+ && UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
+ {
+ if (UTF) {
+ STRLEN next_fold_len;
+
+ toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
+ (U8*) RExC_end, foldbuf + fold_len,
+ &next_fold_len);
+ fold_len += next_fold_len;
+ }
+ else {
+ if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
+ foldbuf[fold_len] = 's';
}
- else if (! _invlist_contains_cp(
- PL_NonFinalFold,
- valid_utf8_to_uvchr((U8 *) s, NULL)))
- {
- break;
+ else {
+ foldbuf[fold_len] = toLOWER_L1(p[1]);
}
+ fold_len++;
+ }
+ }
- /* Here, the current character is problematic in that
- * it does occur in the non-final position of some
- * fold, so try the character before it, but have to
- * special case the very first byte in the string, so
- * we don't read outside the string */
- s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
- backup_count++;
- } /* End of loop backwards through the string */
-
- /* If there were only problematic characters in the string,
- * <s> will point to before s0, in which case the length
- * should be 0, otherwise include the length of the
- * non-problematic character just found */
- len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
- }
+ /* Here foldbuf contains the the fold of p, and if appropriate
+ * that of the character following p in the input. */
- /* Here, have found the final character, if any, that is
- * non-problematic as far as ending the node without splitting
- * it across a potential multi-char fold. <len> contains the
- * number of bytes in the node up-to and including that
- * character, or is 0 if there is no such character, meaning
- * the whole node contains only problematic characters. In
- * this case, give up and just take the node as-is. We can't
- * do any better */
- if (len == 0) {
- len = full_len;
+ /* Search backwards until find a place that doesn't split a
+ * multi-char fold */
+ while (1) {
+ STRLEN s_len;
+ char s_fold_buf[UTF8_MAXBYTES_CASE];
+ char * s_fold = s_fold_buf;
- } else {
+ if (s <= s0) {
- /* Here, the node does contain some characters that aren't
- * problematic. If we didn't have to backup any, then the
- * final character in the node is non-problematic, and we
- * can take the node as-is */
- if (backup_count == 0) {
- goto loopdone;
+ /* There's no safe place in the node to split. Quit so
+ * will take the whole node */
+ break;
}
- else if (backup_count == 1) {
- /* If the final character is problematic, but the
- * penultimate is not, back-off that last character to
- * later start a new node with it */
- p = oldp;
- goto loopdone;
+ /* Backup 1 character. The first time through this moves s
+ * to point to the final character in the node */
+ if (UTF) {
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ }
+ else {
+ s--;
+ }
+
+ /* 's' may or may not be folded; so make sure it is, and
+ * use just the final character in its fold (should there
+ * be more than one */
+ if (UTF) {
+ toFOLD_utf8_safe((U8*) s,
+ (U8*) s + UTF8SKIP(s),
+ (U8 *) s_fold_buf, &s_len);
+ while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
+ {
+ s_fold += UTF8SKIP(s_fold);
+ }
+ s_len = UTF8SKIP(s_fold);
+ }
+ else {
+ if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
+ {
+ s_fold_buf[0] = 's';
+ }
+ else { /* This works for all other non-UTF-8 folds
+ */
+ s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+ }
+ s_len = 1;
}
- /* Here, the final non-problematic character is earlier
- * in the input than the penultimate character. What we do
- * is reparse from the beginning, going up only as far as
- * this final ok one, thus guaranteeing that the node ends
- * in an acceptable character. The reason we reparse is
- * that we know how far in the character is, but we don't
- * know how to correlate its position with the input parse.
- * An alternate implementation would be to build that
- * correlation as we go along during the original parse,
- * but that would entail extra work for every node, whereas
- * this code gets executed only when the string is too
- * large for the node, and the final two characters are
- * problematic, an infrequent occurrence. Yet another
- * possible strategy would be to save the tail of the
- * string, and the next time regatom is called, initialize
- * with that. The problem with this is that unless you
- * back off one more character, you won't be guaranteed
- * regatom will get called again, unless regbranch,
- * regpiece ... are also changed. If you do back off that
- * extra character, so that there is input guaranteed to
- * force calling regatom, you can't handle the case where
- * just the first character in the node is acceptable. I
- * (khw) decided to try this method which doesn't have that
- * pitfall; if performance issues are found, we can do a
- * combination of the current approach plus that one */
- upper_parse = len;
- len = 0;
- s = s0;
- goto reparse;
+ /* Unshift this character to the beginning of the buffer,
+ * No longer needed trailing characters are overwritten.
+ * */
+ Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
+ Copy(s_fold, foldbuf, s_len, U8);
+
+ /* If this isn't a multi-character fold, we have found a
+ * splittable place. If this is the final character in the
+ * node, that means the node is valid as-is, and can quit.
+ * Otherwise, we note how much we can fill the node before
+ * coming to a non-splittable position, and go parse it
+ * again, stopping there. This is done because we know
+ * where in the output to stop, but we don't have a map to
+ * where that is in the input. One could be created, but
+ * it seems like overkill for such a rare event as we are
+ * dealing with here */
+ if (UTF) {
+ if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
+ foldbuf + UTF8_MAXBYTES_CASE))
+ {
+ upper_fill = s + UTF8SKIP(s) - s0;
+ if (LIKELY(upper_fill == 255)) {
+ break;
+ }
+ goto reparse;
+ }
+ }
+ else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
+ foldbuf + UTF8_MAXBYTES_CASE))
+ {
+ upper_fill = s + 1 - s0;
+ if (LIKELY(upper_fill == 255)) {
+ break;
+ }
+ goto reparse;
+ }
}
+
+ /* Here the node consists entirely of non-final multi-char
+ * folds. (Likely it is all 'f's or all 's's.) There's no
+ * decent place to split it, so give up and just take the whole
+ * thing */
+
} /* End of verifying node ends with an appropriate char */
+ p = oldp;
+
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
- /* Free up any over-allocated space */
- change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
+ /* Free up any over-allocated space; cast is to silence bogus
+ * warning in MS VC */
+ change_engine_size(pRExC_state,
+ - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
else if (requires_utf8_target) {
node_type = EXACT_ONLY8;
}
- } else if (FOLD) {
+ }
+ else if (node_type == LEXACT) {
+ if (requires_utf8_target) {
+ node_type = LEXACT_ONLY8;
+ }
+ }
+ else if (FOLD) {
if ( UNLIKELY(has_micro_sign || has_ss)
&& (node_type == EXACTFU || ( node_type == EXACTF
&& maybe_exactfu)))
}
else if (node_type == EXACTF) { /* Means is /di */
+ /* This intermediate variable is needed solely because
+ * the asserts in the macro where used exceed Win32's
+ * literal string capacity */
+ char first_char = * STRING(REGNODE_p(ret));
+
/* If 'maybe_exactfu' is clear, then we need to stay
* /di. If it is set, it means there are no code
* points that match differently depending on UTF8ness
if (! maybe_exactfu) {
RExC_seen_d_op = TRUE;
}
- else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+ else if ( isALPHA_FOLD_EQ(first_char, 's')
|| isALPHA_FOLD_EQ(ender, 's'))
{
/* But, if the node begins or ends in an 's' we
}
OP(REGNODE_p(ret)) = node_type;
- STR_LEN(REGNODE_p(ret)) = len;
+ setSTR_LEN(REGNODE_p(ret), len);
RExC_emit += STR_SZ(len);
/* If the node isn't a single character, it can't be SIMPLE */
- if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+ if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
maybe_SIMPLE = 0;
}
* sets up the bitmap and any flags, removing those code points from the
* inversion list, setting it to NULL should it become completely empty */
+ dVAR;
+
PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
assert(PL_regkind[OP(node)] == ANYOF);
/* There is no bitmap for this node type */
- if (OP(node) == ANYOFH) {
+ if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
return;
}
RExC_parse = RExC_end;
}
else if (RExC_parse != save_parse) {
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
}
vFAIL("Expecting '(?flags:(?[...'");
}
FALSE, /* Require return to be an ANYOF */
¤t))
{
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
+ goto regclass_failed;
}
/* regclass() will return with parsing just the \ sequence,
FALSE, /* Require return to be an ANYOF */
¤t))
{
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
+ goto regclass_failed;
}
if (! current) {
}
if (!node)
- FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
- PTR2UV(flagp));
+ goto regclass_failed;
/* Fix up the node type if we are in locale. (We have pretended we are
* under /u for the purposes of regclass(), as this construct will only
nextchar(pRExC_state);
Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
return node;
+
+ regclass_failed:
+ FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
+ (UV) *flagp);
}
#ifdef ENABLE_REGEX_SETS_DEBUGGING
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
const bool stop_at_1, /* Just parse the next thing, don't
look for a full character class */
- bool allow_multi_folds,
+ bool allow_mutiple_chars,
const bool silence_non_portable, /* Don't output warnings
about too large
characters */
*
* ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
* characters, with the corresponding bit set if that character is in the
- * list. For characters above this, a range list or swash is used. There
+ * list. For characters above this, an inversion list is used. There
* are extra bits for \w, etc. in locale ANYOFs, as what these match is not
* determinable at compile time
*
* UTF-8
*/
+ dVAR;
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
UV value = OOB_UNICODE, save_value = OOB_UNICODE;
- regnode_offset ret;
+ regnode_offset ret = -1; /* Initialized to an illegal value */
STRLEN numlen;
int namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
- SV *listsv = NULL;
+ SV *listsv = NULL; /* List of \p{user-defined} whose definitions
+ aren't available at the time this was called */
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
SV* properties = NULL; /* Code points that match \p{} \P{} */
const bool skip_white = cBOOL( ret_invlist
|| (RExC_flags & RXf_PMf_EXTENDED_MORE));
- /* Unicode properties are stored in a swash; this holds the current one
- * being parsed. If this swash is the only above-latin1 component of the
- * character class, an optimization is to pass it directly on to the
- * execution engine. Otherwise, it is set to NULL to indicate that there
- * are other things in the class that have to be dealt with at execution
- * time */
- SV* swash = NULL; /* Code points that match \p{} \P{} */
-
/* inversion list of code points this node matches only when the target
* string is in UTF-8. These are all non-ASCII, < 256. (Because is under
* /d) */
#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
&& UNICODE_DOT_DOT_VERSION == 0)
- allow_multi_folds = FALSE;
+ allow_mutiple_chars = FALSE;
#endif
- listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
+ /* We include the /i status at the beginning of this so that we can
+ * know it at runtime */
+ listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
initial_listsv_len = SvCUR(listsv);
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
RExC_parse++;
invert = TRUE;
- allow_multi_folds = FALSE;
+ allow_mutiple_chars = FALSE;
MARK_NAUGHTY(1);
SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
}
"Ignoring zero length \\N{} in character class");
}
else { /* cp_count > 1 */
+ assert(cp_count > 1);
if (! RExC_in_multi_char_class) {
- if (invert || range || *RExC_parse == '-') {
+ if ( ! allow_mutiple_chars
+ || invert
+ || range
+ || *RExC_parse == '-')
+ {
if (strict) {
RExC_parse--;
- vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
+ vFAIL("\\N{} here is restricted to one character");
}
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
break; /* <value> contains the first code
case 'P':
{
char *e;
- char *i;
-
- /* We will handle any undefined properties ourselves */
- U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
- /* And we actually would prefer to get
- * the straight inversion list of the
- * swash, since we will be accessing it
- * anyway, to save a little time */
- |_CORE_SWASH_INIT_ACCEPT_INVLIST;
-
- SvREFCNT_dec(swash); /* Free any left-overs */
/* \p means they want Unicode semantics */
REQUIRE_UNI_RULES(flagp, 0);
} /* The \p isn't immediately followed by a '{' */
else if (! isALPHA(*RExC_parse)) {
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL2("Character following \\%c must be '{' or a "
"single-character Unicode property name",
(U8) value);
}
{
char* name = RExC_parse;
- char* base_name; /* name after any packages are stripped */
- char* lookup_name = NULL;
- const char * const colon_colon = "::";
- bool invert;
-
- SV* invlist;
-
- /* Temporary workaround for [perl #133136]. For this
- * precise input that is in the .t that is failing, load
- * utf8.pm, which is what the test wants, so that that
- * .t passes */
- if ( memEQs(RExC_start, e + 1 - RExC_start,
- "foo\\p{Alnum}")
- && ! hv_common(GvHVn(PL_incgv),
- NULL,
- "utf8.pm", sizeof("utf8.pm") - 1,
- 0, HV_FETCH_ISEXISTS, NULL, 0))
- {
- require_pv("utf8.pm");
- }
- invlist = parse_uniprop_string(name, n, FOLD, &invert);
- if (invlist) {
- if (invert) {
- value ^= 'P' ^ 'p';
- }
- }
- else {
- /* Try to get the definition of the property into
- * <invlist>. If /i is in effect, the effective property
- * will have its name be <__NAME_i>. The design is
- * discussed in commit
- * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
- SAVEFREEPV(name);
-
- for (i = RExC_parse; i < RExC_parse + n; i++) {
- if (isCNTRL(*i) && *i != '\t') {
- RExC_parse = e + 1;
- vFAIL2("Can't find Unicode property definition \"%s\"", name);
+ /* Any message returned about expanding the definition */
+ SV* msg = newSVpvs_flags("", SVs_TEMP);
+
+ /* If set TRUE, the property is user-defined as opposed to
+ * official Unicode */
+ bool user_defined = FALSE;
+
+ SV * prop_definition = parse_uniprop_string(
+ name, n, UTF, FOLD,
+ FALSE, /* This is compile-time */
+
+ /* We can't defer this defn when
+ * the full result is required in
+ * this call */
+ ! cBOOL(ret_invlist),
+
+ &user_defined,
+ msg,
+ 0 /* Base level */
+ );
+ if (SvCUR(msg)) { /* Assumes any error causes a msg */
+ assert(prop_definition == NULL);
+ RExC_parse = e + 1;
+ if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
+ thing so, or else the display is
+ mojibake */
+ RExC_utf8 = TRUE;
}
+ /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
+ vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
+ SvCUR(msg), SvPVX(msg)));
}
- if (FOLD) {
- lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
+ if (! is_invlist(prop_definition)) {
- /* The function call just below that uses this can fail
- * to return, leaking memory if we don't do this */
- SAVEFREEPV(lookup_name);
- }
-
- /* Look up the property name, and get its swash and
- * inversion list, if the property is found */
- swash = _core_swash_init("utf8",
- (lookup_name)
- ? lookup_name
- : name,
- &PL_sv_undef,
- 1, /* binary */
- 0, /* not tr/// */
- NULL, /* No inversion list */
- &swash_init_flags
- );
- if (! swash || ! (invlist = _get_swash_invlist(swash))) {
- HV* curpkg = (IN_PERL_COMPILETIME)
- ? PL_curstash
- : CopSTASH(PL_curcop);
- UV final_n = n;
- bool has_pkg;
-
- if (swash) { /* Got a swash but no inversion list.
- Something is likely wrong that will
- be sorted-out later */
- SvREFCNT_dec_NN(swash);
- swash = NULL;
- }
-
- /* Here didn't find it. It could be a an error (like a
- * typo) in specifying a Unicode property, or it could
- * be a user-defined property that will be available at
- * run-time. The names of these must begin with 'In'
- * or 'Is' (after any packages are stripped off). So
- * if not one of those, or if we accept only
- * compile-time properties, is an error; otherwise add
- * it to the list for run-time look up. */
- if ((base_name = rninstr(name, name + n,
- colon_colon, colon_colon + 2)))
- { /* Has ::. We know this must be a user-defined
- property */
- base_name += 2;
- final_n -= base_name - name;
- has_pkg = TRUE;
+ /* Here, the definition isn't known, so we have gotten
+ * returned a string that will be evaluated if and when
+ * encountered at runtime. We add it to the list of
+ * such properties, along with whether it should be
+ * complemented or not */
+ if (value == 'P') {
+ sv_catpvs(listsv, "!");
}
else {
- base_name = name;
- has_pkg = FALSE;
- }
-
- if ( final_n < 3
- || base_name[0] != 'I'
- || (base_name[1] != 's' && base_name[1] != 'n')
- || ret_invlist)
- {
- const char * const msg
- = (has_pkg)
- ? "Illegal user-defined property name"
- : "Can't find Unicode property definition";
- RExC_parse = e + 1;
-
- /* diag_listed_as: Can't find Unicode property definition "%s" */
- vFAIL3utf8f("%s \"%" UTF8f "\"",
- msg, UTF8fARG(UTF, n, name));
+ sv_catpvs(listsv, "+");
}
+ sv_catsv(listsv, prop_definition);
- /* If the property name doesn't already have a package
- * name, add the current one to it so that it can be
- * referred to outside it. [perl #121777] */
- if (! has_pkg && curpkg) {
- char* pkgname = HvNAME(curpkg);
- if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
- char* full_name = Perl_form(aTHX_
- "%s::%s",
- pkgname,
- name);
- n = strlen(full_name);
- name = savepvn(full_name, n);
- SAVEFREEPV(name);
- }
- }
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
- (value == 'p' ? '+' : '!'),
- (FOLD) ? "__" : "",
- UTF8fARG(UTF, n, name),
- (FOLD) ? "_i" : "");
has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
/* We don't know yet what this matches, so have to flag
anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
else {
+ assert (prop_definition && is_invlist(prop_definition));
- /* Here, did get the swash and its inversion list. If
- * the swash is from a user-defined property, then this
- * whole character class should be regarded as such */
- if (swash_init_flags
- & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
+ /* Here we do have the complete property definition
+ *
+ * Temporary workaround for [perl #133136]. For this
+ * precise input that is in the .t that is failing,
+ * load utf8.pm, which is what the test wants, so that
+ * that .t passes */
+ if ( memEQs(RExC_start, e + 1 - RExC_start,
+ "foo\\p{Alnum}")
+ && ! hv_common(GvHVn(PL_incgv),
+ NULL,
+ "utf8.pm", sizeof("utf8.pm") - 1,
+ 0, HV_FETCH_ISEXISTS, NULL, 0))
{
- has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
+ require_pv("utf8.pm");
}
- }
- }
- if (invlist) {
- if (! (has_runtime_dependency
- & HAS_USER_DEFINED_PROPERTY) &&
+
+ if (! user_defined &&
/* We warn on matching an above-Unicode code point
* if the match would return true, except don't
* warn for \p{All}, which has exactly one element
* = 0 */
- (_invlist_contains_cp(invlist, 0x110000)
- && (! (_invlist_len(invlist) == 1
- && *invlist_array(invlist) == 0))))
+ (_invlist_contains_cp(prop_definition, 0x110000)
+ && (! (_invlist_len(prop_definition) == 1
+ && *invlist_array(prop_definition) == 0))))
{
warn_super = TRUE;
}
/* Invert if asking for the complement */
if (value == 'P') {
_invlist_union_complement_2nd(properties,
- invlist,
+ prop_definition,
&properties);
-
- /* The swash can't be used as-is, because we've
- * inverted things; delay removing it to here after
- * have copied its invlist above */
- if (! swash) {
- SvREFCNT_dec_NN(invlist);
- }
- SvREFCNT_dec(swash);
- swash = NULL;
}
else {
- _invlist_union(properties, invlist, &properties);
- if (! swash) {
- SvREFCNT_dec_NN(invlist);
- }
+ _invlist_union(properties, prop_definition, &properties);
}
}
}
RExC_parse += numlen;
if (numlen != 3) {
if (strict) {
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL("Need exactly 3 octal digits");
}
else if ( numlen < 3 /* like \08, \178 */
) {
SV* scratch_list = NULL;
- /* What the Posix classes (like \w, [:space:]) match in locale
- * isn't knowable under locale until actual match time. A
+ /* What the Posix classes (like \w, [:space:]) match isn't
+ * generally knowable under locale until actual match time. A
* special node is used for these which has extra space for a
* bitmap, with a bit reserved for each named class that is to
- * be matched against. This isn't needed for \p{} and
+ * be matched against. (This isn't needed for \p{} and
* pseudo-classes, as they are not affected by locale, and
- * hence are dealt with separately */
- POSIXL_SET(posixl, namedclass);
- has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
- anyof_flags |= ANYOF_MATCHES_POSIXL;
-
- /* The above-Latin1 characters are not subject to locale rules.
- * Just add them to the unconditionally-matched list */
-
- /* Get the list of the above-Latin1 code points this matches */
- _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
- PL_XPosix_ptrs[classnum],
-
- /* Odd numbers are complements, like
- * NDIGIT, NASCII, ... */
- namedclass % 2 != 0,
- &scratch_list);
- /* Checking if 'cp_list' is NULL first saves an extra clone.
- * Its reference count will be decremented at the next union,
- * etc, or if this is the only instance, at the end of the
- * routine */
- if (! cp_list) {
- cp_list = scratch_list;
- }
- else {
- _invlist_union(cp_list, scratch_list, &cp_list);
- SvREFCNT_dec_NN(scratch_list);
+ * hence are dealt with separately.) However, if a named class
+ * and its complement are both present, then it matches
+ * everything, and there is no runtime dependency. Odd numbers
+ * are the complements of the next lower number, so xor works.
+ * (Note that something like [\w\D] should match everything,
+ * because \d should be a proper subset of \w. But rather than
+ * trust that the locale is well behaved, we leave this to
+ * runtime to sort out) */
+ if (POSIXL_TEST(posixl, namedclass ^ 1)) {
+ cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
+ POSIXL_ZERO(posixl);
+ has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
+ anyof_flags &= ~ANYOF_MATCHES_POSIXL;
+ continue; /* We could ignore the rest of the class, but
+ best to parse it for any errors */
+ }
+ else { /* Here, isn't the complement of any already parsed
+ class */
+ POSIXL_SET(posixl, namedclass);
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
+ anyof_flags |= ANYOF_MATCHES_POSIXL;
+
+ /* The above-Latin1 characters are not subject to locale
+ * rules. Just add them to the unconditionally-matched
+ * list */
+
+ /* Get the list of the above-Latin1 code points this
+ * matches */
+ _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+
+ /* Odd numbers are complements,
+ * like NDIGIT, NASCII, ... */
+ namedclass % 2 != 0,
+ &scratch_list);
+ /* Checking if 'cp_list' is NULL first saves an extra
+ * clone. Its reference count will be decremented at the
+ * next union, etc, or if this is the only instance, at the
+ * end of the routine */
+ if (! cp_list) {
+ cp_list = scratch_list;
+ }
+ else {
+ _invlist_union(cp_list, scratch_list, &cp_list);
+ SvREFCNT_dec_NN(scratch_list);
+ }
+ continue; /* Go get next character */
}
- continue; /* Go get next character */
}
else {
* "ss" =~ /^[^\xDF]+$/i => N
*
* See [perl #89750] */
- if (FOLD && allow_multi_folds && value == prevvalue) {
+ if (FOLD && allow_mutiple_chars && value == prevvalue) {
if ( value == LATIN_SMALL_LETTER_SHARP_S
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
value)))
literal
);
}
- else if isMNEMONIC_CNTRL(value) {
+ else if (isMNEMONIC_CNTRL(value)) {
vWARN4(RExC_parse,
"\"%.*s\" is more clearly written simply as \"%s\"",
(int) (RExC_parse - rangebegin),
/* And combine the result (if any) with any inversion lists from posix
* classes. The lists are kept separate up to now because we don't want to
- * fold the classes (folding of those is automatically handled by the swash
- * fetching code) */
+ * fold the classes */
if (simple_posixes) { /* These are the classes known to be unaffected by
/a, /aa, and /d */
if (cp_list) {
* folded until runtime */
/* If we didn't do folding, it's because some information isn't available
- * until runtime; set the run-time fold flag for these. (We don't have to
- * worry about properties folding, as that is taken care of by the swash
- * fetching). We know to set the flag if we have a non-NULL list for UTF-8
- * locales, or the class matches at least one 0-255 range code point */
+ * until runtime; set the run-time fold flag for these We know to set the
+ * flag if we have a non-NULL list for UTF-8 locales, or the class matches
+ * at least one 0-255 range code point */
if (LOC && FOLD) {
/* Some things on the list might be unconditionally included because of
only_utf8_locale_list = NULL;
}
}
- if (only_utf8_locale_list) {
+ if ( only_utf8_locale_list
+ || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
+ || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
+ {
has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
anyof_flags
|= ANYOFL_FOLD
{
_invlist_invert(cp_list);
- /* Any swash can't be used as-is, because we've inverted things */
- if (swash) {
- SvREFCNT_dec_NN(swash);
- swash = NULL;
- }
-
- invert = FALSE;
+ /* Clear the invert flag since have just done it here */
+ invert = FALSE;
}
if (ret_invlist) {
*ret_invlist = cp_list;
- SvREFCNT_dec(swash);
return RExC_emit;
}
invlist_iterinit(cp_list);
for (i = 0; i <= MAX_FOLD_FROMS; i++) {
- if (invlist_iternext(cp_list, &start[i], &end[i])) {
- partial_cp_count += end[i] - start[i] + 1;
+ if (! invlist_iternext(cp_list, &start[i], &end[i])) {
+ break;
}
+ partial_cp_count += end[i] - start[i] + 1;
}
invlist_iterfinish(cp_list);
* the only element in the character class (perluniprops.pod notes
* such properties). */
if (partial_cp_count == 0) {
- assert (! invert);
- ret = reganode(pRExC_state, OPFAIL, 0);
+ if (invert) {
+ ret = reg_node(pRExC_state, SANY);
+ }
+ else {
+ ret = reganode(pRExC_state, OPFAIL, 0);
+ }
+
goto not_anyof;
}
* inversion list, making sure everything is included. */
fold_list = add_cp_to_invlist(fold_list, start[0]);
fold_list = add_cp_to_invlist(fold_list, folded);
- fold_list = add_cp_to_invlist(fold_list, first_fold);
- for (i = 0; i < folds_to_this_cp_count - 1; i++) {
- fold_list = add_cp_to_invlist(fold_list,
+ if (folds_to_this_cp_count > 0) {
+ fold_list = add_cp_to_invlist(fold_list, first_fold);
+ for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
+ fold_list = add_cp_to_invlist(fold_list,
remaining_folds[i]);
+ }
}
/* If the fold list is identical to what's in this ANYOF
ret = regnode_guts(pRExC_state, op, len, "exact");
FILL_NODE(ret, op);
RExC_emit += 1 + STR_SZ(len);
- STR_LEN(REGNODE_p(ret)) = len;
+ setSTR_LEN(REGNODE_p(ret), len);
if (len == 1) {
- *STRING(REGNODE_p(ret)) = value;
+ *STRING(REGNODE_p(ret)) = (U8) value;
}
else {
uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
full_cp_count += this_end - this_start + 1;
}
- invlist_iterfinish(cp_list);
/* At the end of the loop, we count how many bits differ from
* the bits in lowest code point, call the count 'd'. If the
ret = reganode(pRExC_state, op, lowest_cp);
FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
}
+
+ done_anyofm:
+ invlist_iterfinish(cp_list);
}
- done_anyofm:
if (inverted) {
_invlist_invert(cp_list);
SvREFCNT_dec(intersection);
}
- /* If didn't find an optimization and there is no need for a
- * bitmap, optimize to indicate that */
+ /* If didn't find an optimization and there is no need for a bitmap,
+ * optimize to indicate that */
if ( start[0] >= NUM_ANYOF_CODE_POINTS
&& ! LOC
- && ! upper_latin1_only_utf8_matches)
+ && ! upper_latin1_only_utf8_matches
+ && anyof_flags == 0)
{
+ U8 low_utf8[UTF8_MAXBYTES+1];
+ UV highest_cp = invlist_highest(cp_list);
+
op = ANYOFH;
+
+ /* Currently the maximum allowed code point by the system is
+ * IV_MAX. Higher ones are reserved for future internal use. This
+ * particular regnode can be used for higher ones, but we can't
+ * calculate the code point of those. IV_MAX suffices though, as
+ * it will be a large first byte */
+ (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+ /* We store the lowest possible first byte of the UTF-8
+ * representation, using the flags field. This allows for quick
+ * ruling out of some inputs without having to convert from UTF-8
+ * to code point. For EBCDIC, this has to be I8. */
+ anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+ /* If the first UTF-8 start byte for the highest code point in the
+ * range is suitably small, we may be able to get an upper bound as
+ * well */
+ if (highest_cp <= IV_MAX) {
+ U8 high_utf8[UTF8_MAXBYTES+1];
+
+ (void) uvchr_to_utf8(high_utf8, highest_cp);
+
+ /* If the lowest and highest are the same, we can get an exact
+ * first byte instead of a just minimum. We signal this with a
+ * different regnode */
+ if (low_utf8[0] == high_utf8[0]) {
+
+ /* No need to convert to I8 for EBCDIC as this is an exact
+ * match */
+ anyof_flags = low_utf8[0];
+ op = ANYOFHb;
+ }
+ else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
+ {
+
+ /* Here, the high byte is not the same as the low, but is
+ * small enough that its reasonable to have a loose upper
+ * bound, which is packed in with the strict lower bound.
+ * See comments at the definition of MAX_ANYOF_HRx_BYTE.
+ * On EBCDIC platforms, I8 is used. On ASCII platforms I8
+ * is the same thing as UTF-8 */
+
+ U8 bits = 0;
+ U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
+ U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
+ - anyof_flags;
+
+ if (range_diff <= max_range_diff / 8) {
+ bits = 3;
+ }
+ else if (range_diff <= max_range_diff / 4) {
+ bits = 2;
+ }
+ else if (range_diff <= max_range_diff / 2) {
+ bits = 1;
+ }
+ anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
+ op = ANYOFHr;
+ }
+ }
+
+ goto done_finding_op;
}
} /* End of seeing if can optimize it into a different node */
is_anyof: /* It's going to be an ANYOF node. */
- if (op != ANYOFH) {
- op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
- ? ANYOFD
- : ((posixl)
- ? ANYOFPOSIXL
- : ((LOC)
- ? ANYOFL
- : ANYOF));
- }
+ op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+ ? ANYOFD
+ : ((posixl)
+ ? ANYOFPOSIXL
+ : ((LOC)
+ ? ANYOFL
+ : ANYOF));
+
+ done_finding_op:
ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
FILL_NODE(ret, op); /* We set the argument later */
ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
- /* If there is a swash and more than one element, we can't use the swash in
- * the optimization below. */
- if (swash && element_count > 1) {
- SvREFCNT_dec_NN(swash);
- swash = NULL;
- }
-
- /* Note that the optimization of using 'swash' if it is the only thing in
- * the class doesn't have us change swash at all, so it can include things
- * that are also in the bitmap; otherwise we have purposely deleted that
- * duplicate information */
set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
? listsv : NULL,
- only_utf8_locale_list,
- swash, cBOOL(has_runtime_dependency
- & HAS_USER_DEFINED_PROPERTY));
+ only_utf8_locale_list);
return ret;
not_anyof:
regnode* const node,
SV* const cp_list,
SV* const runtime_defns,
- SV* const only_utf8_locale_list,
- SV* const swash,
- const bool has_user_defined_property)
+ SV* const only_utf8_locale_list)
{
/* Sets the arg field of an ANYOF-type node 'node', using information about
* the node passed-in. If there is nothing outside the node's bitmap, the
* arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
* the count returned by add_data(), having allocated and stored an array,
- * av, that that count references, as follows:
- * av[0] stores the character class description in its textual form.
- * This is used later (regexec.c:Perl_regclass_swash()) to
- * initialize the appropriate swash, and is also useful for dumping
- * the regnode. This is set to &PL_sv_undef if the textual
- * description is not needed at run-time (as happens if the other
- * elements completely define the class)
- * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
- * computed from av[0]. But if no further computation need be done,
- * the swash is stored here now (and av[0] is &PL_sv_undef).
- * av[2] stores the inversion list of code points that match only if the
- * current locale is UTF-8
- * av[3] stores the cp_list inversion list for use in addition or instead
- * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
- * (Otherwise everything needed is already in av[0] and av[1])
- * av[4] is set if any component of the class is from a user-defined
- * property; used only if av[3] exists */
+ * av, as follows:
+ *
+ * av[0] stores the inversion list defining this class as far as known at
+ * this time, or PL_sv_undef if nothing definite is now known.
+ * av[1] stores the inversion list of code points that match only if the
+ * current locale is UTF-8, or if none, PL_sv_undef if there is an
+ * av[2], or no entry otherwise.
+ * av[2] stores the list of user-defined properties whose subroutine
+ * definitions aren't known at this time, or no entry if none. */
UV n;
AV * const av = newAV();
SV *rv;
- av_store(av, 0, (runtime_defns)
- ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
- if (swash) {
- assert(cp_list);
- av_store(av, 1, swash);
- SvREFCNT_dec_NN(cp_list);
- }
- else {
- av_store(av, 1, &PL_sv_undef);
- if (cp_list) {
- av_store(av, 3, cp_list);
- av_store(av, 4, newSVuv(has_user_defined_property));
- }
- }
+ if (cp_list) {
+ av_store(av, INVLIST_INDEX, cp_list);
+ }
if (only_utf8_locale_list) {
- av_store(av, 2, only_utf8_locale_list);
+ av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
}
- else {
- av_store(av, 2, &PL_sv_undef);
+
+ if (runtime_defns) {
+ av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
}
rv = newRV_noinc(MUTABLE_SV(av));
{
/* For internal core use only.
- * Returns the swash for the input 'node' in the regex 'prog'.
- * If <doinit> is 'true', will attempt to create the swash if not already
- * done.
+ * Returns the inversion list for the input 'node' in the regex 'prog'.
+ * If <doinit> is 'true', will attempt to create the inversion list if not
+ * already done.
* If <listsvp> is non-null, will return the printable contents of the
- * swash. This can be used to get debugging information even before the
- * swash exists, by calling this function with 'doinit' set to false, in
- * which case the components that will be used to eventually create the
- * swash are returned (in a printable form).
+ * property definition. This can be used to get debugging information
+ * even before the inversion list exists, by calling this function with
+ * 'doinit' set to false, in which case the components that will be used
+ * to eventually create the inversion list are returned (in a printable
+ * form).
* If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
* store an inversion list of code points that should match only if the
* execution-time locale is a UTF-8 one.
* inversion list of the code points that would be instead returned in
* <listsvp> if this were NULL. Thus, what gets output in <listsvp>
* when this parameter is used, is just the non-code point data that
- * will go into creating the swash. This currently should be just
+ * will go into creating the inversion list. This currently should be just
* user-defined properties whose definitions were not known at compile
* time. Using this parameter allows for easier manipulation of the
- * swash's data by the caller. It is illegal to call this function with
- * this parameter set, but not <listsvp>
+ * inversion list's data by the caller. It is illegal to call this
+ * function with this parameter set, but not <listsvp>
*
* Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
- * that, in spite of this function's name, the swash it returns may include
- * the bitmap data as well */
+ * that, in spite of this function's name, the inversion list it returns
+ * may include the bitmap data as well */
- SV *sw = NULL;
- SV *si = NULL; /* Input swash initialization string */
+ SV *si = NULL; /* Input initialization string */
SV* invlist = NULL;
RXi_GET_DECL(prog, progi);
SV * const rv = MUTABLE_SV(data->data[n]);
AV * const av = MUTABLE_AV(SvRV(rv));
SV **const ary = AvARRAY(av);
- U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
- si = *ary; /* ary[0] = the string to initialize the swash with */
+ invlist = ary[INVLIST_INDEX];
- if (av_tindex_skip_len_mg(av) >= 2) {
- if (only_utf8_locale_ptr
- && ary[2]
- && ary[2] != &PL_sv_undef)
- {
- *only_utf8_locale_ptr = ary[2];
- }
- else {
- assert(only_utf8_locale_ptr);
- *only_utf8_locale_ptr = NULL;
- }
-
- /* Elements 3 and 4 are either both present or both absent. [3]
- * is any inversion list generated at compile time; [4]
- * indicates if that inversion list has any user-defined
- * properties in it. */
- if (av_tindex_skip_len_mg(av) >= 3) {
- invlist = ary[3];
- if (SvUV(ary[4])) {
- swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
+ if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
+ *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
+ }
+
+ if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
+ si = ary[DEFERRED_USER_DEFINED_INDEX];
+ }
+
+ if (doinit && (si || invlist)) {
+ if (si) {
+ bool user_defined;
+ SV * msg = newSVpvs_flags("", SVs_TEMP);
+
+ SV * prop_definition = handle_user_defined_property(
+ "", 0, FALSE, /* There is no \p{}, \P{} */
+ SvPVX_const(si)[1] - '0', /* /i or not has been
+ stored here for just
+ this occasion */
+ TRUE, /* run time */
+ FALSE, /* This call must find the defn */
+ si, /* The property definition */
+ &user_defined,
+ msg,
+ 0 /* base level call */
+ );
+
+ if (SvCUR(msg)) {
+ assert(prop_definition == NULL);
+
+ Perl_croak(aTHX_ "%" UTF8f,
+ UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
}
- }
- else {
- invlist = NULL;
- }
- }
- /* Element [1] is reserved for the set-up swash. If already there,
- * return it; if not, create it and store it there */
- if (ary[1] && SvROK(ary[1])) {
- sw = ary[1];
- }
- else if (doinit && ((si && si != &PL_sv_undef)
- || (invlist && invlist != &PL_sv_undef))) {
- assert(si);
- sw = _core_swash_init("utf8", /* the utf8 package */
- "", /* nameless */
- si,
- 1, /* binary */
- 0, /* not from tr/// */
- invlist,
- &swash_init_flags);
- (void)av_store(av, 1, sw);
+ if (invlist) {
+ _invlist_union(invlist, prop_definition, &invlist);
+ SvREFCNT_dec_NN(prop_definition);
+ }
+ else {
+ invlist = prop_definition;
+ }
+
+ STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
+ STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
+
+ av_store(av, INVLIST_INDEX, invlist);
+ av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
+ ? ONLY_LOCALE_MATCHES_INDEX:
+ INVLIST_INDEX);
+ si = NULL;
+ }
}
}
}
- /* If requested, return a printable version of what this swash matches */
+ /* If requested, return a printable version of what this ANYOF node matches
+ * */
if (listsvp) {
SV* matches_string = NULL;
- /* The swash should be used, if possible, to get the data, as it
- * contains the resolved data. But this function can be called at
- * compile-time, before everything gets resolved, in which case we
- * return the currently best available information, which is the string
- * that will eventually be used to do that resolving, 'si' */
- if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
- && (si && si != &PL_sv_undef))
- {
+ /* This function can be called at compile-time, before everything gets
+ * resolved, in which case we return the currently best available
+ * information, which is the string that will eventually be used to do
+ * that resolving, 'si' */
+ if (si) {
/* Here, we only have 'si' (and possibly some passed-in data in
* 'invlist', which is handled below) If the caller only wants
* 'si', use that. */
if (SvCUR(matches_string)) { /* Get rid of trailing blank */
SvCUR_set(matches_string, SvCUR(matches_string) - 1);
}
- } /* end of has an 'si' but no swash */
+ } /* end of has an 'si' */
}
- /* If we have a swash in place, its equivalent inversion list was above
- * placed into 'invlist'. If not, this variable may contain a stored
- * inversion list which is information beyond what is in 'si' */
+ /* Add the stuff that's already known */
if (invlist) {
/* Again, if the caller doesn't want the output inversion list, put
*listsvp = matches_string;
}
- return sw;
+ return invlist;
}
#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
|| UTF8_IS_INVARIANT(*RExC_parse)
|| UTF8_IS_START(*RExC_parse));
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force /x */ );
STATIC void
S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
{
- /* 'size' is the delta to add or subtract from the current memory allocated
- * to the regex engine being constructed */
+ /* 'size' is the delta number of smallest regnode equivalents to add or
+ * subtract from the current memory allocated to the regex engine being
+ * constructed. */
PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
{
- /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns
- * and increments RExC_size and RExC_emit
+ /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
+ * equivalents space. It aligns and increments RExC_size and RExC_emit
*
* It returns the regnode's offset into the regex engine program */
src = REGNODE_p(RExC_emit);
RExC_emit += size;
dst = REGNODE_p(RExC_emit);
- if (RExC_open_parens) {
+
+ /* If we are in a "count the parentheses" pass, the numbers are unreliable,
+ * and [perl #133871] shows this can lead to problems, so skip this
+ * realignment of parens until a later pass when they are reliable */
+ if (! IN_PARENS_PASS && RExC_open_parens) {
int paren;
/*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
/* remember that RExC_npar is rex->nparens + 1,
}
/*
-- regtail - set the next-pointer at the end of a node chain of p to val.
+- regtail - set the next-pointer at the end of a node chain of p to val. If
+ that value won't fit in the space available, instead returns FALSE.
+ (Except asserts if we can't fit in the largest space the regex
+ engine is designed for.)
- SEE ALSO: regtail_study
*/
-STATIC void
+STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,
const regnode_offset p,
const regnode_offset val,
}
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+ assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
}
else {
+ if (val - scan > U16_MAX) {
+ /* Populate this with something that won't loop and will likely
+ * lead to a crash if the caller ignores the failure return, and
+ * execution continues */
+ NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+ return FALSE;
+ }
NEXT_OFF(REGNODE_p(scan)) = val - scan;
}
+
+ return TRUE;
}
#ifdef DEBUGGING
Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
to control which is which.
+This used to return a value that was ignored. It was a problem that it is
+#ifdef'd to be another function that didn't return a value. khw has changed it
+so both currently return a pass/fail return.
+
*/
/* TODO: All four parms should be const */
-STATIC U8
+STATIC bool
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
const regnode_offset val, U32 depth)
{
bool unfolded_multi_char; /* Unexamined in this routine */
if (join_exact(pRExC_state, scan, &min,
&unfolded_multi_char, 1, REGNODE_p(val), depth+1))
- return EXACT;
+ return TRUE; /* Was return EXACT */
}
#endif
if ( exact ) {
switch (OP(REGNODE_p(scan))) {
+ case LEXACT:
case EXACT:
+ case LEXACT_ONLY8:
case EXACT_ONLY8:
case EXACTL:
case EXACTF:
);
});
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+ assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
}
else {
+ if (val - scan > U16_MAX) {
+ /* Populate this with something that won't loop and will likely
+ * lead to a crash if the caller ignores the failure return, and
+ * execution continues */
+ NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+ return FALSE;
+ }
NEXT_OFF(REGNODE_p(scan)) = val - scan;
}
- return exact;
+ return TRUE; /* Was 'return exact' */
}
#endif
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
{
#ifdef DEBUGGING
+ dVAR;
int k;
RXi_GET_DECL(prog, progi);
GET_RE_DEBUG_FLAGS_DECL;
SvPVCLEAR(sv);
- if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
- /* It would be nice to FAIL() here, but this may be called from
- regexec.c, and it would be hard to supply pRExC_state. */
- Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
- (int)OP(o), (int)REGNODE_MAX);
+ if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
+ if (pRExC_state) { /* This gives more info, if we have it */
+ FAIL3("panic: corrupted regexp opcode %d > %d",
+ (int)OP(o), (int)REGNODE_MAX);
+ }
+ else {
+ Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
+ (int)OP(o), (int)REGNODE_MAX);
+ }
+ }
sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
k = PL_regkind[OP(o)];
name_list= RExC_paren_name_list;
}
if (name_list) {
- if ( k != REF || (OP(o) < NREF)) {
+ if ( k != REF || (OP(o) < REFN)) {
SV **name= av_fetch(name_list, parno, 0 );
if (name)
Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
- const U8 flags = ANYOF_FLAGS(o);
+ const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
+ ? 0
+ : ANYOF_FLAGS(o);
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (OP(o) != ANYOFH) {
+ if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
/* Then all the things that could fit in the bitmap */
do_sep = put_charclass_bitmap_innards(sv,
ANYOF_BITMAP(o),
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+ if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ U8 lowest = (OP(o) != ANYOFHr)
+ ? FLAGS(o)
+ : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+ U8 highest = (OP(o) == ANYOFHb)
+ ? lowest
+ : OP(o) == ANYOFH
+ ? 0xFF
+ : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
+ }
+
SvREFCNT_dec(unresolved);
}
else if (k == ANYOFM) {
assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
sv_catpv(sv, bounds[FLAGS(o)]);
}
- else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
- Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+ else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
+ Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
+ if (o->next_off) {
+ Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
+ }
+ Perl_sv_catpvf(aTHX_ sv, "]");
+ }
else if (OP(o) == SBOL)
Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
if (!dsv)
dsv = (REGEXP*) newSV_type(SVt_REGEXP);
else {
- SvOK_off((SV *)dsv);
- if (islv) {
+ assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
+
+ /* our only valid caller, sv_setsv_flags(), should have done
+ * a SV_CHECK_THINKFIRST_COW_DROP() by now */
+ assert(!SvOOK(dsv));
+ assert(!SvIsCOW(dsv));
+ assert(!SvROK(dsv));
+
+ if (SvPVX_const(dsv)) {
+ if (SvLEN(dsv))
+ Safefree(SvPVX(dsv));
+ SvPVX(dsv) = NULL;
+ }
+ SvLEN_set(dsv, 0);
+ SvCUR_set(dsv, 0);
+ SvOK_off((SV *)dsv);
+
+ if (islv) {
/* For PVLVs, the head (sv_any) points to an XPVLV, while
* the LV's xpvlenu_rx will point to a regexp body, which
* we allocate here */
2: something we no longer hold a reference on
so we need to copy it locally. */
RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
+ /* set malloced length to a non-zero value so it will be freed
+ * (otherwise in combination with SVf_FAKE it looks like an alien
+ * buffer). It doesn't have to be the actual malloced size, since it
+ * should never be grown */
+ SvLEN_set(dstr, SvCUR(sstr)+1);
ret->mother_re = NULL;
}
#endif /* PERL_IN_XSUB_RE */
/* As a final resort, output the range or subrange as hex. */
- this_end = (end < NUM_ANYOF_CODE_POINTS)
- ? end
- : NUM_ANYOF_CODE_POINTS - 1;
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ this_end = end;
+ }
+ else {
+ this_end = (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
+ }
#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
? "\\x%02" UVXf "-\\x%02" UVXf
* output would have been only the inversion indicator '^', NULL is instead
* returned. */
+ dVAR;
SV * output;
PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
* whether the class itself is to be inverted. However, there are some
* cases where it can't try inverting, as what actually matches isn't known
* until runtime, and hence the inversion isn't either. */
+
+ dVAR;
bool inverting_allowed = ! force_as_is_display;
int i;
void
Perl_init_uniprops(pTHX)
{
+ dVAR;
+
+ PL_user_def_props = newHV();
+
+#ifdef USE_ITHREADS
+
+ HvSHAREKEYS_off(PL_user_def_props);
+ PL_user_def_props_aTHX = aTHX;
+
+#endif
+
/* Set up the inversion list global variables */
PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
+ PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
#ifdef UNI_XIDC
/* The below are used only by deprecated functions. They could be removed */
#endif
}
-SV *
-Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
- const bool to_fold, bool * invert)
+#if 0
+
+This code was mainly added for backcompat to give a warning for non-portable
+code points in user-defined properties. But experiments showed that the
+warning in earlier perls were only omitted on overflow, which should be an
+error, so there really isnt a backcompat issue, and actually adding the
+warning when none was present before might cause breakage, for little gain. So
+khw left this code in, but not enabled. Tests were never added.
+
+embed.fnc entry:
+Ei |const char *|get_extended_utf8_msg|const UV cp
+
+PERL_STATIC_INLINE const char *
+S_get_extended_utf8_msg(pTHX_ const UV cp)
{
- /* Parse the interior meat of \p{} passed to this in 'name' with length
- * 'name_len', and return an inversion list if a property with 'name' is
- * found, or NULL if not. 'name' point to the input with leading and
- * trailing space trimmed. 'to_fold' indicates if /i is in effect.
+ U8 dummy[UTF8_MAXBYTES + 1];
+ HV *msgs;
+ SV **msg;
+
+ uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
+ &msgs);
+
+ msg = hv_fetchs(msgs, "text", 0);
+ assert(msg);
+
+ (void) sv_2mortal((SV *) msgs);
+
+ return SvPVX(*msg);
+}
+
+#endif
+
+SV *
+Perl_handle_user_defined_property(pTHX_
+
+ /* Parses the contents of a user-defined property definition; returning the
+ * expanded definition if possible. If so, the return is an inversion
+ * list.
*
- * When the return is an inversion list, '*invert' will be set to a boolean
- * indicating if it should be inverted or not
+ * If there are subroutines that are part of the expansion and which aren't
+ * known at the time of the call to this function, this returns what
+ * parse_uniprop_string() returned for the first one encountered.
*
- * This currently doesn't handle all cases. A NULL return indicates the
- * caller should try a different approach
- */
+ * If an error was found, NULL is returned, and 'msg' gets a suitable
+ * message appended to it. (Appending allows the back trace of how we got
+ * to the faulty definition to be displayed through nested calls of
+ * user-defined subs.)
+ *
+ * The caller IS responsible for freeing any returned SV.
+ *
+ * The syntax of the contents is pretty much described in perlunicode.pod,
+ * but we also allow comments on each line */
+
+ const char * name, /* Name of property */
+ const STRLEN name_len, /* The name's length in bytes */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool to_fold, /* ? Is this under /i */
+ const bool runtime, /* ? Are we in compile- or run-time */
+ const bool deferrable, /* Is it ok for this property's full definition
+ to be deferred until later? */
+ SV* contents, /* The property's definition */
+ bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
+ getting called unless this is thought to be
+ a user-defined property */
+ SV * msg, /* Any error or warning msg(s) are appended to
+ this */
+ const STRLEN level) /* Recursion level of this call */
+{
+ STRLEN len;
+ const char * string = SvPV_const(contents, len);
+ const char * const e = string + len;
+ const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
+ const STRLEN msgs_length_on_entry = SvCUR(msg);
+
+ const char * s0 = string; /* Points to first byte in the current line
+ being parsed in 'string' */
+ const char overflow_msg[] = "Code point too large in \"";
+ SV* running_definition = NULL;
+
+ PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
+
+ *user_defined_ptr = TRUE;
+
+ /* Look at each line */
+ while (s0 < e) {
+ const char * s; /* Current byte */
+ char op = '+'; /* Default operation is 'union' */
+ IV min = 0; /* range begin code point */
+ IV max = -1; /* and range end */
+ SV* this_definition;
+
+ /* Skip comment lines */
+ if (*s0 == '#') {
+ s0 = strchr(s0, '\n');
+ if (s0 == NULL) {
+ break;
+ }
+ s0++;
+ continue;
+ }
- char* lookup_name;
- bool stricter = FALSE;
- bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one
- of the cjk numeric properties (though
- it requires extra effort to compile
- them) */
- unsigned int i;
- unsigned int j = 0, lookup_len;
- int equals_pos = -1; /* Where the '=' is found, or negative if none */
- int slash_pos = -1; /* Where the '/' is found, or negative if none */
- int table_index = 0;
- bool starts_with_In_or_Is = FALSE;
- Size_t lookup_offset = 0;
+ /* For backcompat, allow an empty first line */
+ if (*s0 == '\n') {
+ s0++;
+ continue;
+ }
+
+ /* First character in the line may optionally be the operation */
+ if ( *s0 == '+'
+ || *s0 == '!'
+ || *s0 == '-'
+ || *s0 == '&')
+ {
+ op = *s0++;
+ }
+
+ /* If the line is one or two hex digits separated by blank space, its
+ * a range; otherwise it is either another user-defined property or an
+ * error */
+
+ s = s0;
+
+ if (! isXDIGIT(*s)) {
+ goto check_if_property;
+ }
+
+ do { /* Each new hex digit will add 4 bits. */
+ if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpv(msg, overflow_msg);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_failure;
+ }
+
+ /* Accumulate this digit into the value */
+ min = (min << 4) + READ_XDIGIT(s);
+ } while (isXDIGIT(*s));
+
+ while (isBLANK(*s)) { s++; }
+
+ /* We allow comments at the end of the line */
+ if (*s == '#') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ s++;
+ }
+ else if (s < e && *s != '\n') {
+ if (! isXDIGIT(*s)) {
+ goto check_if_property;
+ }
+
+ /* Look for the high point of the range */
+ max = 0;
+ do {
+ if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpv(msg, overflow_msg);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_failure;
+ }
+
+ max = (max << 4) + READ_XDIGIT(s);
+ } while (isXDIGIT(*s));
+
+ while (isBLANK(*s)) { s++; }
+
+ if (*s == '#') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ }
+ else if (s < e && *s != '\n') {
+ goto check_if_property;
+ }
+ }
+
+ if (max == -1) { /* The line only had one entry */
+ max = min;
+ }
+ else if (max < min) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Illegal range in \"");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_failure;
+ }
+
+#if 0 /* See explanation at definition above of get_extended_utf8_msg() */
+
+ if ( UNICODE_IS_PERL_EXTENDED(min)
+ || UNICODE_IS_PERL_EXTENDED(max))
+ {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+
+ /* If both code points are non-portable, warn only on the lower
+ * one. */
+ sv_catpv(msg, get_extended_utf8_msg(
+ (UNICODE_IS_PERL_EXTENDED(min))
+ ? min : max));
+ sv_catpvs(msg, " in \"");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ }
+
+#endif
+
+ /* Here, this line contains a legal range */
+ this_definition = sv_2mortal(_new_invlist(2));
+ this_definition = _add_range_to_invlist(this_definition, min, max);
+ goto calculate;
+
+ check_if_property:
+
+ /* Here it isn't a legal range line. See if it is a legal property
+ * line. First find the end of the meat of the line */
+ s = strpbrk(s, "#\n");
+ if (s == NULL) {
+ s = e;
+ }
+
+ /* Ignore trailing blanks in keeping with the requirements of
+ * parse_uniprop_string() */
+ s--;
+ while (s > s0 && isBLANK_A(*s)) {
+ s--;
+ }
+ s++;
+
+ this_definition = parse_uniprop_string(s0, s - s0,
+ is_utf8, to_fold, runtime,
+ deferrable,
+ user_defined_ptr, msg,
+ (name_len == 0)
+ ? level /* Don't increase level
+ if input is empty */
+ : level + 1
+ );
+ if (this_definition == NULL) {
+ goto return_failure; /* 'msg' should have had the reason
+ appended to it by the above call */
+ }
+
+ if (! is_invlist(this_definition)) { /* Unknown at this time */
+ return newSVsv(this_definition);
+ }
+
+ if (*s != '\n') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ }
+
+ calculate:
+
+ switch (op) {
+ case '+':
+ _invlist_union(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '-':
+ _invlist_subtract(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '&':
+ _invlist_intersection(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '!':
+ _invlist_union_complement_2nd(running_definition,
+ this_definition, &running_definition);
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
+ __FILE__, __LINE__, op);
+ break;
+ }
+
+ /* Position past the '\n' */
+ s0 = s + 1;
+ } /* End of loop through the lines of 'contents' */
+
+ /* Here, we processed all the lines in 'contents' without error. If we
+ * didn't add any warnings, simply return success */
+ if (msgs_length_on_entry == SvCUR(msg)) {
+
+ /* If the expansion was empty, the answer isn't nothing: its an empty
+ * inversion list */
+ if (running_definition == NULL) {
+ running_definition = _new_invlist(1);
+ }
+
+ return running_definition;
+ }
+
+ /* Otherwise, add some explanatory text, but we will return success */
+ goto return_msg;
+
+ return_failure:
+ running_definition = NULL;
+
+ return_msg:
+
+ if (name_len > 0) {
+ sv_catpvs(msg, " in expansion of ");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
+ }
+
+ return running_definition;
+}
+
+/* As explained below, certain operations need to take place in the first
+ * thread created. These macros switch contexts */
+#ifdef USE_ITHREADS
+# define DECLARATION_FOR_GLOBAL_CONTEXT \
+ PerlInterpreter * save_aTHX = aTHX;
+# define SWITCH_TO_GLOBAL_CONTEXT \
+ PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
+# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
+# define CUR_CONTEXT aTHX
+# define ORIGINAL_CONTEXT save_aTHX
+#else
+# define DECLARATION_FOR_GLOBAL_CONTEXT
+# define SWITCH_TO_GLOBAL_CONTEXT NOOP
+# define RESTORE_CONTEXT NOOP
+# define CUR_CONTEXT NULL
+# define ORIGINAL_CONTEXT NULL
+#endif
+
+STATIC void
+S_delete_recursion_entry(pTHX_ void *key)
+{
+ /* Deletes the entry used to detect recursion when expanding user-defined
+ * properties. This is a function so it can be set up to be called even if
+ * the program unexpectedly quits */
+
+ dVAR;
+ SV ** current_entry;
+ const STRLEN key_len = strlen((const char *) key);
+ DECLARATION_FOR_GLOBAL_CONTEXT;
+
+ SWITCH_TO_GLOBAL_CONTEXT;
+
+ /* If the entry is one of these types, it is a permanent entry, and not the
+ * one used to detect recursions. This function should delete only the
+ * recursion entry */
+ current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
+ if ( current_entry
+ && ! is_invlist(*current_entry)
+ && ! SvPOK(*current_entry))
+ {
+ (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
+ G_DISCARD);
+ }
+
+ RESTORE_CONTEXT;
+}
+
+STATIC SV *
+S_get_fq_name(pTHX_
+ const char * const name, /* The first non-blank in the \p{}, \P{} */
+ const Size_t name_len, /* Its length in bytes, not including any trailing space */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool has_colon_colon
+ )
+{
+ /* Returns a mortal SV containing the fully qualified version of the input
+ * name */
+
+ SV * fq_name;
+
+ fq_name = newSVpvs_flags("", SVs_TEMP);
+
+ /* Use the current package if it wasn't included in our input */
+ if (! has_colon_colon) {
+ const HV * pkg = (IN_PERL_COMPILETIME)
+ ? PL_curstash
+ : CopSTASH(PL_curcop);
+ const char* pkgname = HvNAME(pkg);
+
+ Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+ UTF8fARG(is_utf8, strlen(pkgname), pkgname));
+ sv_catpvs(fq_name, "::");
+ }
+
+ Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+ UTF8fARG(is_utf8, name_len, name));
+ return fq_name;
+}
+
+SV *
+Perl_parse_uniprop_string(pTHX_
+
+ /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
+ * now. If so, the return is an inversion list.
+ *
+ * If the property is user-defined, it is a subroutine, which in turn
+ * may call other subroutines. This function will call the whole nest of
+ * them to get the definition they return; if some aren't known at the time
+ * of the call to this function, the fully qualified name of the highest
+ * level sub is returned. It is an error to call this function at runtime
+ * without every sub defined.
+ *
+ * If an error was found, NULL is returned, and 'msg' gets a suitable
+ * message appended to it. (Appending allows the back trace of how we got
+ * to the faulty definition to be displayed through nested calls of
+ * user-defined subs.)
+ *
+ * The caller should NOT try to free any returned inversion list.
+ *
+ * Other parameters will be set on return as described below */
+
+ const char * const name, /* The first non-blank in the \p{}, \P{} */
+ const Size_t name_len, /* Its length in bytes, not including any
+ trailing space */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool to_fold, /* ? Is this under /i */
+ const bool runtime, /* TRUE if this is being called at run time */
+ const bool deferrable, /* TRUE if it's ok for the definition to not be
+ known at this call */
+ bool *user_defined_ptr, /* Upon return from this function it will be
+ set to TRUE if any component is a
+ user-defined property */
+ SV * msg, /* Any error or warning msg(s) are appended to
+ this */
+ const STRLEN level) /* Recursion level of this call */
+{
+ dVAR;
+ char* lookup_name; /* normalized name for lookup in our tables */
+ unsigned lookup_len; /* Its length */
+ bool stricter = FALSE; /* Some properties have stricter name
+ normalization rules, which we decide upon
+ based on parsing */
+
+ /* nv= or numeric_value=, or possibly one of the cjk numeric properties
+ * (though it requires extra effort to download them from Unicode and
+ * compile perl to know about them) */
+ bool is_nv_type = FALSE;
+
+ unsigned int i, j = 0;
+ int equals_pos = -1; /* Where the '=' is found, or negative if none */
+ int slash_pos = -1; /* Where the '/' is found, or negative if none */
+ int table_index = 0; /* The entry number for this property in the table
+ of all Unicode property names */
+ bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
+ Size_t lookup_offset = 0; /* Used to ignore the first few characters of
+ the normalized name in certain situations */
+ Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
+ part of a package name */
+ bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
+ property rather than a Unicode
+ one. */
+ SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
+ if an error. If it is an inversion list,
+ it is the definition. Otherwise it is a
+ string containing the fully qualified sub
+ name of 'name' */
+ SV * fq_name = NULL; /* For user-defined properties, the fully
+ qualified name */
+ bool invert_return = FALSE; /* ? Do we need to complement the result before
+ returning it */
PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
- /* The input will be modified into 'lookup_name' */
+ /* The input will be normalized into 'lookup_name' */
Newx(lookup_name, name_len, char);
SAVEFREEPV(lookup_name);
for (i = 0; i < name_len; i++) {
char cur = name[i];
- /* These characters can be freely ignored in most situations. Later it
- * may turn out we shouldn't have ignored them, and we have to reparse,
- * but we don't have enough information yet to make that decision */
- if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
+ /* Most of the characters in the input will be of this ilk, being parts
+ * of a name */
+ if (isIDCONT_A(cur)) {
+
+ /* Case differences are ignored. Our lookup routine assumes
+ * everything is lowercase, so normalize to that */
+ if (isUPPER_A(cur)) {
+ lookup_name[j++] = toLOWER_A(cur);
+ continue;
+ }
+
+ if (cur == '_') { /* Don't include these in the normalized name */
+ continue;
+ }
+
+ lookup_name[j++] = cur;
+
+ /* The first character in a user-defined name must be of this type.
+ * */
+ if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
+ could_be_user_defined = FALSE;
+ }
+
continue;
}
- /* Case differences are also ignored. Our lookup routine assumes
- * everything is lowercase */
- if (isUPPER_A(cur)) {
- lookup_name[j++] = toLOWER(cur);
+ /* Here, the character is not something typically in a name, But these
+ * two types of characters (and the '_' above) can be freely ignored in
+ * most situations. Later it may turn out we shouldn't have ignored
+ * them, and we have to reparse, but we don't have enough information
+ * yet to make that decision */
+ if (cur == '-' || isSPACE_A(cur)) {
+ could_be_user_defined = FALSE;
continue;
}
- /* A double colon is either an error, or a package qualifier to a
- * subroutine user-defined property; neither of which do we currently
- * handle
- *
- * But a single colon is a synonym for '=' */
- if (cur == ':') {
- if (i < name_len - 1 && name[i+1] == ':') {
- return NULL;
- }
- cur = '=';
+ /* An equals sign or single colon mark the end of the first part of
+ * the property name */
+ if ( cur == '='
+ || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
+ {
+ lookup_name[j++] = '='; /* Treat the colon as an '=' */
+ equals_pos = j; /* Note where it occurred in the input */
+ could_be_user_defined = FALSE;
+ break;
}
/* Otherwise, this character is part of the name. */
lookup_name[j++] = cur;
- /* Only the equals sign needs further processing */
- if (cur == '=') {
- equals_pos = j; /* Note where it occurred in the input */
- break;
+ /* Here it isn't a single colon, so if it is a colon, it must be a
+ * double colon */
+ if (cur == ':') {
+
+ /* A double colon should be a package qualifier. We note its
+ * position and continue. Note that one could have
+ * pkg1::pkg2::...::foo
+ * so that the position at the end of the loop will be just after
+ * the final qualifier */
+
+ i++;
+ non_pkg_begin = i + 1;
+ lookup_name[j++] = ':';
+ }
+ else { /* Only word chars (and '::') can be in a user-defined name */
+ could_be_user_defined = FALSE;
}
+ } /* End of parsing through the lhs of the property name (or all of it if
+ no rhs) */
+
+#define STRLENs(s) (sizeof("" s "") - 1)
+
+ /* If there is a single package name 'utf8::', it is ambiguous. It could
+ * be for a user-defined property, or it could be a Unicode property, as
+ * all of them are considered to be for that package. For the purposes of
+ * parsing the rest of the property, strip it off */
+ if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
+ lookup_name += STRLENs("utf8::");
+ j -= STRLENs("utf8::");
+ equals_pos -= STRLENs("utf8::");
}
/* Here, we are either done with the whole property name, if it was simple;
}
}
- /* Certain properties need special handling. They may optionally be
- * prefixed by 'is'. Ignore that prefix for the purposes of checking
- * if this is one of those properties */
- if (memBEGINPs(lookup_name, name_len, "is")) {
+ /* Most punctuation after the equals indicates a subpattern, like
+ * \p{foo=/bar/} */
+ if ( isPUNCT_A(name[i])
+ && name[i] != '-'
+ && name[i] != '+'
+ && name[i] != '_'
+ && name[i] != '{')
+ {
+ /* Find the property. The table includes the equals sign, so we
+ * use 'j' as-is */
+ table_index = match_uniprop((U8 *) lookup_name, j);
+ if (table_index) {
+ const char * const * prop_values
+ = UNI_prop_value_ptrs[table_index];
+ SV * subpattern;
+ Size_t subpattern_len;
+ REGEXP * subpattern_re;
+ char open = name[i++];
+ char close;
+ const char * pos_in_brackets;
+ bool escaped = 0;
+
+ /* A backslash means the real delimitter is the next character.
+ * */
+ if (open == '\\') {
+ open = name[i++];
+ escaped = 1;
+ }
+
+ /* This data structure is constructed so that the matching
+ * closing bracket is 3 past its matching opening. The second
+ * set of closing is so that if the opening is something like
+ * ']', the closing will be that as well. Something similar is
+ * done in toke.c */
+ pos_in_brackets = strchr("([<)]>)]>", open);
+ close = (pos_in_brackets) ? pos_in_brackets[3] : open;
+
+ if ( i >= name_len
+ || name[name_len-1] != close
+ || (escaped && name[name_len-2] != '\\'))
+ {
+ sv_catpvs(msg, "Unicode property wildcard not terminated");
+ goto append_name_to_msg;
+ }
+
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
+ "The Unicode property wildcards feature is experimental");
+
+ /* Now create and compile the wildcard subpattern. Use /iaa
+ * because nothing outside of ASCII will match, and it the
+ * property values should all match /i. Note that when the
+ * pattern fails to compile, our added text to the user's
+ * pattern will be displayed to the user, which is not so
+ * desirable. */
+ subpattern_len = name_len - i - 1 - escaped;
+ subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
+ (unsigned) subpattern_len,
+ name + i);
+ subpattern = sv_2mortal(subpattern);
+ subpattern_re = re_compile(subpattern, 0);
+ assert(subpattern_re); /* Should have died if didn't compile
+ successfully */
+
+ /* For each legal property value, see if the supplied pattern
+ * matches it. */
+ while (*prop_values) {
+ const char * const entry = *prop_values;
+ const Size_t len = strlen(entry);
+ SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
+
+ if (pregexec(subpattern_re,
+ (char *) entry,
+ (char *) entry + len,
+ (char *) entry, 0,
+ entry_sv,
+ 0))
+ { /* Here, matched. Add to the returned list */
+ Size_t total_len = j + len;
+ SV * sub_invlist = NULL;
+ char * this_string;
+
+ /* We know this is a legal \p{property=value}. Call
+ * the function to return the list of code points that
+ * match it */
+ Newxz(this_string, total_len + 1, char);
+ Copy(lookup_name, this_string, j, char);
+ my_strlcat(this_string, entry, total_len + 1);
+ SAVEFREEPV(this_string);
+ sub_invlist = parse_uniprop_string(this_string,
+ total_len,
+ is_utf8,
+ to_fold,
+ runtime,
+ deferrable,
+ user_defined_ptr,
+ msg,
+ level + 1);
+ _invlist_union(prop_definition, sub_invlist,
+ &prop_definition);
+ }
+
+ prop_values++; /* Next iteration, look at next propvalue */
+ } /* End of looking through property values; (the data
+ structure is terminated by a NULL ptr) */
+
+ SvREFCNT_dec_NN(subpattern_re);
+
+ if (prop_definition) {
+ return prop_definition;
+ }
+
+ sv_catpvs(msg, "No Unicode property value wildcard matches:");
+ goto append_name_to_msg;
+ }
+
+ /* Here's how khw thinks we should proceed to handle the properties
+ * not yet done: Bidi Mirroring Glyph
+ Bidi Paired Bracket
+ Case Folding (both full and simple)
+ Decomposition Mapping
+ Equivalent Unified Ideograph
+ Name
+ Name Alias
+ Lowercase Mapping (both full and simple)
+ NFKC Case Fold
+ Titlecase Mapping (both full and simple)
+ Uppercase Mapping (both full and simple)
+ * Move the part that looks at the property values into a perl
+ * script, like utf8_heavy.pl is done. This makes things somewhat
+ * easier, but most importantly, it avoids always adding all these
+ * strings to the memory usage when the feature is little-used.
+ *
+ * The property values would all be concatenated into a single
+ * string per property with each value on a separate line, and the
+ * code point it's for on alternating lines. Then we match the
+ * user's input pattern m//mg, without having to worry about their
+ * uses of '^' and '$'. Only the values that aren't the default
+ * would be in the strings. Code points would be in UTF-8. The
+ * search pattern that we would construct would look like
+ * (?: \n (code-point_re) \n (?aam: user-re ) \n )
+ * And so $1 would contain the code point that matched the user-re.
+ * For properties where the default is the code point itself, such
+ * as any of the case changing mappings, the string would otherwise
+ * consist of all Unicode code points in UTF-8 strung together.
+ * This would be impractical. So instead, examine their compiled
+ * pattern, looking at the ssc. If none, reject the pattern as an
+ * error. Otherwise run the pattern against every code point in
+ * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
+ * And it might be good to create an API to return the ssc.
+ *
+ * For the name properties, a new function could be created in
+ * charnames which essentially does the same thing as above,
+ * sharing Name.pl with the other charname functions. Don't know
+ * about loose name matching, or algorithmically determined names.
+ * Decomposition.pl similarly.
+ *
+ * It might be that a new pattern modifier would have to be
+ * created, like /t for resTricTed, which changed the behavior of
+ * some constructs in their subpattern, like \A. */
+ } /* End of is a wildcard subppattern */
+
+
+ /* Certain properties whose values are numeric need special handling.
+ * They may optionally be prefixed by 'is'. Ignore that prefix for the
+ * purposes of checking if this is one of those properties */
+ if (memBEGINPs(lookup_name, j, "is")) {
lookup_offset = 2;
}
- /* Then check if it is one of these properties. This is hard-coded
- * because easier this way, and the list is unlikely to change. There
- * are several properties like this in the Unihan DB, which is unlikely
- * to be compiled, and they all end with 'numeric'. The interiors
+ /* Then check if it is one of these specially-handled properties. The
+ * possibilities are hard-coded because easier this way, and the list
+ * is unlikely to change.
+ *
+ * All numeric value type properties are of this ilk, and are also
+ * special in a different way later on. So find those first. There
+ * are several numeric value type properties in the Unihan DB (which is
+ * unlikely to be compiled with perl, but we handle it here in case it
+ * does get compiled). They all end with 'numeric'. The interiors
* aren't checked for the precise property. This would stop working if
* a cjk property were to be created that ended with 'numeric' and
* wasn't a numeric type */
{
unsigned int k;
- /* What makes these properties special is that the stuff after the
- * '=' is a number. Therefore, we can't throw away '-'
- * willy-nilly, as those could be a minus sign. Other stricter
+ /* Since the stuff after the '=' is a number, we can't throw away
+ * '-' willy-nilly, as those could be a minus sign. Other stricter
* rules also apply. However, these properties all can have the
* rhs not be a number, in which case they contain at least one
* alphabetic. In those cases, the stricter rules don't apply.
* But the numeric type properties can have the alphas [Ee] to
* signify an exponent, and it is still a number with stricter
- * rules. So look for an alpha that signifys not-strict */
+ * rules. So look for an alpha that signifies not-strict */
stricter = TRUE;
for (k = i; k < name_len; k++) {
if ( isALPHA_A(name[k])
* zeros, or between the final leading zero and the first other
* digit */
for (; i < name_len - 1; i++) {
- if ( name[i] != '0'
+ if ( name[i] != '0'
&& (name[i] != '_' || ! isDIGIT_A(name[i+1])))
{
break;
}
else { /* No '=' */
- /* We are now in a position to determine if this property should have
- * been parsed using stricter rules. Only a few are like that, and
- * unlikely to change. */
+ /* Only a few properties without an '=' should be parsed with stricter
+ * rules. The list is unlikely to change. */
if ( memBEGINPs(lookup_name, j, "perl")
&& memNEs(lookup_name + 4, j - 4, "space")
&& memNEs(lookup_name + 4, j - 4, "word"))
}
/* Store the first real character in the denominator */
- lookup_name[j++] = name[i];
+ if (i < name_len) {
+ lookup_name[j++] = name[i];
+ }
}
}
{
lookup_name[j++] = '&';
}
- else if (name_len > 2 && name[0] == 'I' && ( name[1] == 'n'
- || name[1] == 's'))
- {
- /* Also, if the original input began with 'In' or 'Is', it could be a
- * subroutine call instead of a property names, which currently isn't
- * handled by this function. Subroutine calls can't happen if there is
- * an '=' in the name */
- if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
- {
- return NULL;
+ /* If the original input began with 'In' or 'Is', it could be a subroutine
+ * call to a user-defined property instead of a Unicode property name. */
+ if ( name_len - non_pkg_begin > 2
+ && name[non_pkg_begin+0] == 'I'
+ && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
+ {
+ /* Names that start with In have different characterstics than those
+ * that start with Is */
+ if (name[non_pkg_begin+1] == 's') {
+ starts_with_Is = TRUE;
}
-
- starts_with_In_or_Is = TRUE;
}
+ else {
+ could_be_user_defined = FALSE;
+ }
+
+ if (could_be_user_defined) {
+ CV* user_sub;
+
+ /* If the user defined property returns the empty string, it could
+ * easily be because the pattern is being compiled before the data it
+ * actually needs to compile is available. This could be argued to be
+ * a bug in the perl code, but this is a change of behavior for Perl,
+ * so we handle it. This means that intentionally returning nothing
+ * will not be resolved until runtime */
+ bool empty_return = FALSE;
+
+ /* Here, the name could be for a user defined property, which are
+ * implemented as subs. */
+ user_sub = get_cvn_flags(name, name_len, 0);
+ if (user_sub) {
+ const char insecure[] = "Insecure user-defined property";
+
+ /* Here, there is a sub by the correct name. Normally we call it
+ * to get the property definition */
+ dSP;
+ SV * user_sub_sv = MUTABLE_SV(user_sub);
+ SV * error; /* Any error returned by calling 'user_sub' */
+ SV * key; /* The key into the hash of user defined sub names
+ */
+ SV * placeholder;
+ SV ** saved_user_prop_ptr; /* Hash entry for this property */
+
+ /* How many times to retry when another thread is in the middle of
+ * expanding the same definition we want */
+ PERL_INT_FAST8_T retry_countdown = 10;
+
+ DECLARATION_FOR_GLOBAL_CONTEXT;
+
+ /* If we get here, we know this property is user-defined */
+ *user_defined_ptr = TRUE;
+
+ /* We refuse to call a potentially tainted subroutine; returning an
+ * error instead */
+ if (TAINT_get) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvn(msg, insecure, sizeof(insecure) - 1);
+ goto append_name_to_msg;
+ }
+
+ /* In principal, we only call each subroutine property definition
+ * once during the life of the program. This guarantees that the
+ * property definition never changes. The results of the single
+ * sub call are stored in a hash, which is used instead for future
+ * references to this property. The property definition is thus
+ * immutable. But, to allow the user to have a /i-dependent
+ * definition, we call the sub once for non-/i, and once for /i,
+ * should the need arise, passing the /i status as a parameter.
+ *
+ * We start by constructing the hash key name, consisting of the
+ * fully qualified subroutine name, preceded by the /i status, so
+ * that there is a key for /i and a different key for non-/i */
+ key = newSVpvn(((to_fold) ? "1" : "0"), 1);
+ fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+ non_pkg_begin != 0);
+ sv_catsv(key, fq_name);
+ sv_2mortal(key);
+
+ /* We only call the sub once throughout the life of the program
+ * (with the /i, non-/i exception noted above). That means the
+ * hash must be global and accessible to all threads. It is
+ * created at program start-up, before any threads are created, so
+ * is accessible to all children. But this creates some
+ * complications.
+ *
+ * 1) The keys can't be shared, or else problems arise; sharing is
+ * turned off at hash creation time
+ * 2) All SVs in it are there for the remainder of the life of the
+ * program, and must be created in the same interpreter context
+ * as the hash, or else they will be freed from the wrong pool
+ * at global destruction time. This is handled by switching to
+ * the hash's context to create each SV going into it, and then
+ * immediately switching back
+ * 3) All accesses to the hash must be controlled by a mutex, to
+ * prevent two threads from getting an unstable state should
+ * they simultaneously be accessing it. The code below is
+ * crafted so that the mutex is locked whenever there is an
+ * access and unlocked only when the next stable state is
+ * achieved.
+ *
+ * The hash stores either the definition of the property if it was
+ * valid, or, if invalid, the error message that was raised. We
+ * use the type of SV to distinguish.
+ *
+ * There's also the need to guard against the definition expansion
+ * from infinitely recursing. This is handled by storing the aTHX
+ * of the expanding thread during the expansion. Again the SV type
+ * is used to distinguish this from the other two cases. If we
+ * come to here and the hash entry for this property is our aTHX,
+ * it means we have recursed, and the code assumes that we would
+ * infinitely recurse, so instead stops and raises an error.
+ * (Any recursion has always been treated as infinite recursion in
+ * this feature.)
+ *
+ * If instead, the entry is for a different aTHX, it means that
+ * that thread has gotten here first, and hasn't finished expanding
+ * the definition yet. We just have to wait until it is done. We
+ * sleep and retry a few times, returning an error if the other
+ * thread doesn't complete. */
+
+ re_fetch:
+ USER_PROP_MUTEX_LOCK;
+
+ /* If we have an entry for this key, the subroutine has already
+ * been called once with this /i status. */
+ saved_user_prop_ptr = hv_fetch(PL_user_def_props,
+ SvPVX(key), SvCUR(key), 0);
+ if (saved_user_prop_ptr) {
+
+ /* If the saved result is an inversion list, it is the valid
+ * definition of this property */
+ if (is_invlist(*saved_user_prop_ptr)) {
+ prop_definition = *saved_user_prop_ptr;
+
+ /* The SV in the hash won't be removed until global
+ * destruction, so it is stable and we can unlock */
+ USER_PROP_MUTEX_UNLOCK;
+
+ /* The caller shouldn't try to free this SV */
+ return prop_definition;
+ }
+
+ /* Otherwise, if it is a string, it is the error message
+ * that was returned when we first tried to evaluate this
+ * property. Fail, and append the message */
+ if (SvPOK(*saved_user_prop_ptr)) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catsv(msg, *saved_user_prop_ptr);
+
+ /* The SV in the hash won't be removed until global
+ * destruction, so it is stable and we can unlock */
+ USER_PROP_MUTEX_UNLOCK;
+
+ return NULL;
+ }
- lookup_len = j; /* Use a more mnemonic name starting here */
+ assert(SvIOK(*saved_user_prop_ptr));
+
+ /* Here, we have an unstable entry in the hash. Either another
+ * thread is in the middle of expanding the property's
+ * definition, or we are ourselves recursing. We use the aTHX
+ * in it to distinguish */
+ if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
+
+ /* Here, it's another thread doing the expanding. We've
+ * looked as much as we are going to at the contents of the
+ * hash entry. It's safe to unlock. */
+ USER_PROP_MUTEX_UNLOCK;
+
+ /* Retry a few times */
+ if (retry_countdown-- > 0) {
+ PerlProc_sleep(1);
+ goto re_fetch;
+ }
+
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Timeout waiting for another thread to "
+ "define");
+ goto append_name_to_msg;
+ }
+
+ /* Here, we are recursing; don't dig any deeper */
+ USER_PROP_MUTEX_UNLOCK;
+
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg,
+ "Infinite recursion in user-defined property");
+ goto append_name_to_msg;
+ }
+
+ /* Here, this thread has exclusive control, and there is no entry
+ * for this property in the hash. So we have the go ahead to
+ * expand the definition ourselves. */
+
+ PUSHSTACKi(PERLSI_MAGIC);
+ ENTER;
+
+ /* Create a temporary placeholder in the hash to detect recursion
+ * */
+ SWITCH_TO_GLOBAL_CONTEXT;
+ placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
+ (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
+ RESTORE_CONTEXT;
+
+ /* Now that we have a placeholder, we can let other threads
+ * continue */
+ USER_PROP_MUTEX_UNLOCK;
+
+ /* Make sure the placeholder always gets destroyed */
+ SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
+
+ PUSHMARK(SP);
+ SAVETMPS;
+
+ /* Call the user's function, with the /i status as a parameter.
+ * Note that we have gone to a lot of trouble to keep this call
+ * from being within the locked mutex region. */
+ XPUSHs(boolSV(to_fold));
+ PUTBACK;
+
+ /* The following block was taken from swash_init(). Presumably
+ * they apply to here as well, though we no longer use a swash --
+ * khw */
+ SAVEHINTS();
+ save_re_context();
+ /* We might get here via a subroutine signature which uses a utf8
+ * parameter name, at which point PL_subname will have been set
+ * but not yet used. */
+ save_item(PL_subname);
+
+ (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
+
+ SPAGAIN;
+
+ error = ERRSV;
+ if (TAINT_get || SvTRUE(error)) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ if (SvTRUE(error)) {
+ sv_catpvs(msg, "Error \"");
+ sv_catsv(msg, error);
+ sv_catpvs(msg, "\"");
+ }
+ if (TAINT_get) {
+ if (SvTRUE(error)) sv_catpvs(msg, "; ");
+ sv_catpvn(msg, insecure, sizeof(insecure) - 1);
+ }
+
+ if (name_len > 0) {
+ sv_catpvs(msg, " in expansion of ");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
+ name_len,
+ name));
+ }
+
+ (void) POPs;
+ prop_definition = NULL;
+ }
+ else { /* G_SCALAR guarantees a single return value */
+ SV * contents = POPs;
+
+ /* The contents is supposed to be the expansion of the property
+ * definition. If the definition is deferrable, and we got an
+ * empty string back, set a flag to later defer it (after clean
+ * up below). */
+ if ( deferrable
+ && (! SvPOK(contents) || SvCUR(contents) == 0))
+ {
+ empty_return = TRUE;
+ }
+ else { /* Otherwise, call a function to check for valid syntax,
+ and handle it */
+
+ prop_definition = handle_user_defined_property(
+ name, name_len,
+ is_utf8, to_fold, runtime,
+ deferrable,
+ contents, user_defined_ptr,
+ msg,
+ level);
+ }
+ }
+
+ /* Here, we have the results of the expansion. Delete the
+ * placeholder, and if the definition is now known, replace it with
+ * that definition. We need exclusive access to the hash, and we
+ * can't let anyone else in, between when we delete the placeholder
+ * and add the permanent entry */
+ USER_PROP_MUTEX_LOCK;
+
+ S_delete_recursion_entry(aTHX_ SvPVX(key));
+
+ if ( ! empty_return
+ && (! prop_definition || is_invlist(prop_definition)))
+ {
+ /* If we got success we use the inversion list defining the
+ * property; otherwise use the error message */
+ SWITCH_TO_GLOBAL_CONTEXT;
+ (void) hv_store_ent(PL_user_def_props,
+ key,
+ ((prop_definition)
+ ? newSVsv(prop_definition)
+ : newSVsv(msg)),
+ 0);
+ RESTORE_CONTEXT;
+ }
+
+ /* All done, and the hash now has a permanent entry for this
+ * property. Give up exclusive control */
+ USER_PROP_MUTEX_UNLOCK;
+
+ FREETMPS;
+ LEAVE;
+ POPSTACK;
+
+ if (empty_return) {
+ goto definition_deferred;
+ }
+
+ if (prop_definition) {
+
+ /* If the definition is for something not known at this time,
+ * we toss it, and go return the main property name, as that's
+ * the one the user will be aware of */
+ if (! is_invlist(prop_definition)) {
+ SvREFCNT_dec_NN(prop_definition);
+ goto definition_deferred;
+ }
+
+ sv_2mortal(prop_definition);
+ }
+
+ /* And return */
+ return prop_definition;
+
+ } /* End of calling the subroutine for the user-defined property */
+ } /* End of it could be a user-defined property */
+
+ /* Here it wasn't a user-defined property that is known at this time. See
+ * if it is a Unicode property */
+
+ lookup_len = j; /* This is a more mnemonic name than 'j' */
/* Get the index into our pointer table of the inversion list corresponding
* to the property */
table_index = match_uniprop((U8 *) lookup_name, lookup_len);
- /* If it didn't find the property */
+ /* If it didn't find the property ... */
if (table_index == 0) {
- /* If didn't find the property, we try again stripping off any initial
- * 'In' or 'Is' */
- if (starts_with_In_or_Is) {
+ /* Try again stripping off any initial 'Is'. This is because we
+ * promise that an initial Is is optional. The same isn't true of
+ * names that start with 'In'. Those can match only blocks, and the
+ * lookup table already has those accounted for. */
+ if (starts_with_Is) {
lookup_name += 2;
lookup_len -= 2;
equals_pos -= 2;
if (table_index == 0) {
char * canonical;
- /* If not found, and not a numeric type property, isn't a legal
- * property */
+ /* Here, we didn't find it. If not a numeric type property, and
+ * can't be a user-defined one, it isn't a legal property */
if (! is_nv_type) {
- return NULL;
- }
+ if (! could_be_user_defined) {
+ goto failed;
+ }
- /* But the numeric type properties need more work to decide. What
- * we do is make sure we have the number in canonical form and look
+ /* Here, the property name is legal as a user-defined one. At
+ * compile time, it might just be that the subroutine for that
+ * property hasn't been encountered yet, but at runtime, it's
+ * an error to try to use an undefined one */
+ if (! deferrable) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Unknown user-defined property name");
+ goto append_name_to_msg;
+ }
+
+ goto definition_deferred;
+ } /* End of isn't a numeric type property */
+
+ /* The numeric type properties need more work to decide. What we
+ * do is make sure we have the number in canonical form and look
* that up. */
if (slash_pos < 0) { /* No slash */
* NV. */
NV value;
+ SSize_t value_len = lookup_len - equals_pos;
/* Get the value */
- if (my_atof3(lookup_name + equals_pos, &value,
- lookup_len - equals_pos)
+ if ( value_len <= 0
+ || my_atof3(lookup_name + equals_pos, &value,
+ value_len)
!= lookup_name + lookup_len)
{
- return NULL;
+ goto failed;
}
- /* If the value is an integer, the canonical value is integral */
+ /* If the value is an integer, the canonical value is integral
+ * */
if (Perl_ceil(value) == value) {
canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
- equals_pos, lookup_name, value);
+ equals_pos, lookup_name, value);
}
else { /* Otherwise, it is %e with a known precision */
char * exp_ptr;
/* Convert the numerator to numeric */
end_ptr = this_lookup_name + slash_pos;
if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
- return NULL;
+ goto failed;
}
/* It better have included all characters before the slash */
if (*end_ptr != '/') {
- return NULL;
+ goto failed;
}
/* Set to look at just the denominator */
/* Convert the denominator to numeric */
if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
- return NULL;
+ goto failed;
}
/* It better be the rest of the characters, and don't divide by
if ( end_ptr != this_lookup_name + lookup_len
|| denominator == 0)
{
- return NULL;
+ goto failed;
}
/* Get the greatest common denominator using
/* If already in lowest possible terms, we have already tried
* looking this up */
if (gcd == 1) {
- return NULL;
+ goto failed;
}
- /* Reduce the rational, which should put it in canonical form.
- * Then look it up */
+ /* Reduce the rational, which should put it in canonical form
+ * */
numerator /= gcd;
denominator /= gcd;
/* Here, we have the number in canonical form. Try that */
table_index = match_uniprop((U8 *) canonical, strlen(canonical));
if (table_index == 0) {
- return NULL;
+ goto failed;
}
- }
- }
+ } /* End of still didn't find the property in our table */
+ } /* End of didn't find the property in our table */
- /* The return is an index into a table of ptrs. A negative return
- * signifies that the real index is the absolute value, but the result
- * needs to be inverted */
+ /* Here, we have a non-zero return, which is an index into a table of ptrs.
+ * A negative return signifies that the real index is the absolute value,
+ * but the result needs to be inverted */
if (table_index < 0) {
- *invert = TRUE;
+ invert_return = TRUE;
table_index = -table_index;
}
- else {
- *invert = FALSE;
- }
/* Out-of band indices indicate a deprecated property. The proper index is
* modulo it with the table size. And dividing by the table size yields
- * an offset into a table constructed to contain the corresponding warning
- * message */
+ * an offset into a table constructed by regen/mk_invlists.pl to contain
+ * the corresponding warning message */
if (table_index > MAX_UNI_KEYWORD_INDEX) {
Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
table_index %= MAX_UNI_KEYWORD_INDEX;
}
/* Create and return the inversion list */
- return _new_invlist_C_array(uni_prop_ptrs[table_index]);
+ prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
+ sv_2mortal(prop_definition);
+
+
+ /* See if there is a private use override to add to this definition */
+ {
+ COPHH * hinthash = (IN_PERL_COMPILETIME)
+ ? CopHINTHASH_get(&PL_compiling)
+ : CopHINTHASH_get(PL_curcop);
+ SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
+
+ if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
+
+ /* See if there is an element in the hints hash for this table */
+ SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
+ const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
+
+ if (pos) {
+ bool dummy;
+ SV * pu_definition;
+ SV * pu_invlist;
+ SV * expanded_prop_definition =
+ sv_2mortal(invlist_clone(prop_definition, NULL));
+
+ /* If so, it's definition is the string from here to the next
+ * \a character. And its format is the same as a user-defined
+ * property */
+ pos += SvCUR(pu_lookup);
+ pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
+ pu_invlist = handle_user_defined_property(lookup_name,
+ lookup_len,
+ 0, /* Not UTF-8 */
+ 0, /* Not folded */
+ runtime,
+ deferrable,
+ pu_definition,
+ &dummy,
+ msg,
+ level);
+ if (TAINT_get) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Insecure private-use override");
+ goto append_name_to_msg;
+ }
+
+ /* For now, as a safety measure, make sure that it doesn't
+ * override non-private use code points */
+ _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
+
+ /* Add it to the list to be returned */
+ _invlist_union(prop_definition, pu_invlist,
+ &expanded_prop_definition);
+ prop_definition = expanded_prop_definition;
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
+ }
+ }
+ }
+
+ if (invert_return) {
+ _invlist_invert(prop_definition);
+ }
+ return prop_definition;
+
+
+ failed:
+ if (non_pkg_begin != 0) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Illegal user-defined property name");
+ }
+ else {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Can't find Unicode property definition");
+ }
+ /* FALLTHROUGH */
+
+ append_name_to_msg:
+ {
+ const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
+ const char * suffix = (runtime && level == 0) ? "}" : "\"";
+
+ sv_catpv(msg, prefix);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
+ sv_catpv(msg, suffix);
+ }
+
+ return NULL;
+
+ definition_deferred:
+
+ /* Here it could yet to be defined, so defer evaluation of this
+ * until its needed at runtime. We need the fully qualified property name
+ * to avoid ambiguity, and a trailing newline */
+ if (! fq_name) {
+ fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+ non_pkg_begin != 0 /* If has "::" */
+ );
+ }
+ sv_catpvs(fq_name, "\n");
+
+ *user_defined_ptr = TRUE;
+ return fq_name;
}
#endif