#define RExC_mysv2 (pRExC_state->mysv2)
#endif
- bool seen_unfolded_sharp_s;
+ bool seen_d_op;
bool strict;
bool study_started;
bool in_script_run;
#define RExC_parse (pRExC_state->parse)
#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
+ under /d from /u ? */
-/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
- * EXACTF node, hence was parsed under /di rules. If later in the parse,
- * something forces the pattern into using /ui rules, the sharp s should be
- * folded into the sequence 'ss', which takes up more space than previously
- * calculated. This means that the sizing pass needs to be restarted. (The
- * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
- * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
- * so there is no need to resize [perl #125990]. */
-#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
#ifdef RE_TRACK_PATTERN_OFFSETS
# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
* Flags to be passed up and down.
*/
#define WORST 0 /* Worst case. */
-#define HASWIDTH 0x01 /* Known to match non-null strings. */
+#define HASWIDTH 0x01 /* Known to not match null strings, could match
+ non-null ones. */
/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
* character. (There needs to be a case: in the switch statement in regexec.c
} \
} STMT_END
-/* Change from /d into /u rules, and restart the parse if we've already seen
- * something whose size would increase as a result, by setting *flagp and
- * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
- * we've changed to /u during the parse. */
+/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
+ * a flag that indicates we need to override /d with /u as a result of
+ * something in the pattern. It should only be used in regards to calling
+ * set_regex_charset() or get_regex_charse() */
#define REQUIRE_UNI_RULES(flagp, restart_retval) \
STMT_START { \
if (DEPENDS_SEMANTICS) { \
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
RExC_uni_semantics = 1; \
+ if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \
+ /* No need to restart the parse if we haven't seen \
+ * anything that differs between /u and /d, and no need \
+ * to restart immediately if we're going to reparse \
+ * anyway to count parens */ \
*flagp |= RESTART_PARSE; \
return restart_retval; \
+ } \
} \
} STMT_END
#define REQUIRE_BRANCHJ(flagp, restart_retval) \
STMT_START { \
RExC_use_BRANCHJ = 1; \
- *flagp |= RESTART_PARSE; \
- return restart_retval; \
+ if (LIKELY(RExC_total_parens >= 0)) { \
+ /* No need to restart the parse immediately if we're \
+ * going to reparse anyway to count parens */ \
+ *flagp |= RESTART_PARSE; \
+ return restart_retval; \
+ } \
} STMT_END
#define REQUIRE_PARENS_PASS \
if (RExC_total_parens == 0) RExC_total_parens = -1; \
} STMT_END
-/* Executes a return statement with the value 'X', if 'flags' contains any of
- * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
- * flags */
-#define RETURN_X_ON_RESTART_OR_FLAGS(X, flags, flagp, extra) \
+/* This is used to return failure (zero) early from the calling function if
+ * various flags in 'flags' are set. Two flags always cause a return:
+ * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
+ * additional flags that should cause a return; 0 if none. If the return will
+ * be done, '*flagp' is first set to be all of the flags that caused the
+ * return. */
+#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
STMT_START { \
if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
*(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
- return X; \
+ return 0; \
} \
} STMT_END
-#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
- RETURN_X_ON_RESTART_OR_FLAGS(0,flags,flagp,extra)
-
-#define RETURN_X_ON_RESTART(X, flags,flagp) \
- RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
-
-
-#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
- if (*(flagp) & (RESTART_PARSE|(extra))) return 0
-
#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
#define RETURN_FAIL_ON_RESTART(flags,flagp) \
- RETURN_X_ON_RESTART(0, flags,flagp)
+ RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
- RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
+ if (MUST_RESTART(*(flagp))) return 0
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
return TRUE;
}
+#define INVLIST_INDEX 0
+#define ONLY_LOCALE_MATCHES_INDEX 1
+#define DEFERRED_USER_DEFINED_INDEX 2
+
STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
const regnode_charclass* const node)
SV **const ary = AvARRAY(av);
assert(RExC_rxi->data->what[n] == 's');
- if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
- invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
- }
- else if (ary[0] && ary[0] != &PL_sv_undef) {
+ if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
- /* Here, no compile-time swash, and there are things that won't be
- * known until runtime -- we have to assume it could be anything */
+ /* Here there are things that won't be known until runtime -- we
+ * have to assume it could be anything */
invlist = sv_2mortal(_new_invlist(1));
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
- else if (ary[3] && ary[3] != &PL_sv_undef) {
+ else if (ary[INVLIST_INDEX]) {
- /* Here no compile-time swash, and no run-time only data. Use the
- * node's inversion list */
- invlist = sv_2mortal(invlist_clone(ary[3], NULL));
+ /* Use the node's inversion list */
+ invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
}
/* Get the code points valid only under UTF-8 locales */
- if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
- && ary[2] && ary[2] != &PL_sv_undef)
+ if ( (ANYOF_FLAGS(node) & ANYOFL_FOLD)
+ && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
{
- only_utf8_locale_invlist = ary[2];
+ only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
}
}
}
/* Add in the points from the bit map */
- for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
- if (ANYOF_BITMAP_TEST(node, i)) {
- unsigned int start = i++;
+ if (OP(node) != ANYOFH) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (ANYOF_BITMAP_TEST(node, i)) {
+ unsigned int start = i++;
- for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
- /* empty */
+ for (; i < NUM_ANYOF_CODE_POINTS
+ && ANYOF_BITMAP_TEST(node, i); ++i)
+ {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
+ new_node_has_latin1 = TRUE;
}
- invlist = _add_range_to_invlist(invlist, start, i-1);
- new_node_has_latin1 = TRUE;
}
}
if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
_invlist_invert(invlist);
}
- else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+ else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+ if (new_node_has_latin1) {
- /* Under /li, any 0-255 could fold to any other 0-255, depending on the
- * locale. We can skip this if there are no 0-255 at all. */
- _invlist_union(invlist, PL_Latin1, &invlist);
+ /* Under /li, any 0-255 could fold to any other 0-255, depending on
+ * the locale. We can skip this if there are no 0-255 at all. */
+ _invlist_union(invlist, PL_Latin1, &invlist);
+
+ invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else {
+ if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
+ invlist = add_cp_to_invlist(invlist, 'I');
+ }
+ if (_invlist_contains_cp(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
+ {
+ invlist = add_cp_to_invlist(invlist, 'i');
+ }
+ }
}
/* Similarly add the UTF-8 locale possible matches. These have to be
U32 count = 0; /* Running total of number of code points matched by
'ssc' */
UV start, end; /* Start and end points of current range in inversion
- list */
+ XXX outdated. UTF-8 locales are common, what about invert? list */
const U32 max_code_points = (LOC)
? 256
- : (( ! UNI_SEMANTICS
- || invlist_highest(ssc->invlist) < 256)
+ : (( ! UNI_SEMANTICS
+ || invlist_highest(ssc->invlist) < 256)
? 128
: NON_OTHER_COUNT);
const U32 max_match = max_code_points / 2;
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
- set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
- NULL, NULL, NULL, FALSE);
+ set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
/* Make sure is clone-safe */
ssc->invlist = NULL;
#endif
switch (flags) {
- case EXACT: case EXACTL: break;
+ case EXACT: case EXACT_ONLY8: case EXACTL: break;
case EXACTFAA:
- case EXACTFU_SS:
+ case EXACTFUP:
case EXACTFU:
case EXACTFLU8: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (flags == EXACT || flags == EXACTL)
+ if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
noper= noper_next;
}
- if ( noper < tail &&
- (
- OP(noper) == flags ||
- (
- flags == EXACTFU &&
- OP(noper) == EXACTFU_SS
- )
- )
- ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFUP))))
+ {
uc= (U8*)STRING(noper);
e= uc + STR_LEN(noper);
} else {
if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
regardless of encoding */
- if (OP( noper ) == EXACTFU_SS) {
+ if (OP( noper ) == EXACTFUP) {
/* false positives are ok, so just set this */
TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFUP))))
+ {
const U8 *uc= (U8*)STRING(noper);
const U8 *e= uc + STR_LEN(noper);
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFUP))))
+ {
const U8 *uc= (U8*)STRING(noper);
const U8 *e= uc + STR_LEN(noper);
* require special handling. The joining is only done if:
* 1) there is room in the current conglomerated node to entirely contain the
* next one.
- * 2) they are the exact same node type
+ * 2) they are compatible node types
*
* The adjacent nodes actually may be separated by NOTHING-kind nodes, and
* these get optimized out
* Both sides fold to "sss", but if the pattern is parsed to create a node that
* would match just the \xDF, it won't be able to handle the case where a
* successful match would have to cross the node's boundary. The new approach
- * that hopefully generally solves the problem generates an EXACTFU_SS node
+ * that hopefully generally solves the problem generates an EXACTFUP node
* that is "sss" in this case.
*
* It turns out that there are problems with all multi-character folds, and not
* adjust the min length of the match, and the delta between min and max,
* so that the optimizer doesn't reject these possibilities based on size
* constraints.
- * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
- * is used for an EXACTFU node that contains at least one "ss" sequence in
- * it. For non-UTF-8 patterns and strings, this is the only case where
- * there is a possible fold length change. That means that a regular
- * EXACTFU node without UTF-8 involvement doesn't have to concern itself
- * with length changes, and so can be processed faster. regexec.c takes
- * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
- * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
- * known until runtime). This saves effort in regex matching. However,
- * the pre-folding isn't done for non-UTF8 patterns because the fold of
- * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
- * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
- * again, EXACTFL) nodes fold to isn't known until runtime. The fold
- * possibilities for the non-UTF8 patterns are quite simple, except for
- * the sharp s. All the ones that don't involve a UTF-8 target string are
- * members of a fold-pair, and arrays are set up for all of them so that
- * the other member of the pair can be found quickly. Code elsewhere in
- * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
- * 'ss', even if the pattern isn't UTF-8. This avoids the issues
- * described in the next item.
+ *
+ * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
+ * under /u, we fold it to 'ss' in regatom(), and in this routine, after
+ * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
+ * EXACTFU nodes. The node type of such nodes is then changed to
+ * EXACTFUP, indicating it is problematic, and needs careful handling.
+ * (The procedures in step 1) above are sufficient to handle this case in
+ * UTF-8 encoded nodes.) The reason this is problematic is that this is
+ * the only case where there is a possible fold length change in non-UTF-8
+ * patterns. By reserving a special node type for problematic cases, the
+ * far more common regular EXACTFU nodes can be processed faster.
+ * regexec.c takes advantage of this.
+ *
+ * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
+ * problematic cases. These all only occur when the pattern is not
+ * UTF-8. In addition to the 'ss' sequence where there is a possible fold
+ * length change, it handles the situation where the string cannot be
+ * entirely folded. The strings in an EXACTFish node are folded as much
+ * as possible during compilation in regcomp.c. This saves effort in
+ * regex matching. By using an EXACTFUP node when it is not possible to
+ * fully fold at compile time, regexec.c can know that everything in an
+ * EXACTFU node is folded, so folding can be skipped at runtime. The only
+ * case where folding in EXACTFU nodes can't be done at compile time is
+ * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
+ * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
+ * handle two very different cases. Alternatively, there could have been
+ * a node type where there are length changes, one for unfolded, and one
+ * for both. If yet another special case needed to be created, the number
+ * of required node types would have to go to 7. khw figures that even
+ * though there are plenty of node types to spare, that the maintenance
+ * cost wasn't worth the small speedup of doing it that way, especially
+ * since he thinks the MICRO SIGN is rarely encountered in practice.
+ *
+ * There are other cases where folding isn't done at compile time, but
+ * none of them are under /u, and hence not for EXACTFU nodes. The folds
+ * in EXACTFL nodes aren't known until runtime, and vary as the locale
+ * changes. Some folds in EXACTF depend on if the runtime target string
+ * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
+ * when no fold in it depends on the UTF-8ness of the target string.)
+ *
* 3) A problem remains for unfolded multi-char folds. (These occur when the
* validity of the fold won't be known until runtime, and so must remain
* unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
* that a character in the pattern corresponds to at most a single
* character in the target string. (And I do mean character, and not byte
* here, unlike other parts of the documentation that have never been
- * updated to account for multibyte Unicode.) sharp s in EXACTF and
+ * updated to account for multibyte Unicode.) Sharp s in EXACTF and
* EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
* nodes it can match "\x{17F}\x{17F}". These, along with other ones in
* EXACTFL nodes, violate the assumption, and they are the only instances
U32 flags, regnode *val, U32 depth)
{
/* Merge several consecutive EXACTish nodes into one. */
+
regnode *n = regnext(scan);
U32 stringok = 1;
regnode *next = scan + NODE_SZ_STR(scan);
#endif
DEBUG_PEEP("join", scan, depth, 0);
+ assert(PL_regkind[OP(scan)] == EXACT);
+
/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
* EXACT ones that are mergeable to the current one. */
- while (n
- && (PL_regkind[OP(n)] == NOTHING
- || (stringok && OP(n) == OP(scan)))
+ while ( n
+ && ( PL_regkind[OP(n)] == NOTHING
+ || (stringok && PL_regkind[OP(n)] == EXACT))
&& NEXT_OFF(n)
&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
{
if (oldl + STR_LEN(n) > U8_MAX)
break;
+ /* Joining something that requires UTF-8 with something that
+ * doesn't, means the result requires UTF-8. */
+ if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
+ OP(scan) = EXACT_ONLY8;
+ }
+ else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
+ ; /* join is compatible, no need to change OP */
+ }
+ else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
+ OP(scan) = EXACTFU_ONLY8;
+ }
+ else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
+ ; /* join is compatible, no need to change OP */
+ }
+ else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
+ ; /* join is compatible, no need to change OP */
+ }
+ else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
+
+ /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
+ * which can join with EXACTFU ones. We check for this case
+ * here. These need to be resolved to either EXACTFU or
+ * EXACTF at joining time. They have nothing in them that
+ * would forbid them from being the more desirable EXACTFU
+ * nodes except that they begin and/or end with a single [Ss].
+ * The reason this is problematic is because they could be
+ * joined in this loop with an adjacent node that ends and/or
+ * begins with [Ss] which would then form the sequence 'ss',
+ * which matches differently under /di than /ui, in which case
+ * EXACTFU can't be used. If the 'ss' sequence doesn't get
+ * formed, the nodes get absorbed into any adjacent EXACTFU
+ * node. And if the only adjacent node is EXACTF, they get
+ * absorbed into that, under the theory that a longer node is
+ * better than two shorter ones, even if one is EXACTFU. Note
+ * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
+ * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
+
+ if (STRING(n)[STR_LEN(n)-1] == 's') {
+
+ /* Here the joined node would end with 's'. If the node
+ * following the combination is an EXACTF one, it's better to
+ * join this trailing edge 's' node with that one, leaving the
+ * current one in 'scan' be the more desirable EXACTFU */
+ if (OP(nnext) == EXACTF) {
+ break;
+ }
+
+ OP(scan) = EXACTFU_S_EDGE;
+
+ } /* Otherwise, the beginning 's' of the 2nd node just
+ becomes an interior 's' in 'scan' */
+ }
+ else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
+ ; /* join is compatible, no need to change OP */
+ }
+ else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
+
+ /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
+ * nodes. But the latter nodes can be also joined with EXACTFU
+ * ones, and that is a better outcome, so if the node following
+ * 'n' is EXACTFU, quit now so that those two can be joined
+ * later */
+ if (OP(nnext) == EXACTFU) {
+ break;
+ }
+
+ /* The join is compatible, and the combined node will be
+ * EXACTF. (These don't care if they begin or end with 's' */
+ }
+ else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
+ if ( STRING(scan)[STR_LEN(scan)-1] == 's'
+ && STRING(n)[0] == 's')
+ {
+ /* When combined, we have the sequence 'ss', which means we
+ * have to remain /di */
+ OP(scan) = EXACTF;
+ }
+ }
+ else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
+ if (STRING(n)[0] == 's') {
+ ; /* Here the join is compatible and the combined node
+ starts with 's', no need to change OP */
+ }
+ else { /* Now the trailing 's' is in the interior */
+ OP(scan) = EXACTFU;
+ }
+ }
+ else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
+
+ /* The join is compatible, and the combined node will be
+ * EXACTF. (These don't care if they begin or end with 's' */
+ OP(scan) = EXACTF;
+ }
+ else if (OP(scan) != OP(n)) {
+
+ /* The only other compatible joinings are the same node type */
+ break;
+ }
+
DEBUG_PEEP("merg", n, depth, 0);
merged++;
#endif
}
+ /* This temporary node can now be turned into EXACTFU, and must, as
+ * regexec.c doesn't handle it */
+ if (OP(scan) == EXACTFU_S_EDGE) {
+ OP(scan) = EXACTFU;
+ }
+
*min_subtract = 0;
*unfolded_multi_char = FALSE;
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
- if (OP(scan) != EXACT && OP(scan) != EXACTL) {
+ if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
U8* s0 = (U8*) STRING(scan);
U8* s = s0;
U8* s_end = s0 + STR_LEN(scan);
continue;
}
- /* Nodes with 'ss' require special handling, except for
- * EXACTFAA-ish for which there is no multi-char fold to this */
- if (len == 2 && *s == 's' && *(s+1) == 's'
- && OP(scan) != EXACTFAA
- && OP(scan) != EXACTFAA_NO_TRIE)
- {
- count = 2;
- if (OP(scan) != EXACTFL) {
- OP(scan) = EXACTFU_SS;
- }
- s += 2;
- }
- else { /* Here is a generic multi-char fold. */
+ { /* Here is a generic multi-char fold. */
U8* multi_end = s + len;
/* Count how many characters are in it. In the case of
* which we don't know until runtime. EXACTFL nodes can't
* transform into EXACTFU nodes */
if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
- OP(scan) = EXACTFU_SS;
+ OP(scan) = EXACTFUP;
}
}
}
#endif
}
+
+ if ( STR_LEN(scan) == 1
+ && isALPHA_A(* STRING(scan))
+ && ( OP(scan) == EXACTFAA
+ || ( OP(scan) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
+ {
+ U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+ /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+ * with the mask set to the complement of the bit that differs
+ * between upper and lower case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ OP(scan) = ANYOFM;
+ ARG_SET(scan, *STRING(scan) & mask);
+ FLAGS(scan) = mask;
+ }
}
#ifdef DEBUGGING
----------------+-----------
NOTHING | NOTHING
EXACT | EXACT
+ EXACT_ONLY8 | EXACT
EXACTFU | EXACTFU
- EXACTFU_SS | EXACTFU
- EXACTFAA | EXACTFAA
+ EXACTFU_ONLY8 | EXACTFU
+ EXACTFUP | EXACTFU
+ EXACTFAA | EXACTFAA
EXACTL | EXACTL
EXACTFLU8 | EXACTFLU8
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
? NOTHING \
- : ( EXACT == (X) ) \
+ : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
? EXACT \
- : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
+ : ( EXACTFU == (X) \
+ || EXACTFU_ONLY8 == (X) \
+ || EXACTFUP == (X) ) \
? EXACTFU \
- : ( EXACTFAA == (X) ) \
- ? EXACTFAA \
+ : ( EXACTFAA == (X) ) \
+ ? EXACTFAA \
: ( EXACTL == (X) ) \
? EXACTL \
- : ( EXACTFLU8 == (X) ) \
- ? EXACTFLU8 \
+ : ( EXACTFLU8 == (X) ) \
+ ? EXACTFLU8 \
: 0 )
/* dont use tail as the end marker for this traverse */
continue;
}
}
- else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
+ else if ( OP(scan) == EXACT
+ || OP(scan) == EXACT_ONLY8
+ || OP(scan) == EXACTL)
+ {
SSize_t l = STR_LEN(scan);
UV uc;
assert(l);
case PLUS:
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
- if (OP(next) == EXACT
+ if ( OP(next) == EXACT
+ || OP(next) == EXACT_ONLY8
|| OP(next) == EXACTL
|| (flags & SCF_DO_STCLASS))
{
min++;
/* FALLTHROUGH */
case STAR:
+ next = NEXTOPER(scan);
+
+ /* This temporary node can now be turned into EXACTFU, and
+ * must, as regexec.c doesn't handle it */
+ if (OP(next) == EXACTFU_S_EDGE) {
+ OP(next) = EXACTFU;
+ }
+
+ if ( STR_LEN(next) == 1
+ && isALPHA_A(* STRING(next))
+ && ( OP(next) == EXACTFAA
+ || ( OP(next) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
+ {
+ /* These differ in just one bit */
+ U8 mask = ~ ('A' ^ 'a');
+
+ assert(isALPHA_A(* STRING(next)));
+
+ /* Then replace it by an ANYOFM node, with
+ * the mask set to the complement of the
+ * bit that differs between upper and lower
+ * case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ OP(next) = ANYOFM;
+ ARG_SET(next, *STRING(next) & mask);
+ FLAGS(next) = mask;
+ }
+
if (flags & SCF_DO_STCLASS) {
mincount = 0;
maxcount = REG_INFTY;
case ANYOFD:
case ANYOFL:
case ANYOFPOSIXL:
+ case ANYOFH:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
(regnode_charclass *) scan);
break;
+ case NANYOFM:
case ANYOFM:
{
SV* cp_list = get_ANYOFM_contents(scan);
if (flags & SCF_DO_STCLASS_OR) {
- ssc_union(data->start_class,
- cp_list,
- FALSE /* don't invert */
- );
+ ssc_union(data->start_class, cp_list, invert);
}
else if (flags & SCF_DO_STCLASS_AND) {
- ssc_intersection(data->start_class,
- cp_list,
- FALSE /* don't invert */
- );
+ ssc_intersection(data->start_class, cp_list, invert);
}
SvREFCNT_dec_NN(cp_list);
}
break;
- case NASCII:
- invert = 1;
- /* FALLTHROUGH */
- case ASCII:
- my_invlist = invlist_clone(PL_Posix_ptrs[_CC_ASCII], NULL);
-
- /* This can be handled as a Posix class */
- goto join_posix_and_ascii;
-
case NPOSIXA: /* For these, we always know the exact set of
what's matched */
invert = 1;
/* FALLTHROUGH */
case POSIXA:
- assert(FLAGS(scan) != _CC_ASCII);
my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
goto join_posix_and_ascii;
&& n < pRExC_state->code_blocks->count
&& s == pRExC_state->code_blocks->cb[n].start)
{
- /* blank out literal code block */
- assert(pat[s] == '(');
- while (s <= pRExC_state->code_blocks->cb[n].end) {
- *p++ = '_';
+ /* blank out literal code block so that they aren't
+ * recompiled: eg change from/to:
+ * /(?{xyz})/
+ * /(?=====)/
+ * and
+ * /(??{xyz})/
+ * /(?======)/
+ * and
+ * /(?(?{xyz}))/
+ * /(?(?=====))/
+ */
+ assert(pat[s] == '(');
+ assert(pat[s+1] == '?');
+ *p++ = '(';
+ *p++ = '?';
+ s += 2;
+ while (s < pRExC_state->code_blocks->cb[n].end) {
+ *p++ = '=';
s++;
}
- s--;
+ *p++ = ')';
n++;
continue;
}
* pm_flags field of the related PMOP. Currently we're only interested in
* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
*
- * We can't allocate space until we know how big the compiled form will be,
- * but we can't compile it (and thus know how big it is) until we've got a
- * place to put the code. So we cheat: we compile it twice, once with code
- * generation turned off and size counting turned on, and once "for real".
- * This also means that we don't allocate space until we are sure that the
- * thing really will compile successfully, and we never have to move the
- * code and thus invalidate pointers into it. (Note that it has to be in
- * one piece because free() must be able to free it all.) [NB: not true in perl]
+ * For many years this code had an initial sizing pass that calculated
+ * (sometimes incorrectly, leading to security holes) the size needed for the
+ * compiled pattern. That was changed by commit
+ * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
+ * node at a time, as parsing goes along. Patches welcome to fix any obsolete
+ * references to this sizing pass.
+ *
+ * Now, an initial crude guess as to the size needed is made, based on the
+ * length of the pattern. Patches welcome to improve that guess. That amount
+ * of space is malloc'd and then immediately freed, and then clawed back node
+ * by node. This design is to minimze, to the extent possible, memory churn
+ * when doing the the reallocs.
+ *
+ * A separate parentheses counting pass may be needed in some cases.
+ * (Previously the sizing pass did this.) Patches welcome to reduce the number
+ * of these cases.
+ *
+ * The existence of a sizing pass necessitated design decisions that are no
+ * longer needed. There are potential areas of simplification.
*
* Beware that the optimization-preparation code in here knows about some
* of the structure of the compiled regexp. [I'll say.]
/* ignore the utf8ness if the pattern is 0 length */
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
-
- RExC_uni_semantics = RExC_utf8; /* UTF-8 implies unicode semantics;
- otherwise we may find later this should
- be 1 */
- RExC_seen_unfolded_sharp_s = 0;
+ RExC_uni_semantics = 0;
RExC_contains_locale = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
RExC_in_script_run = 0;
RExC_close_parens = NULL;
RExC_paren_names = NULL;
RExC_size = 0;
+ RExC_seen_d_op = FALSE;
#ifdef DEBUGGING
RExC_paren_name_list = NULL;
#endif
rx_flags = orig_rx_flags;
- if ( initial_charset == REGEX_DEPENDS_CHARSET
- && (RExC_uni_semantics))
+ if ( (UTF || RExC_uni_semantics)
+ && initial_charset == REGEX_DEPENDS_CHARSET)
{
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
+ RExC_uni_semantics = 1;
}
RExC_pm_flags = pm_flags;
goto redo_parse;
}
- /* In a stable state, as here, this must be true */
- assert(RExC_size = RExC_emit + 1);
-
/* Here, we have successfully parsed and generated the pattern's program
* for the regex engine. We are ready to finish things up and look for
* optimizations. */
DEBUG_PEEP("first:", first, 0, 0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
- if (OP(first) == EXACT || OP(first) == EXACTL)
+ if ( OP(first) == EXACT
+ || OP(first) == EXACT_ONLY8
+ || OP(first) == EXACTL)
+ {
NOOP; /* Empty, get anchored substr later. */
+ }
else
RExC_rxi->regstclass = first;
}
&& nop == END)
RExC_rx->extflags |= RXf_WHITE;
else if ( RExC_rx->extflags & RXf_SPLIT
- && (fop == EXACT || fop == EXACTL)
+ && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& nop == END )
}
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
- int num; \
if (RExC_lastparse!=RExC_parse) { \
Perl_re_printf( aTHX_ "%s", \
Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
} else \
Perl_re_printf( aTHX_ "%16s",""); \
\
- num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
- if (RExC_lastnum!=num) \
- Perl_re_printf( aTHX_ "|%4d", num); \
+ if (RExC_lastnum!=RExC_emit) \
+ Perl_re_printf( aTHX_ "|%4d", RExC_emit); \
else \
Perl_re_printf( aTHX_ "|%4s",""); \
Perl_re_printf( aTHX_ "|%*s%-4s", \
(int)((depth*2)), "", \
(funcname) \
); \
- RExC_lastnum=num; \
+ RExC_lastnum=RExC_emit; \
RExC_lastparse=RExC_parse; \
})
initial_size = 10;
}
- /* Allocate the initial space */
new_list = newSV_type(SVt_INVLIST);
-
initialize_invlist_guts(new_list, initial_size);
return new_list;
}
void
-Perl__invlist_populate_swatch(SV* const invlist,
- const UV start, const UV end, U8* swatch)
-{
- /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
- * but is used when the swash has an inversion list. This makes this much
- * faster, as it uses a binary search instead of a linear one. This is
- * intimately tied to that function, and perhaps should be in utf8.c,
- * except it is intimately tied to inversion lists as well. It assumes
- * that <swatch> is all 0's on input */
-
- UV current = start;
- const IV len = _invlist_len(invlist);
- IV i;
- const UV * array;
-
- PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
-
- if (len == 0) { /* Empty inversion list */
- return;
- }
-
- array = invlist_array(invlist);
-
- /* Find which element it is */
- i = _invlist_search(invlist, start);
-
- /* We populate from <start> to <end> */
- while (current < end) {
- UV upper;
-
- /* The inversion list gives the results for every possible code point
- * after the first one in the list. Only those ranges whose index is
- * even are ones that the inversion list matches. For the odd ones,
- * and if the initial code point is not in the list, we have to skip
- * forward to the next element */
- if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
- i++;
- if (i >= len) { /* Finished if beyond the end of the array */
- return;
- }
- current = array[i];
- if (current >= end) { /* Finished if beyond the end of what we
- are populating */
- if (LIKELY(end < UV_MAX)) {
- return;
- }
-
- /* We get here when the upper bound is the maximum
- * representable on the machine, and we are looking for just
- * that code point. Have to special case it */
- i = len;
- goto join_end_of_list;
- }
- }
- assert(current >= start);
-
- /* The current range ends one below the next one, except don't go past
- * <end> */
- i++;
- upper = (i < len && array[i] < end) ? array[i] : end;
-
- /* Here we are in a range that matches. Populate a bit in the 3-bit U8
- * for each code point in it */
- for (; current < upper; current++) {
- const STRLEN offset = (STRLEN)(current - start);
- swatch[offset >> 3] |= 1 << (offset & 7);
- }
-
- join_end_of_list:
-
- /* Quit if at the end of the list */
- if (i >= len) {
-
- /* But first, have to deal with the highest possible code point on
- * the platform. The previous code assumes that <end> is one
- * beyond where we want to populate, but that is impossible at the
- * platform's infinity, so have to handle it specially */
- if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
- {
- const STRLEN offset = (STRLEN)(end - start);
- swatch[offset >> 3] |= 1 << (offset & 7);
- }
- return;
- }
-
- /* Advance to the next range, which will be for code points not in the
- * inversion list */
- current = array[i];
- }
-
- return;
-}
-
-void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
const bool complement_b, SV** output)
{
SV*
Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
{
-
/* Return a new inversion list that is a copy of the input one, which is
* unchanged. The new list will not be mortal even if the old one was. */
- const STRLEN nominal_length = _invlist_len(invlist); /* Why not +1 XXX */
+ const STRLEN nominal_length = _invlist_len(invlist);
const STRLEN physical_length = SvCUR(invlist);
const bool offset = *(get_invlist_offset_addr(invlist));
PERL_ARGS_ASSERT_INVLIST_CLONE;
- /* Need to allocate extra space to accommodate Perl's addition of a
- * trailing NUL to SvPV's, since it thinks they are always strings */
if (new_invlist == NULL) {
new_invlist = _new_invlist(nominal_length);
}
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
+ Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
start, intra_range_delimiter,
inter_range_delimiter);
}
[0] 0x000A .. 0x000D
[2] 0x0085
[4] 0x2028 .. 0x2029
- [6] 0x3104 .. INFINITY
+ [6] 0x3104 .. INFTY
* This means that the first range of code points matched by the list are
* 0xA through 0xD; the second range contains only the single code point
* 0x85, etc. An inversion list is an array of UVs. Two array elements
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
Perl_dump_indent(aTHX_ level, file,
- "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
+ "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
indent, (UV)count, start);
}
else if (end != start) {
* identical. The final argument, if TRUE, says to take the complement of
* the second inversion list before doing the comparison */
- const UV* array_a = invlist_array(a);
- const UV* array_b = invlist_array(b);
- UV len_a = _invlist_len(a);
+ const UV len_a = _invlist_len(a);
UV len_b = _invlist_len(b);
+ const UV* array_a = NULL;
+ const UV* array_b = NULL;
+
PERL_ARGS_ASSERT__INVLISTEQ;
+ /* This code avoids accessing the arrays unless it knows the length is
+ * non-zero */
+
+ if (len_a == 0) {
+ if (len_b == 0) {
+ return ! complement_b;
+ }
+ }
+ else {
+ array_a = invlist_array(a);
+ }
+
+ if (len_b != 0) {
+ array_b = invlist_array(b);
+ }
+
/* If are to compare 'a' with the complement of b, set it
* up so are looking at b's complement. */
if (complement_b) {
if (len_b == 0) {
return (len_a == 1 && array_a[0] == 0);
}
- else if (array_b[0] == 0) {
+ if (array_b[0] == 0) {
/* Otherwise, to complement, we invert. Here, the first element is
* 0, just remove it. To do this, we just pretend the array starts
/*
* As best we can, determine the characters that can match the start of
- * the given EXACTF-ish node.
+ * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
+ * can be false positive matches
*
* Returns the invlist as a new SV*; it is the caller's responsibility to
* call SvREFCNT_dec() when done with it.
}
else {
/* Any Latin1 range character can potentially match any
- * other depending on the locale */
+ * other depending on the locale, and in Turkic locales, U+130 and
+ * U+131 */
if (OP(node) == EXACTFL) {
_invlist_union(invlist, PL_Latin1, &invlist);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
}
else {
/* But otherwise, it matches at least itself. We can
}
else { /* Single char fold */
unsigned int k;
- unsigned int first_folds_to;
- const unsigned int * remaining_folds_to_list;
- Size_t folds_to_count;
+ unsigned int first_fold;
+ const unsigned int * remaining_folds;
+ Size_t folds_count;
/* It matches itself */
invlist = add_cp_to_invlist(invlist, fc);
/* ... plus all the things that fold to it, which are found in
* PL_utf8_foldclosures */
- folds_to_count = _inverse_folds(fc, &first_folds_to,
- &remaining_folds_to_list);
- for (k = 0; k < folds_to_count; k++) {
- UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1];
+ folds_count = _inverse_folds(fc, &first_fold,
+ &remaining_folds);
+ for (k = 0; k < folds_count; k++) {
+ UV c = (k == 0) ? first_fold : remaining_folds[k-1];
/* /aa doesn't allow folds between ASCII and non- */
if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
invlist = add_cp_to_invlist(invlist, c);
}
+
+ if (OP(node) == EXACTFL) {
+
+ /* If either [iI] are present in an EXACTFL node the above code
+ * should have added its normal case pair, but under a Turkish
+ * locale they could match instead the case pairs from it. Add
+ * those as potential matches as well */
+ if (isALPHA_FOLD_EQ(fc, 'I')) {
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
+ invlist = add_cp_to_invlist(invlist, 'I');
+ }
+ else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ invlist = add_cp_to_invlist(invlist, 'i');
+ }
+ }
}
}
RExC_parse++;
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- set_regex_charset(&RExC_flags, (RExC_uni_semantics)
- ? REGEX_UNICODE_CHARSET
- : REGEX_DEPENDS_CHARSET);
+ cs = (RExC_uni_semantics)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET;
+ set_regex_charset(&RExC_flags, cs);
}
-
- cs = get_regex_charset(RExC_flags);
- if (cs == REGEX_DEPENDS_CHARSET
- && (RExC_uni_semantics))
- {
- cs = REGEX_UNICODE_CHARSET;
+ else {
+ cs = get_regex_charset(RExC_flags);
+ if ( cs == REGEX_DEPENDS_CHARSET
+ && RExC_uni_semantics)
+ {
+ cs = REGEX_UNICODE_CHARSET;
+ }
}
while (RExC_parse < RExC_end) {
*
* Returns 0 otherwise, with *flagp set to indicate why:
* TRYAGAIN at the end of (?) that only sets flags.
- * RESTART_PARSE if the sizing scan needs to be restarted, or'd with
+ * RESTART_PARSE if the parse needs to be restarted, or'd with
* NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
* Otherwise would only return 0 if regbranch() returns 0, which cannot
* happen. */
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting open paren #%" IVdf " to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
- (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
+ (IV)parno, ret));
RExC_open_parens[parno]= ret;
}
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting close paren #%" IVdf " to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
- (IV)parno, REG_NODE_NUM(REGNODE_p(ender))));
+ (IV)parno, ender));
RExC_close_parens[parno]= ender;
if (RExC_nestroot == parno)
RExC_nestroot = 0;
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting close paren #0 (END) to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
- REG_NODE_NUM(REGNODE_p(ender))));
+ ender));
RExC_close_parens[0]= ender;
}
regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
SvPV_nolen_const(RExC_mysv1),
- (IV)REG_NODE_NUM(REGNODE_p(lastbr)),
+ (IV)lastbr,
SvPV_nolen_const(RExC_mysv2),
- (IV)REG_NODE_NUM(REGNODE_p(ender)),
+ (IV)ender,
(IV)(ender - lastbr)
);
);
SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(ret_as_regnode),
SvPV_nolen_const(RExC_mysv2),
- (IV)REG_NODE_NUM(REGNODE_p(ender)),
+ (IV)ender,
(IV)(ender - ret)
);
);
/* Check for proper termination. */
if (paren) {
- /* restore original flags, but keep (?p) and, if we've changed from /d
- * rules to /u, keep the /u */
+ /* restore original flags, but keep (?p) and, if we've encountered
+ * something in the parse that changes /d rules into /u, keep the /u */
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
* On success, returns the offset at which any next node should be placed into
* the regex engine program being compiled.
*
- * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
+ * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
* to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
* UTF-8
*/
if ( chain > (SSize_t) BRANCH_MAX_OFFSET
&& ! RExC_use_BRANCHJ)
{
+ /* XXX We could just redo this branch, but figuring out what
+ * bookkeeping needs to be reset is a pain */
REQUIRE_BRANCHJ(flagp, 0);
}
REGTAIL(pRExC_state, chain, latest);
*
* Returns 0 otherwise, with *flagp set to indicate why:
* TRYAGAIN if regatom() returns 0 with TRYAGAIN.
- * RESTART_PARSE if the sizing scan needs to be restarted, or'd with
+ * RESTART_PARSE if the parse needs to be restarted, or'd with
* NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
*/
STATIC regnode_offset
* function calling S_reg().
*
* The final possibility is that it is premature to be calling this function;
- * that pass1 needs to be restarted. This can happen when this changes from
+ * the parse needs to be restarted. This can happen when this changes from
* /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
* latter occurs only when the fourth possibility would otherwise be in
* effect, and is because one of those code points requires the pattern to be
SvREFCNT_dec_NN(substitute_parse);
if (! *node_p) {
- RETURN_X_ON_RESTART(FALSE, flags, flagp);
+ RETURN_FAIL_ON_RESTART(flags, flagp);
FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
(UV) flags);
}
return op + EXACTF;
}
-PERL_STATIC_INLINE void
-S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
- regnode_offset node, I32* flagp, STRLEN len,
- UV code_point, bool downgradable)
-{
- /* This knows the details about sizing an EXACTish node, setting flags for
- * it (by setting <*flagp>, and potentially populating it with a single
- * character.
- *
- * If <len> (the length in bytes) is non-zero, this function assumes that
- * the node has already been populated, and just does the sizing. In this
- * case <code_point> should be the final code point that has already been
- * placed into the node. This value will be ignored except that under some
- * circumstances <*flagp> is set based on it.
- *
- * If <len> is zero, the function assumes that the node is to contain only
- * the single character given by <code_point> and calculates what <len>
- * should be. In pass 1, it sizes the node appropriately. In pass 2, it
- * additionally will populate the node's STRING with <code_point> or its
- * fold if folding.
- *
- * In both cases <*flagp> is appropriately set
- *
- * It knows that under FOLD, the Latin Sharp S and UTF characters above
- * 255, must be folded (the former only when the rules indicate it can
- * match 'ss')
- *
- * When it does the populating, it looks at the flag 'downgradable'. If
- * true with a node that folds, it checks if the single code point
- * participates in a fold, and if not downgrades the node to an EXACT.
- * This helps the optimizer */
-
- bool len_passed_in = cBOOL(len != 0);
- U8 character[UTF8_MAXBYTES_CASE+1];
-
- PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
-
- if (! len_passed_in) {
- if (UTF) {
- if (UVCHR_IS_INVARIANT(code_point)) {
- if (LOC || ! FOLD) { /* /l defers folding until runtime */
- *character = (U8) code_point;
- }
- else { /* Here is /i and not /l. */
- *character = toFOLD((U8) code_point);
-
- /* We can downgrade to an EXACT node if this character
- * isn't a folding one. Note that this assumes that
- * nothing above Latin1 folds to some other invariant than
- * one of these alphabetics; otherwise we would also have
- * to check:
- * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
- * || ASCII_FOLD_RESTRICTED))
- */
- if (downgradable && PL_fold[code_point] == code_point) {
- OP(REGNODE_p(node)) = EXACT;
- }
- }
- len = 1;
- }
- else if (FOLD && (! LOC
- || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
- { /* Folding, and ok to do so now */
- UV folded = _to_uni_fold_flags(
- code_point,
- character,
- &len,
- FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
- if (downgradable
- && folded == code_point /* This quickly rules out many
- cases, avoiding the
- _invlist_contains_cp() overhead
- for those. */
- && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
- {
- OP(REGNODE_p(node)) = (LOC)
- ? EXACTL
- : EXACT;
- }
- }
- else if (code_point <= MAX_UTF8_TWO_BYTE) {
-
- /* Not folding this cp, and can output it directly */
- *character = UTF8_TWO_BYTE_HI(code_point);
- *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
- len = 2;
- }
- else {
- uvchr_to_utf8( character, code_point);
- len = UTF8SKIP(character);
- }
- } /* Else pattern isn't UTF8. */
- else if (! FOLD) {
- *character = (U8) code_point;
- len = 1;
- } /* Else is folded non-UTF8 */
-#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
- || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
- || UNICODE_DOT_DOT_VERSION > 0)
- else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
-#else
- else if (1) {
-#endif
- /* We don't fold any non-UTF8 except possibly the Sharp s (see
- * comments at join_exact()); */
- *character = (U8) code_point;
- len = 1;
-
- /* Can turn into an EXACT node if we know the fold at compile time,
- * and it folds to itself and doesn't particpate in other folds */
- if (downgradable
- && ! LOC
- && PL_fold_latin1[code_point] == code_point
- && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
- || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
- {
- OP(REGNODE_p(node)) = EXACT;
- }
- } /* else is Sharp s. May need to fold it */
- else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
- *character = 's';
- *(character + 1) = 's';
- len = 2;
- }
- else {
- *character = LATIN_SMALL_LETTER_SHARP_S;
- len = 1;
- }
- }
-
- if (downgradable) {
- change_engine_size(pRExC_state, STR_SZ(len));
- }
-
- RExC_emit += STR_SZ(len);
- STR_LEN(REGNODE_p(node)) = len;
- if (! len_passed_in) {
- Copy((char *) character, STRING(REGNODE_p(node)), len, char);
- }
-
- *flagp |= HASWIDTH;
-
- /* A single character node is SIMPLE, except for the special-cased SHARP S
- * under /di. */
- if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
-#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
- || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
- || UNICODE_DOT_DOT_VERSION > 0)
- && ( code_point != LATIN_SMALL_LETTER_SHARP_S
- || ! FOLD || ! DEPENDS_SEMANTICS)
-#endif
- ) {
- *flagp |= SIMPLE;
- }
-
- if (OP(REGNODE_p(node)) == EXACTFL) {
- RExC_contains_locale = 1;
- }
-}
-
STATIC bool
S_new_regcurly(const char *s, const char *e)
{
at which any next regnode should be placed.
Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
- Returns 0, setting *flagp to RESTART_PARSE if the sizing scan needs to be
+ Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
Otherwise does not return 0.
FALSE, /* don't silence non-portable warnings. */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
- NULL,
NULL);
if (ret == 0) {
- RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, NEED_UTF8);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
(UV) *flagp);
}
/* Special Escapes
This switch handles escape sequences that resolve to some kind
- of special regop and not to literal text. Escape sequnces that
+ of special regop and not to literal text. Escape sequences that
resolve to literal text are handled below in the switch marked
"Literal Escapes".
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, SBOL);
/* SBOL is shared with /^/ so we set the flags so we can tell
- * /\A/ from /^/ in split. We check ret because first pass we
- * have no regop struct to set the flags on. */
+ * /\A/ from /^/ in split. */
FLAGS(REGNODE_p(ret)) = 1;
*flagp |= SIMPLE;
goto finish_meta_pat;
/* FALLTHROUGH */
case 'b':
{
+ U8 flags = 0;
regex_charset charset = get_regex_charset(RExC_flags);
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
op = BOUND + charset;
- if (op == BOUNDL) {
- RExC_contains_locale = 1;
- }
-
- ret = reg_node(pRExC_state, op);
- *flagp |= SIMPLE;
if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
- FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND;
+ flags = TRADITIONAL_BOUND;
if (op > BOUNDA) { /* /aa is same as /a */
- OP(REGNODE_p(ret)) = BOUNDA;
+ op = BOUNDA;
}
}
else {
{
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = GCB_BOUND;
+ flags = GCB_BOUND;
break;
case 'l':
if (length != 2 || *(RExC_parse + 1) != 'b') {
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = LB_BOUND;
+ flags = LB_BOUND;
break;
case 's':
if (length != 2 || *(RExC_parse + 1) != 'b') {
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = SB_BOUND;
+ flags = SB_BOUND;
break;
case 'w':
if (length != 2 || *(RExC_parse + 1) != 'b') {
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = WB_BOUND;
+ flags = WB_BOUND;
break;
default:
bad_bound_type:
RExC_parse = endbrace;
REQUIRE_UNI_RULES(flagp, 0);
- if (op >= BOUNDA) { /* /aa is same as /a */
- OP(REGNODE_p(ret)) = BOUNDU;
+ if (op == BOUND) {
+ op = BOUNDU;
+ }
+ else if (op >= BOUNDA) { /* /aa is same as /a */
+ op = BOUNDU;
length += 4;
/* Don't have to worry about UTF-8, in this message because
}
}
+ if (op == BOUND) {
+ RExC_seen_d_op = TRUE;
+ }
+ else if (op == BOUNDL) {
+ RExC_contains_locale = 1;
+ }
+
if (invert) {
- OP(REGNODE_p(ret)) += NBOUND - BOUND;
+ op += NBOUND - BOUND;
}
+
+ ret = reg_node(pRExC_state, op);
+ FLAGS(REGNODE_p(ret)) = flags;
+
+ *flagp |= SIMPLE;
+
goto finish_meta_pat;
}
else if (op == POSIXL) {
RExC_contains_locale = 1;
}
+ else if (op == POSIXD) {
+ RExC_seen_d_op = TRUE;
+ }
join_posix_op_known:
non-portables */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
- NULL,
NULL);
RETURN_FAIL_ON_RESTART_FLAGP(flagp);
/* regclass() can only return RESTART_PARSE and NEED_UTF8 if
? REFFL
: REFF),
num);
+ if (OP(REGNODE_p(ret)) == REFF) {
+ RExC_seen_d_op = TRUE;
+ }
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
/* We start out as an EXACT node, even if under /i, until we find a
* character which is in a fold. The algorithm now segregates into
* separate nodes, characters that fold from those that don't under
- * /i. (This hopefull will create nodes that are fixed strings
- * even under /i, giving the optimizer something to grab onto to.)
+ * /i. (This hopefully will create nodes that are fixed strings
+ * even under /i, giving the optimizer something to grab on to.)
* So, if a node has something in it and the next character is in
* the opposite category, that node is closed up, and the function
* returns. Then regatom is called again, and a new node is
* created for the new category. */
U8 node_type = EXACT;
- /* Assume node will be fully used; the excess is given back at the end */
+ /* Assume the node will be fully used; the excess is given back at
+ * the end. We can't make any other length assumptions, as a byte
+ * input sequence could shrink down. */
Ptrdiff_t initial_size = STR_SZ(256);
bool next_is_quantifier;
/* We can convert EXACTF nodes to EXACTFU if they contain only
* characters that match identically regardless of the target
* string's UTF8ness. The reason to do this is that EXACTF is not
- * trie-able, EXACTFU is.
+ * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
+ * runtime.
*
* Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
* contain only above-Latin1 characters (hence must be in UTF8),
* which don't participate in folds with Latin1-range characters,
- * as the latter's folds aren't known until runtime. (We don't
- * need to figure this out until pass 2) */
- bool maybe_exactfu = TRUE;
+ * as the latter's folds aren't known until runtime. */
+ bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
- /* To see if RExC_uni_semantics changes during parsing of the node.
- * */
- bool uni_semantics_at_node_start;
+ /* Single-character EXACTish nodes are almost always SIMPLE. This
+ * allows us to override this as encountered */
+ U8 maybe_SIMPLE = SIMPLE;
+
+ /* Does this node contain something that can't match unless the
+ * target string is (also) in UTF-8 */
+ bool requires_utf8_target = FALSE;
+
+ /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
+ bool has_ss = FALSE;
+
+ /* So is the MICRO SIGN */
+ bool has_micro_sign = FALSE;
/* Allocate an EXACT node. The node_type may change below to
* another EXACTish node, but since the size of the node doesn't
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
- uni_semantics_at_node_start = cBOOL(RExC_uni_semantics);
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
}
p = RExC_parse;
RExC_parse = parse_start;
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
+
+ /* The \N{} means the pattern, if previously /d,
+ * becomes /u. That means it can't be an EXACTF node,
+ * but an EXACTFU */
+ if (node_type == EXACTF) {
+ node_type = EXACTFU;
+
+ /* If the node already contains something that
+ * differs between EXACTF and EXACTFU, reparse it
+ * as EXACTFU */
+ if (! maybe_exactfu) {
+ len = 0;
+ s = s0;
+ goto reparse;
+ }
}
+
break;
case 'r':
ender = '\r';
}
UPDATE_WARNINGS_LOC(p - 1);
ender = result;
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
- }
break;
}
case 'x':
}
#endif
}
- else {
- REQUIRE_UTF8(flagp);
- }
break;
}
case 'c':
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
- }
p += numlen;
if ( isDIGIT(*p) /* like \08, \178 */
&& ckWARN(WARN_REGEXP)
/* Here, have looked at the literal character, and <ender>
* contains its ordinal; <p> points to the character after it.
- * We need to check if the next non-ignored thing is a
+ * */
+
+ if (ender > 255) {
+ REQUIRE_UTF8(flagp);
+ }
+
+ /* We need to check if the next non-ignored thing is a
* quantifier. Move <p> to after anything that should be
* ignored, which, as a side effect, positions <p> for the next
* loop iteration */
if (! FOLD) { /* The simple case, just append the literal */
not_fold_common:
- if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+ *(s++) = (char) ender;
+ }
+ else {
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
added_len = (char *) new_s - s;
s = (char *) new_s;
- }
- else {
- *(s++) = (char) ender;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
+ }
}
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
* existing node, so can start a new node with this one */
if (! len) {
node_type = EXACTFL;
+ RExC_contains_locale = 1;
}
else if (node_type == EXACT) {
p = oldp;
goto loopdone;
}
- /* This code point means we can't simplify things */
+ /* This problematic code point means we can't simplify
+ * things */
maybe_exactfu = FALSE;
- /* A problematic code point in this context means that its
- * fold isn't known until runtime, so we can't fold it now.
- * (The non-problematic code points are the above-Latin1
- * ones that fold to also all above-Latin1. Their folds
- * don't vary no matter what the locale is.) But here we
- * have characters whose fold depends on the locale.
- * Unlike the non-folding case above, we have to keep track
- * of these in the sizing pass, so that we can make sure we
- * don't split too-long nodes in the middle of a potential
- * multi-char fold. And unlike the regular fold case
- * handled in the else clauses below, we don't actually
- * fold and don't have special cases to consider. What we
- * do for both passes is the PASS2 code for non-folding */
+ /* Here, we are adding a problematic fold character.
+ * "Problematic" in this context means that its fold isn't
+ * known until runtime. (The non-problematic code points
+ * are the above-Latin1 ones that fold to also all
+ * above-Latin1. Their folds don't vary no matter what the
+ * locale is.) But here we have characters whose fold
+ * depends on the locale. We just add in the unfolded
+ * character, and wait until runtime to fold it */
goto not_fold_common;
}
- else /* A regular FOLD code point */
- if (! UTF)
+ else /* regular fold; see if actually is in a fold */
+ if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
+ || (ender > 255
+ && ! _invlist_contains_cp(PL_in_some_fold, ender)))
{
- /* Here, are folding and are not UTF-8 encoded; therefore
- * the character must be in the range 0-255, and is not /l.
- * (Not /l because we already handled these under /l in
- * is_PROBLEMATIC_LOCALE_FOLD_cp) */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
-
- /* Start a new node for this non-folding character if
- * previous ones in the node were folded */
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- *(s++) = (char) ender;
+ /* Here, folding, but the character isn't in a fold.
+ *
+ * Start a new node if previous characters in the node were
+ * folded */
+ if (len && node_type != EXACT) {
+ p = oldp;
+ goto loopdone;
}
- else { /* Here, does participate in some fold */
-
- /* if this is the first character in the node, change
- * its type to folding. Otherwise, if this is the
- * first folding character in the node, close up the
- * existing node, so can start a new node with this
- * one. */
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
- /* See if the character's fold differs between /d and
- * /u. On non-ancient Unicode versions, this includes
- * the multi-char fold SHARP S to 'ss' */
+ /* Here, continuing a node with non-folded characters. Add
+ * this one */
+ goto not_fold_common;
+ }
+ else { /* Here, does participate in some fold */
-#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
- || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
- || UNICODE_DOT_DOT_VERSION > 0)
-
- if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
-
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- if (node_type == EXACTFU) {
- *(s++) = 's';
+ /* If this is the first character in the node, change its
+ * type to folding. Otherwise, if this is the first
+ * folding character in the node, close up the existing
+ * node, so can start a new node with this one. */
+ if (! len) {
+ node_type = compute_EXACTish(pRExC_state);
+ }
+ else if (node_type == EXACT) {
+ p = oldp;
+ goto loopdone;
+ }
- /* Let the code below add in the extra 's' */
- ender = 's';
- added_len = 2;
- }
- else if ( uni_semantics_at_node_start
- != RExC_uni_semantics)
+ if (UTF) { /* Use the folded value */
+ if (UVCHR_IS_INVARIANT(ender)) {
+ *(s)++ = (U8) toFOLD(ender);
+ }
+ else {
+ ender = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ s += added_len;
+
+ if ( ender > 255
+ && LIKELY(ender != GREEK_SMALL_LETTER_MU))
{
- /* Here, we are supossed to be using Unicode
- * rules, but this folding node is not. This
- * happens during pass 1 when the node started
- * out not under Unicode rules, but a \N{} was
- * encountered during the processing of it,
- * causing Unicode rules to be switched into.
- * Pass 1 continues uninterrupted, as by the
- * time we get to pass 2, we will know enough
- * to generate the correct folds. Except in
- * this one case, we need to restart the node,
- * because the fold of the sharp s requires 2
- * characters, and the sizing needs to account
- * for that. */
- p = oldp;
- goto loopdone;
- }
- else {
- RExC_seen_unfolded_sharp_s = 1;
- maybe_exactfu = FALSE;
+ /* U+B5 folds to the MU, so its possible for a
+ * non-UTF-8 target to match it */
+ requires_utf8_target = TRUE;
}
}
- else if ( len
- && isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))
- {
- maybe_exactfu = FALSE;
- }
- else
-#endif
+ }
+ else {
+ /* Here is non-UTF8. First, see if the character's
+ * fold differs between /d and /u. */
if (PL_fold[ender] != PL_fold_latin1[ender]) {
maybe_exactfu = FALSE;
}
- /* Even when folding, we store just the input
- * character, as we have an array that finds its fold
- * quickly */
- *(s++) = (char) ender;
- }
- }
- else { /* FOLD, and UTF */
- /* Unlike the non-fold case, we do actually have to
- * calculate the fold in pass 1. This is for two reasons,
- * the folded length may be longer than the unfolded, and
- * we have to calculate how many EXACTish nodes it will
- * take; and we may run out of room in a node in the middle
- * of a potential multi-char fold, and have to back off
- * accordingly. */
-
- if (isASCII_uni(ender)) {
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
- *(s)++ = (U8) ender;
- }
- else { /* Is in a fold */
+ /* On non-ancient Unicode versions, this includes the
+ * multi-char fold SHARP S to 'ss' */
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
+ if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
+ || ( isALPHA_FOLD_EQ(ender, 's')
+ && len > 0
+ && isALPHA_FOLD_EQ(*(s-1), 's')))
+ {
+ /* Here, we have one of the following:
+ * a) a SHARP S. This folds to 'ss' only under
+ * /u rules. If we are in that situation,
+ * fold the SHARP S to 'ss'. See the comments
+ * for join_exact() as to why we fold this
+ * non-UTF at compile time, and no others.
+ * b) 'ss'. When under /u, there's nothing
+ * special needed to be done here. The
+ * previous iteration handled the first 's',
+ * and this iteration will handle the second.
+ * If, on the otherhand it's not /u, we have
+ * to exclude the possibility of moving to /u,
+ * so that we won't generate an unwanted
+ * match, unless, at runtime, the target
+ * string is in UTF-8.
+ * */
- *(s)++ = (U8) toFOLD(ender);
- }
- }
- else { /* Not ASCII */
- STRLEN foldlen;
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! _invlist_contains_cp(PL_utf8_foldable, ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
+ has_ss = TRUE;
+ maybe_exactfu = FALSE; /* Can't generate an
+ EXACTFU node (unless we
+ already are in one) */
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ maybe_SIMPLE = 0;
+ if (node_type == EXACTFU) {
+ *(s++) = 's';
+
+ /* Let the code below add in the extra 's' */
+ ender = 's';
+ added_len = 2;
+ }
}
-
- s = (char *) uvchr_to_utf8((U8 *) s, ender);
- added_len = UVCHR_SKIP(ender);
}
- else {
-
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
+#endif
- ender = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
- s += foldlen;
- added_len = foldlen;
+ else if (UNLIKELY(ender == MICRO_SIGN)) {
+ has_micro_sign = TRUE;
}
+
+ *(s++) = (char) (DEPENDS_SEMANTICS)
+ ? toFOLD(ender)
+
+ /* Under /u, the fold of any
+ * character in the 0-255 range
+ * happens to be its lowercase
+ * equivalent, except for LATIN SMALL
+ * LETTER SHARP S, which was handled
+ * above, and the MICRO SIGN, whose
+ * fold requires UTF-8 to represent.
+ * */
+ : toLOWER_L1(ender);
}
- }
+ } /* End of adding current character to the node */
len += added_len;
* identifies, so when it is set to less than the full node, we can
* skip the rest of this */
if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+ PERL_UINT_FAST8_T backup_count = 0;
const STRLEN full_len = len;
assert(len >= MAX_NODE_STRING_SIZE);
- /* Here, <s> points to the final byte of the final character.
- * Look backwards through the string until find a non-
- * problematic character */
+ /* Here, <s> points to just beyond where we have output the
+ * final character of the node. Look backwards through the
+ * string until find a non- problematic character */
if (! UTF) {
goto loopdone;
}
- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
+ backup_count++;
+ }
len = s - s0 + 1;
}
else {
}
}
else if (! _invlist_contains_cp(
- PL_NonL1NonFinalFold,
+ PL_NonFinalFold,
valid_utf8_to_uvchr((U8 *) s, NULL)))
{
break;
* special case the very first byte in the string, so
* we don't read outside the string */
s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+ backup_count++;
} /* End of loop backwards through the string */
/* If there were only problematic characters in the string,
if (len == 0) {
len = full_len;
- /* If the node ends in an 's' we make sure it stays EXACTF,
- * as if it turns into an EXACTFU, it could later get
- * joined with another 's' that would then wrongly match
- * the sharp s */
- if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
- {
- maybe_exactfu = FALSE;
- }
} else {
/* Here, the node does contain some characters that aren't
- * problematic. If one such is the final character in the
- * node, we are done */
- if (len == full_len) {
+ * problematic. If we didn't have to backup any, then the
+ * final character in the node is non-problematic, and we
+ * can take the node as-is */
+ if (backup_count == 0) {
goto loopdone;
}
- else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+ else if (backup_count == 1) {
/* If the final character is problematic, but the
* penultimate is not, back-off that last character to
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
+ /* Free up any over-allocated space */
change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
/* I (khw) don't know if you can get here with zero length, but the
OP(REGNODE_p(ret)) = NOTHING;
}
else {
- OP(REGNODE_p(ret)) = node_type;
/* If the node type is EXACT here, check to see if it
- * should be EXACTL. */
+ * should be EXACTL, or EXACT_ONLY8. */
if (node_type == EXACT) {
if (LOC) {
- OP(REGNODE_p(ret)) = EXACTL;
+ node_type = EXACTL;
}
- }
+ else if (requires_utf8_target) {
+ node_type = EXACT_ONLY8;
+ }
+ } else if (FOLD) {
+ if ( UNLIKELY(has_micro_sign || has_ss)
+ && (node_type == EXACTFU || ( node_type == EXACTF
+ && maybe_exactfu)))
+ { /* These two conditions are problematic in non-UTF-8
+ EXACTFU nodes. */
+ assert(! UTF);
+ node_type = EXACTFUP;
+ }
+ else if (node_type == EXACTFL) {
- if (FOLD) {
- /* If 'maybe_exactfu' is set, then there are no code points
- * that match differently depending on UTF8ness of the
- * target string (for /u), or depending on locale for /l */
- if (maybe_exactfu) {
- if (node_type == EXACTF) {
- OP(REGNODE_p(ret)) = EXACTFU;
+ /* 'maybe_exactfu' is deliberately set above to
+ * indicate this node type, where all code points in it
+ * are above 255 */
+ if (maybe_exactfu) {
+ node_type = EXACTFLU8;
}
- else if (node_type == EXACTFL) {
- OP(REGNODE_p(ret)) = EXACTFLU8;
+ }
+ else if (node_type == EXACTF) { /* Means is /di */
+
+ /* If 'maybe_exactfu' is clear, then we need to stay
+ * /di. If it is set, it means there are no code
+ * points that match differently depending on UTF8ness
+ * of the target string, so it can become an EXACTFU
+ * node */
+ if (! maybe_exactfu) {
+ RExC_seen_d_op = TRUE;
+ }
+ else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+ || isALPHA_FOLD_EQ(ender, 's'))
+ {
+ /* But, if the node begins or ends in an 's' we
+ * have to defer changing it into an EXACTFU, as
+ * the node could later get joined with another one
+ * that ends or begins with 's' creating an 'ss'
+ * sequence which would then wrongly match the
+ * sharp s without the target being UTF-8. We
+ * create a special node that we resolve later when
+ * we join nodes together */
+
+ node_type = EXACTFU_S_EDGE;
+ }
+ else {
+ node_type = EXACTFU;
}
}
+
+ if (requires_utf8_target && node_type == EXACTFU) {
+ node_type = EXACTFU_ONLY8;
+ }
+ }
+
+ OP(REGNODE_p(ret)) = node_type;
+ STR_LEN(REGNODE_p(ret)) = len;
+ RExC_emit += STR_SZ(len);
+
+ /* If the node isn't a single character, it can't be SIMPLE */
+ if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+ maybe_SIMPLE = 0;
}
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
- FALSE /* Don't look to see if could
- be turned into an EXACT
- node, as we have already
- computed that */
- );
+ *flagp |= HASWIDTH | maybe_SIMPLE;
}
- RExC_parse = p - 1;
- Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
- RExC_parse = p;
+ Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
+ RExC_parse = p;
+
{
/* len is STRLEN which is unsigned, need to copy to signed */
IV iv = len;
PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
assert(PL_regkind[OP(node)] == ANYOF);
+ /* There is no bitmap for this node type */
+ if (OP(node) == ANYOFH) {
+ return;
+ }
+
ANYOF_BITMAP_ZERO(node);
if (*invlist_ptr) {
* In b) there may be errors or warnings generated. If 'check_only' is
* TRUE, then any errors are discarded. Warnings are returned to the
* caller via an AV* created into '*posix_warnings' if it is not NULL. If
- * instead it is NULL, warnings are suppressed. This is done in all
- * passes. The reason for this is that the rest of the parsing is heavily
- * dependent on whether this routine found a valid posix class or not. If
- * it did, the closing ']' is absorbed as part of the class. If no class,
- * or an invalid one is found, any ']' will be considered the terminator of
- * the outer bracketed character class, leading to very different results.
- * In particular, a '(?[ ])' construct will likely have a syntax error if
- * the class is parsed other than intended, and this will happen in pass1,
- * before the warnings would normally be output. This mechanism allows the
- * caller to output those warnings in pass1 just before dieing, giving a
- * much better clue as to what is wrong.
+ * instead it is NULL, warnings are suppressed.
*
* The reason for this function, and its complexity is that a bracketed
* character class can contain just about anything. But it's easy to
case '(':
- if ( RExC_parse < RExC_end - 1
- && (UCHARAT(RExC_parse + 1) == '?'))
+ if ( RExC_parse < RExC_end - 2
+ && UCHARAT(RExC_parse + 1) == '?'
+ && UCHARAT(RExC_parse + 2) == '^')
{
- /* If is a '(?', could be an embedded '(?flags:(?[...])'.
+ /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
* This happens when we have some thing like
*
* my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
RExC_parse += 2; /* Skip past the '(?' */
save_parse = RExC_parse;
- /* Parse any flags for the '(?' */
+ /* Parse the flags for the '(?'. We already know the first
+ * flag to parse is a '^' */
parse_lparen_question_flags(pRExC_state);
- if (RExC_parse == save_parse /* Makes sure there was at
- least one flag (or else
- this embedding wasn't
- compiled) */
- || RExC_parse >= RExC_end - 4
+ if ( RExC_parse >= RExC_end - 4
|| UCHARAT(RExC_parse) != ':'
|| UCHARAT(++RExC_parse) != '('
|| UCHARAT(++RExC_parse) != '?'
/* In combination with the above, this moves the
* pointer to the point just after the first erroneous
- * character (or if there are no flags, to where they
- * should have been) */
+ * character. */
if (RExC_parse >= RExC_end - 4) {
RExC_parse = RExC_end;
}
if (UCHARAT(RExC_parse) != ')')
vFAIL("Expecting close paren for wrapper for nested extended charclass");
- RExC_parse++;
RExC_flags = save_flags;
goto handle_operand;
}
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t,
- NULL))
+ ¤t))
{
FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
TRUE, /* silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t,
- NULL
- ))
+ ¤t))
{
FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
they're valid on this machine */
FALSE, /* similarly, no need for strict */
FALSE, /* Require return to be an ANYOF */
- NULL,
NULL
);
default: /* Other code points are checked against the data for the
current Unicode version */
{
- Size_t folds_to_count;
- unsigned int first_folds_to;
- const unsigned int * remaining_folds_to_list;
+ Size_t folds_count;
+ unsigned int first_fold;
+ const unsigned int * remaining_folds;
UV folded_cp;
if (isASCII(cp)) {
*invlist = add_cp_to_invlist(*invlist, folded_cp);
}
- folds_to_count = _inverse_folds(folded_cp, &first_folds_to,
- &remaining_folds_to_list);
- if (folds_to_count == 0) {
+ folds_count = _inverse_folds(folded_cp, &first_fold,
+ &remaining_folds);
+ if (folds_count == 0) {
/* Use deprecated warning to increase the chances of this being
* output */
else {
unsigned int i;
- if (first_folds_to > 255) {
- *invlist = add_cp_to_invlist(*invlist, first_folds_to);
+ if (first_fold > 255) {
+ *invlist = add_cp_to_invlist(*invlist, first_fold);
}
- for (i = 0; i < folds_to_count - 1; i++) {
- if (remaining_folds_to_list[i] > 255) {
+ for (i = 0; i < folds_count - 1; i++) {
+ if (remaining_folds[i] > 255) {
*invlist = add_cp_to_invlist(*invlist,
- remaining_folds_to_list[i]);
+ remaining_folds[i]);
}
}
}
}
STATIC void
-S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
+S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
{
- /* If the final parameter is NULL, output the elements of the array given
- * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
- * pushed onto it, (creating if necessary) */
+ /* Output the elements of the array given by '*posix_warnings' as REGEXP
+ * warnings. */
SV * msg;
- const bool first_is_fatal = ! return_posix_warnings
- && ckDEAD(packWARN(WARN_REGEXP));
+ const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
+
+ PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
- PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
+ if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+ return;
+ }
while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
- if (return_posix_warnings) {
- if (! *return_posix_warnings) { /* mortalize to not leak if
- warnings are fatal */
- *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
- }
- av_push(*return_posix_warnings, msg);
- }
- else {
- if (first_is_fatal) { /* Avoid leaking this */
- av_undef(posix_warnings); /* This isn't necessary if the
- array is mortal, but is a
- fail-safe */
- (void) sv_2mortal(msg);
- if (ckDEAD(packWARN(WARN_REGEXP))) {
- PREPARE_TO_DIE;
- }
- }
- if (TO_OUTPUT_WARNINGS(RExC_parse)) {
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s",
- SvPVX(msg));
- }
- SvREFCNT_dec_NN(msg);
+ if (first_is_fatal) { /* Avoid leaking this */
+ av_undef(posix_warnings); /* This isn't necessary if the
+ array is mortal, but is a
+ fail-safe */
+ (void) sv_2mortal(msg);
+ PREPARE_TO_DIE;
}
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
+ SvREFCNT_dec_NN(msg);
}
- if (! return_posix_warnings) {
- UPDATE_WARNINGS_LOC(RExC_parse);
- }
+ UPDATE_WARNINGS_LOC(RExC_parse);
}
STATIC AV *
const bool strict,
bool optimizable, /* ? Allow a non-ANYOF return
node */
- SV** ret_invlist, /* Return an inversion list, not a node */
- AV** return_posix_warnings
+ SV** ret_invlist /* Return an inversion list, not a node */
)
{
/* parse a bracketed class specification. Most of these will produce an
*
* ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
* characters, with the corresponding bit set if that character is in the
- * list. For characters above this, a range list or swash is used. There
+ * list. For characters above this, an inversion list is used. There
* are extra bits for \w, etc. in locale ANYOFs, as what these match is not
* determinable at compile time
*
* On success, returns the offset at which any next node should be placed
* into the regex engine program being compiled.
*
- * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
+ * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
* to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
* UTF-8
*/
STRLEN numlen;
int namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
- bool need_class = 0;
- SV *listsv = NULL;
+ SV *listsv = NULL; /* List of \p{user-defined} whose definitions
+ aren't available at the time this was called */
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
SV* properties = NULL; /* Code points that match \p{} \P{} */
const bool skip_white = cBOOL( ret_invlist
|| (RExC_flags & RXf_PMf_EXTENDED_MORE));
- /* Unicode properties are stored in a swash; this holds the current one
- * being parsed. If this swash is the only above-latin1 component of the
- * character class, an optimization is to pass it directly on to the
- * execution engine. Otherwise, it is set to NULL to indicate that there
- * are other things in the class that have to be dealt with at execution
- * time */
- SV* swash = NULL; /* Code points that match \p{} \P{} */
-
- /* Set if a component of this character class is user-defined; just passed
- * on to the engine */
- bool has_user_defined_property = FALSE;
-
/* inversion list of code points this node matches only when the target
* string is in UTF-8. These are all non-ASCII, < 256. (Because is under
* /d) */
- SV* has_upper_latin1_only_utf8_matches = NULL;
+ SV* upper_latin1_only_utf8_matches = NULL;
/* Inversion list of code points this node matches regardless of things
* like locale, folding, utf8ness of the target string */
bool warn_super = ALWAYS_WARN_SUPER;
- const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
- case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
- bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
/* This variable is used to mark where the end in the input is of something
* that looks like a POSIX construct but isn't. During the parse, when
char *not_posix_region_end = RExC_parse - 1;
AV* posix_warnings = NULL;
- const bool do_posix_warnings = return_posix_warnings || ckWARN(WARN_REGEXP);
+ const bool do_posix_warnings = ckWARN(WARN_REGEXP);
U8 op = END; /* The returned node-type, initialized to an impossible
one. */
U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
U32 posixl = 0; /* bit field of posix classes matched under /l */
- bool use_anyofd = FALSE; /* ? Is this to be an ANYOFD node */
+
+
+/* Flags as to what things aren't knowable until runtime. (Note that these are
+ * mutually exclusive.) */
+#define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
+ haven't been defined as of yet */
+#define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
+ UTF-8 or not */
+#define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
+ what gets folded */
+ U32 has_runtime_dependency = 0; /* OR of the above flags */
GET_RE_DEBUG_FLAGS_DECL;
PERL_UNUSED_ARG(depth);
#endif
+
+ /* If wants an inversion list returned, we can't optimize to something
+ * else. */
+ if (ret_invlist) {
+ optimizable = FALSE;
+ }
+
DEBUG_PARSE("clas");
#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
allow_multi_folds = FALSE;
#endif
- listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
+ /* We include the /i status at the beginning of this so that we can
+ * know it at runtime */
+ listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
initial_listsv_len = SvCUR(listsv);
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
{
/* Warnings about posix class issues are considered tentative until
* we are far enough along in the parse that we can no longer
- * change our mind, at which point we either output them or add
- * them, if it has so specified, to what gets returned to the
- * caller. This is done each time through the loop so that a later
- * class won't zap them before they have been dealt with. */
- output_or_return_posix_warnings(pRExC_state, posix_warnings,
- return_posix_warnings);
+ * change our mind, at which point we output them. This is done
+ * each time through the loop so that a later class won't zap them
+ * before they have been dealt with. */
+ output_posix_warnings(pRExC_state, posix_warnings);
}
if (RExC_parse >= stop_ptr) {
case 'P':
{
char *e;
- char *i;
-
- /* We will handle any undefined properties ourselves */
- U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
- /* And we actually would prefer to get
- * the straight inversion list of the
- * swash, since we will be accessing it
- * anyway, to save a little time */
- |_CORE_SWASH_INIT_ACCEPT_INVLIST;
-
- SvREFCNT_dec(swash); /* Free any left-overs */
/* \p means they want Unicode semantics */
REQUIRE_UNI_RULES(flagp, 0);
}
{
char* name = RExC_parse;
- char* base_name; /* name after any packages are stripped */
- char* lookup_name = NULL;
- const char * const colon_colon = "::";
- bool invert;
-
- SV* invlist;
-
- /* Temporary workaround for [perl #133136]. For this
- * precise input that is in the .t that is failing, load
- * utf8.pm, which is what the test wants, so that that
- * .t passes */
- if ( memEQs(RExC_start, e + 1 - RExC_start,
- "foo\\p{Alnum}")
- && ! hv_common(GvHVn(PL_incgv),
- NULL,
- "utf8.pm", sizeof("utf8.pm") - 1,
- 0, HV_FETCH_ISEXISTS, NULL, 0))
- {
- require_pv("utf8.pm");
- }
- invlist = parse_uniprop_string(name, n, FOLD, &invert);
- if (invlist) {
- if (invert) {
- value ^= 'P' ^ 'p';
- }
- }
- else {
- /* Try to get the definition of the property into
- * <invlist>. If /i is in effect, the effective property
- * will have its name be <__NAME_i>. The design is
- * discussed in commit
- * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
- SAVEFREEPV(name);
-
- for (i = RExC_parse; i < RExC_parse + n; i++) {
- if (isCNTRL(*i) && *i != '\t') {
- RExC_parse = e + 1;
- vFAIL2("Can't find Unicode property definition \"%s\"", name);
+ /* Any message returned about expanding the definition */
+ SV* msg = newSVpvs_flags("", SVs_TEMP);
+
+ /* If set TRUE, the property is user-defined as opposed to
+ * official Unicode */
+ bool user_defined = FALSE;
+
+ SV * prop_definition = parse_uniprop_string(
+ name, n, UTF, FOLD,
+ FALSE, /* This is compile-time */
+ &user_defined,
+ msg,
+ 0 /* Base level */
+ );
+ if (SvCUR(msg)) { /* Assumes any error causes a msg */
+ assert(prop_definition == NULL);
+ RExC_parse = e + 1;
+ if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
+ thing so, or else the display is
+ mojibake */
+ RExC_utf8 = TRUE;
}
+ /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
+ vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
+ SvCUR(msg), SvPVX(msg)));
}
- if (FOLD) {
- lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
-
- /* The function call just below that uses this can fail
- * to return, leaking memory if we don't do this */
- SAVEFREEPV(lookup_name);
- }
-
- /* Look up the property name, and get its swash and
- * inversion list, if the property is found */
- swash = _core_swash_init("utf8",
- (lookup_name)
- ? lookup_name
- : name,
- &PL_sv_undef,
- 1, /* binary */
- 0, /* not tr/// */
- NULL, /* No inversion list */
- &swash_init_flags
- );
- if (! swash || ! (invlist = _get_swash_invlist(swash))) {
- HV* curpkg = (IN_PERL_COMPILETIME)
- ? PL_curstash
- : CopSTASH(PL_curcop);
- UV final_n = n;
- bool has_pkg;
-
- if (swash) { /* Got a swash but no inversion list.
- Something is likely wrong that will
- be sorted-out later */
- SvREFCNT_dec_NN(swash);
- swash = NULL;
- }
+ if (! is_invlist(prop_definition)) {
- /* Here didn't find it. It could be a an error (like a
- * typo) in specifying a Unicode property, or it could
- * be a user-defined property that will be available at
- * run-time. The names of these must begin with 'In'
- * or 'Is' (after any packages are stripped off). So
- * if not one of those, or if we accept only
- * compile-time properties, is an error; otherwise add
- * it to the list for run-time look up. */
- if ((base_name = rninstr(name, name + n,
- colon_colon, colon_colon + 2)))
- { /* Has ::. We know this must be a user-defined
- property */
- base_name += 2;
- final_n -= base_name - name;
- has_pkg = TRUE;
+ /* Here, the definition isn't known, so we have gotten
+ * returned a string that will be evaluated if and when
+ * encountered at runtime. We add it to the list of
+ * such properties, along with whether it should be
+ * complemented or not */
+ if (value == 'P') {
+ sv_catpvs(listsv, "!");
}
else {
- base_name = name;
- has_pkg = FALSE;
- }
-
- if ( final_n < 3
- || base_name[0] != 'I'
- || (base_name[1] != 's' && base_name[1] != 'n')
- || ret_invlist)
- {
- const char * const msg
- = (has_pkg)
- ? "Illegal user-defined property name"
- : "Can't find Unicode property definition";
- RExC_parse = e + 1;
-
- /* diag_listed_as: Can't find Unicode property definition "%s" */
- vFAIL3utf8f("%s \"%" UTF8f "\"",
- msg, UTF8fARG(UTF, n, name));
+ sv_catpvs(listsv, "+");
}
+ sv_catsv(listsv, prop_definition);
- /* If the property name doesn't already have a package
- * name, add the current one to it so that it can be
- * referred to outside it. [perl #121777] */
- if (! has_pkg && curpkg) {
- char* pkgname = HvNAME(curpkg);
- if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
- char* full_name = Perl_form(aTHX_
- "%s::%s",
- pkgname,
- name);
- n = strlen(full_name);
- name = savepvn(full_name, n);
- SAVEFREEPV(name);
- }
- }
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
- (value == 'p' ? '+' : '!'),
- (FOLD) ? "__" : "",
- UTF8fARG(UTF, n, name),
- (FOLD) ? "_i" : "");
- has_user_defined_property = TRUE;
- optimizable = FALSE; /* Will have to leave this an
- ANYOF node */
+ has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
/* We don't know yet what this matches, so have to flag
* it */
anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
else {
+ assert (prop_definition && is_invlist(prop_definition));
- /* Here, did get the swash and its inversion list. If
- * the swash is from a user-defined property, then this
- * whole character class should be regarded as such */
- if (swash_init_flags
- & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
+ /* Here we do have the complete property definition
+ *
+ * Temporary workaround for [perl #133136]. For this
+ * precise input that is in the .t that is failing,
+ * load utf8.pm, which is what the test wants, so that
+ * that .t passes */
+ if ( memEQs(RExC_start, e + 1 - RExC_start,
+ "foo\\p{Alnum}")
+ && ! hv_common(GvHVn(PL_incgv),
+ NULL,
+ "utf8.pm", sizeof("utf8.pm") - 1,
+ 0, HV_FETCH_ISEXISTS, NULL, 0))
{
- has_user_defined_property = TRUE;
+ require_pv("utf8.pm");
}
- }
- }
- if (invlist) {
- if (! has_user_defined_property &&
+
+ if (! user_defined &&
/* We warn on matching an above-Unicode code point
* if the match would return true, except don't
* warn for \p{All}, which has exactly one element
* = 0 */
- (_invlist_contains_cp(invlist, 0x110000)
- && (! (_invlist_len(invlist) == 1
- && *invlist_array(invlist) == 0))))
+ (_invlist_contains_cp(prop_definition, 0x110000)
+ && (! (_invlist_len(prop_definition) == 1
+ && *invlist_array(prop_definition) == 0))))
{
warn_super = TRUE;
}
/* Invert if asking for the complement */
if (value == 'P') {
_invlist_union_complement_2nd(properties,
- invlist,
+ prop_definition,
&properties);
-
- /* The swash can't be used as-is, because we've
- * inverted things; delay removing it to here after
- * have copied its invlist above */
- if (! swash) {
- SvREFCNT_dec_NN(invlist);
- }
- SvREFCNT_dec(swash);
- swash = NULL;
}
else {
- _invlist_union(properties, invlist, &properties);
- if (! swash) {
- SvREFCNT_dec_NN(invlist);
- }
+ _invlist_union(properties, prop_definition, &properties);
}
}
}
SV* scratch_list = NULL;
/* What the Posix classes (like \w, [:space:]) match in locale
- * isn't knowable under locale until actual match time. Room
- * must be reserved (one time per outer bracketed class) to
- * store such classes. The space will contain a bit for each
- * named class that is to be matched against. This isn't
- * needed for \p{} and pseudo-classes, as they are not affected
- * by locale, and hence are dealt with separately */
- if (! need_class) {
- need_class = 1;
- anyof_flags |= ANYOF_MATCHES_POSIXL;
-
- /* We can't change this into some other type of node
- * (unless this is the only element, in which case there
- * are nodes that mean exactly this) as has runtime
- * dependencies */
- optimizable = FALSE;
- }
-
- /* Coverity thinks it is possible for this to be negative; both
- * jhi and khw think it's not, but be safer */
- assert(! (anyof_flags & ANYOF_MATCHES_POSIXL)
- || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
-
- /* See if it already matches the complement of this POSIX
- * class */
- if ( (anyof_flags & ANYOF_MATCHES_POSIXL)
- && POSIXL_TEST(posixl, namedclass + ((namedclass % 2)
- ? -1
- : 1)))
- {
- posixl_matches_all = TRUE;
- break; /* No need to continue. Since it matches both
- e.g., \w and \W, it matches everything, and the
- bracketed class can be optimized into qr/./s */
- }
-
- /* Add this class to those that should be checked at runtime */
+ * isn't knowable under locale until actual match time. A
+ * special node is used for these which has extra space for a
+ * bitmap, with a bit reserved for each named class that is to
+ * be matched against. This isn't needed for \p{} and
+ * pseudo-classes, as they are not affected by locale, and
+ * hence are dealt with separately */
POSIXL_SET(posixl, namedclass);
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
+ anyof_flags |= ANYOF_MATCHES_POSIXL;
/* The above-Latin1 characters are not subject to locale rules.
- * Just add them, in the second pass, to the
- * unconditionally-matched list */
+ * Just add them to the unconditionally-matched list */
- /* Get the list of the above-Latin1 code points this
- * matches */
+ /* Get the list of the above-Latin1 code points this matches */
_invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
PL_XPosix_ptrs[classnum],
* NDIGIT, NASCII, ... */
namedclass % 2 != 0,
&scratch_list);
- /* Checking if 'cp_list' is NULL first saves an extra
- * clone. Its reference count will be decremented at the
- * next union, etc, or if this is the only instance, at the
- * end of the routine */
+ /* Checking if 'cp_list' is NULL first saves an extra clone.
+ * Its reference count will be decremented at the next union,
+ * etc, or if this is the only instance, at the end of the
+ * routine */
if (! cp_list) {
cp_list = scratch_list;
}
}
else {
- /* Here, not in pass1 (in that pass we skip calculating the
- * contents of this class), and is not /l, or is a POSIX class
- * for which /l doesn't matter (or is a Unicode property, which
- * is skipped here). */
+ /* Here, is not /l, or is a POSIX class for which /l doesn't
+ * matter (or is a Unicode property, which is skipped here). */
if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
&cp_list);
}
}
- else if ( UNI_SEMANTICS
- || AT_LEAST_ASCII_RESTRICTED
- || classnum == _CC_ASCII
- || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
- || classnum == _CC_XDIGIT)))
+ else if ( AT_LEAST_UNI_SEMANTICS
+ || classnum == _CC_ASCII
+ || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
+ || classnum == _CC_XDIGIT)))
{
/* We usually have to worry about /d affecting what POSIX
* classes match, with special code needed because we won't
* <prevvalue> is the beginning of the range, if any; or <value> if
* not. */
- /* non-Latin1 code point implies unicode semantics. Must be set in
- * pass1 so is there for the whole of pass 2 */
+ /* non-Latin1 code point implies unicode semantics. */
if (value > 255) {
REQUIRE_UNI_RULES(flagp, 0);
}
*
* See [perl #89750] */
if (FOLD && allow_multi_folds && value == prevvalue) {
- if (value == LATIN_SMALL_LETTER_SHARP_S
+ if ( value == LATIN_SMALL_LETTER_SHARP_S
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
value)))
{
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
-
if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
- output_or_return_posix_warnings(pRExC_state, posix_warnings,
- return_posix_warnings);
+ output_posix_warnings(pRExC_state, posix_warnings);
}
/* If anything in the class expands to more than one character, we have to
/* Our calculated list will be for Unicode rules. For locale
* matching, we have to keep a separate list that is consulted at
- * runtime only when the locale indicates Unicode rules. For
- * non-locale, we just use the general list */
+ * runtime only when the locale indicates Unicode rules (and we
+ * don't include potential matches in the ASCII/Latin1 range, as
+ * any code point could fold to any other, based on the run-time
+ * locale). For non-locale, we just use the general list */
if (LOC) {
use_list = &only_utf8_locale_list;
}
* be checked. Get the intersection of this class and all the
* possible characters that are foldable. This can quickly narrow
* down a large class */
- _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
+ _invlist_intersection(PL_in_some_fold, cp_foldable_list,
&fold_intersection);
/* Now look at the foldable characters in this class individually */
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
unsigned int k;
- Size_t folds_to_count;
- unsigned int first_folds_to;
- const unsigned int * remaining_folds_to_list;
+ Size_t folds_count;
+ unsigned int first_fold;
+ const unsigned int * remaining_folds;
if (j < 256) {
- if (IS_IN_SOME_FOLD_L1(j)) {
+ /* Under /l, we don't know what code points below 256
+ * fold to, except we do know the MICRO SIGN folds to
+ * an above-255 character if the locale is UTF-8, so we
+ * add it to the special list (in *use_list) Otherwise
+ * we know now what things can match, though some folds
+ * are valid under /d only if the target is UTF-8.
+ * Those go in a separate list */
+ if ( IS_IN_SOME_FOLD_L1(j)
+ && ! (LOC && j != MICRO_SIGN))
+ {
/* ASCII is always matched; non-ASCII is matched
* only under Unicode rules (which could happen
*use_list = add_cp_to_invlist(*use_list,
PL_fold_latin1[j]);
}
- else {
- has_upper_latin1_only_utf8_matches
- = add_cp_to_invlist(
- has_upper_latin1_only_utf8_matches,
- PL_fold_latin1[j]);
+ else if (j != PL_fold_latin1[j]) {
+ upper_latin1_only_utf8_matches
+ = add_cp_to_invlist(
+ upper_latin1_only_utf8_matches,
+ PL_fold_latin1[j]);
}
}
/* Single character fold of above Latin1. Add everything
* in its fold closure to the list that this node should
* match. */
- folds_to_count = _inverse_folds(folded, &first_folds_to,
- &remaining_folds_to_list);
- for (k = 0; k <= folds_to_count; k++) {
+ folds_count = _inverse_folds(folded, &first_fold,
+ &remaining_folds);
+ for (k = 0; k <= folds_count; k++) {
UV c = (k == 0) /* First time through use itself */
? folded
: (k == 1) /* 2nd time use, the first fold */
- ? first_folds_to
+ ? first_fold
/* Then the remaining ones */
- : remaining_folds_to_list[k-2];
+ : remaining_folds[k-2];
/* /aa doesn't allow folds between ASCII and non- */
if (( ASCII_FOLD_RESTRICTED
else {
/* Similarly folds involving non-ascii Latin1
* characters under /d are added to their list */
- has_upper_latin1_only_utf8_matches
- = add_cp_to_invlist(
- has_upper_latin1_only_utf8_matches,
- c);
+ upper_latin1_only_utf8_matches
+ = add_cp_to_invlist(
+ upper_latin1_only_utf8_matches,
+ c);
}
}
}
/* And combine the result (if any) with any inversion lists from posix
* classes. The lists are kept separate up to now because we don't want to
- * fold the classes (folding of those is automatically handled by the swash
- * fetching code) */
+ * fold the classes */
if (simple_posixes) { /* These are the classes known to be unaffected by
/a, /aa, and /d */
if (cp_list) {
* the target string is in UTF-8. But things like \W match all the
* upper Latin1 characters if the target string is not in UTF-8.
*
- * Handle the case where there something like \W separately */
+ * Handle the case with something like \W separately */
if (nposixes) {
SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
/* Likewise for anything else in the range that matched only
* under UTF-8 */
- if (has_upper_latin1_only_utf8_matches) {
+ if (upper_latin1_only_utf8_matches) {
_invlist_union(cp_list,
- has_upper_latin1_only_utf8_matches,
+ upper_latin1_only_utf8_matches,
&cp_list);
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
- has_upper_latin1_only_utf8_matches = NULL;
+ SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
+ upper_latin1_only_utf8_matches = NULL;
}
/* If we don't match all the upper Latin1 characters regardless
&nonascii_but_latin1_properties);
/* And add them to the final list of such characters. */
- _invlist_union(has_upper_latin1_only_utf8_matches,
+ _invlist_union(upper_latin1_only_utf8_matches,
nonascii_but_latin1_properties,
- &has_upper_latin1_only_utf8_matches);
+ &upper_latin1_only_utf8_matches);
/* Remove them from what now becomes the unconditional list */
_invlist_subtract(posixes, nonascii_but_latin1_properties,
SvREFCNT_dec(nonascii_but_latin1_properties);
- /* Get rid of any characters that we now know are matched
- * unconditionally from the conditional list, which may make
- * that list empty */
- _invlist_subtract(has_upper_latin1_only_utf8_matches,
+ /* Get rid of any characters from the conditional list that we
+ * now know are matched unconditionally, which may make that
+ * list empty */
+ _invlist_subtract(upper_latin1_only_utf8_matches,
cp_list,
- &has_upper_latin1_only_utf8_matches);
- if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
- has_upper_latin1_only_utf8_matches = NULL;
+ &upper_latin1_only_utf8_matches);
+ if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
+ SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
+ upper_latin1_only_utf8_matches = NULL;
}
}
}
* class that isn't a Unicode property, and which matches above Unicode, \W
* or [\x{110000}] for example.
* (Note that in this case, unlike the Posix one above, there is no
- * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
+ * <upper_latin1_only_utf8_matches>, because having a Unicode property
* forces Unicode semantics */
if (properties) {
if (cp_list) {
* folded until runtime */
/* If we didn't do folding, it's because some information isn't available
- * until runtime; set the run-time fold flag for these. (We don't have to
- * worry about properties folding, as that is taken care of by the swash
- * fetching). We know to set the flag if we have a non-NULL list for UTF-8
- * locales, or the class matches at least one 0-255 range code point */
+ * until runtime; set the run-time fold flag for these We know to set the
+ * flag if we have a non-NULL list for UTF-8 locales, or the class matches
+ * at least one 0-255 range code point */
if (LOC && FOLD) {
/* Some things on the list might be unconditionally included because of
only_utf8_locale_list = NULL;
}
}
- if (only_utf8_locale_list) {
+ if ( only_utf8_locale_list
+ || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
+ || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
+ {
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
anyof_flags
|= ANYOFL_FOLD
| ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
invlist_iterinit(cp_list);
if (invlist_iternext(cp_list, &start, &end) && start < 256) {
anyof_flags |= ANYOFL_FOLD;
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
}
invlist_iterfinish(cp_list);
}
}
else if ( DEPENDS_SEMANTICS
- && ( has_upper_latin1_only_utf8_matches
+ && ( upper_latin1_only_utf8_matches
|| (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
{
- use_anyofd = TRUE;
- optimizable = FALSE;
+ RExC_seen_d_op = TRUE;
+ has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
}
-
- /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
- * at compile time. Besides not inverting folded locale now, we can't
- * invert if there are things such as \w, which aren't known until runtime
- * */
+ /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
+ * compile time. */
if ( cp_list
&& invert
- && ! use_anyofd
- && ! (anyof_flags & (ANYOF_LOCALE_FLAGS))
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+ && ! has_runtime_dependency)
{
_invlist_invert(cp_list);
- /* Any swash can't be used as-is, because we've inverted things */
- if (swash) {
- SvREFCNT_dec_NN(swash);
- swash = NULL;
- }
-
/* Clear the invert flag since have just done it here */
invert = FALSE;
}
if (ret_invlist) {
*ret_invlist = cp_list;
- SvREFCNT_dec(swash);
return RExC_emit;
}
+ /* All possible optimizations below still have these characteristics.
+ * (Multi-char folds aren't SIMPLE, but they don't get this far in this
+ * routine) */
+ *flagp |= HASWIDTH|SIMPLE;
+
+ if (anyof_flags & ANYOF_LOCALE_FLAGS) {
+ RExC_contains_locale = 1;
+ }
+
/* Some character classes are equivalent to other nodes. Such nodes take
- * up less room and generally fewer operations to execute than ANYOF nodes.
- * */
+ * up less room, and some nodes require fewer operations to execute, than
+ * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
+ * improve efficiency. */
if (optimizable) {
- int posix_class = -1; /* Illegal value */
- const char * cur_parse= RExC_parse;
- U8 ANYOFM_mask = 0xFF;
- U32 anode_arg = 0;
- UV start, end;
+ PERL_UINT_FAST8_T i;
+ Size_t partial_cp_count = 0;
+ UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
+ UV end[MAX_FOLD_FROMS+1] = { 0 };
- if (UNLIKELY(posixl_matches_all)) {
- op = SANY;
- }
- else if (cp_list && ! invert) {
+ if (cp_list) { /* Count the code points in enough ranges that we would
+ see all the ones possible in any fold in this version
+ of Unicode */
- invlist_iterinit(cp_list);
- if (! invlist_iternext(cp_list, &start, &end)) {
+ invlist_iterinit(cp_list);
+ for (i = 0; i <= MAX_FOLD_FROMS; i++) {
+ if (invlist_iternext(cp_list, &start[i], &end[i])) {
+ partial_cp_count += end[i] - start[i] + 1;
+ }
+ }
- /* Here, the list is empty. This happens, for example, when a
- * Unicode property that doesn't match anything is the only element
- * in the character class (perluniprops.pod notes such properties).
- * */
- op = OPFAIL;
- *flagp |= HASWIDTH|SIMPLE;
+ invlist_iterfinish(cp_list);
}
- else if (start == end) { /* The range is a single code point */
- if (! invlist_iternext(cp_list, &start, &end)
-
- /* Don't do this optimization if it would require changing
- * the pattern to UTF-8 */
- && (start < 256 || UTF))
- {
- /* Here, the list contains a single code point. Can optimize
- * into an EXACTish node */
-
- value = start;
- if (! FOLD) {
- op = (LOC)
- ? EXACTL
- : EXACT;
- }
- else if (LOC) {
-
- /* A locale node under folding with one code point can be
- * an EXACTFL, as its fold won't be calculated until
- * runtime */
- op = EXACTFL;
- }
- else {
+ /* If we know at compile time that this matches every possible code
+ * point, any run-time dependencies don't matter */
+ if (start[0] == 0 && end[0] == UV_MAX) {
+ if (invert) {
+ ret = reganode(pRExC_state, OPFAIL, 0);
+ }
+ else {
+ ret = reg_node(pRExC_state, SANY);
+ MARK_NAUGHTY(1);
+ }
+ goto not_anyof;
+ }
- /* Here, we are generally folding, but there is only one
- * code point to match. If we have to, we use an EXACT
- * node, but it would be better for joining with adjacent
- * nodes in the optimization pass if we used the same
- * EXACTFish node that any such are likely to be. We can
- * do this iff the code point doesn't participate in any
- * folds. For example, an EXACTF of a colon is the same as
- * an EXACT one, since nothing folds to or from a colon. */
- if (value < 256) {
- if (IS_IN_SOME_FOLD_L1(value)) {
- op = EXACT;
- }
+ /* Similarly, for /l posix classes, if both a class and its
+ * complement match, any run-time dependencies don't matter */
+ if (posixl) {
+ for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
+ namedclass += 2)
+ {
+ if ( POSIXL_TEST(posixl, namedclass) /* class */
+ && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
+ {
+ if (invert) {
+ ret = reganode(pRExC_state, OPFAIL, 0);
}
else {
- if (_invlist_contains_cp(PL_utf8_foldable, value)) {
- op = EXACT;
- }
- }
-
- /* If we haven't found the node type, above, it means we
- * can use the prevailing one */
- if (op == END) {
- op = compute_EXACTish(pRExC_state);
+ ret = reg_node(pRExC_state, SANY);
+ MARK_NAUGHTY(1);
}
+ goto not_anyof;
}
}
- } /* End of first range contains just a single code point */
- else if (start == 0) {
- if (end == UV_MAX) {
- op = SANY;
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
- }
- else if (end == '\n' - 1
- && invlist_iternext(cp_list, &start, &end)
- && start == '\n' + 1 && end == UV_MAX)
+ /* For well-behaved locales, some classes are subsets of others,
+ * so complementing the subset and including the non-complemented
+ * superset should match everything, like [\D[:alnum:]], and
+ * [[:^alpha:][:alnum:]], but some implementations of locales are
+ * buggy, and khw thinks its a bad idea to have optimization change
+ * behavior, even if it avoids an OS bug in a given case */
+
+#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
+
+ /* If is a single posix /l class, can optimize to just that op.
+ * Such a node will not match anything in the Latin1 range, as that
+ * is not determinable until runtime, but will match whatever the
+ * class does outside that range. (Note that some classes won't
+ * match anything outside the range, like [:ascii:]) */
+ if ( isSINGLE_BIT_SET(posixl)
+ && (partial_cp_count == 0 || start[0] > 255))
{
- op = REG_ANY;
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
+ U8 classnum;
+ SV * class_above_latin1 = NULL;
+ bool already_inverted;
+ bool are_equivalent;
+
+ /* Compute which bit is set, which is the same thing as, e.g.,
+ * ANYOF_CNTRL. From
+ * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
+ * */
+ static const int MultiplyDeBruijnBitPosition2[32] =
+ {
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+ };
+
+ namedclass = MultiplyDeBruijnBitPosition2[(posixl
+ * 0x077CB531U) >> 27];
+ classnum = namedclass_to_classnum(namedclass);
+
+ /* The named classes are such that the inverted number is one
+ * larger than the non-inverted one */
+ already_inverted = namedclass
+ - classnum_to_namedclass(classnum);
+
+ /* Create an inversion list of the official property, inverted
+ * if the constructed node list is inverted, and restricted to
+ * only the above latin1 code points, which are the only ones
+ * known at compile time */
+ _invlist_intersection_maybe_complement_2nd(
+ PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+ already_inverted,
+ &class_above_latin1);
+ are_equivalent = _invlistEQ(class_above_latin1, cp_list,
+ FALSE);
+ SvREFCNT_dec_NN(class_above_latin1);
+
+ if (are_equivalent) {
+
+ /* Resolve the run-time inversion flag with this possibly
+ * inverted class */
+ invert = invert ^ already_inverted;
+
+ ret = reg_node(pRExC_state,
+ POSIXL + invert * (NPOSIXL - POSIXL));
+ FLAGS(REGNODE_p(ret)) = classnum;
+ goto not_anyof;
+ }
}
}
- invlist_iterfinish(cp_list);
- if (op == END) {
+ /* khw can't think of any other possible transformation involving
+ * these. */
+ if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
+ goto is_anyof;
+ }
- /* Here, didn't find an optimization. See if this matches any of
- * the POSIX classes. First try ASCII */
+ if (! has_runtime_dependency) {
- if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
- op = ASCII;
- *flagp |= HASWIDTH|SIMPLE;
+ /* If the list is empty, nothing matches. This happens, for
+ * example, when a Unicode property that doesn't match anything is
+ * the only element in the character class (perluniprops.pod notes
+ * such properties). */
+ if (partial_cp_count == 0) {
+ assert (! invert);
+ ret = reganode(pRExC_state, OPFAIL, 0);
+ goto not_anyof;
}
- else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
- op = NASCII;
- *flagp |= HASWIDTH|SIMPLE;
+
+ /* If matches everything but \n */
+ if ( start[0] == 0 && end[0] == '\n' - 1
+ && start[1] == '\n' + 1 && end[1] == UV_MAX)
+ {
+ assert (! invert);
+ ret = reg_node(pRExC_state, REG_ANY);
+ MARK_NAUGHTY(1);
+ goto not_anyof;
}
- else {
+ }
- /* Then try the other POSIX classes. The POSIXA ones are about
- * the same speed as ANYOF ops, but take less room; the ones
- * that have above-Latin1 code point matches are somewhat
- * faster than ANYOF. */
+ /* Next see if can optimize classes that contain just a few code points
+ * into an EXACTish node. The reason to do this is to let the
+ * optimizer join this node with adjacent EXACTish ones.
+ *
+ * An EXACTFish node can be generated even if not under /i, and vice
+ * versa. But care must be taken. An EXACTFish node has to be such
+ * that it only matches precisely the code points in the class, but we
+ * want to generate the least restrictive one that does that, to
+ * increase the odds of being able to join with an adjacent node. For
+ * example, if the class contains [kK], we have to make it an EXACTFAA
+ * node to prevent the KELVIN SIGN from matching. Whether we are under
+ * /i or not is irrelevant in this case. Less obvious is the pattern
+ * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
+ * supposed to match the single character U+0149 LATIN SMALL LETTER N
+ * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
+ * that includes \X{02BC}, there is a multi-char fold that does, and so
+ * the node generated for it must be an EXACTFish one. On the other
+ * hand qr/:/i should generate a plain EXACT node since the colon
+ * participates in no fold whatsoever, and having it EXACT tells the
+ * optimizer the target string cannot match unless it has a colon in
+ * it.
+ *
+ * We don't typically generate an EXACTish node if doing so would
+ * require changing the pattern to UTF-8, as that affects /d and
+ * otherwise is slower. However, under /i, not changing to UTF-8 can
+ * miss some potential multi-character folds. We calculate the
+ * EXACTish node, and then decide if something would be missed if we
+ * don't upgrade */
+ if ( ! posixl
+ && ! invert
+
+ /* Only try if there are no more code points in the class than
+ * in the max possible fold */
+ && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
+
+ && (start[0] < 256 || UTF || FOLD))
+ {
+ if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
+ {
+ /* We can always make a single code point class into an
+ * EXACTish node. */
+
+ if (LOC) {
+
+ /* Here is /l: Use EXACTL, except /li indicates EXACTFL,
+ * as that means there is a fold not known until runtime so
+ * shows as only a single code point here. */
+ op = (FOLD) ? EXACTFL : EXACTL;
+ }
+ else if (! FOLD) { /* Not /l and not /i */
+ op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
+ }
+ else if (start[0] < 256) { /* /i, not /l, and the code point is
+ small */
+
+ /* Under /i, it gets a little tricky. A code point that
+ * doesn't participate in a fold should be an EXACT node.
+ * We know this one isn't the result of a simple fold, or
+ * there'd be more than one code point in the list, but it
+ * could be part of a multi- character fold. In that case
+ * we better not create an EXACT node, as we would wrongly
+ * be telling the optimizer that this code point must be in
+ * the target string, and that is wrong. This is because
+ * if the sequence around this code point forms a
+ * multi-char fold, what needs to be in the string could be
+ * the code point that folds to the sequence.
+ *
+ * This handles the case of below-255 code points, as we
+ * have an easy look up for those. The next clause handles
+ * the above-256 one */
+ op = IS_IN_SOME_FOLD_L1(start[0])
+ ? EXACTFU
+ : EXACT;
+ }
+ else { /* /i, larger code point. Since we are under /i, and
+ have just this code point, we know that it can't
+ fold to something else, so PL_InMultiCharFold
+ applies to it */
+ op = _invlist_contains_cp(PL_InMultiCharFold,
+ start[0])
+ ? EXACTFU_ONLY8
+ : EXACT_ONLY8;
+ }
- for (posix_class = 0;
- posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
- posix_class++)
+ value = start[0];
+ }
+ else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
+ && _invlist_contains_cp(PL_in_some_fold, start[0]))
+ {
+ /* Here, the only runtime dependency, if any, is from /d, and
+ * the class matches more than one code point, and the lowest
+ * code point participates in some fold. It might be that the
+ * other code points are /i equivalent to this one, and hence
+ * they would representable by an EXACTFish node. Above, we
+ * eliminated classes that contain too many code points to be
+ * EXACTFish, with the test for MAX_FOLD_FROMS
+ *
+ * First, special case the ASCII fold pairs, like 'B' and 'b'.
+ * We do this because we have EXACTFAA at our disposal for the
+ * ASCII range */
+ if (partial_cp_count == 2 && isASCII(start[0])) {
+
+ /* The only ASCII characters that participate in folds are
+ * alphabetics */
+ assert(isALPHA(start[0]));
+ if ( end[0] == start[0] /* First range is a single
+ character, so 2nd exists */
+ && isALPHA_FOLD_EQ(start[0], start[1]))
+ {
+
+ /* Here, is part of an ASCII fold pair */
+
+ if ( ASCII_FOLD_RESTRICTED
+ || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
+ {
+ /* If the second clause just above was true, it
+ * means we can't be under /i, or else the list
+ * would have included more than this fold pair.
+ * Therefore we have to exclude the possibility of
+ * whatever else it is that folds to these, by
+ * using EXACTFAA */
+ op = EXACTFAA;
+ }
+ else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
+
+ /* Here, there's no simple fold that start[0] is part
+ * of, but there is a multi-character one. If we
+ * are not under /i, we want to exclude that
+ * possibility; if under /i, we want to include it
+ * */
+ op = (FOLD) ? EXACTFU : EXACTFAA;
+ }
+ else {
+
+ /* Here, the only possible fold start[0] particpates in
+ * is with start[1]. /i or not isn't relevant */
+ op = EXACTFU;
+ }
+
+ value = toFOLD(start[0]);
+ }
+ }
+ else if ( ! upper_latin1_only_utf8_matches
+ || ( _invlist_len(upper_latin1_only_utf8_matches)
+ == 2
+ && PL_fold_latin1[
+ invlist_highest(upper_latin1_only_utf8_matches)]
+ == start[0]))
{
- int try_inverted;
+ /* Here, the smallest character is non-ascii or there are
+ * more than 2 code points matched by this node. Also, we
+ * either don't have /d UTF-8 dependent matches, or if we
+ * do, they look like they could be a single character that
+ * is the fold of the lowest one in the always-match list.
+ * This test quickly excludes most of the false positives
+ * when there are /d UTF-8 depdendent matches. These are
+ * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
+ * SMALL LETTER A WITH GRAVE iff the target string is
+ * UTF-8. (We don't have to worry above about exceeding
+ * the array bounds of PL_fold_latin1[] because any code
+ * point in 'upper_latin1_only_utf8_matches' is below 256.)
+ *
+ * EXACTFAA would apply only to pairs (hence exactly 2 code
+ * points) in the ASCII range, so we can't use it here to
+ * artificially restrict the fold domain, so we check if
+ * the class does or does not match some EXACTFish node.
+ * Further, if we aren't under /i, and and the folded-to
+ * character is part of a multi-character fold, we can't do
+ * this optimization, as the sequence around it could be
+ * that multi-character fold, and we don't here know the
+ * context, so we have to assume it is that multi-char
+ * fold, to prevent potential bugs.
+ *
+ * To do the general case, we first find the fold of the
+ * lowest code point (which may be higher than the lowest
+ * one), then find everything that folds to it. (The data
+ * structure we have only maps from the folded code points,
+ * so we have to do the earlier step.) */
+
+ Size_t foldlen;
+ U8 foldbuf[UTF8_MAXBYTES_CASE];
+ UV folded = _to_uni_fold_flags(start[0],
+ foldbuf, &foldlen, 0);
+ unsigned int first_fold;
+ const unsigned int * remaining_folds;
+ Size_t folds_to_this_cp_count = _inverse_folds(
+ folded,
+ &first_fold,
+ &remaining_folds);
+ Size_t folds_count = folds_to_this_cp_count + 1;
+ SV * fold_list = _new_invlist(folds_count);
+ unsigned int i;
+
+ /* If there are UTF-8 dependent matches, create a temporary
+ * list of what this node matches, including them. */
+ SV * all_cp_list = NULL;
+ SV ** use_this_list = &cp_list;
+
+ if (upper_latin1_only_utf8_matches) {
+ all_cp_list = _new_invlist(0);
+ use_this_list = &all_cp_list;
+ _invlist_union(cp_list,
+ upper_latin1_only_utf8_matches,
+ use_this_list);
+ }
- for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+ /* Having gotten everything that participates in the fold
+ * containing the lowest code point, we turn that into an
+ * inversion list, making sure everything is included. */
+ fold_list = add_cp_to_invlist(fold_list, start[0]);
+ fold_list = add_cp_to_invlist(fold_list, folded);
+ fold_list = add_cp_to_invlist(fold_list, first_fold);
+ for (i = 0; i < folds_to_this_cp_count - 1; i++) {
+ fold_list = add_cp_to_invlist(fold_list,
+ remaining_folds[i]);
+ }
+
+ /* If the fold list is identical to what's in this ANYOF
+ * node, the node can be represented by an EXACTFish one
+ * instead */
+ if (_invlistEQ(*use_this_list, fold_list,
+ 0 /* Don't complement */ )
+ ) {
- /* Check if matches POSIXA, normal or inverted */
- if (PL_Posix_ptrs[posix_class]) {
- if (_invlistEQ(cp_list,
- PL_Posix_ptrs[posix_class],
- try_inverted))
+ /* But, we have to be careful, as mentioned above.
+ * Just the right sequence of characters could match
+ * this if it is part of a multi-character fold. That
+ * IS what we want if we are under /i. But it ISN'T
+ * what we want if not under /i, as it could match when
+ * it shouldn't. So, when we aren't under /i and this
+ * character participates in a multi-char fold, we
+ * don't optimize into an EXACTFish node. So, for each
+ * case below we have to check if we are folding
+ * and if not, if it is not part of a multi-char fold.
+ * */
+ if (start[0] > 255) { /* Highish code point */
+ if (FOLD || ! _invlist_contains_cp(
+ PL_InMultiCharFold, folded))
{
- op = (try_inverted)
- ? NPOSIXA
- : POSIXA;
- *flagp |= HASWIDTH|SIMPLE;
- goto found_posix;
+ op = (LOC)
+ ? EXACTFLU8
+ : (ASCII_FOLD_RESTRICTED)
+ ? EXACTFAA
+ : EXACTFU_ONLY8;
+ value = folded;
}
+ } /* Below, the lowest code point < 256 */
+ else if ( FOLD
+ && folded == 's'
+ && DEPENDS_SEMANTICS)
+ { /* An EXACTF node containing a single character
+ 's', can be an EXACTFU if it doesn't get
+ joined with an adjacent 's' */
+ op = EXACTFU_S_EDGE;
+ value = folded;
}
-
- /* Check if matches POSIXU, normal or inverted */
- if (_invlistEQ(cp_list,
- PL_XPosix_ptrs[posix_class],
- try_inverted))
+ else if ( FOLD
+ || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
{
- op = (try_inverted)
- ? NPOSIXU
- : POSIXU;
- *flagp |= HASWIDTH|SIMPLE;
- goto found_posix;
+ if (upper_latin1_only_utf8_matches) {
+ op = EXACTF;
+
+ /* We can't use the fold, as that only matches
+ * under UTF-8 */
+ value = start[0];
+ }
+ else if ( UNLIKELY(start[0] == MICRO_SIGN)
+ && ! UTF)
+ { /* EXACTFUP is a special node for this
+ character */
+ op = (ASCII_FOLD_RESTRICTED)
+ ? EXACTFAA
+ : EXACTFUP;
+ value = MICRO_SIGN;
+ }
+ else if ( ASCII_FOLD_RESTRICTED
+ && ! isASCII(start[0]))
+ { /* For ASCII under /iaa, we can use EXACTFU
+ below */
+ op = EXACTFAA;
+ value = folded;
+ }
+ else {
+ op = EXACTFU;
+ value = folded;
+ }
}
}
+
+ SvREFCNT_dec_NN(fold_list);
+ SvREFCNT_dec(all_cp_list);
+ }
+ }
+
+ if (op != END) {
+
+ /* Here, we have calculated what EXACTish node we would use.
+ * But we don't use it if it would require converting the
+ * pattern to UTF-8, unless not using it could cause us to miss
+ * some folds (hence be buggy) */
+
+ if (! UTF && value > 255) {
+ SV * in_multis = NULL;
+
+ assert(FOLD);
+
+ /* If there is no code point that is part of a multi-char
+ * fold, then there aren't any matches, so we don't do this
+ * optimization. Otherwise, it could match depending on
+ * the context around us, so we do upgrade */
+ _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
+ if (UNLIKELY(_invlist_len(in_multis) != 0)) {
+ REQUIRE_UTF8(flagp);
+ }
+ else {
+ op = END;
+ }
+ }
+
+ if (op != END) {
+ U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
+
+ ret = regnode_guts(pRExC_state, op, len, "exact");
+ FILL_NODE(ret, op);
+ RExC_emit += 1 + STR_SZ(len);
+ STR_LEN(REGNODE_p(ret)) = len;
+ if (len == 1) {
+ *STRING(REGNODE_p(ret)) = value;
+ }
+ else {
+ uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
+ }
+ goto not_anyof;
}
- found_posix: ;
- }
-
- /* If it didn't match a POSIX class, it might be able to be turned
- * into an ANYOFM node. Compare two different bytes, bit-by-bit.
- * In some positions, the bits in each will be 1; and in other
- * positions both will be 0; and in some positions the bit will be
- * 1 in one byte, and 0 in the other. Let 'n' be the number of
- * positions where the bits differ. We create a mask which has
- * exactly 'n' 0 bits, each in a position where the two bytes
- * differ. Now take the set of all bytes that when ANDed with the
- * mask yield the same result. That set has 2**n elements, and is
- * representable by just two 8 bit numbers: the result and the
- * mask. Importantly, matching the set can be vectorized by
- * creating a word full of the result bytes, and a word full of the
- * mask bytes, yielding a significant speed up. Here, see if this
- * node matches such a set. As a concrete example consider [01],
- * and the byte representing '0' which is 0x30 on ASCII machines.
- * It has the bits 0011 0000. Take the mask 1111 1110. If we AND
- * 0x31 and 0x30 with that mask we get 0x30. Any other bytes ANDed
- * yield something else. So [01], which is a common usage, is
- * optimizable into ANYOFM, and can benefit from the speed up. We
- * can only do this on UTF-8 invariant bytes, because the variance
- * would throw this off. */
- if ( op == END
- && invlist_highest(cp_list) <=
+ }
+ }
+
+ if (! has_runtime_dependency) {
+
+ /* See if this can be turned into an ANYOFM node. Think about the
+ * bit patterns in two different bytes. In some positions, the
+ * bits in each will be 1; and in other positions both will be 0;
+ * and in some positions the bit will be 1 in one byte, and 0 in
+ * the other. Let 'n' be the number of positions where the bits
+ * differ. We create a mask which has exactly 'n' 0 bits, each in
+ * a position where the two bytes differ. Now take the set of all
+ * bytes that when ANDed with the mask yield the same result. That
+ * set has 2**n elements, and is representable by just two 8 bit
+ * numbers: the result and the mask. Importantly, matching the set
+ * can be vectorized by creating a word full of the result bytes,
+ * and a word full of the mask bytes, yielding a significant speed
+ * up. Here, see if this node matches such a set. As a concrete
+ * example consider [01], and the byte representing '0' which is
+ * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
+ * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
+ * 0x30. Any other bytes ANDed yield something else. So [01],
+ * which is a common usage, is optimizable into ANYOFM, and can
+ * benefit from the speed up. We can only do this on UTF-8
+ * invariant bytes, because they have the same bit patterns under
+ * UTF-8 as not. */
+ PERL_UINT_FAST8_T inverted = 0;
#ifdef EBCDIC
- 0xFF
+ const PERL_UINT_FAST8_T max_permissible = 0xFF;
#else
- 0x7F
+ const PERL_UINT_FAST8_T max_permissible = 0x7F;
#endif
- ) {
- Size_t cp_count = 0;
- bool first_time = TRUE;
- unsigned int lowest_cp = 0xFF;
- U8 bits_differing = 0;
+ /* If doesn't fit the criteria for ANYOFM, invert and try again.
+ * If that works we will instead later generate an NANYOFM, and
+ * invert back when through */
+ if (invlist_highest(cp_list) > max_permissible) {
+ _invlist_invert(cp_list);
+ inverted = 1;
+ }
- /* Only needed on EBCDIC, as there, variants and non- are mixed
- * together. Could #ifdef it out on ASCII, but probably the
- * compiler will optimize it out */
- bool has_variant = FALSE;
+ if (invlist_highest(cp_list) <= max_permissible) {
+ UV this_start, this_end;
+ UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */
+ U8 bits_differing = 0;
+ Size_t full_cp_count = 0;
+ bool first_time = TRUE;
- /* Go through the bytes and find the bit positions that differ */
+ /* Go through the bytes and find the bit positions that differ
+ * */
invlist_iterinit(cp_list);
- while (invlist_iternext(cp_list, &start, &end)) {
- unsigned int i = start;
-
- cp_count += end - start + 1;
+ while (invlist_iternext(cp_list, &this_start, &this_end)) {
+ unsigned int i = this_start;
if (first_time) {
if (! UVCHR_IS_INVARIANT(i)) {
- has_variant = TRUE;
- continue;
+ goto done_anyofm;
}
first_time = FALSE;
- lowest_cp = start;
+ lowest_cp = this_start;
+ /* We have set up the code point to compare with.
+ * Don't compare it with itself */
i++;
}
/* Find the bit positions that differ from the lowest code
* point in the node. Keep track of all such positions by
* OR'ing */
- for (; i <= end; i++) {
+ for (; i <= this_end; i++) {
if (! UVCHR_IS_INVARIANT(i)) {
- has_variant = TRUE;
- continue;
+ goto done_anyofm;
}
bits_differing |= i ^ lowest_cp;
}
+
+ full_cp_count += this_end - this_start + 1;
}
invlist_iterfinish(cp_list);
* other in some other position. That means one code point has
* a 1 in that position, and another has a 0. But that would
* mean that one of them differs from the lowest code point in
- * that position, which possibility we've already excluded. */
- if ( ! has_variant
- && cp_count == 1U << PL_bitcount[bits_differing])
+ * that position, which possibility we've already excluded. */
+ if ( (inverted || full_cp_count > 1)
+ && full_cp_count == 1U << PL_bitcount[bits_differing])
{
- assert(cp_count > 1);
- op = ANYOFM;
+ U8 ANYOFM_mask;
+
+ op = ANYOFM + inverted;;
/* We need to make the bits that differ be 0's */
ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
/* The argument is the lowest code point */
- anode_arg = lowest_cp;
- *flagp |= HASWIDTH|SIMPLE;
+ ret = reganode(pRExC_state, op, lowest_cp);
+ FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
}
}
- }
- }
+ done_anyofm:
- if (op != END) {
- RExC_parse = (char *)orig_parse;
- RExC_emit = orig_emit;
+ if (inverted) {
+ _invlist_invert(cp_list);
+ }
- if (regarglen[op]) {
- ret = reganode(pRExC_state, op, anode_arg);
- } else {
- ret = reg_node(pRExC_state, op);
+ if (op != END) {
+ goto not_anyof;
}
+ }
- RExC_parse = (char *)cur_parse;
+ if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
+ PERL_UINT_FAST8_T type;
+ SV * intersection = NULL;
+ SV* d_invlist = NULL;
- if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
- TRUE /* downgradable to EXACT */
- );
- }
- else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
- FLAGS(REGNODE_p(ret)) = posix_class;
- }
- else if (PL_regkind[op] == ANYOFM) {
- FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
+ /* See if this matches any of the POSIX classes. The POSIXA and
+ * POSIXD ones are about the same speed as ANYOF ops, but take less
+ * room; the ones that have above-Latin1 code point matches are
+ * somewhat faster than ANYOF. */
+
+ for (type = POSIXA; type >= POSIXD; type--) {
+ int posix_class;
+
+ if (type == POSIXL) { /* But not /l posix classes */
+ continue;
+ }
+
+ for (posix_class = 0;
+ posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+ posix_class++)
+ {
+ SV** our_code_points = &cp_list;
+ SV** official_code_points;
+ int try_inverted;
+
+ if (type == POSIXA) {
+ official_code_points = &PL_Posix_ptrs[posix_class];
+ }
+ else {
+ official_code_points = &PL_XPosix_ptrs[posix_class];
+ }
+
+ /* Skip non-existent classes of this type. e.g. \v only
+ * has an entry in PL_XPosix_ptrs */
+ if (! *official_code_points) {
+ continue;
+ }
+
+ /* Try both the regular class, and its inversion */
+ for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+ bool this_inverted = invert ^ try_inverted;
+
+ if (type != POSIXD) {
+
+ /* This class that isn't /d can't match if we have
+ * /d dependencies */
+ if (has_runtime_dependency
+ & HAS_D_RUNTIME_DEPENDENCY)
+ {
+ continue;
+ }
+ }
+ else /* is /d */ if (! this_inverted) {
+
+ /* /d classes don't match anything non-ASCII below
+ * 256 unconditionally (which cp_list contains) */
+ _invlist_intersection(cp_list, PL_UpperLatin1,
+ &intersection);
+ if (_invlist_len(intersection) != 0) {
+ continue;
+ }
+
+ SvREFCNT_dec(d_invlist);
+ d_invlist = invlist_clone(cp_list, NULL);
+
+ /* But under UTF-8 it turns into using /u rules.
+ * Add the things it matches under these conditions
+ * so that we check below that these are identical
+ * to what the tested class should match */
+ if (upper_latin1_only_utf8_matches) {
+ _invlist_union(
+ d_invlist,
+ upper_latin1_only_utf8_matches,
+ &d_invlist);
+ }
+ our_code_points = &d_invlist;
+ }
+ else { /* POSIXD, inverted. If this doesn't have this
+ flag set, it isn't /d. */
+ if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+ {
+ continue;
+ }
+ our_code_points = &cp_list;
+ }
+
+ /* Here, have weeded out some things. We want to see
+ * if the list of characters this node contains
+ * ('*our_code_points') precisely matches those of the
+ * class we are currently checking against
+ * ('*official_code_points'). */
+ if (_invlistEQ(*our_code_points,
+ *official_code_points,
+ try_inverted))
+ {
+ /* Here, they precisely match. Optimize this ANYOF
+ * node into its equivalent POSIX one of the
+ * correct type, possibly inverted */
+ ret = reg_node(pRExC_state, (try_inverted)
+ ? type + NPOSIXA
+ - POSIXA
+ : type);
+ FLAGS(REGNODE_p(ret)) = posix_class;
+ SvREFCNT_dec(d_invlist);
+ SvREFCNT_dec(intersection);
+ goto not_anyof;
+ }
+ }
+ }
}
+ SvREFCNT_dec(d_invlist);
+ SvREFCNT_dec(intersection);
+ }
- SvREFCNT_dec_NN(cp_list);
- return ret;
+ /* If didn't find an optimization and there is no need for a
+ * bitmap, optimize to indicate that */
+ if ( start[0] >= NUM_ANYOF_CODE_POINTS
+ && ! LOC
+ && ! upper_latin1_only_utf8_matches)
+ {
+ op = ANYOFH;
}
+ } /* End of seeing if can optimize it into a different node */
+
+ is_anyof: /* It's going to be an ANYOF node. */
+ if (op != ANYOFH) {
+ op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+ ? ANYOFD
+ : ((posixl)
+ ? ANYOFPOSIXL
+ : ((LOC)
+ ? ANYOFL
+ : ANYOF));
}
- /* It's going to be an ANYOF node. */
- op = (use_anyofd)
- ? ANYOFD
- : ((posixl)
- ? ANYOFPOSIXL
- : ((LOC)
- ? ANYOFL
- : ANYOF));
ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
FILL_NODE(ret, op); /* We set the argument later */
RExC_emit += 1 + regarglen[op];
/* Here, the bitmap has been populated with all the Latin1 code points that
* always match. Can now add to the overall list those that match only
- * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
+ * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
* */
- if (has_upper_latin1_only_utf8_matches) {
+ if (upper_latin1_only_utf8_matches) {
if (cp_list) {
_invlist_union(cp_list,
- has_upper_latin1_only_utf8_matches,
+ upper_latin1_only_utf8_matches,
&cp_list);
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
}
else {
- cp_list = has_upper_latin1_only_utf8_matches;
+ cp_list = upper_latin1_only_utf8_matches;
}
ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
- /* If there is a swash and more than one element, we can't use the swash in
- * the optimization below. */
- if (swash && element_count > 1) {
- SvREFCNT_dec_NN(swash);
- swash = NULL;
- }
-
- /* Note that the optimization of using 'swash' if it is the only thing in
- * the class doesn't have us change swash at all, so it can include things
- * that are also in the bitmap; otherwise we have purposely deleted that
- * duplicate information */
set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
? listsv : NULL,
- only_utf8_locale_list,
- swash, has_user_defined_property);
+ only_utf8_locale_list);
+ return ret;
- *flagp |= HASWIDTH|SIMPLE;
+ not_anyof:
- if (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_LOCALE_FLAGS) {
- RExC_contains_locale = 1;
- }
+ /* Here, the node is getting optimized into something that's not an ANYOF
+ * one. Finish up. */
+ Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
+ RExC_parse - orig_parse);;
+ SvREFCNT_dec(cp_list);;
return ret;
}
regnode* const node,
SV* const cp_list,
SV* const runtime_defns,
- SV* const only_utf8_locale_list,
- SV* const swash,
- const bool has_user_defined_property)
+ SV* const only_utf8_locale_list)
{
/* Sets the arg field of an ANYOF-type node 'node', using information about
* the node passed-in. If there is nothing outside the node's bitmap, the
* arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
* the count returned by add_data(), having allocated and stored an array,
- * av, that that count references, as follows:
- * av[0] stores the character class description in its textual form.
- * This is used later (regexec.c:Perl_regclass_swash()) to
- * initialize the appropriate swash, and is also useful for dumping
- * the regnode. This is set to &PL_sv_undef if the textual
- * description is not needed at run-time (as happens if the other
- * elements completely define the class)
- * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
- * computed from av[0]. But if no further computation need be done,
- * the swash is stored here now (and av[0] is &PL_sv_undef).
- * av[2] stores the inversion list of code points that match only if the
- * current locale is UTF-8
- * av[3] stores the cp_list inversion list for use in addition or instead
- * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
- * (Otherwise everything needed is already in av[0] and av[1])
- * av[4] is set if any component of the class is from a user-defined
- * property; used only if av[3] exists */
+ * av, as follows:
+ *
+ * av[0] stores the inversion list defining this class as far as known at
+ * this time, or PL_sv_undef if nothing definite is now known.
+ * av[1] stores the inversion list of code points that match only if the
+ * current locale is UTF-8, or if none, PL_sv_undef if there is an
+ * av[2], or no entry otherwise.
+ * av[2] stores the list of user-defined properties whose subroutine
+ * definitions aren't known at this time, or no entry if none. */
UV n;
AV * const av = newAV();
SV *rv;
- av_store(av, 0, (runtime_defns)
- ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
- if (swash) {
- assert(cp_list);
- av_store(av, 1, swash);
- SvREFCNT_dec_NN(cp_list);
- }
- else {
- av_store(av, 1, &PL_sv_undef);
- if (cp_list) {
- av_store(av, 3, cp_list);
- av_store(av, 4, newSVuv(has_user_defined_property));
- }
- }
+ if (cp_list) {
+ av_store(av, INVLIST_INDEX, cp_list);
+ }
if (only_utf8_locale_list) {
- av_store(av, 2, only_utf8_locale_list);
+ av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
}
- else {
- av_store(av, 2, &PL_sv_undef);
+
+ if (runtime_defns) {
+ av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
}
rv = newRV_noinc(MUTABLE_SV(av));
{
/* For internal core use only.
- * Returns the swash for the input 'node' in the regex 'prog'.
- * If <doinit> is 'true', will attempt to create the swash if not already
- * done.
+ * Returns the inversion list for the input 'node' in the regex 'prog'.
+ * If <doinit> is 'true', will attempt to create the inversion list if not
+ * already done.
* If <listsvp> is non-null, will return the printable contents of the
- * swash. This can be used to get debugging information even before the
- * swash exists, by calling this function with 'doinit' set to false, in
- * which case the components that will be used to eventually create the
- * swash are returned (in a printable form).
+ * property definition. This can be used to get debugging information
+ * even before the inversion list exists, by calling this function with
+ * 'doinit' set to false, in which case the components that will be used
+ * to eventually create the inversion list are returned (in a printable
+ * form).
* If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
* store an inversion list of code points that should match only if the
* execution-time locale is a UTF-8 one.
* inversion list of the code points that would be instead returned in
* <listsvp> if this were NULL. Thus, what gets output in <listsvp>
* when this parameter is used, is just the non-code point data that
- * will go into creating the swash. This currently should be just
+ * will go into creating the inversion list. This currently should be just
* user-defined properties whose definitions were not known at compile
* time. Using this parameter allows for easier manipulation of the
- * swash's data by the caller. It is illegal to call this function with
- * this parameter set, but not <listsvp>
+ * inversion list's data by the caller. It is illegal to call this
+ * function with this parameter set, but not <listsvp>
*
* Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
- * that, in spite of this function's name, the swash it returns may include
- * the bitmap data as well */
+ * that, in spite of this function's name, the inversion list it returns
+ * may include the bitmap data as well */
- SV *sw = NULL;
- SV *si = NULL; /* Input swash initialization string */
+ SV *si = NULL; /* Input initialization string */
SV* invlist = NULL;
RXi_GET_DECL(prog, progi);
SV * const rv = MUTABLE_SV(data->data[n]);
AV * const av = MUTABLE_AV(SvRV(rv));
SV **const ary = AvARRAY(av);
- U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
- si = *ary; /* ary[0] = the string to initialize the swash with */
+ invlist = ary[INVLIST_INDEX];
- if (av_tindex_skip_len_mg(av) >= 2) {
- if (only_utf8_locale_ptr
- && ary[2]
- && ary[2] != &PL_sv_undef)
- {
- *only_utf8_locale_ptr = ary[2];
- }
- else {
- assert(only_utf8_locale_ptr);
- *only_utf8_locale_ptr = NULL;
- }
-
- /* Elements 3 and 4 are either both present or both absent. [3]
- * is any inversion list generated at compile time; [4]
- * indicates if that inversion list has any user-defined
- * properties in it. */
- if (av_tindex_skip_len_mg(av) >= 3) {
- invlist = ary[3];
- if (SvUV(ary[4])) {
- swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
+ if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
+ *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
+ }
+
+ if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
+ si = ary[DEFERRED_USER_DEFINED_INDEX];
+ }
+
+ if (doinit && (si || invlist)) {
+ if (si) {
+ bool user_defined;
+ SV * msg = newSVpvs_flags("", SVs_TEMP);
+
+ SV * prop_definition = handle_user_defined_property(
+ "", 0, FALSE, /* There is no \p{}, \P{} */
+ SvPVX_const(si)[1] - '0', /* /i or not has been
+ stored here for just
+ this occasion */
+ TRUE, /* run time */
+ si, /* The property definition */
+ &user_defined,
+ msg,
+ 0 /* base level call */
+ );
+
+ if (SvCUR(msg)) {
+ assert(prop_definition == NULL);
+
+ Perl_croak(aTHX_ "%" UTF8f,
+ UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
}
- }
- else {
- invlist = NULL;
- }
- }
- /* Element [1] is reserved for the set-up swash. If already there,
- * return it; if not, create it and store it there */
- if (ary[1] && SvROK(ary[1])) {
- sw = ary[1];
- }
- else if (doinit && ((si && si != &PL_sv_undef)
- || (invlist && invlist != &PL_sv_undef))) {
- assert(si);
- sw = _core_swash_init("utf8", /* the utf8 package */
- "", /* nameless */
- si,
- 1, /* binary */
- 0, /* not from tr/// */
- invlist,
- &swash_init_flags);
- (void)av_store(av, 1, sw);
+ if (invlist) {
+ _invlist_union(invlist, prop_definition, &invlist);
+ SvREFCNT_dec_NN(prop_definition);
+ }
+ else {
+ invlist = prop_definition;
+ }
+
+ assert(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
+ assert(DEFERRED_USER_DEFINED_INDEX == 1
+ + ONLY_LOCALE_MATCHES_INDEX);
+
+ av_store(av, INVLIST_INDEX, invlist);
+ av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
+ ? ONLY_LOCALE_MATCHES_INDEX:
+ INVLIST_INDEX);
+ si = NULL;
+ }
}
}
}
- /* If requested, return a printable version of what this swash matches */
+ /* If requested, return a printable version of what this ANYOF node matches
+ * */
if (listsvp) {
SV* matches_string = NULL;
- /* The swash should be used, if possible, to get the data, as it
- * contains the resolved data. But this function can be called at
- * compile-time, before everything gets resolved, in which case we
- * return the currently best available information, which is the string
- * that will eventually be used to do that resolving, 'si' */
- if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
- && (si && si != &PL_sv_undef))
- {
+ /* This function can be called at compile-time, before everything gets
+ * resolved, in which case we return the currently best available
+ * information, which is the string that will eventually be used to do
+ * that resolving, 'si' */
+ if (si) {
/* Here, we only have 'si' (and possibly some passed-in data in
* 'invlist', which is handled below) If the caller only wants
* 'si', use that. */
if (SvCUR(matches_string)) { /* Get rid of trailing blank */
SvCUR_set(matches_string, SvCUR(matches_string) - 1);
}
- } /* end of has an 'si' but no swash */
+ } /* end of has an 'si' */
}
- /* If we have a swash in place, its equivalent inversion list was above
- * placed into 'invlist'. If not, this variable may contain a stored
- * inversion list which is information beyond what is in 'si' */
+ /* Add the stuff that's already known */
if (invlist) {
/* Again, if the caller doesn't want the output inversion list, put
*listsvp = matches_string;
}
- return sw;
+ return invlist;
}
#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
STATIC void
S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
{
+ /* 'size' is the delta to add or subtract from the current memory allocated
+ * to the regex engine being constructed */
+
PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
RExC_size += size;
STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
{
- /* Allocate a regnode for 'op', with 'extra_size' extra space. In pass1,
- * it aligns and increments RExC_size; in pass2, RExC_emit
+ /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns
+ * and increments RExC_size and RExC_emit
*
- * It returns the renode's offset into the regex engine program (meaningful
- * only in pass2 */
+ * It returns the regnode's offset into the regex engine program */
const regnode_offset ret = RExC_emit;
DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
- SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(REGNODE_p(scan)),
+ SvPV_nolen_const(RExC_mysv), scan,
(temp == NULL ? "->" : ""),
(temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
);
if ( exact ) {
switch (OP(REGNODE_p(scan))) {
case EXACT:
+ case EXACT_ONLY8:
case EXACTL:
case EXACTF:
+ case EXACTFU_S_EDGE:
case EXACTFAA_NO_TRIE:
case EXACTFAA:
case EXACTFU:
+ case EXACTFU_ONLY8:
case EXACTFLU8:
- case EXACTFU_SS:
+ case EXACTFUP:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(REGNODE_p(scan));
regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
SvPV_nolen_const(RExC_mysv),
- REG_NODE_NUM(REGNODE_p(scan)),
+ scan,
PL_reg_name[exact]);
});
if (temp == NULL)
Perl_re_printf( aTHX_
"~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
SvPV_nolen_const(RExC_mysv),
- (IV)REG_NODE_NUM(REGNODE_p(val)),
+ (IV)val,
(IV)(val - scan)
);
});
STATIC SV*
S_get_ANYOFM_contents(pTHX_ const regnode * n) {
- /* Returns an inversion list of all the code points matched by the ANYOFM
- * node 'n' */
+ /* Returns an inversion list of all the code points matched by the
+ * ANYOFM/NANYOFM node 'n' */
SV * cp_list = _new_invlist(-1);
const U8 lowest = (U8) ARG(n);
}
}
+ if (OP(n) == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
return cp_list;
}
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- /* Then all the things that could fit in the bitmap */
- do_sep = put_charclass_bitmap_innards(sv,
- ANYOF_BITMAP(o),
- bitmap_range_not_in_bitmap,
- only_utf8_locale_invlist,
- o,
-
- /* Can't try inverting for a
- * better display if there are
- * things that haven't been
- * resolved */
- unresolved != NULL);
- SvREFCNT_dec(bitmap_range_not_in_bitmap);
-
- /* If there are user-defined properties which haven't been defined yet,
- * output them. If the result is not to be inverted, it is clearest to
- * output them in a separate [] from the bitmap range stuff. If the
- * result is to be complemented, we have to show everything in one [],
- * as the inversion applies to the whole thing. Use {braces} to
- * separate them from anything in the bitmap and anything above the
- * bitmap. */
- if (unresolved) {
- if (inverted) {
- if (! do_sep) { /* If didn't output anything in the bitmap */
- sv_catpvs(sv, "^");
+ if (OP(o) != ANYOFH) {
+ /* Then all the things that could fit in the bitmap */
+ do_sep = put_charclass_bitmap_innards(sv,
+ ANYOF_BITMAP(o),
+ bitmap_range_not_in_bitmap,
+ only_utf8_locale_invlist,
+ o,
+
+ /* Can't try inverting for a
+ * better display if there
+ * are things that haven't
+ * been resolved */
+ unresolved != NULL);
+ SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+ /* If there are user-defined properties which haven't been defined
+ * yet, output them. If the result is not to be inverted, it is
+ * clearest to output them in a separate [] from the bitmap range
+ * stuff. If the result is to be complemented, we have to show
+ * everything in one [], as the inversion applies to the whole
+ * thing. Use {braces} to separate them from anything in the
+ * bitmap and anything above the bitmap. */
+ if (unresolved) {
+ if (inverted) {
+ if (! do_sep) { /* If didn't output anything in the bitmap
+ */
+ sv_catpvs(sv, "^");
+ }
+ sv_catpvs(sv, "{");
}
- sv_catpvs(sv, "{");
- }
- else if (do_sep) {
- Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
- }
- sv_catsv(sv, unresolved);
- if (inverted) {
- sv_catpvs(sv, "}");
+ else if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
+ PL_colors[0]);
+ }
+ sv_catsv(sv, unresolved);
+ if (inverted) {
+ sv_catpvs(sv, "}");
+ }
+ do_sep = ! inverted;
}
- do_sep = ! inverted;
}
/* And, finally, add the above-the-bitmap stuff */
SV * cp_list = get_ANYOFM_contents(o);
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+ if (OP(o) == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
+
put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
void
Perl_init_uniprops(pTHX)
{
+ PL_user_def_props = newHV();
+
+#ifdef USE_ITHREADS
+
+ HvSHAREKEYS_off(PL_user_def_props);
+ PL_user_def_props_aTHX = aTHX;
+
+#endif
+
/* Set up the inversion list global variables */
PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
- PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
+ PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
UNI__PERL_FOLDS_TO_MULTI_CHAR]);
- PL_NonL1NonFinalFold = _new_invlist_C_array(
- NonL1_Perl_Non_Final_Folds_invlist);
+ PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
+ UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
+ PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
+ UNI__PERL_NON_FINAL_FOLDS]);
PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
+ PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
+#ifdef UNI_XIDC
/* The below are used only by deprecated functions. They could be removed */
PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
+#endif
}
-SV *
-Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
- const bool to_fold, bool * invert)
+#if 0
+
+This code was mainly added for backcompat to give a warning for non-portable
+code points in user-defined properties. But experiments showed that the
+warning in earlier perls were only omitted on overflow, which should be an
+error, so there really isnt a backcompat issue, and actually adding the
+warning when none was present before might cause breakage, for little gain. So
+khw left this code in, but not enabled. Tests were never added.
+
+embed.fnc entry:
+Ei |const char *|get_extended_utf8_msg|const UV cp
+
+PERL_STATIC_INLINE const char *
+S_get_extended_utf8_msg(pTHX_ const UV cp)
{
- /* Parse the interior meat of \p{} passed to this in 'name' with length
- * 'name_len', and return an inversion list if a property with 'name' is
- * found, or NULL if not. 'name' point to the input with leading and
- * trailing space trimmed. 'to_fold' indicates if /i is in effect.
+ U8 dummy[UTF8_MAXBYTES + 1];
+ HV *msgs;
+ SV **msg;
+
+ uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
+ &msgs);
+
+ msg = hv_fetchs(msgs, "text", 0);
+ assert(msg);
+
+ (void) sv_2mortal((SV *) msgs);
+
+ return SvPVX(*msg);
+}
+
+#endif
+
+SV *
+Perl_handle_user_defined_property(pTHX_
+
+ /* Parses the contents of a user-defined property definition; returning the
+ * expanded definition if possible. If so, the return is an inversion
+ * list.
*
- * When the return is an inversion list, '*invert' will be set to a boolean
- * indicating if it should be inverted or not
+ * If there are subroutines that are part of the expansion and which aren't
+ * known at the time of the call to this function, this returns what
+ * parse_uniprop_string() returned for the first one encountered.
*
- * This currently doesn't handle all cases. A NULL return indicates the
- * caller should try a different approach
- */
+ * If an error was found, NULL is returned, and 'msg' gets a suitable
+ * message appended to it. (Appending allows the back trace of how we got
+ * to the faulty definition to be displayed through nested calls of
+ * user-defined subs.)
+ *
+ * The caller IS responsible for freeing any returned SV.
+ *
+ * The syntax of the contents is pretty much described in perlunicode.pod,
+ * but we also allow comments on each line */
+
+ const char * name, /* Name of property */
+ const STRLEN name_len, /* The name's length in bytes */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool to_fold, /* ? Is this under /i */
+ const bool runtime, /* ? Are we in compile- or run-time */
+ SV* contents, /* The property's definition */
+ bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
+ getting called unless this is thought to be
+ a user-defined property */
+ SV * msg, /* Any error or warning msg(s) are appended to
+ this */
+ const STRLEN level) /* Recursion level of this call */
+{
+ STRLEN len;
+ const char * string = SvPV_const(contents, len);
+ const char * const e = string + len;
+ const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
+ const STRLEN msgs_length_on_entry = SvCUR(msg);
+
+ const char * s0 = string; /* Points to first byte in the current line
+ being parsed in 'string' */
+ const char overflow_msg[] = "Code point too large in \"";
+ SV* running_definition = NULL;
+
+ PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
+
+ *user_defined_ptr = TRUE;
+
+ /* Look at each line */
+ while (s0 < e) {
+ const char * s; /* Current byte */
+ char op = '+'; /* Default operation is 'union' */
+ IV min = 0; /* range begin code point */
+ IV max = -1; /* and range end */
+ SV* this_definition;
+
+ /* Skip comment lines */
+ if (*s0 == '#') {
+ s0 = strchr(s0, '\n');
+ if (s0 == NULL) {
+ break;
+ }
+ s0++;
+ continue;
+ }
- char* lookup_name;
- bool stricter = FALSE;
- bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one
- of the cjk numeric properties (though
- it requires extra effort to compile
- them) */
- unsigned int i;
- unsigned int j = 0, lookup_len;
- int equals_pos = -1; /* Where the '=' is found, or negative if none */
- int slash_pos = -1; /* Where the '/' is found, or negative if none */
- int table_index = 0;
- bool starts_with_In_or_Is = FALSE;
- Size_t lookup_offset = 0;
+ /* For backcompat, allow an empty first line */
+ if (*s0 == '\n') {
+ s0++;
+ continue;
+ }
+
+ /* First character in the line may optionally be the operation */
+ if ( *s0 == '+'
+ || *s0 == '!'
+ || *s0 == '-'
+ || *s0 == '&')
+ {
+ op = *s0++;
+ }
+
+ /* If the line is one or two hex digits separated by blank space, its
+ * a range; otherwise it is either another user-defined property or an
+ * error */
+
+ s = s0;
+
+ if (! isXDIGIT(*s)) {
+ goto check_if_property;
+ }
+
+ do { /* Each new hex digit will add 4 bits. */
+ if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpv(msg, overflow_msg);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_msg;
+ }
+
+ /* Accumulate this digit into the value */
+ min = (min << 4) + READ_XDIGIT(s);
+ } while (isXDIGIT(*s));
+
+ while (isBLANK(*s)) { s++; }
+
+ /* We allow comments at the end of the line */
+ if (*s == '#') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ s++;
+ }
+ else if (s < e && *s != '\n') {
+ if (! isXDIGIT(*s)) {
+ goto check_if_property;
+ }
+
+ /* Look for the high point of the range */
+ max = 0;
+ do {
+ if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpv(msg, overflow_msg);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_msg;
+ }
+
+ max = (max << 4) + READ_XDIGIT(s);
+ } while (isXDIGIT(*s));
+
+ while (isBLANK(*s)) { s++; }
+
+ if (*s == '#') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ }
+ else if (s < e && *s != '\n') {
+ goto check_if_property;
+ }
+ }
+
+ if (max == -1) { /* The line only had one entry */
+ max = min;
+ }
+ else if (max < min) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Illegal range in \"");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_msg;
+ }
+
+#if 0 /* See explanation at definition above of get_extended_utf8_msg() */
+
+ if ( UNICODE_IS_PERL_EXTENDED(min)
+ || UNICODE_IS_PERL_EXTENDED(max))
+ {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+
+ /* If both code points are non-portable, warn only on the lower
+ * one. */
+ sv_catpv(msg, get_extended_utf8_msg(
+ (UNICODE_IS_PERL_EXTENDED(min))
+ ? min : max));
+ sv_catpvs(msg, " in \"");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ }
+
+#endif
+
+ /* Here, this line contains a legal range */
+ this_definition = sv_2mortal(_new_invlist(2));
+ this_definition = _add_range_to_invlist(this_definition, min, max);
+ goto calculate;
+
+ check_if_property:
+
+ /* Here it isn't a legal range line. See if it is a legal property
+ * line. First find the end of the meat of the line */
+ s = strpbrk(s, "#\n");
+ if (s == NULL) {
+ s = e;
+ }
+
+ /* Ignore trailing blanks in keeping with the requirements of
+ * parse_uniprop_string() */
+ s--;
+ while (s > s0 && isBLANK_A(*s)) {
+ s--;
+ }
+ s++;
+
+ this_definition = parse_uniprop_string(s0, s - s0,
+ is_utf8, to_fold, runtime,
+ user_defined_ptr, msg,
+ (name_len == 0)
+ ? level /* Don't increase level
+ if input is empty */
+ : level + 1
+ );
+ if (this_definition == NULL) {
+ goto return_msg; /* 'msg' should have had the reason appended to
+ it by the above call */
+ }
+
+ if (! is_invlist(this_definition)) { /* Unknown at this time */
+ return newSVsv(this_definition);
+ }
+
+ if (*s != '\n') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ }
+
+ calculate:
+
+ switch (op) {
+ case '+':
+ _invlist_union(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '-':
+ _invlist_subtract(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '&':
+ _invlist_intersection(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '!':
+ _invlist_union_complement_2nd(running_definition,
+ this_definition, &running_definition);
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
+ __FILE__, __LINE__, op);
+ break;
+ }
+
+ /* Position past the '\n' */
+ s0 = s + 1;
+ } /* End of loop through the lines of 'contents' */
+
+ /* Here, we processed all the lines in 'contents' without error. If we
+ * didn't add any warnings, simply return success */
+ if (msgs_length_on_entry == SvCUR(msg)) {
+
+ /* If the expansion was empty, the answer isn't nothing: its an empty
+ * inversion list */
+ if (running_definition == NULL) {
+ running_definition = _new_invlist(1);
+ }
+
+ return running_definition;
+ }
+
+ /* Otherwise, add some explanatory text, but we will return success */
+
+ return_msg:
+
+ if (name_len > 0) {
+ sv_catpvs(msg, " in expansion of ");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
+ }
+
+ return running_definition;
+}
+
+/* As explained below, certain operations need to take place in the first
+ * thread created. These macros switch contexts */
+#ifdef USE_ITHREADS
+# define DECLARATION_FOR_GLOBAL_CONTEXT \
+ PerlInterpreter * save_aTHX = aTHX;
+# define SWITCH_TO_GLOBAL_CONTEXT \
+ PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
+# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
+# define CUR_CONTEXT aTHX
+# define ORIGINAL_CONTEXT save_aTHX
+#else
+# define DECLARATION_FOR_GLOBAL_CONTEXT
+# define SWITCH_TO_GLOBAL_CONTEXT NOOP
+# define RESTORE_CONTEXT NOOP
+# define CUR_CONTEXT NULL
+# define ORIGINAL_CONTEXT NULL
+#endif
+
+STATIC void
+S_delete_recursion_entry(pTHX_ void *key)
+{
+ /* Deletes the entry used to detect recursion when expanding user-defined
+ * properties. This is a function so it can be set up to be called even if
+ * the program unexpectedly quits */
+
+ SV ** current_entry;
+ const STRLEN key_len = strlen((const char *) key);
+ DECLARATION_FOR_GLOBAL_CONTEXT;
+
+ SWITCH_TO_GLOBAL_CONTEXT;
+
+ /* If the entry is one of these types, it is a permanent entry, and not the
+ * one used to detect recursions. This function should delete only the
+ * recursion entry */
+ current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
+ if ( current_entry
+ && ! is_invlist(*current_entry)
+ && ! SvPOK(*current_entry))
+ {
+ (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
+ G_DISCARD);
+ }
+
+ RESTORE_CONTEXT;
+}
+
+SV *
+Perl_parse_uniprop_string(pTHX_
+
+ /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
+ * now. If so, the return is an inversion list.
+ *
+ * If the property is user-defined, it is a subroutine, which in turn
+ * may call other subroutines. This function will call the whole nest of
+ * them to get the definition they return; if some aren't known at the time
+ * of the call to this function, the fully qualified name of the highest
+ * level sub is returned. It is an error to call this function at runtime
+ * without every sub defined.
+ *
+ * If an error was found, NULL is returned, and 'msg' gets a suitable
+ * message appended to it. (Appending allows the back trace of how we got
+ * to the faulty definition to be displayed through nested calls of
+ * user-defined subs.)
+ *
+ * The caller should NOT try to free any returned inversion list.
+ *
+ * Other parameters will be set on return as described below */
+
+ const char * const name, /* The first non-blank in the \p{}, \P{} */
+ const Size_t name_len, /* Its length in bytes, not including any
+ trailing space */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool to_fold, /* ? Is this under /i */
+ const bool runtime, /* TRUE if this is being called at run time */
+ bool *user_defined_ptr, /* Upon return from this function it will be
+ set to TRUE if any component is a
+ user-defined property */
+ SV * msg, /* Any error or warning msg(s) are appended to
+ this */
+ const STRLEN level) /* Recursion level of this call */
+{
+ char* lookup_name; /* normalized name for lookup in our tables */
+ unsigned lookup_len; /* Its length */
+ bool stricter = FALSE; /* Some properties have stricter name
+ normalization rules, which we decide upon
+ based on parsing */
+
+ /* nv= or numeric_value=, or possibly one of the cjk numeric properties
+ * (though it requires extra effort to download them from Unicode and
+ * compile perl to know about them) */
+ bool is_nv_type = FALSE;
+
+ unsigned int i, j = 0;
+ int equals_pos = -1; /* Where the '=' is found, or negative if none */
+ int slash_pos = -1; /* Where the '/' is found, or negative if none */
+ int table_index = 0; /* The entry number for this property in the table
+ of all Unicode property names */
+ bool starts_with_In_or_Is = FALSE; /* ? Does the name start with 'In' or
+ 'Is' */
+ Size_t lookup_offset = 0; /* Used to ignore the first few characters of
+ the normalized name in certain situations */
+ Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
+ part of a package name */
+ bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
+ property rather than a Unicode
+ one. */
+ SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
+ if an error. If it is an inversion list,
+ it is the definition. Otherwise it is a
+ string containing the fully qualified sub
+ name of 'name' */
+ bool invert_return = FALSE; /* ? Do we need to complement the result before
+ returning it */
PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
- /* The input will be modified into 'lookup_name' */
+ /* The input will be normalized into 'lookup_name' */
Newx(lookup_name, name_len, char);
SAVEFREEPV(lookup_name);
for (i = 0; i < name_len; i++) {
char cur = name[i];
- /* These characters can be freely ignored in most situations. Later it
- * may turn out we shouldn't have ignored them, and we have to reparse,
- * but we don't have enough information yet to make that decision */
- if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
+ /* Most of the characters in the input will be of this ilk, being parts
+ * of a name */
+ if (isIDCONT_A(cur)) {
+
+ /* Case differences are ignored. Our lookup routine assumes
+ * everything is lowercase, so normalize to that */
+ if (isUPPER_A(cur)) {
+ lookup_name[j++] = toLOWER_A(cur);
+ continue;
+ }
+
+ if (cur == '_') { /* Don't include these in the normalized name */
+ continue;
+ }
+
+ lookup_name[j++] = cur;
+
+ /* The first character in a user-defined name must be of this type.
+ * */
+ if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
+ could_be_user_defined = FALSE;
+ }
+
continue;
}
- /* Case differences are also ignored. Our lookup routine assumes
- * everything is lowercase */
- if (isUPPER_A(cur)) {
- lookup_name[j++] = toLOWER(cur);
+ /* Here, the character is not something typically in a name, But these
+ * two types of characters (and the '_' above) can be freely ignored in
+ * most situations. Later it may turn out we shouldn't have ignored
+ * them, and we have to reparse, but we don't have enough information
+ * yet to make that decision */
+ if (cur == '-' || isSPACE_A(cur)) {
+ could_be_user_defined = FALSE;
continue;
}
- /* A double colon is either an error, or a package qualifier to a
- * subroutine user-defined property; neither of which do we currently
- * handle
- *
- * But a single colon is a synonym for '=' */
- if (cur == ':') {
- if (i < name_len - 1 && name[i+1] == ':') {
- return NULL;
- }
- cur = '=';
+ /* An equals sign or single colon mark the end of the first part of
+ * the property name */
+ if ( cur == '='
+ || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
+ {
+ lookup_name[j++] = '='; /* Treat the colon as an '=' */
+ equals_pos = j; /* Note where it occurred in the input */
+ could_be_user_defined = FALSE;
+ break;
}
/* Otherwise, this character is part of the name. */
lookup_name[j++] = cur;
- /* Only the equals sign needs further processing */
- if (cur == '=') {
- equals_pos = j; /* Note where it occurred in the input */
- break;
+ /* Here it isn't a single colon, so if it is a colon, it must be a
+ * double colon */
+ if (cur == ':') {
+
+ /* A double colon should be a package qualifier. We note its
+ * position and continue. Note that one could have
+ * pkg1::pkg2::...::foo
+ * so that the position at the end of the loop will be just after
+ * the final qualifier */
+
+ i++;
+ non_pkg_begin = i + 1;
+ lookup_name[j++] = ':';
+ }
+ else { /* Only word chars (and '::') can be in a user-defined name */
+ could_be_user_defined = FALSE;
}
+ } /* End of parsing through the lhs of the property name (or all of it if
+ no rhs) */
+
+#define STRLENs(s) (sizeof("" s "") - 1)
+
+ /* If there is a single package name 'utf8::', it is ambiguous. It could
+ * be for a user-defined property, or it could be a Unicode property, as
+ * all of them are considered to be for that package. For the purposes of
+ * parsing the rest of the property, strip it off */
+ if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
+ lookup_name += STRLENs("utf8::");
+ j -= STRLENs("utf8::");
+ equals_pos -= STRLENs("utf8::");
}
/* Here, we are either done with the whole property name, if it was simple;
}
}
- /* Certain properties need special handling. They may optionally be
- * prefixed by 'is'. Ignore that prefix for the purposes of checking
- * if this is one of those properties */
+ /* Certain properties whose values are numeric need special handling.
+ * They may optionally be prefixed by 'is'. Ignore that prefix for the
+ * purposes of checking if this is one of those properties */
if (memBEGINPs(lookup_name, name_len, "is")) {
lookup_offset = 2;
}
- /* Then check if it is one of these properties. This is hard-coded
- * because easier this way, and the list is unlikely to change. There
- * are several properties like this in the Unihan DB, which is unlikely
- * to be compiled, and they all end with 'numeric'. The interiors
+ /* Then check if it is one of these specially-handled properties. The
+ * possibilities are hard-coded because easier this way, and the list
+ * is unlikely to change.
+ *
+ * All numeric value type properties are of this ilk, and are also
+ * special in a different way later on. So find those first. There
+ * are several numeric value type properties in the Unihan DB (which is
+ * unlikely to be compiled with perl, but we handle it here in case it
+ * does get compiled). They all end with 'numeric'. The interiors
* aren't checked for the precise property. This would stop working if
* a cjk property were to be created that ended with 'numeric' and
* wasn't a numeric type */
{
unsigned int k;
- /* What makes these properties special is that the stuff after the
- * '=' is a number. Therefore, we can't throw away '-'
- * willy-nilly, as those could be a minus sign. Other stricter
+ /* Since the stuff after the '=' is a number, we can't throw away
+ * '-' willy-nilly, as those could be a minus sign. Other stricter
* rules also apply. However, these properties all can have the
* rhs not be a number, in which case they contain at least one
* alphabetic. In those cases, the stricter rules don't apply.
* But the numeric type properties can have the alphas [Ee] to
* signify an exponent, and it is still a number with stricter
- * rules. So look for an alpha that signifys not-strict */
+ * rules. So look for an alpha that signifies not-strict */
stricter = TRUE;
for (k = i; k < name_len; k++) {
if ( isALPHA_A(name[k])
* zeros, or between the final leading zero and the first other
* digit */
for (; i < name_len - 1; i++) {
- if ( name[i] != '0'
+ if ( name[i] != '0'
&& (name[i] != '_' || ! isDIGIT_A(name[i+1])))
{
break;
}
else { /* No '=' */
- /* We are now in a position to determine if this property should have
- * been parsed using stricter rules. Only a few are like that, and
- * unlikely to change. */
+ /* Only a few properties without an '=' should be parsed with stricter
+ * rules. The list is unlikely to change. */
if ( memBEGINPs(lookup_name, j, "perl")
&& memNEs(lookup_name + 4, j - 4, "space")
&& memNEs(lookup_name + 4, j - 4, "word"))
{
lookup_name[j++] = '&';
}
- else if (name_len > 2 && name[0] == 'I' && ( name[1] == 'n'
- || name[1] == 's'))
- {
-
- /* Also, if the original input began with 'In' or 'Is', it could be a
- * subroutine call instead of a property names, which currently isn't
- * handled by this function. Subroutine calls can't happen if there is
- * an '=' in the name */
- if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
- {
- return NULL;
- }
+ /* If the original input began with 'In' or 'Is', it could be a subroutine
+ * call to a user-defined property instead of a Unicode property name. */
+ if ( non_pkg_begin + name_len > 2
+ && name[non_pkg_begin+0] == 'I'
+ && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
+ {
starts_with_In_or_Is = TRUE;
}
+ else {
+ could_be_user_defined = FALSE;
+ }
+
+ if (could_be_user_defined) {
+ CV* user_sub;
+
+ /* Here, the name could be for a user defined property, which are
+ * implemented as subs. */
+ user_sub = get_cvn_flags(name, name_len, 0);
+ if (user_sub) {
+
+ /* Here, there is a sub by the correct name. Normally we call it
+ * to get the property definition */
+ dSP;
+ SV * user_sub_sv = MUTABLE_SV(user_sub);
+ SV * error; /* Any error returned by calling 'user_sub' */
+ SV * fq_name; /* Fully qualified property name */
+ SV * placeholder;
+ char to_fold_string[] = "0:"; /* The 0 gets overwritten with the
+ actual value */
+ SV ** saved_user_prop_ptr; /* Hash entry for this property */
+
+ /* How many times to retry when another thread is in the middle of
+ * expanding the same definition we want */
+ PERL_INT_FAST8_T retry_countdown = 10;
+
+ DECLARATION_FOR_GLOBAL_CONTEXT;
+
+ /* If we get here, we know this property is user-defined */
+ *user_defined_ptr = TRUE;
+
+ /* We refuse to call a tainted subroutine; returning an error
+ * instead */
+ if (TAINT_get) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Insecure user-defined property");
+ goto append_name_to_msg;
+ }
+
+ /* In principal, we only call each subroutine property definition
+ * once during the life of the program. This guarantees that the
+ * property definition never changes. The results of the single
+ * sub call are stored in a hash, which is used instead for future
+ * references to this property. The property definition is thus
+ * immutable. But, to allow the user to have a /i-dependent
+ * definition, we call the sub once for non-/i, and once for /i,
+ * should the need arise, passing the /i status as a parameter.
+ *
+ * We start by constructing the hash key name, consisting of the
+ * fully qualified subroutine name */
+ fq_name = sv_2mortal(newSV(10)); /* 10 is just a guess */
+ (void) cv_name(user_sub, fq_name, 0);
+
+ /* But precede the sub name in the key with the /i status, so that
+ * there is a key for /i and a different key for non-/i */
+ to_fold_string[0] = to_fold + '0';
+ sv_insert(fq_name, 0, 0, to_fold_string, 2);
+
+ /* We only call the sub once throughout the life of the program
+ * (with the /i, non-/i exception noted above). That means the
+ * hash must be global and accessible to all threads. It is
+ * created at program start-up, before any threads are created, so
+ * is accessible to all children. But this creates some
+ * complications.
+ *
+ * 1) The keys can't be shared, or else problems arise; sharing is
+ * turned off at hash creation time
+ * 2) All SVs in it are there for the remainder of the life of the
+ * program, and must be created in the same interpreter context
+ * as the hash, or else they will be freed from the wrong pool
+ * at global destruction time. This is handled by switching to
+ * the hash's context to create each SV going into it, and then
+ * immediately switching back
+ * 3) All accesses to the hash must be controlled by a mutex, to
+ * prevent two threads from getting an unstable state should
+ * they simultaneously be accessing it. The code below is
+ * crafted so that the mutex is locked whenever there is an
+ * access and unlocked only when the next stable state is
+ * achieved.
+ *
+ * The hash stores either the definition of the property if it was
+ * valid, or, if invalid, the error message that was raised. We
+ * use the type of SV to distinguish.
+ *
+ * There's also the need to guard against the definition expansion
+ * from infinitely recursing. This is handled by storing the aTHX
+ * of the expanding thread during the expansion. Again the SV type
+ * is used to distinguish this from the other two cases. If we
+ * come to here and the hash entry for this property is our aTHX,
+ * it means we have recursed, and the code assumes that we would
+ * infinitely recurse, so instead stops and raises an error.
+ * (Any recursion has always been treated as infinite recursion in
+ * this feature.)
+ *
+ * If instead, the entry is for a different aTHX, it means that
+ * that thread has gotten here first, and hasn't finished expanding
+ * the definition yet. We just have to wait until it is done. We
+ * sleep and retry a few times, returning an error if the other
+ * thread doesn't complete. */
+
+ re_fetch:
+ USER_PROP_MUTEX_LOCK;
+
+ /* If we have an entry for this key, the subroutine has already
+ * been called once with this /i status. */
+ saved_user_prop_ptr = hv_fetch(PL_user_def_props,
+ SvPVX(fq_name), SvCUR(fq_name), 0);
+ if (saved_user_prop_ptr) {
+
+ /* If the saved result is an inversion list, it is the valid
+ * definition of this property */
+ if (is_invlist(*saved_user_prop_ptr)) {
+ prop_definition = *saved_user_prop_ptr;
+
+ /* The SV in the hash won't be removed until global
+ * destruction, so it is stable and we can unlock */
+ USER_PROP_MUTEX_UNLOCK;
+
+ /* The caller shouldn't try to free this SV */
+ return prop_definition;
+ }
+
+ /* Otherwise, if it is a string, it is the error message
+ * that was returned when we first tried to evaluate this
+ * property. Fail, and append the message */
+ if (SvPOK(*saved_user_prop_ptr)) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catsv(msg, *saved_user_prop_ptr);
+
+ /* The SV in the hash won't be removed until global
+ * destruction, so it is stable and we can unlock */
+ USER_PROP_MUTEX_UNLOCK;
+
+ return NULL;
+ }
+
+ assert(SvIOK(*saved_user_prop_ptr));
+
+ /* Here, we have an unstable entry in the hash. Either another
+ * thread is in the middle of expanding the property's
+ * definition, or we are ourselves recursing. We use the aTHX
+ * in it to distinguish */
+ if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
+
+ /* Here, it's another thread doing the expanding. We've
+ * looked as much as we are going to at the contents of the
+ * hash entry. It's safe to unlock. */
+ USER_PROP_MUTEX_UNLOCK;
+
+ /* Retry a few times */
+ if (retry_countdown-- > 0) {
+ PerlProc_sleep(1);
+ goto re_fetch;
+ }
+
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Timeout waiting for another thread to "
+ "define");
+ goto append_name_to_msg;
+ }
+
+ /* Here, we are recursing; don't dig any deeper */
+ USER_PROP_MUTEX_UNLOCK;
+
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg,
+ "Infinite recursion in user-defined property");
+ goto append_name_to_msg;
+ }
+
+ /* Here, this thread has exclusive control, and there is no entry
+ * for this property in the hash. So we have the go ahead to
+ * expand the definition ourselves. */
+
+ ENTER;
+
+ /* Create a temporary placeholder in the hash to detect recursion
+ * */
+ SWITCH_TO_GLOBAL_CONTEXT;
+ placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
+ (void) hv_store_ent(PL_user_def_props, fq_name, placeholder, 0);
+ RESTORE_CONTEXT;
+
+ /* Now that we have a placeholder, we can let other threads
+ * continue */
+ USER_PROP_MUTEX_UNLOCK;
+
+ /* Make sure the placeholder always gets destroyed */
+ SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(fq_name));
+
+ PUSHMARK(SP);
+ SAVETMPS;
+
+ /* Call the user's function, with the /i status as a parameter.
+ * Note that we have gone to a lot of trouble to keep this call
+ * from being within the locked mutex region. */
+ XPUSHs(boolSV(to_fold));
+ PUTBACK;
+
+ (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
+
+ SPAGAIN;
+
+ error = ERRSV;
+ if (SvTRUE(error)) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Error \"");
+ sv_catsv(msg, error);
+ sv_catpvs(msg, "\"");
+ if (name_len > 0) {
+ sv_catpvs(msg, " in expansion of ");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
+ name_len,
+ name));
+ }
+
+ (void) POPs;
+ prop_definition = NULL;
+ }
+ else { /* G_SCALAR guarantees a single return value */
+
+ /* The contents is supposed to be the expansion of the property
+ * definition. Call a function to check for valid syntax and
+ * handle it */
+ prop_definition = handle_user_defined_property(name, name_len,
+ is_utf8, to_fold, runtime,
+ POPs, user_defined_ptr,
+ msg,
+ level);
+ }
+
+ /* Here, we have the results of the expansion. Replace the
+ * placeholder with them. We need exclusive access to the hash,
+ * and we can't let anyone else in, between when we delete the
+ * placeholder and add the permanent entry */
+ USER_PROP_MUTEX_LOCK;
+
+ S_delete_recursion_entry(aTHX_ SvPVX(fq_name));
+
+ if (! prop_definition || is_invlist(prop_definition)) {
+
+ /* If we got success we use the inversion list defining the
+ * property; otherwise use the error message */
+ SWITCH_TO_GLOBAL_CONTEXT;
+ (void) hv_store_ent(PL_user_def_props,
+ fq_name,
+ ((prop_definition)
+ ? newSVsv(prop_definition)
+ : newSVsv(msg)),
+ 0);
+ RESTORE_CONTEXT;
+ }
+
+ /* All done, and the hash now has a permanent entry for this
+ * property. Give up exclusive control */
+ USER_PROP_MUTEX_UNLOCK;
+
+ FREETMPS;
+ LEAVE;
+
+ if (prop_definition) {
- lookup_len = j; /* Use a more mnemonic name starting here */
+ /* If the definition is for something not known at this time,
+ * we toss it, and go return the main property name, as that's
+ * the one the user will be aware of */
+ if (! is_invlist(prop_definition)) {
+ SvREFCNT_dec_NN(prop_definition);
+ goto definition_deferred;
+ }
+
+ sv_2mortal(prop_definition);
+ }
+
+ /* And return */
+ return prop_definition;
+
+ } /* End of calling the subroutine for the user-defined property */
+ } /* End of it could be a user-defined property */
+
+ /* Here it wasn't a user-defined property that is known at this time. See
+ * if it is a Unicode property */
+
+ lookup_len = j; /* This is a more mnemonic name than 'j' */
/* Get the index into our pointer table of the inversion list corresponding
* to the property */
table_index = match_uniprop((U8 *) lookup_name, lookup_len);
- /* If it didn't find the property */
+ /* If it didn't find the property ... */
if (table_index == 0) {
- /* If didn't find the property, we try again stripping off any initial
- * 'In' or 'Is' */
+ /* Try again stripping off any initial 'In' or 'Is' */
if (starts_with_In_or_Is) {
lookup_name += 2;
lookup_len -= 2;
if (table_index == 0) {
char * canonical;
- /* If not found, and not a numeric type property, isn't a legal
- * property */
+ /* Here, we didn't find it. If not a numeric type property, and
+ * can't be a user-defined one, it isn't a legal property */
if (! is_nv_type) {
- return NULL;
- }
+ if (! could_be_user_defined) {
+ goto failed;
+ }
+
+ /* Here, the property name is legal as a user-defined one. At
+ * compile time, it might just be that the subroutine for that
+ * property hasn't been encountered yet, but at runtime, it's
+ * an error to try to use an undefined one */
+ if (runtime) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Unknown user-defined property name");
+ goto append_name_to_msg;
+ }
- /* But the numeric type properties need more work to decide. What
- * we do is make sure we have the number in canonical form and look
+ goto definition_deferred;
+ } /* End of isn't a numeric type property */
+
+ /* The numeric type properties need more work to decide. What we
+ * do is make sure we have the number in canonical form and look
* that up. */
if (slash_pos < 0) { /* No slash */
lookup_len - equals_pos)
!= lookup_name + lookup_len)
{
- return NULL;
+ goto failed;
}
- /* If the value is an integer, the canonical value is integral */
+ /* If the value is an integer, the canonical value is integral
+ * */
if (Perl_ceil(value) == value) {
canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
- equals_pos, lookup_name, value);
+ equals_pos, lookup_name, value);
}
else { /* Otherwise, it is %e with a known precision */
char * exp_ptr;
/* Convert the numerator to numeric */
end_ptr = this_lookup_name + slash_pos;
if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
- return NULL;
+ goto failed;
}
/* It better have included all characters before the slash */
if (*end_ptr != '/') {
- return NULL;
+ goto failed;
}
/* Set to look at just the denominator */
/* Convert the denominator to numeric */
if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
- return NULL;
+ goto failed;
}
/* It better be the rest of the characters, and don't divide by
if ( end_ptr != this_lookup_name + lookup_len
|| denominator == 0)
{
- return NULL;
+ goto failed;
}
/* Get the greatest common denominator using
/* If already in lowest possible terms, we have already tried
* looking this up */
if (gcd == 1) {
- return NULL;
+ goto failed;
}
- /* Reduce the rational, which should put it in canonical form.
- * Then look it up */
+ /* Reduce the rational, which should put it in canonical form
+ * */
numerator /= gcd;
denominator /= gcd;
/* Here, we have the number in canonical form. Try that */
table_index = match_uniprop((U8 *) canonical, strlen(canonical));
if (table_index == 0) {
- return NULL;
+ goto failed;
}
- }
- }
+ } /* End of still didn't find the property in our table */
+ } /* End of didn't find the property in our table */
- /* The return is an index into a table of ptrs. A negative return
- * signifies that the real index is the absolute value, but the result
- * needs to be inverted */
+ /* Here, we have a non-zero return, which is an index into a table of ptrs.
+ * A negative return signifies that the real index is the absolute value,
+ * but the result needs to be inverted */
if (table_index < 0) {
- *invert = TRUE;
+ invert_return = TRUE;
table_index = -table_index;
}
- else {
- *invert = FALSE;
- }
/* Out-of band indices indicate a deprecated property. The proper index is
* modulo it with the table size. And dividing by the table size yields
- * an offset into a table constructed to contain the corresponding warning
- * message */
+ * an offset into a table constructed by regen/mk_invlists.pl to contain
+ * the corresponding warning message */
if (table_index > MAX_UNI_KEYWORD_INDEX) {
Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
table_index %= MAX_UNI_KEYWORD_INDEX;
}
/* Create and return the inversion list */
- return _new_invlist_C_array(uni_prop_ptrs[table_index]);
+ prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
+ if (invert_return) {
+ _invlist_invert(prop_definition);
+ }
+ sv_2mortal(prop_definition);
+ return prop_definition;
+
+
+ failed:
+ if (non_pkg_begin != 0) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Illegal user-defined property name");
+ }
+ else {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Can't find Unicode property definition");
+ }
+ /* FALLTHROUGH */
+
+ append_name_to_msg:
+ {
+ const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
+ const char * suffix = (runtime && level == 0) ? "}" : "\"";
+
+ sv_catpv(msg, prefix);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
+ sv_catpv(msg, suffix);
+ }
+
+ return NULL;
+
+ definition_deferred:
+
+ /* Here it could yet to be defined, so defer evaluation of this
+ * until its needed at runtime. */
+ prop_definition = newSVpvs_flags("", SVs_TEMP);
+
+ /* To avoid any ambiguity, the package is always specified.
+ * Use the current one if it wasn't included in our input */
+ if (non_pkg_begin == 0) {
+ const HV * pkg = (IN_PERL_COMPILETIME)
+ ? PL_curstash
+ : CopSTASH(PL_curcop);
+ const char* pkgname = HvNAME(pkg);
+
+ Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
+ UTF8fARG(is_utf8, strlen(pkgname), pkgname));
+ sv_catpvs(prop_definition, "::");
+ }
+
+ Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
+ UTF8fARG(is_utf8, name_len, name));
+ sv_catpvs(prop_definition, "\n");
+
+ *user_defined_ptr = TRUE;
+ return prop_definition;
}
#endif