May be the same as tail.
tail : item following the branch sequence
count : words in the sequence
- flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
+ flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
depth : indent depth
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
else { /* Here is a generic multi-char fold. */
U8* multi_end = s + len;
- /* Count how many characters in it. In the case of /aa, no
- * folds which contain ASCII code points are allowed, so
- * check for those, and skip if found. */
+ /* Count how many characters are in it. In the case of
+ * /aa, no folds which contain ASCII code points are
+ * allowed, so check for those, and skip if found. */
if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
count = utf8_length(s, multi_end);
s = multi_end;
}
flags &= ~SCF_DO_STCLASS;
}
- else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
+ else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
+ EXACTFish */
SSize_t l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
separate code points */
+ const U8 * s = (U8*)STRING(scan);
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
}
if (UTF) {
- const U8 * const s = (U8 *)STRING(scan);
uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
data->longest = &(data->longest_float);
}
}
- if (OP(scan) == EXACTFL) {
- /* We don't know what the folds are; it could be anything. XXX
- * Actually, we only support UTF-8 encoding for code points
- * above Latin1, so we could know what those folds are. */
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0,
- UV_MAX);
+ if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
+ ssc_clear_locale(data->start_class);
}
- else { /* Non-locale EXACTFish */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
- if (flags & SCF_DO_STCLASS_AND) {
- ssc_clear_locale(data->start_class);
+
+ if (! UTF) {
+
+ /* We punt and assume can match anything if the node begins
+ * with a multi-character fold. Things are complicated. For
+ * example, /ffi/i could match any of:
+ * "\N{LATIN SMALL LIGATURE FFI}"
+ * "\N{LATIN SMALL LIGATURE FF}I"
+ * "F\N{LATIN SMALL LIGATURE FI}"
+ * plus several other things; and making sure we have all the
+ * possibilities is hard. */
+ if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
}
- if (uc < 256) { /* We know what the Latin1 folds are ... */
- if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
- know if anything folds
- with this */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
+ else {
+
+ /* Any Latin1 range character can potentially match any
+ * other depending on the locale */
+ if (OP(scan) == EXACTFL) {
+ _invlist_union(EXACTF_invlist, PL_Latin1,
+ &EXACTF_invlist);
+ }
+ else {
+ /* But otherwise, it matches at least itself. We can
+ * quickly tell if it has a distinct fold, and if so,
+ * it matches that as well */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (IS_IN_SOME_FOLD_L1(uc)) {
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
PL_fold_latin1[uc]);
- if (OP(scan) != EXACTFA) { /* The folds below aren't
- legal under /iaa */
- if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist,
- LATIN_SMALL_LETTER_SHARP_S);
- }
- else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 's');
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 'S');
- }
}
+ }
- /* We also know if there are above-Latin1 code points
- * that fold to this (none legal for ASCII and /iaa) */
- if ((! isASCII(uc) || OP(scan) != EXACTFA)
- && HAS_NONLATIN1_FOLD_CLOSURE(uc))
- {
- /* XXX We could know exactly what does fold to this
- * if the reverse folds are loaded, as currently in
- * S_regclass() */
- _invlist_union(EXACTF_invlist,
- PL_AboveLatin1,
- &EXACTF_invlist);
+ /* Some characters match above-Latin1 ones under /i. This
+ * is true of EXACTFL ones when the locale is UTF-8 */
+ if (HAS_NONLATIN1_FOLD_CLOSURE(uc)
+ && (! isASCII(uc) || (OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)))
+ {
+ add_above_Latin1_folds(pRExC_state,
+ (U8) uc,
+ &EXACTF_invlist);
+ }
+ }
+ }
+ else { /* Pattern is UTF-8 */
+ U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
+ STRLEN foldlen = UTF8SKIP(s);
+ const U8* e = s + STR_LEN(scan);
+ SV** listp;
+
+ /* The only code points that aren't folded in a UTF EXACTFish
+ * node are are the problematic ones in EXACTFL nodes */
+ if (OP(scan) == EXACTFL
+ && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
+ {
+ /* We need to check for the possibility that this EXACTFL
+ * node begins with a multi-char fold. Therefore we fold
+ * the first few characters of it so that we can make that
+ * check */
+ U8 *d = folded;
+ int i;
+
+ for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
+ if (isASCII(*s)) {
+ *(d++) = (U8) toFOLD(*s);
+ s++;
+ }
+ else {
+ STRLEN len;
+ to_utf8_fold(s, d, &len);
+ d += len;
+ s += UTF8SKIP(s);
}
}
+
+ /* And set up so the code below that looks in this folded
+ * buffer instead of the node's string */
+ e = d;
+ foldlen = UTF8SKIP(folded);
+ s = folded;
+ }
+
+ /* When we reach here 's' points to the fold of the first
+ * character(s) of the node; and 'e' points to far enough along
+ * the folded string to be just past any possible multi-char
+ * fold. 'foldlen' is the length in bytes of the first
+ * character in 's'
+ *
+ * Unlike the non-UTF-8 case, the macro for determining if a
+ * string is a multi-char fold requires all the characters to
+ * already be folded. This is because of all the complications
+ * if not. Note that they are folded anyway, except in EXACTFL
+ * nodes. Like the non-UTF case above, we punt if the node
+ * begins with a multi-char fold */
+
+ if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
}
- else { /* Non-locale, above Latin1. XXX We don't currently
- know what participates in folds with this, so have
- to assume anything could */
-
- /* XXX We could know exactly what does fold to this if the
- * reverse folds are loaded, as currently in S_regclass().
- * But we do know that under /iaa nothing in the ASCII
- * range can participate */
- if (OP(scan) == EXACTFA) {
- _invlist_union_complement_2nd(EXACTF_invlist,
- PL_XPosix_ptrs[_CC_ASCII],
- &EXACTF_invlist);
+ else { /* Single char fold */
+
+ /* It matches all the things that fold to it, which are
+ * found in PL_utf8_foldclosures (including itself) */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (! PL_utf8_foldclosures) {
+ _load_PL_utf8_foldclosures();
}
- else {
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0, UV_MAX);
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) s, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_tindex(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ assert(c_p);
+
+ c = SvUV(*c_p);
+
+ /* /aa doesn't allow folds between ASCII and non- */
+ if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
+ && isASCII(c) != isASCII(uc))
+ {
+ continue;
+ }
+
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
+ }
}
}
}
return count;
}
-/*XXX: todo make this not included in a non debugging perl */
+/*XXX: todo make this not included in a non debugging perl, but appears to be
+ * used anyway there, in 'use re' */
#ifndef PERL_IN_XSUB_RE
void
Perl_reginitcolors(pTHX)
count += 2;
}
}
+
+void
+Perl__load_PL_utf8_foldclosures (pTHX)
+{
+ assert(! PL_utf8_foldclosures);
+
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
+
+ /* This string is just a short named one above \xff */
+ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+ assert(PL_utf8_tofold); /* Verify that worked */
+ }
+ PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+}
#endif
#ifdef PERL_ARGS_ASSERT__INVLISTEQ
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#': /* (?#...) */
- /* XXX As soon as we disallow separating the '?' and '*' (by
- * spaces or (?#...) comment), it is believed that this case
- * will be unreachable and can be removed. See
- * [perl #117327] */
- while (*RExC_parse && *RExC_parse != ')')
- RExC_parse++;
- if (*RExC_parse != ')')
- FAIL("Sequence (?#... not terminated");
- nextchar(pRExC_state);
- *flagp = TRYAGAIN;
- return NULL;
case '0' : /* (?0) */
case 'R' : /* (?R) */
if (*RExC_parse != ')')
}
#undef IS_OPERAND
+STATIC void
+S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+{
+ /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
+ * innocent-looking character class, like /[ks]/i won't have to go out to
+ * disk to find the possible matches.
+ *
+ * This should be called only for a Latin1-range code points, cp, which is
+ * known to be involved in a fold with other code points above Latin1. It
+ * would give false results if /aa has been specified. Multi-char folds
+ * are outside the scope of this, and must be handled specially.
+ *
+ * XXX It would be better to generate these via regen, in case a new
+ * version of the Unicode standard adds new mappings, though that is not
+ * really likely, and may be caught by the default: case of the switch
+ * below. */
+
+ PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
+
+ switch (cp) {
+ case 'k':
+ case 'K':
+ *invlist =
+ add_cp_to_invlist(*invlist, KELVIN_SIGN);
+ break;
+ case 's':
+ case 'S':
+ *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
+ break;
+ case MICRO_SIGN:
+ *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
+ *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ *invlist = add_cp_to_invlist(*invlist,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
+ break;
+ case 'F': case 'f':
+ case 'I': case 'i':
+ case 'L': case 'l':
+ case 'T': case 't':
+ case 'A': case 'a':
+ case 'H': case 'h':
+ case 'J': case 'j':
+ case 'N': case 'n':
+ case 'W': case 'w':
+ case 'Y': case 'y':
+ /* These all are targets of multi-character folds from code points
+ * that require UTF8 to express, so they can't match unless the
+ * target string is in UTF-8, so no action here is necessary, as
+ * regexec.c properly handles the general case for UTF-8 matching
+ * and multi-char folds */
+ break;
+ default:
+ /* Use deprecated warning to increase the chances of this being
+ * output */
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+ break;
+ }
+}
+
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
}
if (!SIZE_ONLY) {
SV* invlist;
- char* formatted;
char* name;
if (UCHARAT(RExC_parse) == '^') {
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- formatted = Perl_form(aTHX_
+ name = savepv(Perl_form(aTHX_
"%s%.*s%s\n",
(FOLD) ? "__" : "",
(int)n,
RExC_parse,
(FOLD) ? "_i" : ""
- );
- name = savepvn(formatted, strlen(formatted));
+ ));
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
"Property '%"UTF8f"' is unknown",
UTF8fARG(UTF, n, name));
}
+
+ /* If the property name doesn't already have a package
+ * name, add the current one to it so that it can be
+ * referred to outside it. [perl #121777] */
+ if (! instr(name, "::") && PL_curstash) {
+ char* full_name = Perl_form(aTHX_
+ "%s::%s",
+ HvNAME(PL_curstash),
+ name);
+ n = strlen(full_name);
+ Safefree(name);
+ name = savepvn(full_name, n);
+ }
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
(value == 'p' ? '+' : '!'),
UTF8fARG(UTF, n, name));
ANYOF_POSIXL_ZERO(ret);
}
+ /* Coverity thinks it is possible for this to be negative; both
+ * jhi and khw think it's not, but be safer */
+ assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
+ || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
+
/* See if it already matches the complement of this POSIX
* class */
if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
&& ((prevvalue >= 'a' && value <= 'z')
|| (prevvalue >= 'A' && value <= 'Z')))
{
- _invlist_intersection(this_range, PL_ASCII,
+ _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
&this_range);
/* Since this above only contains ascii, the intersection of it
/* This is a hash that for a particular fold gives all
* characters that are involved in it */
if (! PL_utf8_foldclosures) {
-
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES_CASE+1];
-
- /* This string is just a short named one above \xff */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures
- = _swash_inversion_hash(PL_utf8_tofold);
+ _load_PL_utf8_foldclosures();
}
}
if (j < 256) {
- /* We have the latin1 folding rules hard-coded here so
- * that an innocent-looking character class, like
- * /[ks]/i won't have to go out to disk to find the
- * possible matches. XXX It would be better to
- * generate these via regen, in case a new version of
- * the Unicode standard adds new mappings, though that
- * is not really likely, and may be caught by the
- * default: case of the switch below. */
-
if (IS_IN_SOME_FOLD_L1(j)) {
/* ASCII is always matched; non-ASCII is matched
if (HAS_NONLATIN1_FOLD_CLOSURE(j)
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
{
- /* Certain Latin1 characters have matches outside
- * Latin1. To get here, <j> is one of those
- * characters. None of these matches is valid for
- * ASCII characters under /aa, which is why the 'if'
- * just above excludes those. These matches only
- * happen when the target string is utf8. The code
- * below adds the single fold closures for <j> to the
- * inversion list. */
-
- switch (j) {
- case 'k':
- case 'K':
- *use_list =
- add_cp_to_invlist(*use_list, KELVIN_SIGN);
- break;
- case 's':
- case 'S':
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_SMALL_LETTER_LONG_S);
- break;
- case MICRO_SIGN:
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_CAPITAL_LETTER_MU);
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_SMALL_LETTER_MU);
- break;
- case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
- case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
- *use_list =
- add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
- break;
- case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_SHARP_S);
- break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character
- * folds from code points that require UTF8
- * to express, so they can't match unless
- * the target string is in UTF-8, so no
- * action here is necessary, as regexec.c
- * properly handles the general case for
- * UTF-8 matching and multi-char folds */
- break;
- default:
- /* Use deprecated warning to increase the
- * chances of this being output */
- ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
- break;
- }
+ add_above_Latin1_folds(pRExC_state,
+ (U8) j,
+ use_list);
}
continue;
}
for (k = 0; k <= av_tindex(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
+ assert(c_p);
+
c = SvUV(*c_p);
/* /aa doesn't allow folds between ASCII and non- */