#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
+ _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
- const regnode_charclass_posixl* const node)
+ const regnode_charclass* const node)
{
/* Returns a mortal inversion list defining which code points are matched
* by 'node', which is of type ANYOF. Handles complementing the result if
}
}
else {
- anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl*) and_with);
+ anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
}
|| (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
{
/* One or the other of P1, P2 is non-empty. */
- ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
+ if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
+ ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
+ }
ssc_union(ssc, anded_cp_list, FALSE);
}
else { /* P1 = P2 = empty */
ored_flags = ANYOF_FLAGS(or_with);
}
else {
- ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl*) or_with);
+ ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
}
&& ! is_ANYOF_SYNTHETIC(or_with))
{
/* We ignore P2, leaving P1 going forward */
- }
- else { /* Not inverted */
+ } /* else Not inverted */
+ else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
unsigned int i;
set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
NULL, NULL, NULL, FALSE);
+ /* Make sure is clone-safe */
+ ssc->invlist = NULL;
+
if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
}
May be the same as tail.
tail : item following the branch sequence
count : words in the sequence
- flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
+ flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
depth : indent depth
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
});
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+ assert(re_trie_maxbuff);
if (!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
}
regnode *noper = NEXTOPER( cur );
const U8 *uc = (U8*)STRING( noper );
const U8 *e = uc + STR_LEN( noper );
- STRLEN foldlen = 0;
+ int foldlen = 0;
U32 wordlen = 0; /* required init */
- STRLEN minbytes = 0;
- STRLEN maxbytes = 0;
+ STRLEN minchars = 0;
+ STRLEN maxchars = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
bitmap?*/
TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
- for ( ; uc < e ; uc += len ) {
+ for ( ; uc < e ; uc += len ) { /* Look at each char in the current
+ branch */
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
- /* Acummulate to the current values, the range in the number of
- * bytes that this character could match. The max is presumed to
- * be the same as the folded input (which TRIE_READ_CHAR returns),
- * except that when this is not in UTF-8, it could be matched
- * against a string which is UTF-8, and the variant characters
- * could be 2 bytes instead of the 1 here. Likewise, for the
- * minimum number of bytes when not folded. When folding, the min
- * is assumed to be 1 byte could fold to match the single character
- * here, or in the case of a multi-char fold, 1 byte can fold to
- * the whole sequence. 'foldlen' is used to denote whether we are
- * in such a sequence, skipping the min setting if so. XXX TODO
- * Use the exact list of what folds to each character, from
- * PL_utf8_foldclosures */
- if (UTF) {
- maxbytes += UTF8SKIP(uc);
- if (! folder) {
- /* A non-UTF-8 string could be 1 byte to match our 2 */
- minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
- ? 1
- : UTF8SKIP(uc);
- }
- else {
- if (foldlen) {
- foldlen -= UTF8SKIP(uc);
- }
- else {
- foldlen = is_MULTI_CHAR_FOLD_utf8(uc);
- minbytes++;
- }
- }
+ /* TRIE_READ_CHAR returns the current character, or its fold if /i
+ * is in effect. Under /i, this character can match itself, or
+ * anything that folds to it. If not under /i, it can match just
+ * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
+ * all fold to k, and all are single characters. But some folds
+ * expand to more than one character, so for example LATIN SMALL
+ * LIGATURE FFI folds to the three character sequence 'ffi'. If
+ * the string beginning at 'uc' is 'ffi', it could be matched by
+ * three characters, or just by the one ligature character. (It
+ * could also be matched by two characters: LATIN SMALL LIGATURE FF
+ * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
+ * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
+ * match.) The trie needs to know the minimum and maximum number
+ * of characters that could match so that it can use size alone to
+ * quickly reject many match attempts. The max is simple: it is
+ * the number of folded characters in this branch (since a fold is
+ * never shorter than what folds to it. */
+
+ maxchars++;
+
+ /* And the min is equal to the max if not under /i (indicated by
+ * 'folder' being NULL), or there are no multi-character folds. If
+ * there is a multi-character fold, the min is incremented just
+ * once, for the character that folds to the sequence. Each
+ * character in the sequence needs to be added to the list below of
+ * characters in the trie, but we count only the first towards the
+ * min number of characters needed. This is done through the
+ * variable 'foldlen', which is returned by the macros that look
+ * for these sequences as the number of bytes the sequence
+ * occupies. Each time through the loop, we decrement 'foldlen' by
+ * how many bytes the current char occupies. Only when it reaches
+ * 0 do we increment 'minchars' or look for another multi-character
+ * sequence. */
+ if (folder == NULL) {
+ minchars++;
+ }
+ else if (foldlen > 0) {
+ foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
}
else {
- maxbytes += (UNI_IS_INVARIANT(*uc))
- ? 1
- : 2;
- if (! folder) {
- minbytes++;
- }
- else {
- if (foldlen) {
- foldlen--;
- }
- else {
- foldlen = is_MULTI_CHAR_FOLD_latin1(uc);
- minbytes++;
+ minchars++;
+
+ /* See if *uc is the beginning of a multi-character fold. If
+ * so, we decrement the length remaining to look at, to account
+ * for the current character this iteration. (We can use 'uc'
+ * instead of the fold returned by TRIE_READ_CHAR because for
+ * non-UTF, the latin1_safe macro is smart enough to account
+ * for all the unfolded characters, and because for UTF, the
+ * string will already have been folded earlier in the
+ * compilation process */
+ if (UTF) {
+ if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
+ foldlen -= UTF8SKIP(uc);
}
}
+ else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
+ foldlen--;
+ }
}
+
+ /* The current character (and any potential folds) should be added
+ * to the possible matching characters for this position in this
+ * branch */
if ( uvc < 256 ) {
if ( folder ) {
U8 folded= folder[ (U8) uvc ];
set_bit = 0; /* We've done our bit :-) */
}
} else {
+
+ /* XXX We could come up with the list of code points that fold
+ * to this using PL_utf8_foldclosures, except not for
+ * multi-char folds, as there may be multiple combinations
+ * there that could work, which needs to wait until runtime to
+ * resolve (The comment about LIGATURE FFI above is such an
+ * example */
+
SV** svpp;
if ( !widecharmap )
widecharmap = newHV();
TRIE_STORE_REVCHAR(uvc);
}
}
- }
+ } /* end loop through characters in this branch of the trie */
+
+ /* We take the min and max for this branch and combine to find the min
+ * and max for all branches processed so far */
if( cur == first ) {
- trie->minlen = minbytes;
- trie->maxlen = maxbytes;
- } else if (minbytes < trie->minlen) {
- trie->minlen = minbytes;
- } else if (maxbytes > trie->maxlen) {
- trie->maxlen = maxbytes;
+ trie->minlen = minchars;
+ trie->maxlen = maxchars;
+ } else if (minchars < trie->minlen) {
+ trie->minlen = minchars;
+ } else if (maxchars > trie->maxlen) {
+ trie->maxlen = maxchars;
}
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
: MADE_TRIE;
}
-STATIC void
-S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
+STATIC regnode *
+S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
{
/* The Trie is constructed and compressed now so we can build a fail array if
* it's needed
U32 *fail;
reg_ac_data *aho;
const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
+ regnode *stclass;
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
+ PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
+ if ( OP(source) == TRIE ) {
+ struct regnode_1 *op = (struct regnode_1 *)
+ PerlMemShared_calloc(1, sizeof(struct regnode_1));
+ StructCopy(source,op,struct regnode_1);
+ stclass = (regnode *)op;
+ } else {
+ struct regnode_charclass *op = (struct regnode_charclass *)
+ PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
+ StructCopy(source,op,struct regnode_charclass);
+ stclass = (regnode *)op;
+ }
+ OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
ARG_SET( stclass, data_slot );
aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
});
Safefree(q);
/*RExC_seen |= REG_TRIEDFA_SEEN;*/
+ return stclass;
}
length sequence we are looking for is 2 */
{
int count = 0; /* How many characters in a multi-char fold */
- int len = is_MULTI_CHAR_FOLD_utf8(s);
+ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
if (! len) { /* Not a multi-char fold: get next char */
s += UTF8SKIP(s);
continue;
else { /* Here is a generic multi-char fold. */
U8* multi_end = s + len;
- /* Count how many characters in it. In the case of /aa, no
- * folds which contain ASCII code points are allowed, so
- * check for those, and skip if found. */
+ /* Count how many characters are in it. In the case of
+ * /aa, no folds which contain ASCII code points are
+ * allowed, so check for those, and skip if found. */
if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
count = utf8_length(s, multi_end);
s = multi_end;
: s_end -1;
while (s < upper) {
- int len = is_MULTI_CHAR_FOLD_latin1(s);
+ int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
if (*s == LATIN_SMALL_LETTER_SHARP_S
&& (OP(scan) == EXACTF || OP(scan) == EXACTFL))
}
flags &= ~SCF_DO_STCLASS;
}
- else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
+ else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
+ EXACTFish */
SSize_t l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
separate code points */
+ const U8 * s = (U8*)STRING(scan);
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
}
if (UTF) {
- const U8 * const s = (U8 *)STRING(scan);
uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
data->longest = &(data->longest_float);
}
}
- if (OP(scan) == EXACTFL) {
- /* We don't know what the folds are; it could be anything. XXX
- * Actually, we only support UTF-8 encoding for code points
- * above Latin1, so we could know what those folds are. */
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0,
- UV_MAX);
+ if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
+ ssc_clear_locale(data->start_class);
}
- else { /* Non-locale EXACTFish */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
- if (flags & SCF_DO_STCLASS_AND) {
- ssc_clear_locale(data->start_class);
+
+ if (! UTF) {
+
+ /* We punt and assume can match anything if the node begins
+ * with a multi-character fold. Things are complicated. For
+ * example, /ffi/i could match any of:
+ * "\N{LATIN SMALL LIGATURE FFI}"
+ * "\N{LATIN SMALL LIGATURE FF}I"
+ * "F\N{LATIN SMALL LIGATURE FI}"
+ * plus several other things; and making sure we have all the
+ * possibilities is hard. */
+ if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
}
- if (uc < 256) { /* We know what the Latin1 folds are ... */
- if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
- know if anything folds
- with this */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
+ else {
+
+ /* Any Latin1 range character can potentially match any
+ * other depending on the locale */
+ if (OP(scan) == EXACTFL) {
+ _invlist_union(EXACTF_invlist, PL_Latin1,
+ &EXACTF_invlist);
+ }
+ else {
+ /* But otherwise, it matches at least itself. We can
+ * quickly tell if it has a distinct fold, and if so,
+ * it matches that as well */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (IS_IN_SOME_FOLD_L1(uc)) {
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
PL_fold_latin1[uc]);
- if (OP(scan) != EXACTFA) { /* The folds below aren't
- legal under /iaa */
- if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist,
- LATIN_SMALL_LETTER_SHARP_S);
- }
- else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 's');
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 'S');
- }
}
+ }
- /* We also know if there are above-Latin1 code points
- * that fold to this (none legal for ASCII and /iaa) */
- if ((! isASCII(uc) || OP(scan) != EXACTFA)
- && HAS_NONLATIN1_FOLD_CLOSURE(uc))
- {
- /* XXX We could know exactly what does fold to this
- * if the reverse folds are loaded, as currently in
- * S_regclass() */
- _invlist_union(EXACTF_invlist,
- PL_AboveLatin1,
- &EXACTF_invlist);
+ /* Some characters match above-Latin1 ones under /i. This
+ * is true of EXACTFL ones when the locale is UTF-8 */
+ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
+ && (! isASCII(uc) || (OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)))
+ {
+ add_above_Latin1_folds(pRExC_state,
+ (U8) uc,
+ &EXACTF_invlist);
+ }
+ }
+ }
+ else { /* Pattern is UTF-8 */
+ U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
+ STRLEN foldlen = UTF8SKIP(s);
+ const U8* e = s + STR_LEN(scan);
+ SV** listp;
+
+ /* The only code points that aren't folded in a UTF EXACTFish
+ * node are are the problematic ones in EXACTFL nodes */
+ if (OP(scan) == EXACTFL
+ && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
+ {
+ /* We need to check for the possibility that this EXACTFL
+ * node begins with a multi-char fold. Therefore we fold
+ * the first few characters of it so that we can make that
+ * check */
+ U8 *d = folded;
+ int i;
+
+ for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
+ if (isASCII(*s)) {
+ *(d++) = (U8) toFOLD(*s);
+ s++;
+ }
+ else {
+ STRLEN len;
+ to_utf8_fold(s, d, &len);
+ d += len;
+ s += UTF8SKIP(s);
}
}
+
+ /* And set up so the code below that looks in this folded
+ * buffer instead of the node's string */
+ e = d;
+ foldlen = UTF8SKIP(folded);
+ s = folded;
+ }
+
+ /* When we reach here 's' points to the fold of the first
+ * character(s) of the node; and 'e' points to far enough along
+ * the folded string to be just past any possible multi-char
+ * fold. 'foldlen' is the length in bytes of the first
+ * character in 's'
+ *
+ * Unlike the non-UTF-8 case, the macro for determining if a
+ * string is a multi-char fold requires all the characters to
+ * already be folded. This is because of all the complications
+ * if not. Note that they are folded anyway, except in EXACTFL
+ * nodes. Like the non-UTF case above, we punt if the node
+ * begins with a multi-char fold */
+
+ if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
}
- else { /* Non-locale, above Latin1. XXX We don't currently
- know what participates in folds with this, so have
- to assume anything could */
-
- /* XXX We could know exactly what does fold to this if the
- * reverse folds are loaded, as currently in S_regclass().
- * But we do know that under /iaa nothing in the ASCII
- * range can participate */
- if (OP(scan) == EXACTFA) {
- _invlist_union_complement_2nd(EXACTF_invlist,
- PL_XPosix_ptrs[_CC_ASCII],
- &EXACTF_invlist);
+ else { /* Single char fold */
+
+ /* It matches all the things that fold to it, which are
+ * found in PL_utf8_foldclosures (including itself) */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (! PL_utf8_foldclosures) {
+ _load_PL_utf8_foldclosures();
}
- else {
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0, UV_MAX);
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) s, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_tindex(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ assert(c_p);
+
+ c = SvUV(*c_p);
+
+ /* /aa doesn't allow folds between ASCII and non- */
+ if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
+ && isASCII(c) != isASCII(uc))
+ {
+ continue;
+ }
+
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
+ }
}
}
}
if (flags & SCF_DO_SUBSTR)
data->pos_min++;
min++;
- /* Fall through. */
+ /* FALLTHROUGH */
case STAR:
if (flags & SCF_DO_STCLASS) {
mincount = 0;
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
- " SSize_t_MAX=%"UVdf" minnext=%"UVdf
- " maxcount=%"UVdf" mincount=%"UVdf"\n",
+PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
+ " SSize_t_MAX=%"UVuf" minnext=%"UVuf
+ " maxcount=%"UVuf" mincount=%"UVuf"\n",
(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
(UV)mincount);
if (deltanext != SSize_t_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
case NPOSIXL:
invert = 1;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case POSIXL:
namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
case NPOSIXA: /* For these, we always know the exact set of
what's matched */
invert = 1;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case POSIXA:
if (FLAGS(scan) == _CC_ASCII) {
my_invlist = PL_XPosix_ptrs[_CC_ASCII];
case NPOSIXD:
case NPOSIXU:
invert = 1;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case POSIXD:
case POSIXU:
my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
return count;
}
-/*XXX: todo make this not included in a non debugging perl */
+/*XXX: todo make this not included in a non debugging perl, but appears to be
+ * used anyway there, in 'use re' */
#ifndef PERL_IN_XSUB_RE
void
Perl_reginitcolors(pTHX)
/* if we know we have at least two args, create an empty string,
* then concatenate args to that. For no args, return an empty string */
if (!pat && pat_count != 1) {
- pat = newSVpvn("", 0);
+ pat = newSVpvs("");
SAVEFREESV(pat);
alloced = TRUE;
}
else if (PL_regkind[OP(first)] == TRIE &&
((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
{
- regnode *trie_op;
- /* this can happen only on restudy */
- if ( OP(first) == TRIE ) {
- struct regnode_1 *trieop = (struct regnode_1 *)
- PerlMemShared_calloc(1, sizeof(struct regnode_1));
- StructCopy(first,trieop,struct regnode_1);
- trie_op=(regnode *)trieop;
- } else {
- struct regnode_charclass *trieop = (struct regnode_charclass *)
- PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
- StructCopy(first,trieop,struct regnode_charclass);
- trie_op=(regnode *)trieop;
- }
- OP(trie_op)+=2;
- make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
- ri->regstclass = trie_op;
+ /* this can happen only on restudy */
+ ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
}
#endif
else if (REGNODE_SIMPLE(OP(first)))
}
if (RExC_contains_locale) {
- RXp_EXTFLAGS(r) |= RXf_TAINTED_SEEN;
+ RXp_EXTFLAGS(r) |= RXf_TAINTED;
}
#ifdef DEBUGGING
count += 2;
}
}
+
+void
+Perl__load_PL_utf8_foldclosures (pTHX)
+{
+ assert(! PL_utf8_foldclosures);
+
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
+
+ /* This string is just a short named one above \xff */
+ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+ assert(PL_utf8_tofold); /* Verify that worked */
+ }
+ PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+}
#endif
#ifdef PERL_ARGS_ASSERT__INVLISTEQ
int internal_argval = 0; /* internal_argval is only useful if
!argok */
- if (has_intervening_patws && SIZE_ONLY) {
- ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+ if (has_intervening_patws) {
+ RExC_parse++;
+ vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
}
while ( *RExC_parse && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
- if (has_intervening_patws && SIZE_ONLY) {
- ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+ if (has_intervening_patws) {
+ RExC_parse++;
+ vFAIL("In '(?...)', the '(' and '?' must be adjacent");
}
RExC_parse++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
RExC_in_lookbehind++;
RExC_parse++;
+ /* FALLTHROUGH */
case '=': /* (?=...) */
RExC_seen_zerolen++;
break;
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#': /* (?#...) */
- /* XXX As soon as we disallow separating the '?' and '*' (by
- * spaces or (?#...) comment), it is believed that this case
- * will be unreachable and can be removed. See
- * [perl #117327] */
- while (*RExC_parse && *RExC_parse != ')')
- RExC_parse++;
- if (*RExC_parse != ')')
- FAIL("Sequence (?#... not terminated");
- nextchar(pRExC_state);
- *flagp = TRYAGAIN;
- return NULL;
case '0' : /* (?0) */
case 'R' : /* (?R) */
if (*RExC_parse != ')')
RExC_parse--; /* rewind to let it be handled later */
goto parse_flags;
}
- /*FALLTHROUGH */
+ /* FALLTHROUGH */
case '1': case '2': case '3': case '4': /* (?1) */
case '5': case '6': case '7': case '8': case '9':
RExC_parse--;
}
*flagp |= POSTPONED;
paren = *RExC_parse++;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case '{': /* (?{...}) */
{
U32 n = 0;
case '=':
case '!':
*flagp &= ~HASWIDTH;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case '>':
ender = reg_node(pRExC_state, SUCCEED);
break;
* modifier. The other meaning does not, so use a temporary until we find
* out which we are being called with */
p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regwhite( pRExC_state, RExC_parse )
+ ? regpatws(pRExC_state, RExC_parse,
+ TRUE) /* means recognize comments */
: RExC_parse;
/* Disambiguate between \N meaning a named character versus \N meaning
{
char *q = p;
- for (;isDIGIT(*q); q++); /* calculate length of num */
+ for (;isDIGIT(*q); q++) {} /* calculate length of num */
if (q - p == 0 || q - p > 9)
return I32_MAX;
return atoi(p);
RExC_parse++;
goto defchar;
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case '?':
case '+':
case '*':
}
*flagp |= HASWIDTH|SIMPLE;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
finish_meta_pat:
nextchar(pRExC_state);
case '\0':
if (RExC_parse >= RExC_end)
FAIL("Trailing \\");
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
/* Do not generate "unrecognized" warnings here, we fall
back into the quick-grab loop below */
case '#':
if (RExC_flags & RXf_PMf_EXTENDED) {
- if ( reg_skipcomment( pRExC_state ) )
+ RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+ if (RExC_parse < RExC_end)
goto tryagain;
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
switch ((U8)*p) {
case '^':
case '$':
--p;
goto loopdone;
}
+ /* FALLTHROUGH */
case '0':
{
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
case '\0':
if (p >= RExC_end)
FAIL("Trailing \\");
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
/* Include any { following the alpha to emphasize
break;
default: /* A literal character */
- if (! SIZE_ONLY
- && RExC_flags & RXf_PMf_EXTENDED
- && ckWARN_d(WARN_DEPRECATED)
- && is_PATWS_non_low(p, UTF))
- {
- vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
- "Escape literal pattern white space under /x");
- }
-
normal_default:
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
*/
if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
}
STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGWHITE;
-
- while (p < e) {
- if (isSPACE(*p))
- ++p;
- else if (*p == '#') {
- bool ended = 0;
- do {
- if (*p++ == '\n') {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- }
- else
- break;
- }
- return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
{
/* Returns the next non-pattern-white space, non-comment character (the
* latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. If there is no line break ending a comment,
- * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+ * ended by RExC_end. See also reg_skipcomment */
const char *e = RExC_end;
PERL_ARGS_ASSERT_REGPATWS;
p += len;
}
else if (recognize_comment && *p == '#') {
- bool ended = 0;
- do {
- p++;
- if (is_LNBREAK_safe(p, e, UTF)) {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ p = reg_skipcomment(pRExC_state, p);
}
else
break;
while (RExC_parse < RExC_end) {
SV* current = NULL;
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE); /* means recognize comments */
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
break;
case '\\':
/* Skip white space */
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE /* means recognize comments */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
RExC_flags = save_flags;
goto handle_operand;
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
top_index -= 2;
SvREFCNT_dec_NN(lparen);
- /* FALL THROUGH */
+ /* FALLTHROUGH */
}
handle_operand:
}
#undef IS_OPERAND
+STATIC void
+S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+{
+ /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
+ * innocent-looking character class, like /[ks]/i won't have to go out to
+ * disk to find the possible matches.
+ *
+ * This should be called only for a Latin1-range code points, cp, which is
+ * known to be involved in a fold with other code points above Latin1. It
+ * would give false results if /aa has been specified. Multi-char folds
+ * are outside the scope of this, and must be handled specially.
+ *
+ * XXX It would be better to generate these via regen, in case a new
+ * version of the Unicode standard adds new mappings, though that is not
+ * really likely, and may be caught by the default: case of the switch
+ * below. */
+
+ PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
+
+ switch (cp) {
+ case 'k':
+ case 'K':
+ *invlist =
+ add_cp_to_invlist(*invlist, KELVIN_SIGN);
+ break;
+ case 's':
+ case 'S':
+ *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
+ break;
+ case MICRO_SIGN:
+ *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
+ *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ *invlist = add_cp_to_invlist(*invlist,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
+ break;
+ case 'F': case 'f':
+ case 'I': case 'i':
+ case 'L': case 'l':
+ case 'T': case 't':
+ case 'A': case 'a':
+ case 'H': case 'h':
+ case 'J': case 'j':
+ case 'N': case 'n':
+ case 'W': case 'w':
+ case 'Y': case 'y':
+ /* These all are targets of multi-character folds from code points
+ * that require UTF8 to express, so they can't match unless the
+ * target string is in UTF-8, so no action here is necessary, as
+ * regexec.c properly handles the general case for UTF-8 matching
+ * and multi-char folds */
+ break;
+ default:
+ /* Use deprecated warning to increase the chances of this being
+ * output */
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+ break;
+ }
+}
+
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
}
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == ']') {
e = strchr(RExC_parse++, '}');
if (!e)
vFAIL2("Missing right brace on \\%c{}", c);
- while (isSPACE(UCHARAT(RExC_parse)))
+ while (isSPACE(*RExC_parse))
RExC_parse++;
if (e == RExC_parse)
vFAIL2("Empty \\%c{}", c);
n = e - RExC_parse;
- while (isSPACE(UCHARAT(RExC_parse + n - 1)))
+ while (isSPACE(*(RExC_parse + n - 1)))
n--;
}
else {
}
if (!SIZE_ONLY) {
SV* invlist;
- char* formatted;
char* name;
if (UCHARAT(RExC_parse) == '^') {
* that bit) */
value ^= 'P' ^ 'p';
- while (isSPACE(UCHARAT(RExC_parse))) {
+ while (isSPACE(*RExC_parse)) {
RExC_parse++;
n--;
}
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- formatted = Perl_form(aTHX_
+ name = savepv(Perl_form(aTHX_
"%s%.*s%s\n",
(FOLD) ? "__" : "",
(int)n,
RExC_parse,
(FOLD) ? "_i" : ""
- );
- name = savepvn(formatted, strlen(formatted));
+ ));
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
"Property '%"UTF8f"' is unknown",
UTF8fARG(UTF, n, name));
}
+
+ /* If the property name doesn't already have a package
+ * name, add the current one to it so that it can be
+ * referred to outside it. [perl #121777] */
+ if (! instr(name, "::") && PL_curstash) {
+ char* full_name = Perl_form(aTHX_
+ "%s::%s",
+ HvNAME(PL_curstash),
+ name);
+ n = strlen(full_name);
+ Safefree(name);
+ name = savepvn(full_name, n);
+ }
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
(value == 'p' ? '+' : '!'),
UTF8fARG(UTF, n, name));
ANYOF_POSIXL_ZERO(ret);
}
+ /* Coverity thinks it is possible for this to be negative; both
+ * jhi and khw think it's not, but be safer */
+ assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
+ || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
+
/* See if it already matches the complement of this POSIX
* class */
if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (range) {
AV* this_array;
STRLEN cp_count = utf8_length(foldbuf,
foldbuf + foldlen);
- SV* multi_fold = sv_2mortal(newSVpvn("", 0));
+ SV* multi_fold = sv_2mortal(newSVpvs(""));
Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
&& ((prevvalue >= 'a' && value <= 'z')
|| (prevvalue >= 'A' && value <= 'Z')))
{
- _invlist_intersection(this_range, PL_ASCII,
+ _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
&this_range);
/* Since this above only contains ascii, the intersection of it
op = POSIXA;
}
}
+ else if (prevvalue == 'A') {
+ if (value == 'Z'
+#ifdef EBCDIC
+ && literal_endpoint == 2
+#endif
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+ op = POSIXA;
+ }
+ }
+ else if (prevvalue == 'a') {
+ if (value == 'z'
+#ifdef EBCDIC
+ && literal_endpoint == 2
+#endif
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+ op = POSIXA;
+ }
+ }
}
/* Here, we have changed <op> away from its initial value iff we found
/* This is a hash that for a particular fold gives all
* characters that are involved in it */
if (! PL_utf8_foldclosures) {
-
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES_CASE+1];
-
- /* This string is just a short named one above \xff */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures
- = _swash_inversion_hash(PL_utf8_tofold);
+ _load_PL_utf8_foldclosures();
}
}
if (j < 256) {
- /* We have the latin1 folding rules hard-coded here so
- * that an innocent-looking character class, like
- * /[ks]/i won't have to go out to disk to find the
- * possible matches. XXX It would be better to
- * generate these via regen, in case a new version of
- * the Unicode standard adds new mappings, though that
- * is not really likely, and may be caught by the
- * default: case of the switch below. */
-
if (IS_IN_SOME_FOLD_L1(j)) {
/* ASCII is always matched; non-ASCII is matched
}
}
- if (HAS_NONLATIN1_FOLD_CLOSURE(j)
+ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
{
- /* Certain Latin1 characters have matches outside
- * Latin1. To get here, <j> is one of those
- * characters. None of these matches is valid for
- * ASCII characters under /aa, which is why the 'if'
- * just above excludes those. These matches only
- * happen when the target string is utf8. The code
- * below adds the single fold closures for <j> to the
- * inversion list. */
-
- switch (j) {
- case 'k':
- case 'K':
- *use_list =
- add_cp_to_invlist(*use_list, KELVIN_SIGN);
- break;
- case 's':
- case 'S':
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_SMALL_LETTER_LONG_S);
- break;
- case MICRO_SIGN:
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_CAPITAL_LETTER_MU);
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_SMALL_LETTER_MU);
- break;
- case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
- case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
- *use_list =
- add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
- break;
- case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_SHARP_S);
- break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character
- * folds from code points that require UTF8
- * to express, so they can't match unless
- * the target string is in UTF-8, so no
- * action here is necessary, as regexec.c
- * properly handles the general case for
- * UTF-8 matching and multi-char folds */
- break;
- default:
- /* Use deprecated warning to increase the
- * chances of this being output */
- ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
- break;
- }
+ add_above_Latin1_folds(pRExC_state,
+ (U8) j,
+ use_list);
}
continue;
}
for (k = 0; k <= av_tindex(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
+ assert(c_p);
+
c = SvUV(*c_p);
/* /aa doesn't allow folds between ASCII and non- */
* at compile time. Besides not inverting folded locale now, we can't
* invert if there are things such as \w, which aren't known until runtime
* */
- if (invert
+ if (cp_list
+ && invert
&& ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
&& ! depends_list
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
/* We don't optimize if we are supposed to make sure all non-Unicode
* code points raise a warning, as only ANYOF nodes have this check.
* */
- && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
+ && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
{
UV start, end;
U8 op = END; /* The optimzation node-type */
av_store(av, 0, (runtime_defns)
? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
if (swash) {
+ assert(cp_list);
av_store(av, 1, swash);
SvREFCNT_dec_NN(cp_list);
}
/* reg_skipcomment()
- Absorbs an /x style # comments from the input stream.
- Returns true if there is more text remaining in the stream.
- Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
- terminates the pattern without including a newline.
+ Absorbs an /x style # comment from the input stream,
+ returning a pointer to the first character beyond the comment, or if the
+ comment terminates the pattern without anything following it, this returns
+ one past the final character of the pattern (in other words, RExC_end) and
+ sets the REG_RUN_ON_COMMENT_SEEN flag.
- Note its the callers responsibility to ensure that we are
+ Note it's the callers responsibility to ensure that we are
actually in /x mode
*/
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
{
- bool ended = 0;
-
PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
- while (RExC_parse < RExC_end)
- if (*RExC_parse++ == '\n') {
- ended = 1;
- break;
+ assert(*p = '#');
+
+ while (p < RExC_end) {
+ if (*(++p) == '\n') {
+ return p+1;
}
- if (!ended) {
- /* we ran off the end of the pattern without ending
- the comment, so we have to add an \n when wrapping */
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- return 0;
- } else
- return 1;
+ }
+
+ /* we ran off the end of the pattern without ending the comment, so we have
+ * to add an \n when wrapping */
+ RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ return p;
}
/* nextchar()
continue;
}
if (RExC_flags & RXf_PMf_EXTENDED) {
- if (isSPACE(*RExC_parse)) {
- RExC_parse++;
- continue;
- }
- else if (*RExC_parse == '#') {
- if ( reg_skipcomment( pRExC_state ) )
- continue;
- }
+ char * p = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ if (p != RExC_parse) {
+ RExC_parse = p;
+ continue;
+ }
}
- return retval;
+ return retval;
}
}
int bit;
int set=0;
- for (bit=0; bit<32; bit++) {
+ ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
if (flags & (1<<bit)) {
if (!set++ && lead)
PerlIO_printf(Perl_debug_log, "%s",lead);
int set=0;
regex_charset cs;
- for (bit=0; bit<32; bit++) {
+ ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
if (flags & (1<<bit)) {
if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
continue;
}
else if (k == POSIXD || k == NPOSIXD) {
U8 index = FLAGS(o) * 2;
- if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
- Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
- }
- else {
+ if (index < C_ARRAY_LENGTH(anyofs)) {
if (*anyofs[index] != '[') {
sv_catpv(sv, "[");
}
sv_catpv(sv, "]");
}
}
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
+ }
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
PerlMemShared_free(aho->fail);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
- PerlMemShared_free(ri->regstclass);
+ /* we should only ever get called once, so
+ * assert as much, and also guard the free
+ * which /might/ happen twice. At the least
+ * it will make code anlyzers happy and it
+ * doesn't cost much. - Yves */
+ assert(ri->regstclass);
+ if (ri->regstclass) {
+ PerlMemShared_free(ri->regstclass);
+ ri->regstclass = 0;
+ }
}
}
break;
* when the corresponding reg_ac_data struct is freed.
*/
reti->regstclass= ri->regstclass;
- /* Fall through */
+ /* FALLTHROUGH */
case 't':
OP_REFCNT_LOCK;
((reg_trie_data*)ri->data->data[i])->refcount++;
OP_REFCNT_UNLOCK;
- /* Fall through */
+ /* FALLTHROUGH */
case 'l':
case 'L':
d->data[i] = ri->data->data[i];
last= plast;
while (PL_regkind[op] != END && (!last || node < last)) {
+ assert(node);
/* While that wasn't END last time... */
NODE_ALIGN(node);
op = OP(node);