if ( SvIV(re_trie_maxbuff)>=0 ) {
regnode *cur;
regnode *first = (regnode *)NULL;
- regnode *last = (regnode *)NULL;
+ regnode *prev = (regnode *)NULL;
regnode *tail = scan;
U8 trietype = 0;
U32 count=0;
REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
}
Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
- REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+ REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
);
});
} else {
if ( trietype == NOTHING )
trietype = noper_trietype;
- last = cur;
+ prev = cur;
}
if (first)
count++;
* noper may either be a triable node which can
* not be tried together with the current trie,
* or a non triable node */
- if ( last ) {
+ if ( prev ) {
/* If last is set and trietype is not
* NOTHING then we have found at least two
* triable branch sequences in a row of a
make_trie( pRExC_state,
startbranch, first, cur, tail,
count, trietype, depth+1 );
- last = NULL; /* note: we clear/update
+ prev = NULL; /* note: we clear/update
first, trietype etc below,
so we dont do it here */
}
Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
- REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+ REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
PL_reg_name[trietype]
);
});
- if ( last && trietype ) {
+ if ( prev && trietype ) {
if ( trietype != NOTHING ) {
/* the last branch of the sequence was part of
* a trie, so we have to construct it here
OP(opt)= OPTIMIZED;
}
}
- } /* end if ( last) */
+ } /* end if ( prev) */
} /* TRIE_MAXBUF is non zero */
-
} /* do trie */
}
DEBUG_r(if (!PL_colorset) reginitcolors());
- /* Initialize these here instead of as-needed, as is quick and avoids
- * having to test them each time otherwise */
- if (! PL_InBitmap) {
-#ifdef DEBUGGING
- char * dump_len_string;
-#endif
-
- /* This is calculated here, because the Perl program that generates the
- * static global ones doesn't currently have access to
- * NUM_ANYOF_CODE_POINTS */
- PL_InBitmap = _new_invlist(2);
- PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
- NUM_ANYOF_CODE_POINTS - 1);
-#ifdef DEBUGGING
- dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
- if ( ! dump_len_string
- || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
- {
- PL_dump_re_max_len = 60; /* A reasonable default */
- }
-#endif
- }
pRExC_state->warn_text = NULL;
pRExC_state->unlexed_names = NULL;
i = rx->sublen + rx->suboffset - rx->offs[0].end;
}
else
- if ( 0 <= n && n <= (I32)rx->nparens &&
- (s1 = rx->offs[n].start) != -1 &&
+ if (inRANGE(n, 0, (I32)rx->nparens) &&
+ (s1 = rx->offs[n].start) != -1 &&
(t1 = rx->offs[n].end) != -1)
{
/* $&, ${^MATCH}, $1 ... */
value = (U8 *) SvPV(value_sv, value_len);
/* See if the result is one code point vs 0 or multiple */
- if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
- ? UTF8SKIP(value)
- : 1))
+ if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
+ ? UTF8SKIP(value)
+ : 1)))
{
/* Here, exactly one code point. If that isn't what is wanted,
* fail */
goto continue_parse;
}
- else if (! LOC) { /* XXX shouldn't /l assume could be a UTF-8
- locale, and prepare for that? */
+ else if (FOLD) {
bool splittable = FALSE;
bool backed_up = FALSE;
- char * e = s;
-
- assert(FOLD);
+ char * e;
+ char * s_start;
/* Here is /i. Running out of room creates a problem if we are
* folding, and the split happens in the middle of a
* oldp points to the beginning byte in the input of
* 'ender'.
*
+ * In the case of /il, we haven't folded anything that could be
+ * affected by the locale. That means only above-Latin1
+ * characters that fold to other above-latin1 characters get
+ * folded at compile time. To check where a good place to
+ * split nodes is, everything in it will have to be folded.
+ * The boolean 'maybe_exactfu' keeps track in /il if there are
+ * any unfolded characters in the node. */
+ bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+ /* If we do need to fold the node, we need a place to store the
+ * folded copy, and a way to map back to the unfolded original
+ * */
+ char * locfold_buf = NULL;
+ Size_t * loc_correspondence = NULL;
+
+ if (! need_to_fold_loc) { /* The normal case. Just
+ initialize to the actual node */
+ e = s;
+ s_start = s0;
+ s = old_old_s; /* Point to the beginning of the final char
+ that fits in the node */
+ }
+ else {
+
+ /* Here, we have filled a /il node, and there are unfolded
+ * characters in it. If the runtime locale turns out to be
+ * UTF-8, there are possible multi-character folds, just
+ * like when not under /l. The node hence can't terminate
+ * in the middle of such a fold. To determine this, we
+ * have to create a folded copy of this node. That means
+ * reparsing the node, folding everything assuming a UTF-8
+ * locale. (If at runtime it isn't such a locale, the
+ * actions here wouldn't have been necessary, but we have
+ * to assume the worst case.) If we find we need to back
+ * off the folded string, we do so, and then map that
+ * position back to the original unfolded node, which then
+ * gets output, truncated at that spot */
+
+ char * redo_p = RExC_parse;
+ char * redo_e;
+ char * old_redo_e;
+
+ /* Allow enough space assuming a single byte input folds to
+ * a single byte output, plus assume that the two unparsed
+ * characters (that we may need) fold to the largest number
+ * of bytes possible, plus extra for one more worst case
+ * scenario. In the loop below, if we start eating into
+ * that final spare space, we enlarge this initial space */
+ Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+ Newxz(locfold_buf, size, char);
+ Newxz(loc_correspondence, size, Size_t);
+
+ /* Redo this node's parse, folding into 'locfold_buf' */
+ redo_p = RExC_parse;
+ old_redo_e = redo_e = locfold_buf;
+ while (redo_p <= oldp) {
+
+ old_redo_e = redo_e;
+ loc_correspondence[redo_e - locfold_buf]
+ = redo_p - RExC_parse;
+
+ if (UTF) {
+ Size_t added_len;
+
+ (void) _to_utf8_fold_flags((U8 *) redo_p,
+ (U8 *) RExC_end,
+ (U8 *) redo_e,
+ &added_len,
+ FOLD_FLAGS_FULL);
+ redo_e += added_len;
+ redo_p += UTF8SKIP(redo_p);
+ }
+ else {
+
+ /* Note that if this code is run on some ancient
+ * Unicode versions, SHARP S doesn't fold to 'ss',
+ * but rather than clutter the code with #ifdef's,
+ * as is done above, we ignore that possibility.
+ * This is ok because this code doesn't affect what
+ * gets matched, but merely where the node gets
+ * split */
+ if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+ *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+ }
+ else {
+ *redo_e++ = 's';
+ *redo_e++ = 's';
+ }
+ redo_p++;
+ }
+
+
+ /* If we're getting so close to the end that a
+ * worst-case fold in the next character would cause us
+ * to overflow, increase, assuming one byte output byte
+ * per one byte input one, plus room for another worst
+ * case fold */
+ if ( redo_p <= oldp
+ && redo_e > locfold_buf + size
+ - (UTF8_MAXBYTES_CASE + 1))
+ {
+ Size_t new_size = size
+ + (oldp - redo_p)
+ + UTF8_MAXBYTES_CASE + 1;
+ Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+ Renew(locfold_buf, new_size, char);
+ Renew(loc_correspondence, new_size, Size_t);
+ size = new_size;
+
+ redo_e = locfold_buf + e_offset;
+ }
+ }
+
+ /* Set so that things are in terms of the folded, temporary
+ * string */
+ s = old_redo_e;
+ s_start = locfold_buf;
+ e = redo_e;
+
+ }
+
+ /* Here, we have 's', 's_start' and 'e' set up to point to the
+ * input that goes into the node, folded.
+ *
* If the final character of the node and the fold of ender
* form the first two characters of a three character fold, we
* need to peek ahead at the next (unparsed) character in the
* and try again.
*
* Otherwise, the node can be split at the current position.
- */
- s = old_old_s; /* Point to the beginning of the final char
- that fits in the node */
-
- /* The same logic is used for UTF-8 patterns and not */
+ *
+ * The same logic is used for UTF-8 patterns and not */
if (UTF) {
Size_t added_len;
* drop down to try at that position */
if (isPUNCT(*p)) {
s = (char *) utf8_hop_back((U8 *) s, -1,
- (U8 *) s0);
+ (U8 *) s_start);
backed_up = TRUE;
}
else {
* either case would break apart a fold */
do {
char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
- (U8 *) s0);
+ (U8 *) s_start);
/* If is a multi-char fold, can't split here. Backup
* one char and try again */
* three character fold starting at the character
* before s, we can't split either before or after s.
* Backup two chars and try again */
- if ( LIKELY(s > s0)
+ if ( LIKELY(s > s_start)
&& UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
{
s = prev_s;
- s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
backed_up = TRUE;
continue;
}
splittable = TRUE;
break;
- } while (s > s0); /* End of loops backing up through the node */
+ } while (s > s_start); /* End of loops backing up through the node */
/* Here we either couldn't find a place to split the node,
* or else we broke out of the loop setting 'splittable' to
continue;
}
- if ( LIKELY(s > s0)
+ if ( LIKELY(s > s_start)
&& UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
{
s -= 2;
splittable = TRUE;
break;
- } while (s > s0);
+ } while (s > s_start);
if (splittable) {
s++;
/* If we did find a place to split, reparse the entire node
* stopping where we have calculated. */
if (splittable) {
- upper_fill = s - s0;
+
+ /* If we created a temporary folded string under /l, we
+ * have to map that back to the original */
+ if (need_to_fold_loc) {
+ upper_fill = loc_correspondence[s - s_start];
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+
+ if (upper_fill == 0) {
+ FAIL2("panic: loc_correspondence[%d] is 0",
+ (int) (s - s_start));
+ }
+ }
+ else {
+ upper_fill = s - s0;
+ }
goto reparse;
}
+ else if (need_to_fold_loc) {
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+ }
/* Here the node consists entirely of non-final multi-char
* folds. (Likely it is all 'f's or all 's's.) There's no
UPDATE_WARNINGS_LOC(RExC_parse);
}
-Size_t PERL_STATIC_INLINE
+PERL_STATIC_INLINE Size_t
S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
{
const U8 * const start = s1;
/* Only try if there are no more code points in the class than
* in the max possible fold */
- && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1)
+ && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
{
if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
{
regarglen[op] + STR_SZ(len),
"anyofhs");
FILL_NODE(ret, op);
- RExC_emit += 1 + regarglen[op]
- - 1 + STR_SZ(len); /* Replace the [1]
- element of the struct
- by the real value */
- REGNODE_p(ret)->flags = len;
+ ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
+ = len;
Copy(low_utf8, /* Add the common bytes */
((struct regnode_anyofhs *) REGNODE_p(ret))->string,
len, U8);
- NEXT_OFF(REGNODE_p(ret)) = regarglen[op] + STR_SZ(len);
+ RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
NULL, only_utf8_locale_list);
goto not_anyof;
STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
- av_store(av, INVLIST_INDEX, invlist);
+ ary[INVLIST_INDEX] = invlist;
av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
- ? ONLY_LOCALE_MATCHES_INDEX:
- INVLIST_INDEX);
+ ? ONLY_LOCALE_MATCHES_INDEX
+ : INVLIST_INDEX);
si = NULL;
}
}
UV prev_cp = 0;
U8 count = 0;
- /* Ignore everything before the first new-line */
- while (*si_string != '\n' && remaining > 0) {
- si_string++;
- remaining--;
- }
- assert(remaining > 0);
-
+ /* Ignore everything before and including the first new-line */
+ si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
+ assert (si_string != NULL);
si_string++;
- remaining--;
+ remaining = SvPVX(si) + SvCUR(si) - si_string;
while (remaining > 0) {
* here to the next \n */
remaining -= len;
- while (*(si_string + len) != '\n' && remaining > 0) {
- remaining--;
- len++;
- }
- if (*(si_string + len) == '\n') {
- len++;
- remaining--;
- }
+ len = strcspn(si_string, "\n");
+ remaining -= len;
if (matches_string) {
- sv_catpvn(matches_string, si_string, len - 1);
+ sv_catpvn(matches_string, si_string, len);
}
else {
- matches_string = newSVpvn(si_string, len - 1);
+ matches_string = newSVpvn(si_string, len);
}
- si_string += len;
sv_catpvs(matches_string, " ");
+
+ si_string += len;
+ if (remaining && UCHARAT(si_string) == '\n') {
+ si_string++;
+ remaining--;
+ }
} /* end of loop through the text */
assert(matches_string);
scan = REGNODE_OFFSET(temp);
}
+ assert(val >= scan);
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
else if ( op == PLUS || op == STAR) {
DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
}
- else if (PL_regkind[(U8)op] == EXACT) {
+ else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
/* Literal string, where present. */
node += NODE_SZ_STR(node) - 1;
node = NEXTOPER(node);
{
dVAR;
+#ifdef DEBUGGING
+ char * dump_len_string;
+
+ dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+ if ( ! dump_len_string
+ || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+ {
+ PL_dump_re_max_len = 60; /* A reasonable default */
+ }
+#endif
+
PL_user_def_props = newHV();
#ifdef USE_ITHREADS
#endif
- /* Set up the inversion list global variables */
+ /* Set up the inversion list interpreter-level variables */
PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
+ PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist);
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);