#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
-#ifndef PERL_IN_XSUB_RE
-
-bool
-Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
+STATIC bool
+S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
{
/* Returns a boolean as to whether or not 'character' is a member of the
* Posix character class given by 'classnum' that should be equivalent to a
return FALSE;
}
-#endif
-
PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
{
* On the one hand you'd expect rare substrings to appear less
* often than \n's. On the other hand, searching for \n means
* we're effectively flipping between check_substr and "\n" on each
- * iteration as the current "rarest" string candidate, which
+ * iteration as the current "rarest" candidate string, which
* means for example that we'll quickly reject the whole string if
* hasn't got a \n, rather than trying every substr position
* first
* string can match, it returns FALSE; otherwise TRUE. (The FALSE
* situation occurs if the first character in <text_node> requires UTF-8 to
* represent, and the target string isn't in UTF-8.)
+ *
+ * Some analysis is in GH #18414, located at the time of this writing at:
+ * https://github.com/Perl/perl5/issues/18414
*/
const bool utf8_target = reginfo->is_utf8_target;
/* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
*/
- U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { 0 };
+ U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
U8 lengths[MAX_MATCHES] = { 0 };
U8 index_of_longest = 0;
*
* Everything generally matches at least itself. But if there is a
* UTF8ness mismatch, we have to convert to that of the target string. */
- if (utf8_pat == utf8_target || UTF8_IS_INVARIANT(*pat)) {
- lengths[0] = MIN(pat_len, C_ARRAY_LENGTH(matches[0]));
- Copy(pat, matches[0], lengths[0], U8);
+ if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
+ matches[0][0] = pat[0];
+ lengths[0] = 1;
m->count++;
}
- else if (utf8_target) { /* target is UTF-8; pattern isn't */
- matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
- matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
- lengths[0] = 2;
- m->count++;
- }
- else { /* pattern is UTF-8, target isn't */
- if (UTF8_IS_DOWNGRADEABLE_START(*pat)) {
- matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
- lengths[0] = 1;
+ else if (utf8_target) {
+ if (utf8_pat) {
+ lengths[0] = UTF8SKIP(pat);
+ Copy(pat, matches[0], lengths[0], U8);
m->count++;
}
+ else { /* target is UTF-8, pattern isn't */
+ matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
+ matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
+ lengths[0] = 2;
+ m->count++;
+ }
+ }
+ else if (! utf8_pat) { /* Neither is UTF-8 */
+ matches[0][0] = pat[0];
+ lengths[0] = 1;
+ m->count++;
+ }
+ else /* target isn't UTF-8; pattern is. No match possible unless the
+ pattern's first character can fit in a byte */
+ if (UTF8_IS_DOWNGRADEABLE_START(*pat))
+ {
+ matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
+ lengths[0] = 1;
+ m->count++;
}
/* Here we have taken care of any necessary node-type changes */
fold_from = remaining_fold_froms[i-1];
}
- if (folded == fold_from) { /* We already added the character itself */
+ if (folded == fold_from) { /* We already added the character
+ itself */
continue;
}
lengths[m->count] = UVCHR_SKIP(fold_from);
m->count++;
}
- else { /* Non-UTF8 target: any code point above 255
- can't appear in it */
+ else { /* Non-UTF8 target: no code point above 255 can appear in it
+ */
if (fold_from > 255) {
continue;
}
if (m->count > 1) { /* No need to sort a single entry */
for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
- /* Keep the same order for all but the longest */
+ /* Keep the same order for all but the longest. (If the
+ * asserts fail, it could be because m->matches is declared too
+ * short, either because of a new Unicode release, or an
+ * overlooked test case, or it could be a bug.) */
if (i != index_of_longest) {
assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
m->lengths[output_index] = lengths[index_of_longest];
}
+
return TRUE;
}
curly_try_B_min_known:
/* find the next place where 'B' could work, then call B */
if (locinput + ST.Binfo.initial_exact < loceol) {
- if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
-
- /* Here, the mask is all 1's for the entire length of
- * any possible match. (That actually means that there
- * is only one possible match.) Look for the next
- * occurrence */
- locinput = ninstr(locinput, loceol,
- (char *) ST.Binfo.matches,
- (char *) ST.Binfo.matches
- + ST.Binfo.initial_exact);
- if (locinput == NULL) {
- sayNO;
- }
- }
- else do {
- /* If the first byte(s) of the mask are all ones, it
- * means those bytes must match identically, so can use
- * ninstr() to find the next possible matchpoint */
- if (ST.Binfo.initial_exact > 0) {
+ if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
+
+ /* Here, the mask is all 1's for the entire length of
+ * any possible match. (That actually means that there
+ * is only one possible match.) Look for the next
+ * occurrence */
locinput = ninstr(locinput, loceol,
- (char *) ST.Binfo.matches,
- (char *) ST.Binfo.matches
+ (char *) ST.Binfo.matches,
+ (char *) ST.Binfo.matches
+ ST.Binfo.initial_exact);
+ if (locinput == NULL) {
+ sayNO;
+ }
}
- else { /* Otherwise find the next byte that matches,
- masked */
- locinput = (char *) find_next_masked(
- (U8 *) locinput, (U8 *) loceol,
- ST.Binfo.first_byte_anded,
- ST.Binfo.first_byte_mask);
- /* Advance to the end of a multi-byte character */
- if (utf8_target) {
- while ( locinput < loceol
- && UTF8_IS_CONTINUATION(*locinput))
- {
- locinput++;
+ else do {
+ /* If the first byte(s) of the mask are all ones, it
+ * means those bytes must match identically, so can use
+ * ninstr() to find the next possible matchpoint */
+ if (ST.Binfo.initial_exact > 0) {
+ locinput = ninstr(locinput, loceol,
+ (char *) ST.Binfo.matches,
+ (char *) ST.Binfo.matches
+ + ST.Binfo.initial_exact);
+ }
+ else { /* Otherwise find the next byte that matches,
+ masked */
+ locinput = (char *) find_next_masked(
+ (U8 *) locinput, (U8 *) loceol,
+ ST.Binfo.first_byte_anded,
+ ST.Binfo.first_byte_mask);
+ /* Advance to the end of a multi-byte character */
+ if (utf8_target) {
+ while ( locinput < loceol
+ && UTF8_IS_CONTINUATION(*locinput))
+ {
+ locinput++;
+ }
}
}
- }
- if ( locinput == NULL
- || locinput + ST.Binfo.min_length > loceol)
- {
- sayNO;
- }
+ if ( locinput == NULL
+ || locinput + ST.Binfo.min_length > loceol)
+ {
+ sayNO;
+ }
- /* Here, we have found a possible match point; if can't
- * rule it out, quit the loop so can check fully */
- if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
- break;
- }
+ /* Here, we have found a possible match point; if can't
+ * rule it out, quit the loop so can check fully */
+ if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
+ break;
+ }
- locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
+ locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
- } while (locinput <= ST.maxpos);
+ } while (locinput <= ST.maxpos);
}
if (locinput > ST.maxpos)
n = (utf8_target)
? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
- : locinput - ST.oldloc;
+ : (STRLEN) (locinput - ST.oldloc);
/* Here is at the beginning of a character that meets the mask
if (definitive_len == 1) {
const char * orig_scan = scan;
- this_eol = MIN(this_eol, scan + max - hardcount);
+ if (this_eol - (scan - hardcount) > max) {
+ this_eol = scan - hardcount + max;
+ }
/* Use different routines depending on whether it's an
* exact match or matches with a mask */