* This just calls isFOO_lc on the code point for the character if it is in
* the range 0-255. Outside that range, all characters use Unicode
* rules, ignoring any locale. So use the Unicode function if this class
- * requires a swash, and use the Unicode macro otherwise. */
+ * requires an inversion list, and use the Unicode macro otherwise. */
PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
return FALSE; /* Things like CNTRL are always below 256 */
}
-STATIC char *
-S_find_next_ascii(char * s, const char * send, const bool utf8_target)
-{
- /* Returns the position of the first ASCII byte in the sequence between 's'
- * and 'send-1' inclusive; returns 'send' if none found */
-
- PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
-
-#ifndef EBCDIC
-
- if ((STRLEN) (send - s) >= PERL_WORDSIZE
-
- /* This term is wordsize if subword; 0 if not */
- + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
-
- /* 'offset' */
- - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
- {
-
- /* Process per-byte until reach word boundary. XXX This loop could be
- * eliminated if we knew that this platform had fast unaligned reads */
- while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
- if (isASCII(*s)) {
- return s;
- }
- s++; /* khw didn't bother creating a separate loop for
- utf8_target */
- }
-
- /* Here, we know we have at least one full word to process. Process
- * per-word as long as we have at least a full word left */
- do {
- PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
- if (complemented & PERL_VARIANTS_WORD_MASK) {
-
-# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
- || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
-
- s += _variant_byte_number(complemented);
- return s;
-
-# else /* If weird byte order, drop into next loop to do byte-at-a-time
- checks. */
-
- break;
-# endif
- }
-
- s += PERL_WORDSIZE;
-
- } while (s + PERL_WORDSIZE <= send);
- }
-
-#endif
-
- /* Process per-character */
- if (utf8_target) {
- while (s < send) {
- if (isASCII(*s)) {
- return s;
- }
- s += UTF8SKIP(s);
- }
- }
- else {
- while (s < send) {
- if (isASCII(*s)) {
- return s;
- }
- s++;
- }
- }
-
- return s;
-}
-
-STATIC char *
-S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
-{
- /* Returns the position of the first non-ASCII byte in the sequence between
- * 's' and 'send-1' inclusive; returns 'send' if none found */
-
-#ifdef EBCDIC
-
- PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
-
- if (utf8_target) {
- while (s < send) {
- if ( ! isASCII(*s)) {
- return s;
- }
- s += UTF8SKIP(s);
- }
- }
- else {
- while (s < send) {
- if ( ! isASCII(*s)) {
- return s;
- }
- s++;
- }
- }
-
- return s;
-
-#else
-
- const U8 * next_non_ascii = NULL;
-
- PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
- PERL_UNUSED_ARG(utf8_target);
-
- /* On ASCII platforms invariants and ASCII are identical, so if the string
- * is entirely invariants, there is no non-ASCII character */
- return (is_utf8_invariant_string_loc((U8 *) s,
- (STRLEN) (send - s),
- &next_non_ascii))
- ? (char *) send
- : (char *) next_non_ascii;
-
-#endif
-
-}
-
STATIC U8 *
S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
{
(U8) ARG(c), FLAGS(c)));
break;
+ case ANYOFH:
+ if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE,
+ reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+ break;
+
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
/* FALLTHROUGH */
case EXACTF: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
if (utf8_target) {
- utf8_fold_flags = 0;
goto do_exactf_utf8;
}
fold_array = PL_fold;
folder = foldEQ_locale;
goto do_exactf_non_utf8;
- case EXACTFU_SS:
+ case EXACTFUP: /* Problematic even though pattern isn't UTF-8. Use
+ full functionality normally not done except for
+ UTF-8 */
assert(! is_utf8_pat);
goto do_exactf_utf8;
case EXACTFU:
if (is_utf8_pat || utf8_target) {
- utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
+ utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
goto do_exactf_utf8;
}
* so we don't have to worry here about this single special case
* in the Latin1 range */
fold_array = PL_fold_latin1;
- folder = foldEQ_latin1;
+ folder = foldEQ_latin1_s2_folded;
/* FALLTHROUGH */
);
break;
- case ASCII:
- REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
- break;
-
- case NASCII:
- if (utf8_target) {
- REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
- utf8_target));
- }
- else {
- REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
- utf8_target));
- }
-
- break;
-
/* The argument to all the POSIX node types is the class number to pass to
* _generic_isCC() to build a mask for searching in PL_charclass[] */
int i;
for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
- if (isASCII(*s)) {
+ if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
*(d++) = (U8) toFOLD_LC(*s);
s++;
}
}
else { /* an EXACTFish node which doesn't begin with a multi-char fold */
c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
- if (c1 > 255) {
+
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && OP(text_node) == EXACTFL
+ && UNLIKELY( c1 == 'i' || c1 == 'I'
+ || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
+ || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
+ { /* Hard-coded Turkish locale rules for these 4 characters
+ override normal rules */
+ if (c1 == 'i') {
+ c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
+ }
+ else if (c1 == 'I') {
+ c2 = LATIN_SMALL_LETTER_DOTLESS_I;
+ }
+ else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ c2 = 'i';
+ }
+ else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
+ c2 = 'I';
+ }
+ }
+ else if (c1 > 255) {
const unsigned int * remaining_folds;
unsigned int first_fold;
assert(! is_utf8_pat);
/* FALLTHROUGH */
case EXACTFAA:
- case EXACTFU_SS:
+ case EXACTFUP:
case EXACTFU:
c2 = PL_fold_latin1[c1];
break;
}
fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
| FOLDEQ_S2_FOLDS_SANE;
- folder = foldEQ_latin1;
+ folder = foldEQ_latin1_s2_folded;
fold_array = PL_fold_latin1;
goto do_exactf;
fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
goto do_exactf;
- case EXACTFU_SS: /* /\x{df}/iu */
+ case EXACTFUP: /* /foo/iu, and something is problematic in
+ 'foo' so can't take shortcuts. */
assert(! is_utf8_pat);
- /* FALLTHROUGH */
+ folder = foldEQ_latin1;
+ fold_array = PL_fold_latin1;
+ fold_utf8_flags = 0;
+ goto do_exactf;
+
case EXACTFU: /* /abc/iu */
- folder = foldEQ_latin1;
+ folder = foldEQ_latin1_s2_folded;
fold_array = PL_fold_latin1;
- fold_utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
+ fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
goto do_exactf;
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
if ( utf8_target
|| is_utf8_pat
- || state_num == EXACTFU_SS
+ || state_num == EXACTFUP
|| (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
{
/* Either target or the pattern are utf8, or has the issue where
goto increment_locinput;
break;
- case ASCII:
- if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
- sayNO;
- }
-
- locinput++; /* ASCII is always single byte */
- break;
-
- case NASCII:
- if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
+ case ANYOFH:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+ utf8_target))
+ {
sayNO;
}
-
goto increment_locinput;
break;
PL_op = NULL;
re_sv = NULL;
- if (logical == 0) /* (?{})/ */
- sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
+ if (logical == 0) { /* (?{})/ */
+ SV *replsv = save_scalar(PL_replgv);
+ sv_setsv(replsv, ret); /* $^R */
+ SvSETMAGIC(replsv);
+ }
else if (logical == 1) { /* /(?(?{...})X|Y)/ */
sw = cBOOL(SvTRUE_NN(ret));
logical = 0;
{
/* preserve $^R across LEAVE's. See Bug 121070. */
SV *save_sv= GvSV(PL_replgv);
+ SV *replsv;
SvREFCNT_inc(save_sv);
regcpblow(ST.cp); /* LEAVE in disguise */
- sv_setsv(GvSV(PL_replgv), save_sv);
+ /* don't move this initialization up */
+ replsv = GvSV(PL_replgv);
+ sv_setsv(replsv, save_sv);
+ SvSETMAGIC(replsv);
SvREFCNT_dec(save_sv);
}
cur_eval = ST.prev_eval;
* see code related to PL_replgv elsewhere in this file.
* Yves
*/
- if (oreplsv != GvSV(PL_replgv))
+ if (oreplsv != GvSV(PL_replgv)) {
sv_setsv(oreplsv, GvSV(PL_replgv));
+ SvSETMAGIC(oreplsv);
+ }
}
result = 1;
goto final_exit;
goto do_exactf;
case EXACTFU:
- if (reginfo->is_utf8_pat) {
- utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
- }
+ utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
/* FALLTHROUGH */
- case EXACTFU_SS:
+ case EXACTFUP:
do_exactf: {
int c1, c2;
}
break;
- case ASCII:
- if (utf8_target && loceol - scan > max) {
- loceol = scan + max;
+ case ANYOFH:
+ if (utf8_target) while ( hardcount < max
+ && scan < loceol
+ && reginclass(prog, p, (U8*)scan, (U8*) loceol,
+ TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
}
-
- scan = find_next_non_ascii(scan, loceol, utf8_target);
- break;
-
- case NASCII:
- if (utf8_target) {
- while ( hardcount < max
- && scan < loceol
- && ! isASCII_utf8_safe(scan, loceol))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- }
- else {
- scan = find_next_ascii(scan, loceol, utf8_target);
- }
- break;
+ break;
/* The argument (FLAGS) to all the POSIX node types is the class number */
return(c);
}
-
-#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
-/*
-- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
-create a copy so that changes the caller makes won't change the shared one.
-If <altsvp> is non-null, will return NULL in it, for back-compat.
- */
-SV *
-Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
-{
- PERL_ARGS_ASSERT_REGCLASS_SWASH;
-
- if (altsvp) {
- *altsvp = NULL;
- }
-
- return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
-}
-
-#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
-
/*
- reginclass - determine if a character falls into a character class
}
/* If this character is potentially in the bitmap, check it */
- if (c < NUM_ANYOF_CODE_POINTS) {
+ if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if ((flags
&& IN_UTF8_CTYPE_LOCALE)))
{
SV* only_utf8_locale = NULL;
- SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
- &only_utf8_locale, NULL);
- if (sw) {
+ SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE,
+ 0, &only_utf8_locale, NULL);
+ if (definition) {
U8 utf8_buffer[2];
U8 * utf8_p;
if (utf8_target) {
utf8_p = utf8_buffer;
}
- if (swash_fetch(sw, utf8_p, TRUE)) {
+ /* Turkish locales have these hard-coded rules overriding
+ * normal ones */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && isALPHA_FOLD_EQ(*p, 'i'))
+ {
+ if (*p == 'i') {
+ if (_invlist_contains_cp(definition,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
+ {
+ match = TRUE;
+ }
+ }
+ else if (*p == 'I') {
+ if (_invlist_contains_cp(definition,
+ LATIN_SMALL_LETTER_DOTLESS_I))
+ {
+ match = TRUE;
+ }
+ }
+ }
+ else if (_invlist_contains_cp(definition, c)) {
match = TRUE;
}
}
}
}
+ /* In a Turkic locale under folding, hard-code the I i case pair
+ * matches */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && ! match
+ && (flags & ANYOFL_FOLD)
+ && utf8_target)
+ {
+ if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ if (ANYOF_BITMAP_TEST(n, 'i')) {
+ match = TRUE;
+ }
+ }
+ else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
+ if (ANYOF_BITMAP_TEST(n, 'I')) {
+ match = TRUE;
+ }
+ }
+ }
+
if (UNICODE_IS_SUPER(c)
&& (flags
& ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)