#include "re_top.h"
#endif
-/* At least one required character in the target string is expressible only in
- * UTF-8. */
-static const char* const non_utf8_target_but_utf8_required
- = "Can't match, because target string needs to be in UTF-8\n";
-
-#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
- goto target; \
-} STMT_END
-
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
#include "inline_invlist.c"
#include "unicode_constants.h"
+#ifdef DEBUGGING
+/* At least one required character in the target string is expressible only in
+ * UTF-8. */
+static const char* const non_utf8_target_but_utf8_required
+ = "Can't match, because target string needs to be in UTF-8\n";
+#endif
+
+#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
+ goto target; \
+} STMT_END
+
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#ifndef STATIC
#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
swash_property_names[_CC_WORDCHAR], \
- GREEK_SMALL_LETTER_IOTA_UTF8)
+ LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
STMT_START { \
LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
"_X_regular_begin", \
- GREEK_SMALL_LETTER_IOTA_UTF8); \
+ LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
"_X_extend", \
COMBINING_GRAVE_ACCENT_UTF8); \
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
we don't need this definition. */
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
-#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
+#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
}
check = prog->check_substr;
}
- if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */
- && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
- {
- ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
+ if (prog->extflags & RXf_ANCH) { /* Match at \G, beg-of-str or after \n */
+ ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
|| ( (prog->extflags & RXf_ANCH_BOL)
&& !multiline ) ); /* Check after \n? */
if (!ml_anch) {
- if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
- && (strpos != strbeg)) {
+ /* we are only allowed to match at BOS or \G */
+
+ if (prog->extflags & RXf_ANCH_GPOS) {
+ /* in this case, we hope(!) that the caller has already
+ * set strpos to pos()-gofs, and will already have checked
+ * that this anchor position is legal
+ */
+ ;
+ }
+ else if (!(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
+ && (strpos != strbeg))
+ {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
}
/* end shift should be non negative here */
}
-#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
+#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
if (end_shift < 0)
Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
(IV)end_shift, RX_PRECOMP(prog));
}
#define DECL_TRIE_TYPE(scan) \
- const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
+ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
+ trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
trie_type = ((scan->flags == EXACT) \
? (utf8_target ? trie_utf8 : trie_plain) \
- : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
+ : (scan->flags == EXACTFA) \
+ ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
+ : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
STMT_START { \
STRLEN skiplen; \
+ U8 flags = FOLD_FLAGS_FULL; \
switch (trie_type) { \
+ case trie_utf8_exactfa_fold: \
+ flags |= FOLD_FLAGS_NOMIX_ASCII; \
+ /* FALL THROUGH */ \
case trie_utf8_fold: \
if ( foldlen>0 ) { \
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
uscan += len; \
len=0; \
} else { \
- uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
+ uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \
len = UTF8SKIP(uc); \
skiplen = UNISKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
break; \
+ case trie_latin_utf8_exactfa_fold: \
+ flags |= FOLD_FLAGS_NOMIX_ASCII; \
+ /* FALL THROUGH */ \
case trie_latin_utf8_fold: \
if ( foldlen>0 ) { \
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
len=0; \
} else { \
len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
skiplen = UNISKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
switch (OP(c)) {
case ANYOF:
case ANYOF_SYNTHETIC:
- case ANYOF_WARN_SUPER:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
reginclass(prog, c, (U8*)s, utf8_target));
);
break;
+ case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
+ assert(! is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
folder = foldEQ_latin1; /* /a, except the sharp s one which */
goto do_exactf_non_utf8; /* isn't dealt with by these */
- case EXACTF:
+ case EXACTF: /* This node only generated for non-utf8 patterns */
+ assert(! is_utf8_pat);
if (utf8_target) {
-
- /* regcomp.c already folded this if pattern is in UTF-8 */
utf8_fold_flags = 0;
goto do_exactf_utf8;
}
}
goto do_exactf_utf8;
- case EXACTFU_TRICKYFOLD:
case EXACTFU:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
: strbeg; /* pos() not defined; use start of string */
DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
+ "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
/* in the presence of \G, we may need to start looking earlier in
* the string than the suggested start point of stringarg:
- * if gofs->prog is set, then that's a known, fixed minimum
+ * if prog->gofs is set, then that's a known, fixed minimum
* offset, such as
* /..\G/: gofs = 2
* /ab|c\G/: gofs = 1
Not newSVsv, either, as it does not COW.
*/
reginfo->sv = newSV(0);
- sv_setsv(reginfo->sv, sv);
+ SvSetSV_nosteal(reginfo->sv, sv);
SAVEFREESV(reginfo->sv);
}
SV** listp;
if (! PL_utf8_foldclosures) {
if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
/* Force loading this by folding an above-Latin1 char */
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
* which is the one above 255 */
if ((c1 < 256) != (c2 < 256)) {
if (OP(text_node) == EXACTFL
- || (OP(text_node) == EXACTFA
+ || ((OP(text_node) == EXACTFA
+ || OP(text_node) == EXACTFA_NO_TRIE)
&& (isASCII(c1) || isASCII(c2))))
{
if (c1 < 256) {
if (utf8_target
&& HAS_NONLATIN1_FOLD_CLOSURE(c1)
&& OP(text_node) != EXACTFL
- && (OP(text_node) != EXACTFA || ! isASCII(c1)))
+ && ((OP(text_node) != EXACTFA
+ && OP(text_node) != EXACTFA_NO_TRIE)
+ || ! isASCII(c1)))
{
/* Here, there could be something above Latin1 in the target which
* folds to this character in the pattern. All such cases except
c2 = PL_fold_locale[c1];
break;
- case EXACTF:
+ case EXACTF: /* This node only generated for non-utf8
+ patterns */
+ assert(! is_utf8_pat);
if (! utf8_target) { /* /d rules */
c2 = PL_fold[c1];
break;
/* FALLTHROUGH */
/* /u rules for all these. This happens to work for
* EXACTFA as nothing in Latin1 folds to ASCII */
+ case EXACTFA_NO_TRIE: /* This node only generated for
+ non-utf8 patterns */
+ assert(! is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA:
- case EXACTFU_TRICKYFOLD:
case EXACTFU_SS:
case EXACTFU:
c2 = PL_fold_latin1[c1];
during a successful match */
U32 lastopen = 0; /* last open we saw */
bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
- SV* const oreplsv = GvSV(PL_replgv);
+ SV* const oreplsv = GvSVn(PL_replgv);
/* these three flags are set by various ops to signal information to
* the very next op. They have a useful lifetime of exactly one loop
* iteration, and are not preserved or restored by state pushes/pops
sayNO_SILENT;
assert(0); /*NOTREACHED*/
- case EOL: /* /..$/ */
- goto seol;
-
case MEOL: /* /..$/m */
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
break;
+ case EOL: /* /..$/ */
+ /* FALL THROUGH */
case SEOL: /* /..$/s */
- seol:
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
if (reginfo->strend - locinput > 1)
goto do_exactf;
case EXACTFU_SS: /* /\x{df}/iu */
- case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
case EXACTFU: /* /abc/iu */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
goto do_exactf;
+ case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
+ patterns */
+ assert(! is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA: /* /abc/iaa */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_exactf;
- case EXACTF: /* /abc/i */
+ case EXACTF: /* /abc/i This node only generated for
+ non-utf8 patterns */
+ assert(! is_utf8_pat);
folder = foldEQ;
fold_array = PL_fold;
fold_utf8_flags = 0;
break;
case ANYOF: /* /[abc]/ */
- case ANYOF_WARN_SUPER:
if (NEXTCHR_IS_EOS)
sayNO;
if (utf8_target) {
else { /* /(??{}) */
/* if its overloaded, let the regex compiler handle
* it; otherwise extract regex, or stringify */
+ if (SvGMAGICAL(ret))
+ ret = sv_mortalcopy(ret);
if (!SvAMAGIC(ret)) {
SV *sv = ret;
if (SvROK(sv))
sv = SvRV(sv);
if (SvTYPE(sv) == SVt_REGEXP)
re_sv = (REGEXP*) sv;
- else if (SvSMAGICAL(sv)) {
- MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
+ else if (SvSMAGICAL(ret)) {
+ MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
if (mg)
re_sv = (REGEXP *) mg->mg_obj;
}
- /* force any magic, undef warnings here */
- if (!re_sv) {
+ /* force any undef warnings here */
+ if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
ret = sv_mortalcopy(ret);
(void) SvPV_force_nolen(ret);
}
pm_flags);
if (!(SvFLAGS(ret)
- & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
- | SVs_GMG))) {
+ & (SVs_TEMP | SVs_GMG | SVf_ROK))
+ && (!SvPADTMP(ret) || SvREADONLY(ret))) {
/* This isn't a first class regexp. Instead, it's
caching a regexp onto an existing, Perl visible
scalar. */
sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
}
- /* safe to do now that any $1 etc has been
- * interpolated into the new pattern string and
- * compiled */
- S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
}
SAVEFREESV(re_sv);
re = ReANY(re_sv);
}
break;
+ case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
+ assert(! reginfo->is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA:
- utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
+ utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_exactf;
case EXACTFL:
utf8_flags = FOLDEQ_UTF8_LOCALE;
goto do_exactf;
- case EXACTF:
- utf8_flags = 0;
- goto do_exactf;
+ case EXACTF: /* This node only generated for non-utf8 patterns */
+ assert(! reginfo->is_utf8_pat);
+ utf8_flags = 0;
+ goto do_exactf;
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFU:
utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
break;
}
case ANYOF:
- case ANYOF_WARN_SUPER:
if (utf8_target) {
while (hardcount < max
&& scan < loceol
match = TRUE;
}
else if (flags & ANYOF_LOCALE) {
- RXp_MATCH_TAINTED_on(prog);
-
- if ((flags & ANYOF_LOC_FOLD)
- && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
- {
- match = TRUE;
- }
- else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
+ if (flags & ANYOF_LOC_FOLD) {
+ RXp_MATCH_TAINTED_on(prog);
+ if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
+ match = TRUE;
+ }
+ }
+ else if (ANYOF_POSIXL_TEST_ANY_SET(n)) {
/* The data structure is arranged so bits 0, 2, 4, ... are set
* if the class includes the Posix character class given by
int count = 0;
int to_complement = 0;
+
+ RXp_MATCH_TAINTED_on(prog);
while (count < ANYOF_MAX) {
- if (ANYOF_CLASS_TEST(n, count)
+ if (ANYOF_POSIXL_TEST(n, count)
&& to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
{
match = TRUE;
* positive that will be resolved when the match is done again as not part
* of the synthetic start class */
if (!match) {
- if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
+ if (utf8_target && (flags & ANYOF_ABOVE_LATIN1_ALL) && c >= 256) {
match = TRUE; /* Everything above 255 matches */
}
else if (ANYOF_NONBITMAP(n)
}
if (UNICODE_IS_SUPER(c)
- && OP(n) == ANYOF_WARN_SUPER
+ && (flags & ANYOF_WARN_SUPER)
&& ckWARN_d(WARN_NON_UNICODE))
{
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
- "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
+ "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
}
}
+#if ANYOF_INVERT != 1
+ /* Depending on compiler optimization cBOOL takes time, so if don't have to
+ * use it, don't */
+# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
+#endif
+
/* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
- return cBOOL(flags & ANYOF_INVERT) ^ match;
+ return (flags & ANYOF_INVERT) ^ match;
}
STATIC U8 *