#include "re_top.h"
#endif
-/* At least one required character in the target string is expressible only in
- * UTF-8. */
-static const char* const non_utf8_target_but_utf8_required
- = "Can't match, because target string needs to be in UTF-8\n";
-
-#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
- goto target; \
-} STMT_END
-
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
#include "inline_invlist.c"
#include "unicode_constants.h"
+#ifdef DEBUGGING
+/* At least one required character in the target string is expressible only in
+ * UTF-8. */
+static const char* const non_utf8_target_but_utf8_required
+ = "Can't match, because target string needs to be in UTF-8\n";
+#endif
+
+#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
+ goto target; \
+} STMT_END
+
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#ifndef STATIC
#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
swash_property_names[_CC_WORDCHAR], \
- GREEK_SMALL_LETTER_IOTA_UTF8)
+ LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
STMT_START { \
LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
"_X_regular_begin", \
- GREEK_SMALL_LETTER_IOTA_UTF8); \
+ LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
"_X_extend", \
COMBINING_GRAVE_ACCENT_UTF8); \
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
we don't need this definition. */
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
-#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
+#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
);
for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
- SSPUSHINT(rex->offs[p].end);
- SSPUSHINT(rex->offs[p].start);
+ SSPUSHIV(rex->offs[p].end);
+ SSPUSHIV(rex->offs[p].start);
SSPUSHINT(rex->offs[p].start_tmp);
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
);
paren = *maxopenparen_p;
for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
- I32 tmps;
+ SSize_t tmps;
rex->offs[paren].start_tmp = SSPOPINT;
- rex->offs[paren].start = SSPOPINT;
- tmps = SSPOPINT;
+ rex->offs[paren].start = SSPOPIV;
+ tmps = SSPOPIV;
if (paren <= rex->lastparen)
rex->offs[paren].end = tmps;
DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
}
else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
return isFOO_lc(classnum,
- TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
+ TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
}
if (classnum < _FIRST_NON_SWASH_CC) {
*/
I32
Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
- char *strbeg, I32 minend, SV *screamer, U32 nosave)
+ char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
/* stringarg: the point in the string at which to begin matching */
/* strend: pointer to null at end of string */
/* strbeg: real beginning of string */
{
dVAR;
struct regexp *const prog = ReANY(rx);
- I32 start_shift = 0;
+ SSize_t start_shift = 0;
/* Should be nonnegative! */
- I32 end_shift = 0;
+ SSize_t end_shift = 0;
char *s;
SV *check;
char *t;
}
check = prog->check_substr;
}
- if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */
- && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
- {
- ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
+ if (prog->extflags & RXf_ANCH) { /* Match at \G, beg-of-str or after \n */
+ ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
|| ( (prog->extflags & RXf_ANCH_BOL)
&& !multiline ) ); /* Check after \n? */
if (!ml_anch) {
- if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
- && (strpos != strbeg)) {
+ /* we are only allowed to match at BOS or \G */
+
+ if (prog->extflags & RXf_ANCH_GPOS) {
+ /* in this case, we hope(!) that the caller has already
+ * set strpos to pos()-gofs, and will already have checked
+ * that this anchor position is legal
+ */
+ ;
+ }
+ else if (!(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
+ && (strpos != strbeg))
+ {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
}
See [perl #115242] */
{
/* Substring at constant offset from beg-of-str... */
- I32 slen;
+ SSize_t slen;
s = HOP3c(strpos, prog->check_offset_min, strend);
end_shift = prog->check_end_shift;
if (!ml_anch) {
- const I32 end = prog->check_offset_max + CHR_SVLEN(check)
+ const SSize_t end = prog->check_offset_max + CHR_SVLEN(check)
- (SvTAIL(check) != 0);
- const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
+ const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
if (end_shift < eshift)
end_shift = eshift;
/* end shift should be non negative here */
}
-#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
+#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
if (end_shift < 0)
Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
(IV)end_shift, RX_PRECOMP(prog));
the "check" substring in the region corrected by start/end_shift. */
{
- I32 srch_start_shift = start_shift;
- I32 srch_end_shift = end_shift;
+ SSize_t srch_start_shift = start_shift;
+ SSize_t srch_end_shift = end_shift;
U8* start_point;
U8* end_point;
if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
}
#define DECL_TRIE_TYPE(scan) \
- const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
+ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
+ trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
trie_type = ((scan->flags == EXACT) \
? (utf8_target ? trie_utf8 : trie_plain) \
- : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
+ : (scan->flags == EXACTFA) \
+ ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
+ : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
STMT_START { \
STRLEN skiplen; \
+ U8 flags = FOLD_FLAGS_FULL; \
switch (trie_type) { \
+ case trie_utf8_exactfa_fold: \
+ flags |= FOLD_FLAGS_NOMIX_ASCII; \
+ /* FALL THROUGH */ \
case trie_utf8_fold: \
if ( foldlen>0 ) { \
- uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
foldlen -= len; \
uscan += len; \
len=0; \
} else { \
- uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
+ uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \
len = UTF8SKIP(uc); \
skiplen = UNISKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
break; \
+ case trie_latin_utf8_exactfa_fold: \
+ flags |= FOLD_FLAGS_NOMIX_ASCII; \
+ /* FALL THROUGH */ \
case trie_latin_utf8_fold: \
if ( foldlen>0 ) { \
- uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
foldlen -= len; \
uscan += len; \
len=0; \
} else { \
len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
skiplen = UNISKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
break; \
case trie_utf8: \
- uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
break; \
case trie_plain: \
uvc = (UV)*uc; \
switch (OP(c)) {
case ANYOF:
case ANYOF_SYNTHETIC:
- case ANYOF_WARN_SUPER:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
reginclass(prog, c, (U8*)s, utf8_target));
);
break;
+ case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
+ assert(! is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
folder = foldEQ_latin1; /* /a, except the sharp s one which */
goto do_exactf_non_utf8; /* isn't dealt with by these */
- case EXACTF:
+ case EXACTF: /* This node only generated for non-utf8 patterns */
+ assert(! is_utf8_pat);
if (utf8_target) {
-
- /* regcomp.c already folded this if pattern is in UTF-8 */
utf8_fold_flags = 0;
goto do_exactf_utf8;
}
}
goto do_exactf_utf8;
- case EXACTFU_TRICKYFOLD:
case EXACTFU:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
* characters, and there are only 2 availabe, we know without
* trying that it will fail; so don't start a match past the
* required minimum number from the far end */
- e = HOP3c(strend, -((I32)ln), s);
+ e = HOP3c(strend, -((SSize_t)ln), s);
if (reginfo->intuit && e < s) {
e = s; /* Due to minlen logic of intuit() */
* only 2 are left, it's guaranteed to fail, so don't start a
* match that would require us to go beyond the end of the string
*/
- e = HOP3c(strend, -((I32)lnc), s);
+ e = HOP3c(strend, -((SSize_t)lnc), s);
if (reginfo->intuit && e < s) {
e = s; /* Due to minlen logic of intuit() */
case BOUNDL:
RXp_MATCH_TAINTED_on(prog);
FBC_BOUND(isWORDCHAR_LC,
- isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+ isWORDCHAR_LC_uvchr(tmp),
isWORDCHAR_LC_utf8((U8*)s));
break;
case NBOUNDL:
RXp_MATCH_TAINTED_on(prog);
FBC_NBOUND(isWORDCHAR_LC,
- isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+ isWORDCHAR_LC_uvchr(tmp),
isWORDCHAR_LC_utf8((U8*)s));
break;
case BOUND:
classnum)))
|| (UTF8_IS_DOWNGRADEABLE_START(*s)
&& to_complement ^ cBOOL(
- _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
+ _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+ *(s + 1)),
classnum))))
{
if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
"Copy on write: regexp capture, type %d\n",
(int) SvTYPE(sv));
}
- /* skip creating new COW SV if a valid one already exists */
- if (! ( prog->saved_copy
- && SvIsCOW(sv)
- && SvPOKp(sv)
- && SvIsCOW(prog->saved_copy)
- && SvPOKp(prog->saved_copy)
- && SvPVX(sv) == SvPVX(prog->saved_copy)))
+ /* Create a new COW SV to share the match string and store
+ * in saved_copy, unless the current COW SV in saved_copy
+ * is valid and suitable for our purpose */
+ if (( prog->saved_copy
+ && SvIsCOW(prog->saved_copy)
+ && SvPOKp(prog->saved_copy)
+ && SvIsCOW(sv)
+ && SvPOKp(sv)
+ && SvPVX(sv) == SvPVX(prog->saved_copy)))
{
+ /* just reuse saved_copy SV */
+ if (RXp_MATCH_COPIED(prog)) {
+ Safefree(prog->subbeg);
+ RXp_MATCH_COPIED_off(prog);
+ }
+ }
+ else {
+ /* create new COW SV to share string */
RX_MATCH_COPY_FREE(rx);
prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
- prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
- assert (SvPOKp(prog->saved_copy));
}
+ prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
+ assert (SvPOKp(prog->saved_copy));
prog->sublen = strend - strbeg;
prog->suboffset = 0;
prog->subcoffset = 0;
} else
#endif
{
- I32 min = 0;
- I32 max = strend - strbeg;
- I32 sublen;
+ SSize_t min = 0;
+ SSize_t max = strend - strbeg;
+ SSize_t sublen;
if ( (flags & REXEC_COPY_SKIP_POST)
- && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
+ && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
&& !(PL_sawampersand & SAWAMPERSAND_RIGHT)
) { /* don't copy $' part of string */
U32 n = 0;
}
if ( (flags & REXEC_COPY_SKIP_PRE)
- && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
+ && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
&& !(PL_sawampersand & SAWAMPERSAND_LEFT)
) { /* don't copy $` part of string */
U32 n = 0;
* $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
* from going quadratic */
if (SvPOKp(sv) && SvPVX(sv) == strbeg)
- sv_pos_b2u(sv, &(prog->subcoffset));
+ prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
+ SV_GMAGIC|SV_CONST_RETURN);
else
prog->subcoffset = utf8_length((U8*)strbeg,
(U8*)(strbeg+prog->suboffset));
*/
I32
Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
- char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
+ char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
/* stringarg: the point in the string at which to begin matching */
/* strend: pointer to null at end of string */
/* strbeg: real beginning of string */
* itself is accessed via the pointers above */
/* data: May be used for some additional optimizations.
Currently unused. */
-/* nosave: For optimizations. */
+/* flags: For optimizations. See REXEC_* in regexp.h */
{
dVAR;
char *s;
regnode *c;
char *startpos;
- I32 minlen; /* must match at least this many chars */
- I32 dontbother = 0; /* how many characters not to try at end */
- I32 end_shift = 0; /* Same for the end. */ /* CC */
+ SSize_t minlen; /* must match at least this many chars */
+ SSize_t dontbother = 0; /* how many characters not to try at end */
const bool utf8_target = cBOOL(DO_UTF8(sv));
I32 multiline;
RXi_GET_DECL(prog,progi);
if (prog->extflags & RXf_GPOS_SEEN) {
MAGIC *mg;
+ /* set reginfo->ganch, the position where \G can match */
+
+ reginfo->ganch =
+ (flags & REXEC_IGNOREPOS)
+ ? stringarg /* use start pos rather than pos() */
+ : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
+ /* Defined pos(): */
+ ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
+ : strbeg; /* pos() not defined; use start of string */
+
+ DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
+ "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
+
/* in the presence of \G, we may need to start looking earlier in
* the string than the suggested start point of stringarg:
- * if gofs->prog is set, then that's a known, fixed minimum
+ * if prog->gofs is set, then that's a known, fixed minimum
* offset, such as
* /..\G/: gofs = 2
* /ab|c\G/: gofs = 1
* or if the minimum offset isn't known, then we have to go back
* to the start of the string, e.g. /w+\G/
*/
- if (prog->gofs) {
+
+ if (prog->extflags & RXf_ANCH_GPOS) {
+ startpos = reginfo->ganch - prog->gofs;
+ if (startpos <
+ ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
+ {
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "fail: ganch-gofs before earliest possible start\n"));
+ return 0;
+ }
+ }
+ else if (prog->gofs) {
if (startpos - prog->gofs < strbeg)
startpos = strbeg;
else
}
else if (prog->extflags & RXf_GPOS_FLOAT)
startpos = strbeg;
-
- /* set reginfo->ganch, the position where \G can match */
-
- reginfo->ganch =
- (flags & REXEC_IGNOREPOS)
- ? stringarg /* use start pos rather than pos() */
- : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
- ? strbeg + mg->mg_len /* Defined pos() */
- : strbeg; /* pos() not defined; use start of string */
-
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
}
minlen = prog->minlen;
return 0;
}
+ /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
+ * which will call destuctors to reset PL_regmatch_state, free higher
+ * PL_regmatch_slabs, and clean up regmatch_info_aux and
+ * regmatch_info_aux_eval */
+
+ oldsave = PL_savestack_ix;
+
s = startpos;
- if ((RX_EXTFLAGS(rx) & RXf_USE_INTUIT)
+ if ((prog->extflags & RXf_USE_INTUIT)
&& !(flags & REXEC_CHECKED))
{
s = re_intuit_start(rx, sv, strbeg, startpos, strend,
if (!s)
return 0;
- if (RX_EXTFLAGS(rx) & RXf_CHECK_ALL) {
+ if (prog->extflags & RXf_CHECK_ALL) {
/* we can match based purely on the result of INTUIT.
* Set up captures etc just for $& and $-[0]
* (an intuit-only match wont have $1,$2,..) */
* Let @-, @+, $^N know */
prog->lastparen = prog->lastcloseparen = 0;
RX_MATCH_UTF8_set(rx, utf8_target);
+ prog->offs[0].start = s - strbeg;
+ prog->offs[0].end = utf8_target
+ ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
+ : s - strbeg + prog->minlenret;
if ( !(flags & REXEC_NOT_FIRST) )
S_reg_set_capture_string(aTHX_ rx,
strbeg, strend,
sv, flags, utf8_target);
- prog->offs[0].start = s - strbeg;
- prog->offs[0].end = utf8_target
- ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
- : s - strbeg + prog->minlenret;
return 1;
}
}
-
- /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
- * which will call destuctors to reset PL_regmatch_state, free higher
- * PL_regmatch_slabs, and clean up regmatch_info_aux and
- * regmatch_info_aux_eval */
-
- oldsave = PL_savestack_ix;
-
multiline = prog->extflags & RXf_PMf_MULTILINE;
if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
Not newSVsv, either, as it does not COW.
*/
reginfo->sv = newSV(0);
- sv_setsv(reginfo->sv, sv);
+ SvSetSV_nosteal(reginfo->sv, sv);
SAVEFREESV(reginfo->sv);
}
goto phooey;
} else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
{
- /* the warning about reginfo->ganch being used without initialization
- is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
- and we only enter this block when the same bit is set. */
- char *tmp_s = reginfo->ganch - prog->gofs;
-
- if (s <= tmp_s && regtry(reginfo, &tmp_s))
+ /* For anchored \G, the only position it can match from is
+ * (ganch-gofs); we already set startpos to this above; if intuit
+ * moved us on from there, we can't possibly succeed */
+ assert(startpos == reginfo->ganch - prog->gofs);
+ if (s == startpos && regtry(reginfo, &s))
goto got_it;
goto phooey;
}
|| ((prog->float_substr != NULL || prog->float_utf8 != NULL)
&& prog->float_max_offset < strend - s)) {
SV *must;
- I32 back_max;
- I32 back_min;
+ SSize_t back_max;
+ SSize_t back_min;
char *last;
char *last1; /* Last position checked before */
#ifdef DEBUGGING
last = strend;
} else {
last = HOP3c(strend, /* Cannot start after this */
- -(I32)(CHR_SVLEN(must)
+ -(SSize_t)(CHR_SVLEN(must)
- (SvTAIL(must) != 0) + back_min), strbeg);
}
if (s > reginfo->strbeg)
/* XXXX check_substr already used to find "s", can optimize if
check_substr==must. */
- dontbother = end_shift;
+ dontbother = 0;
strend = HOPc(strend, -dontbother);
while ( (s <= last) &&
(s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
CHECKPOINT lastcp;
REGEXP *const rx = reginfo->prog;
regexp *const prog = ReANY(rx);
- I32 result;
+ SSize_t result;
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
SV** listp;
if (! PL_utf8_foldclosures) {
if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
/* Force loading this by folding an above-Latin1 char */
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
* which is the one above 255 */
if ((c1 < 256) != (c2 < 256)) {
if (OP(text_node) == EXACTFL
- || (OP(text_node) == EXACTFA
+ || ((OP(text_node) == EXACTFA
+ || OP(text_node) == EXACTFA_NO_TRIE)
&& (isASCII(c1) || isASCII(c2))))
{
if (c1 < 256) {
if (utf8_target
&& HAS_NONLATIN1_FOLD_CLOSURE(c1)
&& OP(text_node) != EXACTFL
- && (OP(text_node) != EXACTFA || ! isASCII(c1)))
+ && ((OP(text_node) != EXACTFA
+ && OP(text_node) != EXACTFA_NO_TRIE)
+ || ! isASCII(c1)))
{
/* Here, there could be something above Latin1 in the target which
* folds to this character in the pattern. All such cases except
c2 = PL_fold_locale[c1];
break;
- case EXACTF:
+ case EXACTF: /* This node only generated for non-utf8
+ patterns */
+ assert(! is_utf8_pat);
if (! utf8_target) { /* /d rules */
c2 = PL_fold[c1];
break;
/* FALLTHROUGH */
/* /u rules for all these. This happens to work for
* EXACTFA as nothing in Latin1 folds to ASCII */
+ case EXACTFA_NO_TRIE: /* This node only generated for
+ non-utf8 patterns */
+ assert(! is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA:
- case EXACTFU_TRICKYFOLD:
case EXACTFU_SS:
case EXACTFU:
c2 = PL_fold_latin1[c1];
}
/* returns -1 on failure, $+[0] on success */
-STATIC I32
+STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
{
#if PERL_VERSION < 9 && !defined(PERL_CORE)
regnode *scan;
regnode *next;
U32 n = 0; /* general value; init to avoid compiler warning */
- I32 ln = 0; /* len or last; init to avoid compiler warning */
+ SSize_t ln = 0; /* len or last; init to avoid compiler warning */
char *locinput = startpos;
char *pushinput; /* where to continue after a PUSH */
I32 nextchr; /* is always set to UCHARAT(locinput) */
during a successful match */
U32 lastopen = 0; /* last open we saw */
bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
- SV* const oreplsv = GvSV(PL_replgv);
+ SV* const oreplsv = GvSVn(PL_replgv);
/* these three flags are set by various ops to signal information to
* the very next op. They have a useful lifetime of exactly one loop
* iteration, and are not preserved or restored by state pushes/pops
sayNO_SILENT;
assert(0); /*NOTREACHED*/
- case EOL: /* /..$/ */
- goto seol;
-
case MEOL: /* /..$/m */
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
break;
+ case EOL: /* /..$/ */
+ /* FALL THROUGH */
case SEOL: /* /..$/s */
- seol:
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
if (reginfo->strend - locinput > 1)
while (chars) {
if (utf8_target) {
- uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
+ uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
uniflags);
uc += len;
}
while (foldlen) {
if (!--chars)
break;
- uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
+ uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
uniflags);
uscan += len;
foldlen -= len;
l++;
}
else {
- if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
+ if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+ {
sayNO;
}
l += 2;
s++;
}
else {
- if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
+ if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+ {
sayNO;
}
s += 2;
goto do_exactf;
case EXACTFU_SS: /* /\x{df}/iu */
- case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
case EXACTFU: /* /abc/iu */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
goto do_exactf;
+ case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
+ patterns */
+ assert(! is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA: /* /abc/iaa */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_exactf;
- case EXACTF: /* /abc/i */
+ case EXACTF: /* /abc/i This node only generated for
+ non-utf8 patterns */
+ assert(! is_utf8_pat);
folder = foldEQ;
fold_array = PL_fold;
fold_utf8_flags = 0;
}
}
else {
- ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
+ ln = isWORDCHAR_LC_uvchr(ln);
n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
}
}
break;
case ANYOF: /* /[abc]/ */
- case ANYOF_WARN_SUPER:
if (NEXTCHR_IS_EOS)
sayNO;
if (utf8_target) {
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
- (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
+ (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
*(locinput + 1))))))
{
sayNO;
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
+ ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
- FLAGS(scan)))))
+ FLAGS(scan)))))
{
sayNO;
}
rex->offs[0].end = locinput - reginfo->strbeg;
if (reginfo->info_aux_eval->pos_magic)
- reginfo->info_aux_eval->pos_magic->mg_len
- = locinput - reginfo->strbeg;
+ MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
+ reginfo->sv, reginfo->strbeg,
+ locinput - reginfo->strbeg);
if (sv_yes_mark) {
SV *sv_mrk = get_sv("REGMARK", 1);
else { /* /(??{}) */
/* if its overloaded, let the regex compiler handle
* it; otherwise extract regex, or stringify */
+ if (SvGMAGICAL(ret))
+ ret = sv_mortalcopy(ret);
if (!SvAMAGIC(ret)) {
SV *sv = ret;
if (SvROK(sv))
sv = SvRV(sv);
if (SvTYPE(sv) == SVt_REGEXP)
re_sv = (REGEXP*) sv;
- else if (SvSMAGICAL(sv)) {
- MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
+ else if (SvSMAGICAL(ret)) {
+ MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
if (mg)
re_sv = (REGEXP *) mg->mg_obj;
}
- /* force any magic, undef warnings here */
- if (!re_sv) {
+ /* force any undef warnings here */
+ if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
ret = sv_mortalcopy(ret);
(void) SvPV_force_nolen(ret);
}
pm_flags);
if (!(SvFLAGS(ret)
- & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
- | SVs_GMG))) {
+ & (SVs_TEMP | SVs_GMG | SVf_ROK))
+ && (!SvPADTMP(ret) || SvREADONLY(ret))) {
/* This isn't a first class regexp. Instead, it's
caching a regexp onto an existing, Perl visible
scalar. */
sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
}
- /* safe to do now that any $1 etc has been
- * interpolated into the new pattern string and
- * compiled */
- S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
}
SAVEFREESV(re_sv);
re = ReANY(re_sv);
if (reginfo->poscache_iter-- == 0) {
/* initialise cache */
- const I32 size = (reginfo->poscache_maxiter + 7)/8;
+ const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
regmatch_info_aux *const aux = reginfo->info_aux;
if (aux->poscache) {
- if ((I32)reginfo->poscache_size < size) {
+ if ((SSize_t)reginfo->poscache_size < size) {
Renew(aux->poscache, size, char);
reginfo->poscache_size = size;
}
if (reginfo->poscache_iter < 0) {
/* have we already failed at this position? */
- I32 offset, mask;
+ SSize_t offset, mask;
reginfo->poscache_iter = -1; /* stop eventual underflow */
offset = (scan->flags & 0xf) - 1
/* simulate B failing */
DEBUG_OPTIMISE_r(
PerlIO_printf(Perl_debug_log,
- "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
+ "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
(int)(REPORT_CODE_OFF+(depth*2)),"",
valid_utf8_to_uvchr((U8 *) locinput, NULL),
valid_utf8_to_uvchr(ST.c1_utf8, NULL),
/* simulate B failing */
DEBUG_OPTIMISE_r(
PerlIO_printf(Perl_debug_log,
- "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
+ "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
(int)(REPORT_CODE_OFF+(depth*2)),"",
(int) nextchr, ST.c1, ST.c2)
);
/* Target isn't utf8; convert the character in the UTF-8
* pattern to non-UTF8, and do a simple loop */
- c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
+ c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
}
break;
+ case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
+ assert(! reginfo->is_utf8_pat);
+ /* FALL THROUGH */
case EXACTFA:
- utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
+ utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_exactf;
case EXACTFL:
utf8_flags = FOLDEQ_UTF8_LOCALE;
goto do_exactf;
- case EXACTF:
- utf8_flags = 0;
- goto do_exactf;
+ case EXACTF: /* This node only generated for non-utf8 patterns */
+ assert(! reginfo->is_utf8_pat);
+ utf8_flags = 0;
+ goto do_exactf;
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFU:
utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
break;
}
case ANYOF:
- case ANYOF_WARN_SUPER:
if (utf8_target) {
while (hardcount < max
&& scan < loceol
}
else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
- *(scan + 1)),
+ ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+ *(scan + 1)),
classnum))))
{
break;
S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
{
/* Returns the swash for the input 'node' in the regex 'prog'.
- * If <doinit> is true, will attempt to create the swash if not already
+ * If <doinit> is 'true', will attempt to create the swash if not already
* done.
- * If <listsvp> is non-null, will return the swash initialization string in
- * it.
+ * If <listsvp> is non-null, will return the printable contents of the
+ * swash. This can be used to get debugging information even before the
+ * swash exists, by calling this function with 'doinit' set to false, in
+ * which case the components that will be used to eventually create the
+ * swash are returned (in a printable form).
* Tied intimately to how regcomp.c sets up the data structure */
dVAR;
SV *sw = NULL;
- SV *si = NULL;
+ SV *si = NULL; /* Input swash initialization string */
SV* invlist = NULL;
RXi_GET_DECL(prog,progi);
/* Element [1] is reserved for the set-up swash. If already there,
* return it; if not, create it and store it there */
- if (SvROK(ary[1])) {
+ if (ary[1] && SvROK(ary[1])) {
sw = ary[1];
}
else if (si && doinit) {
}
}
+ /* If requested, return a printable version of what this swash matches */
if (listsvp) {
SV* matches_string = newSVpvn("", 0);
- /* Use the swash, if any, which has to have incorporated into it all
- * possibilities */
+ /* The swash should be used, if possible, to get the data, as it
+ * contains the resolved data. But this function can be called at
+ * compile-time, before everything gets resolved, in which case we
+ * return the currently best available information, which is the string
+ * that will eventually be used to do that resolving, 'si' */
if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
&& (si && si != &PL_sv_undef))
{
-
- /* If no swash, use the input initialization string, if available */
sv_catsv(matches_string, si);
}
match = TRUE;
}
else if (flags & ANYOF_LOCALE) {
- RXp_MATCH_TAINTED_on(prog);
-
- if ((flags & ANYOF_LOC_FOLD)
- && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
- {
- match = TRUE;
- }
- else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
+ if (flags & ANYOF_LOC_FOLD) {
+ RXp_MATCH_TAINTED_on(prog);
+ if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
+ match = TRUE;
+ }
+ }
+ else if (ANYOF_POSIXL_TEST_ANY_SET(n)) {
/* The data structure is arranged so bits 0, 2, 4, ... are set
* if the class includes the Posix character class given by
int count = 0;
int to_complement = 0;
+
+ RXp_MATCH_TAINTED_on(prog);
while (count < ANYOF_MAX) {
- if (ANYOF_CLASS_TEST(n, count)
+ if (ANYOF_POSIXL_TEST(n, count)
&& to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
{
match = TRUE;
* positive that will be resolved when the match is done again as not part
* of the synthetic start class */
if (!match) {
- if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
+ if (utf8_target && (flags & ANYOF_ABOVE_LATIN1_ALL) && c >= 256) {
match = TRUE; /* Everything above 255 matches */
}
else if (ANYOF_NONBITMAP(n)
}
if (UNICODE_IS_SUPER(c)
- && OP(n) == ANYOF_WARN_SUPER
+ && (flags & ANYOF_WARN_SUPER)
&& ckWARN_d(WARN_NON_UNICODE))
{
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
- "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
+ "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
}
}
+#if ANYOF_INVERT != 1
+ /* Depending on compiler optimization cBOOL takes time, so if don't have to
+ * use it, don't */
+# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
+#endif
+
/* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
- return cBOOL(flags & ANYOF_INVERT) ^ match;
+ return (flags & ANYOF_INVERT) ^ match;
}
STATIC U8 *
-S_reghop3(U8 *s, I32 off, const U8* lim)
+S_reghop3(U8 *s, SSize_t off, const U8* lim)
{
/* return the position 'off' UTF-8 characters away from 's', forward if
* 'off' >= 0, backwards if negative. But don't go outside of position
we ifdef it out - dmq
*/
STATIC U8 *
-S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
+S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
{
dVAR;
#endif
STATIC U8 *
-S_reghopmaybe3(U8* s, I32 off, const U8* lim)
+S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
{
dVAR;
}
eval_state->pos_magic = mg;
eval_state->pos = mg->mg_len;
+ eval_state->pos_flags = mg->mg_flags;
}
else
eval_state->pos_magic = NULL;
RXp_MATCH_COPIED_on(rex);
}
if (eval_state->pos_magic)
+ {
eval_state->pos_magic->mg_len = eval_state->pos;
+ eval_state->pos_magic->mg_flags =
+ (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
+ | (eval_state->pos_flags & MGf_BYTES);
+ }
PL_curpm = eval_state->curpm;
}