*/
/*
- * One Ring to rule them all, One Ring to find them
- &
+ * One Ring to rule them all, One Ring to find them
+ *
* [p.v of _The Lord of the Rings_, opening poem]
* [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
* [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
# include "regcomp.h"
#endif
-#include "inline_invlist.c"
+#include "invlist_inline.h"
#include "unicode_constants.h"
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
+ "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+
+static const char utf8_locale_required[] =
+ "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
+
#ifdef DEBUGGING
/* At least one required character in the target string is expressible only in
* UTF-8. */
= "Can't match, because target string needs to be in UTF-8\n";
#endif
-#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
- goto target; \
+#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
+ goto target; \
} STMT_END
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#define STATIC static
#endif
-/* Valid only for non-utf8 strings: avoids the reginclass
- * call if there are no complications: i.e., if everything matchable is
- * straight forward in the bitmap */
-#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
- : ANYOF_BITMAP_TEST(p,*(c)))
+/* Valid only if 'c', the character being looke-up, is an invariant under
+ * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
+ * everything matchable is straight forward in the bitmap */
+#define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
+ ? reginclass(prog,p,c,c+1,u) \
+ : ANYOF_BITMAP_TEST(p,*(c)))
/*
* Forwards.
#define HOPBACKc(pos, off) \
(char*)(reginfo->is_utf8_target \
- ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
+ ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
: (pos - off >= reginfo->strbeg) \
? (U8*)pos - off \
: NULL)
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
"", \
PL_XPosix_ptrs[_CC_WORDCHAR], \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
-
-#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
- STMT_START { \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
- "_X_regular_begin", \
- NULL, \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
- "_X_extend", \
- NULL, \
- COMBINING_GRAVE_ACCENT_UTF8); \
- } STMT_END
+ LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
*/
#define JUMPABLE(rn) ( \
OP(rn) == OPEN || \
- (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
+ (OP(rn) == CLOSE && \
+ !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
OP(rn) == EVAL || \
OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
OP(rn) == PLUS || OP(rn) == MINMOD || \
#if 0
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
- we don't need this definition. */
+ we don't need this definition. XXX These are now out-of-sync*/
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
-#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
+#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
} \
} STMT_END
-/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
- * These are for the pre-composed Hangul syllables, which are all in a
- * contiguous block and arranged there in such a way so as to facilitate
- * alorithmic determination of their characteristics. As such, they don't need
- * a swash, but can be determined by simple arithmetic. Almost all are
- * GCB=LVT, but every 28th one is a GCB=LV */
-#define SBASE 0xAC00 /* Start of block */
-#define SCount 11172 /* Length of block */
-#define TCount 28
-
#define SLAB_FIRST(s) (&(s)->states[0])
#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
{
- dVAR;
const int retval = PL_savestack_ix;
const int paren_elems_to_push =
(maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
PERL_ARGS_ASSERT_REGCPPUSH;
if (paren_elems_to_push < 0)
- Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i",
- paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS);
+ Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
+ (int)paren_elems_to_push, (int)maxopenparen,
+ (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
DEBUG_BUFFERS_r(
if ((int)maxopenparen > (int)parenfloor)
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
PTR2UV(rex),
PTR2UV(rex->offs)
SSPUSHIV(rex->offs[p].end);
SSPUSHIV(rex->offs[p].start);
SSPUSHINT(rex->offs[p].start_tmp);
- DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
(UV)p,
(IV)rex->offs[p].start,
/* These are needed since we do not localize EVAL nodes: */
#define REGCP_SET(cp) \
DEBUG_STATE_r( \
- PerlIO_printf(Perl_debug_log, \
- " Setting an EVAL scope, savestack=%"IVdf"\n", \
- (IV)PL_savestack_ix)); \
+ Perl_re_exec_indentf( aTHX_ \
+ "Setting an EVAL scope, savestack=%"IVdf",\n", \
+ depth, (IV)PL_savestack_ix \
+ ) \
+ ); \
cp = PL_savestack_ix
#define REGCP_UNWIND(cp) \
DEBUG_STATE_r( \
- if (cp != PL_savestack_ix) \
- PerlIO_printf(Perl_debug_log, \
- " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
- (IV)(cp), (IV)PL_savestack_ix)); \
+ if (cp != PL_savestack_ix) \
+ Perl_re_exec_indentf( aTHX_ \
+ "Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n",\
+ depth, (IV)(cp), (IV)PL_savestack_ix \
+ ) \
+ ); \
regcpblow(cp)
#define UNWIND_PAREN(lp, lcp) \
STATIC void
S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
{
- dVAR;
UV i;
U32 paren;
GET_RE_DEBUG_FLAGS_DECL;
/* Now restore the parentheses context. */
DEBUG_BUFFERS_r(
if (i || rex->lastparen + 1 <= rex->nparens)
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
PTR2UV(rex),
PTR2UV(rex->offs)
tmps = SSPOPIV;
if (paren <= rex->lastparen)
rex->offs[paren].end = tmps;
- DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
+ DEBUG_BUFFERS_r( Perl_re_printf( aTHX_
" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
(UV)paren,
(IV)rex->offs[paren].start,
if (i > *maxopenparen_p)
rex->offs[i].start = -1;
rex->offs[i].end = -1;
- DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
+ DEBUG_BUFFERS_r( Perl_re_printf( aTHX_
" \\%"UVuf": %s ..-1 undeffing\n",
(UV)i,
(i > *maxopenparen_p) ? "-1" : " "
case _CC_ENUM_ALPHA: return isALPHA_LC(character);
case _CC_ENUM_ASCII: return isASCII_LC(character);
case _CC_ENUM_BLANK: return isBLANK_LC(character);
- case _CC_ENUM_CASED: return isLOWER_LC(character)
+ case _CC_ENUM_CASED: return isLOWER_LC(character)
|| isUPPER_LC(character);
case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
case _CC_ENUM_LOWER: return isLOWER_LC(character);
case _CC_ENUM_PRINT: return isPRINT_LC(character);
- case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
case _CC_ENUM_SPACE: return isSPACE_LC(character);
case _CC_ENUM_UPPER: return isUPPER_LC(character);
Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
}
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
return FALSE;
}
* '_char_class_number'.
*
* This just calls isFOO_lc on the code point for the character if it is in
- * the range 0-255. Outside that range, all characters avoid Unicode
+ * the range 0-255. Outside that range, all characters use Unicode
* rules, ignoring any locale. So use the Unicode function if this class
* requires a swash, and use the Unicode macro otherwise. */
}
else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
return isFOO_lc(classnum,
- TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
+ EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
}
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
+
if (classnum < _FIRST_NON_SWASH_CC) {
/* Initialize the swash unless done already */
}
switch ((_char_class_number) classnum) {
- case _CC_ENUM_SPACE:
- case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
-
+ case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
- default: return 0; /* Things like CNTRL are always
- below 256 */
+ default: break;
}
- assert(0); /* NOTREACHED */
- return FALSE;
+ return FALSE; /* Things like CNTRL are always below 256 */
}
/*
const U32 flags,
re_scream_pos_data *data)
{
- dVAR;
struct regexp *const prog = ReANY(rx);
SSize_t start_shift = prog->check_offset_min;
/* Should be nonnegative! */
PERL_UNUSED_ARG(flags);
PERL_UNUSED_ARG(data);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
"Intuit: trying to determine minimum start position...\n"));
/* for now, assume that all substr offsets are positive. If at some point
- * in the future someone wants to do clever things with look-behind and
+ * in the future someone wants to do clever things with lookbehind and
* -ve offsets, they'll need to fix up any code in this function
* which uses these offsets. See the thread beginning
* <20140113145929.GF27210@iabyn.com>
* to quickly reject some cases that can't match, but will reject
* them later after doing full char arithmetic */
if (prog->minlen > strend - strpos) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" String too short...\n"));
goto fail;
}
+ RX_MATCH_UTF8_set(rx,utf8_target);
reginfo->is_utf8_target = cBOOL(utf8_target);
reginfo->info_aux = NULL;
reginfo->strbeg = strbeg;
if (!sv)
continue;
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
" substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
" useful=%"IVdf" utf8=%d [%s]\n",
i,
});
if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
- /* Check after \n? */
- ml_anch = (prog->intflags & PREGf_ANCH_MBOL);
- if (!ml_anch) {
+ /* ml_anch: check after \n?
+ *
+ * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
+ * with /.*.../, these flags will have been added by the
+ * compiler:
+ * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
+ * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
+ */
+ ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
+ && !(prog->intflags & PREGf_IMPLICIT);
+
+ if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
/* we are only allowed to match at BOS or \G */
/* trivially reject if there's a BOS anchor and we're not at BOS.
- * In the case of \G, we hope(!) that the caller has already
- * set strpos to pos()-gofs, and will already have checked
- * that this anchor position is legal. So we can skip it here.
+ *
+ * Note that we don't try to do a similar quick reject for
+ * \G, since generally the caller will have calculated strpos
+ * based on pos() and gofs, so the string is already correctly
+ * anchored by definition; and handling the exceptions would
+ * be too fiddly (e.g. REXEC_IGNOREPOS).
*/
- if ( !(prog->intflags & PREGf_ANCH_GPOS)
- && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
- && (strpos != strbeg))
+ if ( strpos != strbeg
+ && (prog->intflags & PREGf_ANCH_SBOL))
{
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" Not at start...\n"));
goto fail;
}
* caller will have set strpos=pos()-4; we look for the substr
* at position pos()-4+1, which lines up with the "a" */
- if (prog->check_offset_min == prog->check_offset_max
- && !(prog->intflags & PREGf_CANY_SEEN))
- {
+ if (prog->check_offset_min == prog->check_offset_max) {
/* Substring at constant offset from beg-of-str... */
SSize_t slen = SvCUR(check);
- char *s;
-
- s = HOP3c(strpos, prog->check_offset_min, strend);
+ char *s = HOP3c(strpos, prog->check_offset_min, strend);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" Looking for check substr at fixed offset %"IVdf"...\n",
(IV)prog->check_offset_min));
- if (SvTAIL(check) && !multiline) {
- /* In this case, the regex is anchored at the end too,
- * so the lengths must match exactly, give or take a \n.
- * NB: slen >= 1 since the last char of check is \n */
- if ( strend - s > slen || strend - s < slen - 1
- || (strend - s == slen && strend[-1] != '\n'))
+ if (SvTAIL(check)) {
+ /* In this case, the regex is anchored at the end too.
+ * Unless it's a multiline match, the lengths must match
+ * exactly, give or take a \n. NB: slen >= 1 since
+ * the last char of check is \n */
+ if (!multiline
+ && ( strend - s > slen
+ || strend - s < slen - 1
+ || (strend - s == slen && strend[-1] != '\n')))
{
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" String too long...\n"));
goto fail_finish;
}
if (slen && (*SvPVX_const(check) != *s
|| (slen > 1 && memNE(SvPVX_const(check), s, slen))))
{
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" String not equal...\n"));
goto fail_finish;
}
U8* end_point;
DEBUG_OPTIMISE_MORE_r({
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
" At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
" Start shift: %"IVdf" End shift %"IVdf
" Real end Shift: %"IVdf"\n",
- (IV)(rx_origin - strpos),
+ (IV)(rx_origin - strbeg),
(IV)prog->check_offset_min,
(IV)start_shift,
(IV)end_shift,
(IV)prog->check_end_shift);
});
- if (prog->intflags & PREGf_CANY_SEEN) {
- start_point= (U8*)(rx_origin + start_shift);
- end_point= (U8*)(strend - end_shift);
- if (start_point > end_point)
- goto fail_finish;
- } else {
- end_point = HOP3(strend, -end_shift, strbeg);
- start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
- if (!start_point)
- goto fail_finish;
- }
+ end_point = HOP3(strend, -end_shift, strbeg);
+ start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
+ if (!start_point)
+ goto fail_finish;
- /* if the regex is absolutely anchored to the start of the string,
- * then check_offset_max represents an upper bound on the string
- * where the substr could start */
+ /* If the regex is absolutely anchored to either the start of the
+ * string (SBOL) or to pos() (ANCH_GPOS), then
+ * check_offset_max represents an upper bound on the string where
+ * the substr could start. For the ANCH_GPOS case, we assume that
+ * the caller of intuit will have already set strpos to
+ * pos()-gofs, so in this case strpos + offset_max will still be
+ * an upper bound on the substr.
+ */
if (!ml_anch
&& prog->intflags & PREGf_ANCH
- && prog->check_offset_max != SSize_t_MAX
- && start_shift < prog->check_offset_max)
+ && prog->check_offset_max != SSize_t_MAX)
{
SSize_t len = SvCUR(check) - !!SvTAIL(check);
- end_point = HOP3lim(start_point,
- prog->check_offset_max - start_shift,
- end_point -len)
- + len;
- }
+ const char * const anchor =
+ (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
- DEBUG_OPTIMISE_MORE_r({
- PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
- (int)(end_point - start_point),
- (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
- start_point);
- });
+ /* do a bytes rather than chars comparison. It's conservative;
+ * so it skips doing the HOP if the result can't possibly end
+ * up earlier than the old value of end_point.
+ */
+ if ((char*)end_point - anchor > prog->check_offset_max) {
+ end_point = HOP3lim((U8*)anchor,
+ prog->check_offset_max,
+ end_point -len)
+ + len;
+ }
+ }
check_at = fbm_instr( start_point, end_point,
check, multiline ? FBMrf_MULTILINE : 0);
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
+ (IV)((char*)start_point - strbeg),
+ (IV)((char*)end_point - strbeg),
+ (IV)(check_at ? check_at - strbeg : -1)
+ ));
+
/* Update the count-of-usability, remove useless subpatterns,
unshift s. */
DEBUG_EXECUTE_r({
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
- PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
+ Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
(check_at ? "Found" : "Did not find"),
(check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
? "anchored" : "floating"),
if (!check_at)
goto fail_finish;
- /* Finish the diagnostic message */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
-
/* set rx_origin to the minimum position where the regex could start
* matching, given the constraint of the just-matched check substring.
* But don't set it lower than previously.
if (check_at - rx_origin > prog->check_offset_max)
rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
+ /* Finish the diagnostic message */
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ "%ld (rx_origin now %"IVdf")...\n",
+ (long)(check_at - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
}
must = utf8_target ? other->utf8_substr : other->substr;
assert(SvPOK(must));
- s = fbm_instr(
- (unsigned char*)s,
- (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
- must,
- multiline ? FBMrf_MULTILINE : 0
- );
+ {
+ char *from = s;
+ char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
+
+ if (to > strend)
+ to = strend;
+ if (from > to) {
+ s = NULL;
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
+ (IV)(from - strbeg),
+ (IV)(to - strbeg)
+ ));
+ }
+ else {
+ s = fbm_instr(
+ (unsigned char*)from,
+ (unsigned char*)to,
+ must,
+ multiline ? FBMrf_MULTILINE : 0
+ );
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
+ (IV)(from - strbeg),
+ (IV)(to - strbeg),
+ (IV)(s ? s - strbeg : -1)
+ ));
+ }
+ }
+
DEBUG_EXECUTE_r({
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
- PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
+ Perl_re_printf( aTHX_ " %s %s substr %s%s",
s ? "Found" : "Contradicts",
other_ix ? "floating" : "anchored",
quoted, RE_SV_TAIL(must));
/* last1 is latest possible substr location. If we didn't
* find it before there, we never will */
if (last >= last1) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", giving up...\n"));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ "; giving up...\n"));
goto fail_finish;
}
/* try to find the check substr again at a later
* position. Maybe next time we'll find the "other" substr
* in range too */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", trying %s at offset %ld...\n",
- (other_ix ? "floating" : "anchored"),
- (long)(HOP3c(check_at, 1, strend) - strpos)));
-
other_last = HOP3c(last, 1, strend) /* highest failure */;
rx_origin =
other_ix /* i.e. if other-is-float */
? HOP3c(rx_origin, 1, strend)
: HOP4c(last, 1 - other->min_offset, strbeg, strend);
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
+ (other_ix ? "floating" : "anchored"),
+ (long)(HOP3c(check_at, 1, strend) - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
goto restart;
}
else {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
- (long)(s - strpos)));
-
if (other_ix) { /* if (other-is-float) */
/* other_last is set to s, not s+1, since its possible for
* a floating substr to fail first time, then succeed
rx_origin = HOP3c(s, -other->min_offset, strbeg);
other_last = HOP3c(s, 1, strend);
}
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ " at offset %ld (rx_origin now %"IVdf")...\n",
+ (long)(s - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
+
}
}
else {
DEBUG_OPTIMISE_MORE_r(
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
" Check-only match: offset min:%"IVdf" max:%"IVdf
" check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
- " strend-strpos:%"IVdf"\n",
+ " strend:%"IVdf"\n",
(IV)prog->check_offset_min,
(IV)prog->check_offset_max,
- (IV)(check_at-strpos),
- (IV)(rx_origin-strpos),
+ (IV)(check_at-strbeg),
+ (IV)(rx_origin-strbeg),
(IV)(rx_origin-check_at),
- (IV)(strend-strpos)
+ (IV)(strend-strbeg)
)
);
}
/* handle the extra constraint of /^.../m if present */
- if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n'
- /* May be due to an implicit anchor of m{.*foo} */
- && !(prog->intflags & PREGf_IMPLICIT))
- {
+ if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
char *s;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" looking for /^/m anchor"));
/* we have failed the constraint of a \n before rx_origin.
* scanning ahead for the next \n or the next substr is debatable.
* On the one hand you'd expect rare substrings to appear less
* often than \n's. On the other hand, searching for \n means
- * we're effectively flipping been check_substr and "\n" on each
+ * we're effectively flipping between check_substr and "\n" on each
* iteration as the current "rarest" string candidate, which
* means for example that we'll quickly reject the whole string if
* hasn't got a \n, rather than trying every substr position
if (s <= rx_origin ||
! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
{
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" Did not find /%s^%s/m...\n",
PL_colors[0], PL_colors[1]));
goto fail_finish;
/* Position contradicts check-string; either because
* check was anchored (and thus has no wiggle room),
* or check was float and rx_origin is above the float range */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
+ PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
goto restart;
}
* contradict. On the other hand, the float "check" substr
* didn't contradict, so just retry the anchored "other"
* substr */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
PL_colors[0], PL_colors[1],
- (long)(rx_origin - strpos),
- (long)(rx_origin - strpos + prog->anchored_offset)));
+ (IV)(rx_origin - strbeg + prog->anchored_offset),
+ (IV)(rx_origin - strbeg)
+ ));
goto do_other_substr;
}
/* success: we don't contradict the found floating substring
* (and there's no anchored substr). */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m at offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ " Found /%s^%s/m with rx_origin %ld...\n",
+ PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
}
else {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" (multiline anchor test skipped)\n"));
}
else
endpos= strend;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" looking for class: start_shift: %"IVdf" check_at: %"IVdf
" rx_origin: %"IVdf" endpos: %"IVdf"\n",
(IV)start_shift, (IV)(check_at - strbeg),
reginfo);
if (!s) {
if (endpos == strend) {
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
" Could not match STCLASS...\n") );
goto fail;
}
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
" This position contradicts STCLASS...\n") );
- if ((prog->intflags & PREGf_ANCH) && !ml_anch)
+ if ((prog->intflags & PREGf_ANCH) && !ml_anch
+ && !(prog->intflags & PREGf_IMPLICIT))
goto fail;
/* Contradict one of substrings */
* The condition above is in bytes rather than
* chars for efficiency. It's conservative, in
* that it errs on the side of doing 'goto
- * do_other_substr', where a more accurate
- * char-based calculation will be done */
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- " Looking for anchored substr starting at offset %ld...\n",
- (long)(other_last - strpos)) );
+ * do_other_substr'. In this case, at worst,
+ * an extra anchored search may get done, but in
+ * practice the extra fbm_instr() is likely to
+ * get skipped anyway. */
+ DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
+ " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
+ (long)(other_last - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
goto do_other_substr;
}
}
* but since we goto a block of code that's going to
* search for the next \n if any, its safe here */
rx_origin++;
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- " Looking for /%s^%s/m starting at offset %ld...\n",
+ DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
+ " about to look for /%s^%s/m starting at rx_origin %ld...\n",
PL_colors[0], PL_colors[1],
- (long)(rx_origin - strpos)) );
+ (long)(rx_origin - strbeg)) );
goto postprocess_substr_matches;
}
* It's conservative: it errs on the side of doing 'goto restart',
* where there is code that does a proper char-based test */
if (rx_origin + start_shift + end_shift > strend) {
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
" Could not match STCLASS...\n") );
goto fail;
}
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- " Looking for %s substr starting at offset %ld...\n",
+ DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
+ " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
(prog->substrs->check_ix ? "floating" : "anchored"),
- (long)(rx_origin + start_shift - strpos)) );
+ (long)(rx_origin + start_shift - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
goto restart;
}
/* Success !!! */
if (rx_origin != s) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" By STCLASS: moving %ld --> %ld\n",
- (long)(rx_origin - strpos), (long)(s - strpos))
+ (long)(rx_origin - strbeg), (long)(s - strbeg))
);
}
else {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
" Does not contradict STCLASS...\n");
);
}
/* Fixed substring is found far enough so that the match
cannot start at strpos. */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
}
else {
)))
{
/* If flags & SOMETHING - do not do it many times on the same match */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
/* XXX Does the destruction order has to change with utf8_target? */
SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
prog->check_substr = prog->check_utf8 = NULL; /* disable */
prog->float_substr = prog->float_utf8 = NULL; /* clear */
check = NULL; /* abort */
- /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
- see http://bugs.activestate.com/show_bug.cgi?id=87173 */
- if (prog->intflags & PREGf_IMPLICIT) {
- prog->intflags &= ~PREGf_ANCH_MBOL;
- /* maybe we have no anchors left after this... */
- if (!(prog->intflags & PREGf_ANCH))
- prog->extflags &= ~RXf_IS_ANCHORED;
- }
/* XXXX This is a remnant of the old implementation. It
looks wasteful, since now INTUIT can use many
other heuristics. */
prog->extflags &= ~RXf_USE_INTUIT;
- /* XXXX What other flags might need to be cleared in this branch? */
}
}
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
"Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
- PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
+ PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
return rx_origin;
if (prog->check_substr || prog->check_utf8) /* could be removed already */
BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
fail:
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
PL_colors[4], PL_colors[5]));
return NULL;
}
#define DECL_TRIE_TYPE(scan) \
- const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
- trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
- trie_type = ((scan->flags == EXACT) \
- ? (utf8_target ? trie_utf8 : trie_plain) \
- : (scan->flags == EXACTFA) \
- ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
- : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
+ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
+ trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
+ trie_utf8l, trie_flu8 } \
+ trie_type = ((scan->flags == EXACT) \
+ ? (utf8_target ? trie_utf8 : trie_plain) \
+ : (scan->flags == EXACTL) \
+ ? (utf8_target ? trie_utf8l : trie_plain) \
+ : (scan->flags == EXACTFA) \
+ ? (utf8_target \
+ ? trie_utf8_exactfa_fold \
+ : trie_latin_utf8_exactfa_fold) \
+ : (scan->flags == EXACTFLU8 \
+ ? trie_flu8 \
+ : (utf8_target \
+ ? trie_utf8_fold \
+ : trie_latin_utf8_fold)))
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
STMT_START { \
STRLEN skiplen; \
U8 flags = FOLD_FLAGS_FULL; \
switch (trie_type) { \
+ case trie_flu8: \
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ } \
+ goto do_trie_utf8_fold; \
case trie_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
- /* FALL THROUGH */ \
+ /* FALLTHROUGH */ \
case trie_utf8_fold: \
+ do_trie_utf8_fold: \
if ( foldlen>0 ) { \
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
foldlen -= len; \
} else { \
uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
len = UTF8SKIP(uc); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
break; \
case trie_latin_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
- /* FALL THROUGH */ \
+ /* FALLTHROUGH */ \
case trie_latin_utf8_fold: \
if ( foldlen>0 ) { \
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
} else { \
len = 1; \
uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
break; \
+ case trie_utf8l: \
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ } \
+ /* FALLTHROUGH */ \
case trie_utf8: \
uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
break; \
} \
} STMT_END
-#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
+#define DUMP_EXEC_POS(li,s,doutf8,depth) \
+ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
+ startpos, doutf8, depth)
+
+#define REXEC_FBC_EXACTISH_SCAN(COND) \
STMT_START { \
while (s <= e) { \
- if ( (CoNd) \
+ if ( (COND) \
&& (ln == 1 || folder(s, pat_string, ln)) \
&& (reginfo->intuit || regtry(reginfo, &s)) )\
goto got_it; \
} \
} STMT_END
-#define REXEC_FBC_UTF8_SCAN(CoDe) \
+#define REXEC_FBC_UTF8_SCAN(CODE) \
STMT_START { \
while (s < strend) { \
- CoDe \
+ CODE \
s += UTF8SKIP(s); \
} \
} STMT_END
-#define REXEC_FBC_SCAN(CoDe) \
+#define REXEC_FBC_SCAN(CODE) \
STMT_START { \
while (s < strend) { \
- CoDe \
+ CODE \
s++; \
} \
} STMT_END
-#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
-REXEC_FBC_UTF8_SCAN( \
- if (CoNd) { \
- if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
- goto got_it; \
- else \
- tmp = doevery; \
- } \
- else \
- tmp = 1; \
+#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
+REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
+ if (COND) { \
+ if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
+ goto got_it; \
+ else \
+ tmp = doevery; \
+ } \
+ else \
+ tmp = 1; \
)
-#define REXEC_FBC_CLASS_SCAN(CoNd) \
-REXEC_FBC_SCAN( \
- if (CoNd) { \
- if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
- goto got_it; \
- else \
- tmp = doevery; \
- } \
- else \
- tmp = 1; \
+#define REXEC_FBC_CLASS_SCAN(COND) \
+REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
+ if (COND) { \
+ if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
+ goto got_it; \
+ else \
+ tmp = doevery; \
+ } \
+ else \
+ tmp = 1; \
)
-#define REXEC_FBC_TRYIT \
-if ((reginfo->intuit || regtry(reginfo, &s))) \
- goto got_it
-
-#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
+#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
if (utf8_target) { \
- REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
} \
else { \
- REXEC_FBC_CLASS_SCAN(CoNd); \
+ REXEC_FBC_CLASS_SCAN(COND); \
}
-
-#define DUMP_EXEC_POS(li,s,doutf8) \
- dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
- startpos, doutf8)
-
-
-#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
- tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
- tmp = TEST_NON_UTF8(tmp); \
- REXEC_FBC_UTF8_SCAN( \
- if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
- tmp = !tmp; \
- IF_SUCCESS; \
- } \
- else { \
- IF_FAIL; \
- } \
- ); \
-#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
- if (s == reginfo->strbeg) { \
- tmp = '\n'; \
- } \
- else { \
- U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
- tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
+/* The three macros below are slightly different versions of the same logic.
+ *
+ * The first is for /a and /aa when the target string is UTF-8. This can only
+ * match ascii, but it must advance based on UTF-8. The other two handle the
+ * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
+ * for the boundary (or non-boundary) between a word and non-word character.
+ * The utf8 and non-utf8 cases have the same logic, but the details must be
+ * different. Find the "wordness" of the character just prior to this one, and
+ * compare it with the wordness of this one. If they differ, we have a
+ * boundary. At the beginning of the string, pretend that the previous
+ * character was a new-line.
+ *
+ * All these macros uncleanly have side-effects with each other and outside
+ * variables. So far it's been too much trouble to clean-up
+ *
+ * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
+ * a word character or not.
+ * IF_SUCCESS is code to do if it finds that we are at a boundary between
+ * word/non-word
+ * IF_FAIL is code to do if we aren't at a boundary between word/non-word
+ *
+ * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
+ * are looking for a boundary or for a non-boundary. If we are looking for a
+ * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
+ * see if this tentative match actually works, and if so, to quit the loop
+ * here. And vice-versa if we are looking for a non-boundary.
+ *
+ * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
+ * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
+ * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
+ * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
+ * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
+ * complement. But in that branch we complement tmp, meaning that at the
+ * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
+ * which means at the top of the loop in the next iteration, it is
+ * TEST_NON_UTF8(s-1) */
+#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
+ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
+ tmp = TEST_NON_UTF8(tmp); \
+ REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
+ if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
+ tmp = !tmp; \
+ IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
+ } \
+ else { \
+ IF_FAIL; \
+ } \
+ ); \
+
+/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
+ * TEST_UTF8 is a macro that for the same input code points returns identically
+ * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
+#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
+ if (s == reginfo->strbeg) { \
+ tmp = '\n'; \
+ } \
+ else { /* Back-up to the start of the previous character */ \
+ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
+ tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
0, UTF8_ALLOW_DEFAULT); \
- } \
- tmp = TeSt1_UtF8; \
- LOAD_UTF8_CHARCLASS_ALNUM(); \
- REXEC_FBC_UTF8_SCAN( \
- if (tmp == ! (TeSt2_UtF8)) { \
- tmp = !tmp; \
- IF_SUCCESS; \
- } \
- else { \
- IF_FAIL; \
- } \
- ); \
+ } \
+ tmp = TEST_UV(tmp); \
+ LOAD_UTF8_CHARCLASS_ALNUM(); \
+ REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
+ if (tmp == ! (TEST_UTF8((U8 *) s))) { \
+ tmp = !tmp; \
+ IF_SUCCESS; \
+ } \
+ else { \
+ IF_FAIL; \
+ } \
+ );
-/* The only difference between the BOUND and NBOUND cases is that
- * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
- * NBOUND. This is accomplished by passing it in either the if or else clause,
- * with the other one being empty */
-#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
- FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
-
-#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
- FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
-
-#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
- FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
-
-#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
- FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
-
-
-/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
- * be passed in completely with the variable name being tested, which isn't
- * such a clean interface, but this is easier to read than it was before. We
- * are looking for the boundary (or non-boundary between a word and non-word
- * character. The utf8 and non-utf8 cases have the same logic, but the details
- * must be different. Find the "wordness" of the character just prior to this
- * one, and compare it with the wordness of this one. If they differ, we have
- * a boundary. At the beginning of the string, pretend that the previous
- * character was a new-line */
+/* Like the above two macros. UTF8_CODE is the complete code for handling
+ * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
+ * macros below */
#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
if (utf8_target) { \
- UTF8_CODE \
+ UTF8_CODE \
} \
else { /* Not utf8 */ \
tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
tmp = TEST_NON_UTF8(tmp); \
- REXEC_FBC_SCAN( \
+ REXEC_FBC_SCAN( /* advances s while s < strend */ \
if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
- tmp = !tmp; \
IF_SUCCESS; \
+ tmp = !tmp; \
} \
else { \
IF_FAIL; \
} \
); \
} \
- if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
- goto got_it;
+ /* Here, things have been set up by the previous code so that tmp is the \
+ * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
+ * utf8ness of the target). We also have to check if this matches against \
+ * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
+ * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
+ * string */ \
+ if (tmp == ! TEST_NON_UTF8('\n')) { \
+ IF_SUCCESS; \
+ } \
+ else { \
+ IF_FAIL; \
+ }
+
+/* This is the macro to use when we want to see if something that looks like it
+ * could match, actually does, and if so exits the loop */
+#define REXEC_FBC_TRYIT \
+ if ((reginfo->intuit || regtry(reginfo, &s))) \
+ goto got_it
+
+/* The only difference between the BOUND and NBOUND cases is that
+ * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
+ * NBOUND. This is accomplished by passing it as either the if or else clause,
+ * with the other one being empty (PLACEHOLDER is defined as empty).
+ *
+ * The TEST_FOO parameters are for operating on different forms of input, but
+ * all should be ones that return identically for the same underlying code
+ * points */
+#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
+ FBC_BOUND_COMMON( \
+ FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
+ TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+
+#define FBC_BOUND_A(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON( \
+ FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
+ TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+
+#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
+ FBC_BOUND_COMMON( \
+ FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
+ TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+
+#define FBC_NBOUND_A(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON( \
+ FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
+ TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+
+#ifdef DEBUGGING
+static IV
+S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
+ IV cp_out = Perl__invlist_search(invlist, cp_in);
+ assert(cp_out >= 0);
+ return cp_out;
+}
+# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
+ invmap[S_get_break_val_cp_checked(invlist, cp)]
+#else
+# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
+ invmap[_invlist_search(invlist, cp)]
+#endif
+
+/* Takes a pointer to an inversion list, a pointer to its corresponding
+ * inversion map, and a code point, and returns the code point's value
+ * according to the two arrays. It assumes that all code points have a value.
+ * This is used as the base macro for macros for particular properties */
+#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
+ _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
+
+/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
+ * of a code point, returning the value for the first code point in the string.
+ * And it takes the particular macro name that finds the desired value given a
+ * code point. Merely convert the UTF-8 to code point and call the cp macro */
+#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
+ (__ASSERT_(pos < strend) \
+ /* Note assumes is valid UTF-8 */ \
+ (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
+
+/* Returns the GCB value for the input code point */
+#define getGCB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_GCB_invlist, \
+ _Perl_GCB_invmap, \
+ (cp))
+
+/* Returns the GCB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getGCB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
+
+/* Returns the LB value for the input code point */
+#define getLB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_LB_invlist, \
+ _Perl_LB_invmap, \
+ (cp))
+
+/* Returns the LB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getLB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
+
+
+/* Returns the SB value for the input code point */
+#define getSB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_SB_invlist, \
+ _Perl_SB_invmap, \
+ (cp))
+
+/* Returns the SB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getSB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
+
+/* Returns the WB value for the input code point */
+#define getWB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_WB_invlist, \
+ _Perl_WB_invmap, \
+ (cp))
+
+/* Returns the WB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getWB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
/* We know what class REx starts with. Try to find this position... */
/* if reginfo->intuit, its a dryrun */
/* annoyingly all the vars in this routine have different names from their counterparts
in regmatch. /grrr */
-
STATIC char *
S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
const char *strend, regmatch_info *reginfo)
/* We know what class it must start with. */
switch (OP(c)) {
+ case ANYOFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
+
+ /* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
}
else {
- REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
+ REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s, 0));
}
break;
- case CANY:
- REXEC_FBC_SCAN(
- if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
- goto got_it;
- else
- tmp = doevery;
- );
- break;
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case EXACTFA:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_exactf_non_utf8;
case EXACTFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
utf8_fold_flags = FOLDEQ_LOCALE;
goto do_exactf_utf8;
}
goto do_exactf_utf8;
+ case EXACTFLU8:
+ if (! utf8_target) { /* All code points in this node require
+ UTF-8 to express. */
+ break;
+ }
+ utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+ | FOLDEQ_S2_FOLDS_SANE;
+ goto do_exactf_utf8;
+
case EXACTFU:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
fold_array = PL_fold_latin1;
folder = foldEQ_latin1;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
- do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
+ do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
are no glitches with fold-length differences
between the target string and pattern */
}
break;
- do_exactf_utf8:
- {
+ do_exactf_utf8:
+ {
unsigned expansion;
/* If one of the operands is in utf8, we can't use the simpler folding
}
break;
}
+
case BOUNDL:
- FBC_BOUND(isWORDCHAR_LC,
- isWORDCHAR_LC_uvchr(tmp),
- isWORDCHAR_LC_utf8((U8*)s));
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
+ goto do_boundu;
+ }
+
+ FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
+
case NBOUNDL:
- FBC_NBOUND(isWORDCHAR_LC,
- isWORDCHAR_LC_uvchr(tmp),
- isWORDCHAR_LC_utf8((U8*)s));
- break;
- case BOUND:
- FBC_BOUND(isWORDCHAR,
- isWORDCHAR_uni(tmp),
- cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
+ goto do_nboundu;
+ }
+
+ FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
- case BOUNDA:
- FBC_BOUND_NOLOAD(isWORDCHAR_A,
- isWORDCHAR_A(tmp),
- isWORDCHAR_A((U8*)s));
+
+ case BOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
+ FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case NBOUND:
- FBC_NBOUND(isWORDCHAR,
- isWORDCHAR_uni(tmp),
- cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+
+ case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
+ FBC_BOUND_A(isWORDCHAR_A);
break;
- case NBOUNDA:
- FBC_NBOUND_NOLOAD(isWORDCHAR_A,
- isWORDCHAR_A(tmp),
- isWORDCHAR_A((U8*)s));
+
+ case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
+ FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case BOUNDU:
- FBC_BOUND(isWORDCHAR_L1,
- isWORDCHAR_uni(tmp),
- cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+
+ case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
+ FBC_NBOUND_A(isWORDCHAR_A);
break;
+
case NBOUNDU:
- FBC_NBOUND(isWORDCHAR_L1,
- isWORDCHAR_uni(tmp),
- cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+ if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ }
+
+ do_nboundu:
+
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case BOUNDU:
+ do_boundu:
+ switch((bound_type) FLAGS(c)) {
+ case TRADITIONAL_BOUND:
+ FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ case GCB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s))
+ {
+ goto got_it;
+ }
+
+ /* Didn't match. Try at the next position (if there is one) */
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ if (utf8_target) {
+ GCB_enum before = getGCB_VAL_UTF8(
+ reghop3((U8*)s, -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ GCB_enum after = getGCB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if ( (to_complement ^ isGCB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. Everything is a GCB except between CR and
+ LF */
+ while (s < strend) {
+ if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
+ || UCHARAT(s) != '\n'))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ s++;
+ }
+ }
+
+ /* And, since this is a bound, it can match after the final
+ * character in the string */
+ if ((reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+ break;
+
+ case LB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ if (utf8_target) {
+ LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ LB_enum before = getLB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ LB_enum after = getLB_VAL_CP((U8) *s);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s++;
+ }
+ }
+
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+
+ break;
+
+ case SB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ if (utf8_target) {
+ SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ SB_enum after = getSB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ SB_enum before = getSB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ SB_enum after = getSB_VAL_CP((U8) *s);
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s++;
+ }
+ }
+
+ /* Here are at the final position in the target string. The SB
+ * value is always true here, so matches, depending on other
+ * constraints */
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+
+ break;
+
+ case WB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ if (utf8_target) {
+ /* We are at a boundary between char_sub_0 and char_sub_1.
+ * We also keep track of the value for char_sub_-1 as we
+ * loop through the line. Context may be needed to make a
+ * determination, and if so, this can save having to
+ * recalculate it */
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_UTF8(
+ reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ WB_enum after = getWB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ previous = before;
+ before = after;
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ WB_enum after = getWB_VAL_CP((U8) *s);
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ previous = before;
+ before = after;
+ s++;
+ }
+ }
+
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ }
break;
+
case LNBREAK:
REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
is_LNBREAK_latin1_safe(s, strend)
/* FALLTHROUGH */
case POSIXL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
break;
case NPOSIXA:
if (utf8_target) {
/* The complement of something that matches only ASCII matches all
- * UTF-8 variant code points, plus everything in ASCII that isn't
- * in the class */
- REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
+ * non-ASCII, plus everything in ASCII that isn't in the class. */
+ REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
|| ! _generic_isCC_A(*s, FLAGS(c)));
break;
}
}
else {
- posix_utf8:
+ posix_utf8:
classnum = (_char_class_number) FLAGS(c);
if (classnum < _FIRST_NON_SWASH_CC) {
while (s < strend) {
classnum)))
|| (UTF8_IS_DOWNGRADEABLE_START(*s)
&& to_complement ^ cBOOL(
- _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+ _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
*(s + 1)),
classnum))))
{
}
else switch (classnum) { /* These classes are implemented as
macros */
- case _CC_ENUM_SPACE: /* XXX would require separate code if we
- revert the change of \v matching this */
- /* FALL THROUGH */
-
- case _CC_ENUM_PSXSPC:
+ case _CC_ENUM_SPACE:
REXEC_FBC_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(isSPACE_utf8(s)));
break;
default:
Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
}
break;
DEBUG_TRIE_EXECUTE_r(
if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
dump_exec_pos( (char *)uc, c, strend, real_start,
- (char *)uc, utf8_target );
- PerlIO_printf( Perl_debug_log,
+ (char *)uc, utf8_target, 0 );
+ Perl_re_printf( aTHX_
" Scanning for legal start char...\n");
}
);
foldbuf, uniflags);
DEBUG_TRIE_EXECUTE_r({
dump_exec_pos( (char *)uc, c, strend,
- real_start, s, utf8_target);
- PerlIO_printf(Perl_debug_log,
+ real_start, s, utf8_target, 0);
+ Perl_re_printf( aTHX_
" Charid:%3u CP:%4"UVxf" ",
charid, uvc);
});
DEBUG_TRIE_EXECUTE_r({
if (failed)
dump_exec_pos( (char *)uc, c, strend, real_start,
- s, utf8_target );
- PerlIO_printf( Perl_debug_log,
+ s, utf8_target, 0 );
+ Perl_re_printf( aTHX_
"%sState: %4"UVxf", word=%"UVxf,
failed ? " Fail transition to " : "",
(UV)state, (UV)word);
&& (tmp=trie->trans[offset].next))
{
DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log," - legal\n"));
+ Perl_re_printf( aTHX_ " - legal\n"));
state = tmp;
break;
}
else {
DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log," - fail\n"));
+ Perl_re_printf( aTHX_ " - fail\n"));
failed = 1;
state = aho->fail[state];
}
else {
/* we must be accepting here */
DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log," - accepting\n"));
+ Perl_re_printf( aTHX_ " - accepting\n"));
failed = 1;
break;
}
if (leftmost) {
s = (char*)leftmost;
DEBUG_TRIE_EXECUTE_r({
- PerlIO_printf(
- Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
+ Perl_re_printf( aTHX_ "Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
(UV)accepted_word, (IV)(s - real_start)
);
});
}
s = HOPc(s,1);
DEBUG_TRIE_EXECUTE_r({
- PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
+ Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
});
} else {
DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,"No match.\n"));
+ Perl_re_printf( aTHX_ "No match.\n"));
break;
}
}
break;
default:
Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
- break;
}
return 0;
got_it:
if (flags & REXEC_COPY_STR) {
#ifdef PERL_ANY_COW
if (SvCANCOW(sv)) {
- if (DEBUG_C_TEST) {
- PerlIO_printf(Perl_debug_log,
+ DEBUG_C(Perl_re_printf( aTHX_
"Copy on write: regexp capture, type %d\n",
- (int) SvTYPE(sv));
- }
+ (int) SvTYPE(sv)));
/* Create a new COW SV to share the match string and store
* in saved_copy, unless the current COW SV in saved_copy
* is valid and suitable for our purpose */
U32 n = 0;
max = -1;
/* calculate the right-most part of the string covered
- * by a capture. Due to look-ahead, this may be to
+ * by a capture. Due to lookahead, this may be to
* the right of $&, so we have to scan all captures */
while (n <= prog->lastparen) {
if (prog->offs[n].end > max)
U32 n = 0;
min = max;
/* calculate the left-most part of the string covered
- * by a capture. Due to look-behind, this may be to
+ * by a capture. Due to lookbehind, this may be to
* the left of $&, so we have to scan all captures */
while (min && n <= prog->lastparen) {
if ( prog->offs[n].start != -1
/* flags: For optimizations. See REXEC_* in regexp.h */
{
- dVAR;
struct regexp *const prog = ReANY(rx);
char *s;
regnode *c;
PERL_UNUSED_ARG(data);
/* Be paranoid... */
- if (prog == NULL || stringarg == NULL) {
+ if (prog == NULL) {
Perl_croak(aTHX_ "NULL regexp parameter");
- return 0;
}
DEBUG_EXECUTE_r(
startpos = stringarg;
+ /* set these early as they may be used by the HOP macros below */
+ reginfo->strbeg = strbeg;
+ reginfo->strend = strend;
+ reginfo->is_utf8_target = cBOOL(utf8_target);
+
if (prog->intflags & PREGf_GPOS_SEEN) {
MAGIC *mg;
reginfo->ganch =
(flags & REXEC_IGNOREPOS)
? stringarg /* use start pos rather than pos() */
- : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
+ : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
/* Defined pos(): */
? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
: strbeg; /* pos() not defined; use start of string */
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_GPOS_r(Perl_re_printf( aTHX_
"GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
/* in the presence of \G, we may need to start looking earlier in
*/
if (prog->intflags & PREGf_ANCH_GPOS) {
- startpos = reginfo->ganch - prog->gofs;
- if (startpos <
- ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
- {
- DEBUG_r(PerlIO_printf(Perl_debug_log,
- "fail: ganch-gofs before earliest possible start\n"));
- return 0;
+ if (prog->gofs) {
+ startpos = HOPBACKc(reginfo->ganch, prog->gofs);
+ if (!startpos ||
+ ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
+ {
+ DEBUG_r(Perl_re_printf( aTHX_
+ "fail: ganch-gofs before earliest possible start\n"));
+ return 0;
+ }
}
+ else
+ startpos = reginfo->ganch;
}
else if (prog->gofs) {
- if (startpos - prog->gofs < strbeg)
+ startpos = HOPBACKc(startpos, prog->gofs);
+ if (!startpos)
startpos = strbeg;
- else
- startpos -= prog->gofs;
}
else if (prog->intflags & PREGf_GPOS_FLOAT)
startpos = strbeg;
minlen = prog->minlen;
if ((startpos + minlen) > strend || startpos < strbeg) {
- DEBUG_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_r(Perl_re_printf( aTHX_
"Regex match can't succeed, so not even tried\n"));
return 0;
}
{
/* this should only be possible under \G */
assert(prog->intflags & PREGf_GPOS_SEEN);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
"matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
goto phooey;
}
multiline = prog->extflags & RXf_PMf_MULTILINE;
if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
"String too short [regexec_flags]...\n"));
goto phooey;
}
Perl_croak(aTHX_ "corrupted regexp program");
}
+ RX_MATCH_TAINTED_off(rx);
+ RX_MATCH_UTF8_set(rx, utf8_target);
+
reginfo->prog = rx; /* Yes, sorry that this is confusing. */
reginfo->intuit = 0;
- reginfo->is_utf8_target = cBOOL(utf8_target);
reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
reginfo->warned = FALSE;
- reginfo->strbeg = strbeg;
reginfo->sv = sv;
reginfo->poscache_maxiter = 0; /* not yet started a countdown */
- reginfo->strend = strend;
/* see how far we have to get to not match where we matched before */
reginfo->till = stringarg + minend;
magic belonging to this SV.
Not newSVsv, either, as it does not COW.
*/
- assert(!IS_PADGV(sv));
reginfo->sv = newSV(0);
SvSetSV_nosteal(reginfo->sv, sv);
SAVEFREESV(reginfo->sv);
swap = prog->offs;
/* do we need a save destructor here for eval dies? */
Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
- DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
"rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
PTR2UV(prog),
PTR2UV(swap),
));
}
- /* Simplest case: anchored match need be tried only once. */
- /* [unless only anchor is BOL and multiline is set] */
+ if (prog->recurse_locinput)
+ Zero(prog->recurse_locinput,prog->nparens + 1, char *);
+
+ /* Simplest case: anchored match need be tried only once, or with
+ * MBOL, only at the beginning of each line.
+ *
+ * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
+ * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
+ * match at the start of the string then it won't match anywhere else
+ * either; while with /.*.../, if it doesn't match at the beginning,
+ * the earliest it could match is at the start of the next line */
+
if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
- if (s == startpos && regtry(reginfo, &s))
+ char *end;
+
+ if (regtry(reginfo, &s))
goto got_it;
- else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
- {
- char *end;
-
- if (minlen)
- dontbother = minlen - 1;
- end = HOP3c(strend, -dontbother, strbeg) - 1;
- /* for multiline we only have to try after newlines */
- if (prog->check_substr || prog->check_utf8) {
- /* because of the goto we can not easily reuse the macros for bifurcating the
- unicode/non-unicode match modes here like we do elsewhere - demerphq */
- if (utf8_target) {
- if (s == startpos)
- goto after_try_utf8;
- while (1) {
- if (regtry(reginfo, &s)) {
- goto got_it;
- }
- after_try_utf8:
- if (s > end) {
- goto phooey;
- }
- if (prog->extflags & RXf_USE_INTUIT) {
- s = re_intuit_start(rx, sv, strbeg,
- s + UTF8SKIP(s), strend, flags, NULL);
- if (!s) {
- goto phooey;
- }
- }
- else {
- s += UTF8SKIP(s);
- }
- }
- } /* end search for check string in unicode */
- else {
- if (s == startpos) {
- goto after_try_latin;
- }
- while (1) {
- if (regtry(reginfo, &s)) {
- goto got_it;
- }
- after_try_latin:
- if (s > end) {
- goto phooey;
- }
- if (prog->extflags & RXf_USE_INTUIT) {
- s = re_intuit_start(rx, sv, strbeg,
- s + 1, strend, flags, NULL);
- if (!s) {
- goto phooey;
- }
- }
- else {
- s++;
- }
- }
- } /* end search for check string in latin*/
- } /* end search for check string */
- else { /* search for newline */
- if (s > startpos) {
- /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
- s--;
- }
- /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
- while (s <= end) { /* note it could be possible to match at the end of the string */
- if (*s++ == '\n') { /* don't need PL_utf8skip here */
- if (regtry(reginfo, &s))
- goto got_it;
- }
- }
- } /* end search for newline */
- } /* end anchored/multiline check string search */
- goto phooey;
- } else if (prog->intflags & PREGf_ANCH_GPOS)
+
+ if (!(prog->intflags & PREGf_ANCH_MBOL))
+ goto phooey;
+
+ /* didn't match at start, try at other newline positions */
+
+ if (minlen)
+ dontbother = minlen - 1;
+ end = HOP3c(strend, -dontbother, strbeg) - 1;
+
+ /* skip to next newline */
+
+ while (s <= end) { /* note it could be possible to match at the end of the string */
+ /* NB: newlines are the same in unicode as they are in latin */
+ if (*s++ != '\n')
+ continue;
+ if (prog->check_substr || prog->check_utf8) {
+ /* note that with PREGf_IMPLICIT, intuit can only fail
+ * or return the start position, so it's of limited utility.
+ * Nevertheless, I made the decision that the potential for
+ * quick fail was still worth it - DAPM */
+ s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
+ if (!s)
+ goto phooey;
+ }
+ if (regtry(reginfo, &s))
+ goto got_it;
+ }
+ goto phooey;
+ } /* end anchored search */
+
+ if (prog->intflags & PREGf_ANCH_GPOS)
{
/* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
assert(prog->intflags & PREGf_GPOS_SEEN);
/* For anchored \G, the only position it can match from is
* (ganch-gofs); we already set startpos to this above; if intuit
* moved us on from there, we can't possibly succeed */
- assert(startpos == reginfo->ganch - prog->gofs);
+ assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
if (s == startpos && regtry(reginfo, &s))
goto got_it;
goto phooey;
);
}
DEBUG_EXECUTE_r(if (!did_match)
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"Did not find anchored character...\n")
);
}
DEBUG_EXECUTE_r(if (!did_match) {
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
- PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
+ Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
((must == prog->anchored_substr || must == prog->anchored_utf8)
? "anchored" : "floating"),
quoted, RE_SV_TAIL(must));
if (minlen) {
const OPCODE op = OP(progi->regstclass);
/* don't bother with what can't match */
- if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
+ if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
strend = HOPc(strend, -(minlen - 1));
}
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
- regprop(prog, prop, c, reginfo);
+ regprop(prog, prop, c, reginfo, NULL);
{
RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
s,strend-s,60);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"Matching stclass %.*s against %s (%d bytes)\n",
(int)SvCUR(prop), SvPVX_const(prop),
quoted, (int)(strend - s));
});
if (find_byclass(prog, c, s, strend, reginfo))
goto got_it;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
}
else {
dontbother = 0;
* the \n. */
char *checkpos= strend - len;
DEBUG_OPTIMISE_r(
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"%sChecking for float_real.%s\n",
PL_colors[4], PL_colors[5]));
if (checkpos + 1 < strbeg) {
/* can't match, even if we remove the trailing \n
* string is too short to match */
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"%sString shorter than required trailing substring, cannot match.%s\n",
PL_colors[4], PL_colors[5]));
goto phooey;
/* cant match, string is too short when the "\n" is
* included */
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"%sString does not contain required trailing substring, cannot match.%s\n",
PL_colors[4], PL_colors[5]));
goto phooey;
last= checkpos;
} else {
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"%sString does not contain required trailing substring, cannot match.%s\n",
PL_colors[4], PL_colors[5]));
goto phooey;
* pretty sure it is not anymore, so I have removed the comment
* and replaced it with this one. Yves */
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "String does not contain required substring, cannot match.\n"
+ Perl_re_printf( aTHX_
+ "%sString does not contain required substring, cannot match.%s\n",
+ PL_colors[4], PL_colors[5]
));
goto phooey;
}
/* Failure. */
goto phooey;
-got_it:
+ got_it:
/* s/// doesn't like it if $& is earlier than where we asked it to
* start searching (which can happen on something like /.\G/) */
if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
{
/* this should only be possible under \G */
assert(prog->intflags & PREGf_GPOS_SEEN);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
"matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
goto phooey;
}
DEBUG_BUFFERS_r(
if (swap)
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
PTR2UV(prog),
PTR2UV(swap)
if (RXp_PAREN_NAMES(prog))
(void)hv_iterinit(RXp_PAREN_NAMES(prog));
- RX_MATCH_UTF8_set(rx, utf8_target);
-
/* make sure $`, $&, $', and $digit will work later */
if ( !(flags & REXEC_NOT_FIRST) )
S_reg_set_capture_string(aTHX_ rx,
return 1;
-phooey:
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
+ phooey:
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
PL_colors[4], PL_colors[5]));
/* clean up; this will trigger destructors that will free all slabs
if (swap) {
/* we failed :-( roll it back */
- DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
"rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
PTR2UV(prog),
PTR2UV(prog->offs),
/*
- regtry - try match at specific point
*/
-STATIC I32 /* 0 failure, 1 success */
+STATIC bool /* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
{
- dVAR;
CHECKPOINT lastcp;
REGEXP *const rx = reginfo->prog;
regexp *const prog = ReANY(rx);
SSize_t result;
+#ifdef DEBUGGING
+ U32 depth = 0; /* used by REGCP_SET */
+#endif
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
sayNO
/* this is used to determine how far from the left messages like
- 'failed...' are printed. It should be set such that messages
- are inline with the regop output that created them.
+ 'failed...' are printed in regexec.c. It should be set such that
+ messages are inline with the regop output that created them.
*/
-#define REPORT_CODE_OFF 32
+#define REPORT_CODE_OFF 29
+#define INDENT_CHARS(depth) ((int)(depth) % 20)
+#ifdef DEBUGGING
+int
+Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
+ va_start(ap, depth);
+ PerlIO_printf(f, "%*s|%4"UVuf"| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+#endif /* DEBUGGING */
#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
*/
-#define DEBUG_STATE_pp(pp) \
- DEBUG_STATE_r({ \
- DUMP_EXEC_POS(locinput, scan, utf8_target); \
- PerlIO_printf(Perl_debug_log, \
- " %*s"pp" %s%s%s%s%s\n", \
- depth*2, "", \
- PL_reg_name[st->resume_state], \
- ((st==yes_state||st==mark_state) ? "[" : ""), \
- ((st==yes_state) ? "Y" : ""), \
- ((st==mark_state) ? "M" : ""), \
- ((st==yes_state||st==mark_state) ? "]" : "") \
- ); \
+#define DEBUG_STATE_pp(pp) \
+ DEBUG_STATE_r({ \
+ DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
+ Perl_re_printf( aTHX_ \
+ "%*s" pp " %s%s%s%s%s\n", \
+ INDENT_CHARS(depth), "", \
+ PL_reg_name[st->resume_state], \
+ ((st==yes_state||st==mark_state) ? "[" : ""), \
+ ((st==yes_state) ? "Y" : ""), \
+ ((st==mark_state) ? "M" : ""), \
+ ((st==yes_state||st==mark_state) ? "]" : "") \
+ ); \
});
RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
start, end - start, 60);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"%s%s REx%s %s against %s\n",
PL_colors[4], blurb, PL_colors[5], s0, s1);
if (utf8_target||utf8_pat)
- PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
+ Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
utf8_pat ? "pattern" : "",
utf8_pat && utf8_target ? " and " : "",
utf8_target ? "string" : ""
const char *loc_regeol,
const char *loc_bostr,
const char *loc_reg_starttry,
- const bool utf8_target)
+ const bool utf8_target,
+ const U32 depth
+ )
{
const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
if (pref0_len > pref_len)
pref0_len = pref_len;
{
- const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
+ const int is_uni = utf8_target ? 1 : 0;
RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
(locinput - pref_len),pref0_len, 60, 4, 5);
locinput, loc_regeol - locinput, 10, 0, 1);
const STRLEN tlen=len0+len1+len2;
- PerlIO_printf(Perl_debug_log,
- "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
+ Perl_re_printf( aTHX_
+ "%4"IVdf" <%.*s%.*s%s%.*s>%*s|%4u| ",
(IV)(locinput - loc_bostr),
len0, s0,
len1, s1,
(docolor ? "" : "> <"),
len2, s2,
(int)(tlen > 19 ? 0 : 19 - tlen),
- "");
+ "",
+ depth);
}
}
* or 0 if non of the buffers matched.
*/
STATIC I32
-S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
+S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
{
I32 n;
RXi_GET_DECL(rex,rexi);
const bool utf8_target = reginfo->is_utf8_target;
- UV c1 = CHRTEST_NOT_A_CP_1;
- UV c2 = CHRTEST_NOT_A_CP_2;
+ UV c1 = (UV)CHRTEST_NOT_A_CP_1;
+ UV c2 = (UV)CHRTEST_NOT_A_CP_2;
bool use_chrtest_void = FALSE;
const bool is_utf8_pat = reginfo->is_utf8_pat;
U8 *pat = (U8*)STRING(text_node);
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
- if (OP(text_node) == EXACT) {
+ if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
/* In an exact node, only one thing can be matched, that first
* character. If both the pat and the target are UTF-8, we can just
}
else { /* an EXACTFish node which doesn't begin with a multi-char fold */
c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
- if (c1 > 256) {
+ if (c1 > 255) {
/* Load the folds hash, if not already done */
SV** listp;
if (! PL_utf8_foldclosures) {
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES_CASE+1];
-
- /* Force loading this by folding an above-Latin1 char */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+ _load_PL_utf8_foldclosures();
}
/* The fold closures data structure is a hash with the keys
}
else { /* Does participate in folds */
AV* list = (AV*) *listp;
- if (av_tindex(list) != 1) {
+ if (av_tindex_nomg(list) != 1) {
/* If there aren't exactly two folds to this, it is
* outside the scope of this function */
/* Folds that cross the 255/256 boundary are forbidden
* if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
* one is ASCIII. Since the pattern character is above
- * 256, and its only other match is below 256, the only
+ * 255, and its only other match is below 256, the only
* legal match will be to itself. We have thrown away
* the original, so have to compute which is the one
- * above 255 */
+ * above 255. */
if ((c1 < 256) != (c2 < 256)) {
if ((OP(text_node) == EXACTFL
&& ! IN_UTF8_CTYPE_LOCALE)
}
}
}
- else /* Here, c1 is < 255 */
+ else /* Here, c1 is <= 255 */
if (utf8_target
&& HAS_NONLATIN1_FOLD_CLOSURE(c1)
&& ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
case EXACTFA_NO_TRIE: /* This node only generated for
non-utf8 patterns */
assert(! is_utf8_pat);
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case EXACTFA:
case EXACTFU_SS:
case EXACTFU:
default:
Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
}
}
return TRUE;
}
+STATIC bool
+S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
+{
+ /* returns a boolean indicating if there is a Grapheme Cluster Boundary
+ * between the inputs. See http://www.unicode.org/reports/tr29/. */
+
+ PERL_ARGS_ASSERT_ISGCB;
+
+ switch (GCB_table[before][after]) {
+ case GCB_BREAKABLE:
+ return TRUE;
+
+ case GCB_NOBREAK:
+ return FALSE;
+
+ case GCB_RI_then_RI:
+ {
+ int RI_count = 1;
+ U8 * temp_pos = (U8 *) curpos;
+
+ /* Do not break within emoji flag sequences. That is, do not
+ * break between regional indicator (RI) symbols if there is an
+ * odd number of RI characters before the break point.
+ * GB12 ^ (RI RI)* RI × RI
+ * GB13 [^RI] (RI RI)* RI × RI */
+
+ while (backup_one_GCB(strbeg,
+ &temp_pos,
+ utf8_target) == GCB_Regional_Indicator)
+ {
+ RI_count++;
+ }
+
+ return RI_count % 2 != 1;
+ }
+
+ case GCB_EX_then_EM:
+
+ /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
+ {
+ U8 * temp_pos = (U8 *) curpos;
+ GCB_enum prev;
+
+ do {
+ prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == GCB_Extend);
+
+ return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
+ }
+
+ default:
+ break;
+ }
+
+#ifdef DEBUGGING
+ Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
+ before, after, GCB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
+}
+
+STATIC GCB_enum
+S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ GCB_enum gcb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
+
+ if (*curpos < strbeg) {
+ return GCB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ U8 * prev_prev_char_pos;
+
+ if (! prev_char_pos) {
+ return GCB_EDGE;
+ }
+
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
+ gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return GCB_EDGE;
+ }
+ }
+ else {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return GCB_EDGE;
+ }
+ (*curpos)--;
+ gcb = getGCB_VAL_CP(*(*curpos - 1));
+ }
+
+ return gcb;
+}
+
+/* Combining marks attach to most classes that precede them, but this defines
+ * the exceptions (from TR14) */
+#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
+ || prev == LB_Mandatory_Break \
+ || prev == LB_Carriage_Return \
+ || prev == LB_Line_Feed \
+ || prev == LB_Next_Line \
+ || prev == LB_Space \
+ || prev == LB_ZWSpace))
+
+STATIC bool
+S_isLB(pTHX_ LB_enum before,
+ LB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ U8 * temp_pos = (U8 *) curpos;
+ LB_enum prev = before;
+
+ /* Is the boundary between 'before' and 'after' line-breakable?
+ * Most of this is just a table lookup of a generated table from Unicode
+ * rules. But some rules require context to decide, and so have to be
+ * implemented in code */
+
+ PERL_ARGS_ASSERT_ISLB;
+
+ /* Rule numbers in the comments below are as of Unicode 9.0 */
+
+ redo:
+ before = prev;
+ switch (LB_table[before][after]) {
+ case LB_BREAKABLE:
+ return TRUE;
+
+ case LB_NOBREAK:
+ case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
+ return FALSE;
+
+ case LB_SP_foo + LB_BREAKABLE:
+ case LB_SP_foo + LB_NOBREAK:
+ case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
+
+ /* When we have something following a SP, we have to look at the
+ * context in order to know what to do.
+ *
+ * SP SP should not reach here because LB7: Do not break before
+ * spaces. (For two spaces in a row there is nothing that
+ * overrides that) */
+ assert(after != LB_Space);
+
+ /* Here we have a space followed by a non-space. Mostly this is a
+ * case of LB18: "Break after spaces". But there are complications
+ * as the handling of spaces is somewhat tricky. They are in a
+ * number of rules, which have to be applied in priority order, but
+ * something earlier in the string can cause a rule to be skipped
+ * and a lower priority rule invoked. A prime example is LB7 which
+ * says don't break before a space. But rule LB8 (lower priority)
+ * says that the first break opportunity after a ZW is after any
+ * span of spaces immediately after it. If a ZW comes before a SP
+ * in the input, rule LB8 applies, and not LB7. Other such rules
+ * involve combining marks which are rules 9 and 10, but they may
+ * override higher priority rules if they come earlier in the
+ * string. Since we're doing random access into the middle of the
+ * string, we have to look for rules that should get applied based
+ * on both string position and priority. Combining marks do not
+ * attach to either ZW nor SP, so we don't have to consider them
+ * until later.
+ *
+ * To check for LB8, we have to find the first non-space character
+ * before this span of spaces */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Space);
+
+ /* LB8 Break before any character following a zero-width space,
+ * even if one or more spaces intervene.
+ * ZW SP* ÷
+ * So if we have a ZW just before this span, and to get here this
+ * is the final space in the span. */
+ if (prev == LB_ZWSpace) {
+ return TRUE;
+ }
+
+ /* Here, not ZW SP+. There are several rules that have higher
+ * priority than LB18 and can be resolved now, as they don't depend
+ * on anything earlier in the string (except ZW, which we have
+ * already handled). One of these rules is LB11 Do not break
+ * before Word joiner, but we have specially encoded that in the
+ * lookup table so it is caught by the single test below which
+ * catches the other ones. */
+ if (LB_table[LB_Space][after] - LB_SP_foo
+ == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
+ {
+ return FALSE;
+ }
+
+ /* If we get here, we have to XXX consider combining marks. */
+ if (prev == LB_Combining_Mark) {
+
+ /* What happens with these depends on the character they
+ * follow. */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
+
+ /* Most times these attach to and inherit the characteristics
+ * of that character, but not always, and when not, they are to
+ * be treated as AL by rule LB10. */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+ }
+
+ /* Here, we have the character preceding the span of spaces all set
+ * up. We follow LB18: "Break after spaces" unless the table shows
+ * that is overriden */
+ return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
+
+ case LB_CM_ZWJ_foo:
+
+ /* We don't know how to treat the CM except by looking at the first
+ * non-CM character preceding it. ZWJ is treated as CM */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark || prev == LB_ZWJ);
+
+ /* Here, 'prev' is that first earlier non-CM character. If the CM
+ * attatches to it, then it inherits the behavior of 'prev'. If it
+ * doesn't attach, it is to be treated as an AL */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+
+ goto redo;
+
+ case LB_HY_or_BA_then_foo + LB_BREAKABLE:
+ case LB_HY_or_BA_then_foo + LB_NOBREAK:
+
+ /* LB21a Don't break after Hebrew + Hyphen.
+ * HL (HY | BA) × */
+
+ if (backup_one_LB(strbeg, &temp_pos, utf8_target)
+ == LB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
+
+ case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
+ case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
+
+ /* LB25a (PR | PO) × ( OP | HY )? NU */
+ if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
+ == LB_BREAKABLE;
+
+ case LB_SY_or_IS_then_various + LB_BREAKABLE:
+ case LB_SY_or_IS_then_various + LB_NOBREAK:
+ {
+ /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
+
+ LB_enum temp = prev;
+ do {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_SY_or_IS_then_various
+ == LB_BREAKABLE;
+ }
+
+ case LB_various_then_PO_or_PR + LB_BREAKABLE:
+ case LB_various_then_PO_or_PR + LB_NOBREAK:
+ {
+ /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
+
+ LB_enum temp = prev;
+ if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
+ {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+ return LB_various_then_PO_or_PR;
+ }
+
+ case LB_RI_then_RI + LB_NOBREAK:
+ case LB_RI_then_RI + LB_BREAKABLE:
+ {
+ int RI_count = 1;
+
+ /* LB30a Break between two regional indicator symbols if and
+ * only if there are an even number of regional indicators
+ * preceding the position of the break.
+ *
+ * sot (RI RI)* RI × RI
+ * [^RI] (RI RI)* RI × RI */
+
+ while (backup_one_LB(strbeg,
+ &temp_pos,
+ utf8_target) == LB_Regional_Indicator)
+ {
+ RI_count++;
+ }
+
+ return RI_count % 2 == 0;
+ }
+
+ default:
+ break;
+ }
+
+#ifdef DEBUGGING
+ Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
+ before, after, LB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
+}
+
+STATIC LB_enum
+S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
+
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_UTF8(*curpos, strend);
+ }
+ else {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_CP(**curpos);
+ }
+
+ return lb;
+}
+
+STATIC LB_enum
+S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_LB;
+
+ if (*curpos < strbeg) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ U8 * prev_prev_char_pos;
+
+ if (! prev_char_pos) {
+ return LB_EDGE;
+ }
+
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
+ lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ }
+ else {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ (*curpos)--;
+ lb = getLB_VAL_CP(*(*curpos - 1));
+ }
+
+ return lb;
+}
+
+STATIC bool
+S_isSB(pTHX_ SB_enum before,
+ SB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ /* returns a boolean indicating if there is a Sentence Boundary Break
+ * between the inputs. See http://www.unicode.org/reports/tr29/ */
+
+ U8 * lpos = (U8 *) curpos;
+ bool has_para_sep = FALSE;
+ bool has_sp = FALSE;
+
+ PERL_ARGS_ASSERT_ISSB;
+
+ /* Break at the start and end of text.
+ SB1. sot ÷
+ SB2. ÷ eot
+ But unstated in Unicode is don't break if the text is empty */
+ if (before == SB_EDGE || after == SB_EDGE) {
+ return before != after;
+ }
+
+ /* SB 3: Do not break within CRLF. */
+ if (before == SB_CR && after == SB_LF) {
+ return FALSE;
+ }
+
+ /* Break after paragraph separators. CR and LF are considered
+ * so because Unicode views text as like word processing text where there
+ * are no newlines except between paragraphs, and the word processor takes
+ * care of wrapping without there being hard line-breaks in the text *./
+ SB4. Sep | CR | LF ÷ */
+ if (before == SB_Sep || before == SB_CR || before == SB_LF) {
+ return TRUE;
+ }
+
+ /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
+ * (See Section 6.2, Replacing Ignore Rules.)
+ SB5. X (Extend | Format)* → X */
+ if (after == SB_Extend || after == SB_Format) {
+
+ /* Implied is that the these characters attach to everything
+ * immediately prior to them except for those separator-type
+ * characters. And the rules earlier have already handled the case
+ * when one of those immediately precedes the extend char */
+ return FALSE;
+ }
+
+ if (before == SB_Extend || before == SB_Format) {
+ U8 * temp_pos = lpos;
+ const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ if ( backup != SB_EDGE
+ && backup != SB_Sep
+ && backup != SB_CR
+ && backup != SB_LF)
+ {
+ before = backup;
+ lpos = temp_pos;
+ }
+
+ /* Here, both 'before' and 'backup' are these types; implied is that we
+ * don't break between them */
+ if (backup == SB_Extend || backup == SB_Format) {
+ return FALSE;
+ }
+ }
+
+ /* Do not break after ambiguous terminators like period, if they are
+ * immediately followed by a number or lowercase letter, if they are
+ * between uppercase letters, if the first following letter (optionally
+ * after certain punctuation) is lowercase, or if they are followed by
+ * "continuation" punctuation such as comma, colon, or semicolon. For
+ * example, a period may be an abbreviation or numeric period, and thus may
+ * not mark the end of a sentence.
+
+ * SB6. ATerm × Numeric */
+ if (before == SB_ATerm && after == SB_Numeric) {
+ return FALSE;
+ }
+
+ /* SB7. (Upper | Lower) ATerm × Upper */
+ if (before == SB_ATerm && after == SB_Upper) {
+ U8 * temp_pos = lpos;
+ SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ if (backup == SB_Upper || backup == SB_Lower) {
+ return FALSE;
+ }
+ }
+
+ /* The remaining rules that aren't the final one, all require an STerm or
+ * an ATerm after having backed up over some Close* Sp*, and in one case an
+ * optional Paragraph separator, although one rule doesn't have any Sp's in it.
+ * So do that backup now, setting flags if either Sp or a paragraph
+ * separator are found */
+
+ if (before == SB_Sep || before == SB_CR || before == SB_LF) {
+ has_para_sep = TRUE;
+ before = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+
+ if (before == SB_Sp) {
+ has_sp = TRUE;
+ do {
+ before = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+ while (before == SB_Sp);
+ }
+
+ while (before == SB_Close) {
+ before = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+
+ /* The next few rules apply only when the backed-up-to is an ATerm, and in
+ * most cases an STerm */
+ if (before == SB_STerm || before == SB_ATerm) {
+
+ /* So, here the lhs matches
+ * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
+ * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
+ * The rules that apply here are:
+ *
+ * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
+ | LF | STerm | ATerm) )* Lower
+ SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
+ SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
+ SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
+ SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
+ */
+
+ /* And all but SB11 forbid having seen a paragraph separator */
+ if (! has_para_sep) {
+ if (before == SB_ATerm) { /* SB8 */
+ U8 * rpos = (U8 *) curpos;
+ SB_enum later = after;
+
+ while ( later != SB_OLetter
+ && later != SB_Upper
+ && later != SB_Lower
+ && later != SB_Sep
+ && later != SB_CR
+ && later != SB_LF
+ && later != SB_STerm
+ && later != SB_ATerm
+ && later != SB_EDGE)
+ {
+ later = advance_one_SB(&rpos, strend, utf8_target);
+ }
+ if (later == SB_Lower) {
+ return FALSE;
+ }
+ }
+
+ if ( after == SB_SContinue /* SB8a */
+ || after == SB_STerm
+ || after == SB_ATerm)
+ {
+ return FALSE;
+ }
+
+ if (! has_sp) { /* SB9 applies only if there was no Sp* */
+ if ( after == SB_Close
+ || after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF)
+ {
+ return FALSE;
+ }
+ }
+
+ /* SB10. This and SB9 could probably be combined some way, but khw
+ * has decided to follow the Unicode rule book precisely for
+ * simplified maintenance */
+ if ( after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF)
+ {
+ return FALSE;
+ }
+ }
+
+ /* SB11. */
+ return TRUE;
+ }
+
+ /* Otherwise, do not break.
+ SB12. Any × Any */
+
+ return FALSE;
+}
+
+STATIC SB_enum
+S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ SB_enum sb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
+
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+
+ if (utf8_target) {
+ do {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+ sb = getSB_VAL_UTF8(*curpos, strend);
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+ else {
+ do {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+ sb = getSB_VAL_CP(**curpos);
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+
+ return sb;
+}
+
+STATIC SB_enum
+S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ SB_enum sb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_SB;
+
+ if (*curpos < strbeg) {
+ return SB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! prev_char_pos) {
+ return SB_EDGE;
+ }
+
+ /* Back up over Extend and Format. curpos is always just to the right
+ * of the characater whose value we are getting */
+ do {
+ U8 * prev_prev_char_pos;
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
+ strbeg)))
+ {
+ sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return SB_EDGE;
+ }
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+ else {
+ do {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return SB_EDGE;
+ }
+ (*curpos)--;
+ sb = getSB_VAL_CP(*(*curpos - 1));
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+
+ return sb;
+}
+
+STATIC bool
+S_isWB(pTHX_ WB_enum previous,
+ WB_enum before,
+ WB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ /* Return a boolean as to if the boundary between 'before' and 'after' is
+ * a Unicode word break, using their published algorithm, but tailored for
+ * Perl by treating spans of white space as one unit. Context may be
+ * needed to make this determination. If the value for the character
+ * before 'before' is known, it is passed as 'previous'; otherwise that
+ * should be set to WB_UNKNOWN. The other input parameters give the
+ * boundaries and current position in the matching of the string. That
+ * is, 'curpos' marks the position where the character whose wb value is
+ * 'after' begins. See http://www.unicode.org/reports/tr29/ */
+
+ U8 * before_pos = (U8 *) curpos;
+ U8 * after_pos = (U8 *) curpos;
+ WB_enum prev = before;
+ WB_enum next;
+
+ PERL_ARGS_ASSERT_ISWB;
+
+ /* Rule numbers in the comments below are as of Unicode 9.0 */
+
+ redo:
+ before = prev;
+ switch (WB_table[before][after]) {
+ case WB_BREAKABLE:
+ return TRUE;
+
+ case WB_NOBREAK:
+ return FALSE;
+
+ case WB_hs_then_hs: /* 2 horizontal spaces in a row */
+ next = advance_one_WB(&after_pos, strend, utf8_target,
+ FALSE /* Don't skip Extend nor Format */ );
+ /* A space immediately preceeding an Extend or Format is attached
+ * to by them, and hence gets separated from previous spaces.
+ * Otherwise don't break between horizontal white space */
+ return next == WB_Extend || next == WB_Format;
+
+ /* WB4 Ignore Format and Extend characters, except when they appear at
+ * the beginning of a region of text. This code currently isn't
+ * general purpose, but it works as the rules are currently and likely
+ * to be laid out. The reason it works is that when 'they appear at
+ * the beginning of a region of text', the rule is to break before
+ * them, just like any other character. Therefore, the default rule
+ * applies and we don't have to look in more depth. Should this ever
+ * change, we would have to have 2 'case' statements, like in the rules
+ * below, and backup a single character (not spacing over the extend
+ * ones) and then see if that is one of the region-end characters and
+ * go from there */
+ case WB_Ex_or_FO_or_ZWJ_then_foo:
+ prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ goto redo;
+
+ case WB_DQ_then_HL + WB_BREAKABLE:
+ case WB_DQ_then_HL + WB_NOBREAK:
+
+ /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
+
+ if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ == WB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
+
+ case WB_HL_then_DQ + WB_BREAKABLE:
+ case WB_HL_then_DQ + WB_NOBREAK:
+
+ /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
+
+ if (advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ == WB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
+
+ case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
+ case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
+
+ /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
+ * | Single_Quote) (ALetter | Hebrew_Letter) */
+
+ next = advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ );
+
+ if (next == WB_ALetter || next == WB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return WB_table[before][after]
+ - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
+
+ case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
+ case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
+
+ /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
+ * | Single_Quote) × (ALetter | Hebrew_Letter) */
+
+ prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return WB_table[before][after]
+ - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
+
+ case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
+ case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
+
+ /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
+ * */
+
+ if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ == WB_Numeric)
+ {
+ return FALSE;
+ }
+
+ return WB_table[before][after]
+ - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
+
+ case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
+ case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
+
+ /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
+
+ if (advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ == WB_Numeric)
+ {
+ return FALSE;
+ }
+
+ return WB_table[before][after]
+ - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
+
+ case WB_RI_then_RI + WB_NOBREAK:
+ case WB_RI_then_RI + WB_BREAKABLE:
+ {
+ int RI_count = 1;
+
+ /* Do not break within emoji flag sequences. That is, do not
+ * break between regional indicator (RI) symbols if there is an
+ * odd number of RI characters before the potential break
+ * point.
+ *
+ * WB15 ^ (RI RI)* RI × RI
+ * WB16 [^RI] (RI RI)* RI × RI */
+
+ while (backup_one_WB(&previous,
+ strbeg,
+ &before_pos,
+ utf8_target) == WB_Regional_Indicator)
+ {
+ RI_count++;
+ }
+
+ return RI_count % 2 != 1;
+ }
+
+ default:
+ break;
+ }
+
+#ifdef DEBUGGING
+ Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
+ before, after, WB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
+}
+
+STATIC WB_enum
+S_advance_one_WB(pTHX_ U8 ** curpos,
+ const U8 * const strend,
+ const bool utf8_target,
+ const bool skip_Extend_Format)
+{
+ WB_enum wb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
+
+ if (*curpos >= strend) {
+ return WB_EDGE;
+ }
+
+ if (utf8_target) {
+
+ /* Advance over Extend and Format */
+ do {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return WB_EDGE;
+ }
+ wb = getWB_VAL_UTF8(*curpos, strend);
+ } while ( skip_Extend_Format
+ && (wb == WB_Extend || wb == WB_Format));
+ }
+ else {
+ do {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return WB_EDGE;
+ }
+ wb = getWB_VAL_CP(**curpos);
+ } while ( skip_Extend_Format
+ && (wb == WB_Extend || wb == WB_Format));
+ }
+
+ return wb;
+}
+
+STATIC WB_enum
+S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ WB_enum wb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_WB;
+
+ /* If we know what the previous character's break value is, don't have
+ * to look it up */
+ if (*previous != WB_UNKNOWN) {
+ wb = *previous;
+
+ /* But we need to move backwards by one */
+ if (utf8_target) {
+ *curpos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! *curpos) {
+ *previous = WB_EDGE;
+ *curpos = (U8 *) strbeg;
+ }
+ else {
+ *previous = WB_UNKNOWN;
+ }
+ }
+ else {
+ (*curpos)--;
+ *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
+ }
+
+ /* And we always back up over these three types */
+ if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
+ return wb;
+ }
+ }
+
+ if (*curpos < strbeg) {
+ return WB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! prev_char_pos) {
+ return WB_EDGE;
+ }
+
+ /* Back up over Extend and Format. curpos is always just to the right
+ * of the characater whose value we are getting */
+ do {
+ U8 * prev_prev_char_pos;
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
+ -1,
+ strbeg)))
+ {
+ wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return WB_EDGE;
+ }
+ } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
+ }
+ else {
+ do {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return WB_EDGE;
+ }
+ (*curpos)--;
+ wb = getWB_VAL_CP(*(*curpos - 1));
+ } while (wb == WB_Extend || wb == WB_Format);
+ }
+
+ return wb;
+}
+
+#define EVAL_CLOSE_PAREN_IS(st,expr) \
+( \
+ ( ( st ) ) && \
+ ( ( st )->u.eval.close_paren ) && \
+ ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
+)
+
+#define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
+( \
+ ( ( st ) ) && \
+ ( ( st )->u.eval.close_paren ) && \
+ ( ( expr ) ) && \
+ ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
+)
+
+
+#define EVAL_CLOSE_PAREN_SET(st,expr) \
+ (st)->u.eval.close_paren = ( (expr) + 1 )
+
+#define EVAL_CLOSE_PAREN_CLEAR(st) \
+ (st)->u.eval.close_paren = 0
+
/* returns -1 on failure, $+[0] on success */
STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
{
+
#if PERL_VERSION < 9 && !defined(PERL_CORE)
dMY_CXT;
#endif
SSize_t ln = 0; /* len or last; init to avoid compiler warning */
char *locinput = startpos;
char *pushinput; /* where to continue after a PUSH */
- I32 nextchr; /* is always set to UCHARAT(locinput) */
+ I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */
bool result = 0; /* return value of S_regmatch */
int depth = 0; /* depth of backtrack stack */
*/
PAD* last_pad = NULL;
dMULTICALL;
- I32 gimme = G_SCALAR;
+ U8 gimme = G_SCALAR;
CV *caller_cv = NULL; /* who called us */
CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
CHECKPOINT runops_cp; /* savestack position before executing EVAL */
int to_complement; /* Invert the result? */
_char_class_number classnum;
bool is_utf8_pat = reginfo->is_utf8_pat;
+ bool match = FALSE;
+
+/* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
+#if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
+# define SOLARIS_BAD_OPTIMIZER
+ const U32 *pl_charclass_dup = PL_charclass;
+# define PL_charclass pl_charclass_dup
+#endif
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
/* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
multicall_oldcatch = 0;
- multicall_cv = NULL;
- cx = NULL;
PERL_UNUSED_VAR(multicall_cop);
- PERL_UNUSED_VAR(newsp);
-
PERL_ARGS_ASSERT_REGMATCH;
DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
- PerlIO_printf(Perl_debug_log,"regmatch start\n");
+ Perl_re_printf( aTHX_ "regmatch start\n");
}));
st = PL_regmatch_state;
scan = prog;
while (scan != NULL) {
- DEBUG_EXECUTE_r( {
- SV * const prop = sv_newmortal();
- regnode *rnext=regnext(scan);
- DUMP_EXEC_POS( locinput, scan, utf8_target );
- regprop(rex, prop, scan, reginfo);
-
- PerlIO_printf(Perl_debug_log,
- "%3"IVdf":%*s%s(%"IVdf")\n",
- (IV)(scan - rexi->program), depth*2, "",
- SvPVX_const(prop),
- (PL_regkind[OP(scan)] == END || !rnext) ?
- 0 : (IV)(rnext - rexi->program));
- });
next = scan + NEXT_OFF(scan);
if (next == scan)
state_num = OP(scan);
reenter_switch:
+ DEBUG_EXECUTE_r(
+ if (state_num <= REGNODE_MAX) {
+ SV * const prop = sv_newmortal();
+ regnode *rnext = regnext(scan);
+
+ DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
+ regprop(rex, prop, scan, reginfo, NULL);
+ Perl_re_printf( aTHX_
+ "%*s%"IVdf":%s(%"IVdf")\n",
+ INDENT_CHARS(depth), "",
+ (IV)(scan - rexi->program),
+ SvPVX_const(prop),
+ (PL_regkind[OP(scan)] == END || !rnext) ?
+ 0 : (IV)(rnext - rexi->program));
+ }
+ );
+
to_complement = 0;
SET_nextchr;
assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
switch (state_num) {
- case BOL: /* /^../ */
- case SBOL: /* /^../s */
+ case SBOL: /* /^../ and /\A../ */
if (locinput == reginfo->strbeg)
break;
sayNO;
st->u.keeper.val = rex->offs[0].start;
rex->offs[0].start = locinput - reginfo->strbeg;
PUSH_STATE_GOTO(KEEPS_next, next, locinput);
- assert(0); /*NOTREACHED*/
+ NOT_REACHED; /* NOTREACHED */
+
case KEEPS_next_fail:
/* rollback the start point change */
rex->offs[0].start = st->u.keeper.val;
sayNO_SILENT;
- assert(0); /*NOTREACHED*/
+ NOT_REACHED; /* NOTREACHED */
case MEOL: /* /..$/m */
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
break;
- case EOL: /* /..$/ */
- /* FALL THROUGH */
- case SEOL: /* /..$/s */
+ case SEOL: /* /..$/ */
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
if (reginfo->strend - locinput > 1)
sayNO;
goto increment_locinput;
- case CANY: /* \C */
- if (NEXTCHR_IS_EOS)
- sayNO;
- locinput++;
- break;
-
case REG_ANY: /* /./ */
if ((NEXTCHR_IS_EOS) || nextchr == '\n')
sayNO;
*/
if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match trie start class...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
+ depth, PL_colors[4], PL_colors[5])
);
sayNO_SILENT;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case TRIE: /* (ab|cd) */
/* the basic plan of execution of the trie is:
* At the beginning, run though all the states, and
HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
U32 state = trie->startstate;
+ if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (utf8_target
+ && UTF8_IS_ABOVE_LATIN1(nextchr)
+ && scan->flags == EXACTL)
+ {
+ /* We only output for EXACTL, as we let the folder
+ * output this message for EXACTFLU8 to avoid
+ * duplication */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+ reginfo->strend);
+ }
+ }
if ( trie->bitmap
&& (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
{
if (trie->states[ state ].wordnum) {
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %smatched empty string...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ Perl_re_exec_indentf( aTHX_ "%smatched empty string...%s\n",
+ depth, PL_colors[4], PL_colors[5])
);
if (!trie->jump)
break;
} else {
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match trie start class...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
+ depth, PL_colors[4], PL_colors[5])
);
sayNO_SILENT;
}
}
DEBUG_TRIE_EXECUTE_r({
- DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
- PerlIO_printf( Perl_debug_log,
- "%*s %sState: %4"UVxf" Accepted: %c ",
- 2+depth * 2, "", PL_colors[4],
+ DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
+ Perl_re_exec_indentf( aTHX_
+ "%sState: %4"UVxf" Accepted: %c ",
+ depth, PL_colors[4],
(UV)state, (accepted ? 'Y' : 'N'));
});
state = 0;
}
DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
+ Perl_re_printf( aTHX_
"Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
charid, uvc, (UV)state, PL_colors[5] );
);
}
DEBUG_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
- "%*s %sgot %"IVdf" possible matches%s\n",
- REPORT_CODE_OFF + depth * 2, "",
+ Perl_re_exec_indentf( aTHX_ "%sgot %"IVdf" possible matches%s\n",
+ depth,
PL_colors[4], (IV)ST.accepted, PL_colors[5] );
);
goto trie_first_try; /* jump into the fail handler */
}}
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case TRIE_next_fail: /* we failed - try next alternative */
{
}
if (!--ST.accepted) {
DEBUG_EXECUTE_r({
- PerlIO_printf( Perl_debug_log,
- "%*s %sTRIE failed...%s\n",
- REPORT_CODE_OFF+depth*2, "",
+ Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
+ depth,
PL_colors[4],
PL_colors[5] );
});
: NEXT_OFF(ST.me));
DEBUG_EXECUTE_r({
- PerlIO_printf( Perl_debug_log,
- "%*s %sTRIE matched word #%d, continuing%s\n",
- REPORT_CODE_OFF+depth*2, "",
+ Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
+ depth,
PL_colors[4],
ST.nextword,
PL_colors[5]
if (ST.accepted > 1 || has_cutgroup) {
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
/* only one choice left - just continue */
DEBUG_EXECUTE_r({
AV *const trie_words
= MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
- SV ** const tmp = av_fetch( trie_words,
- ST.nextword-1, 0 );
+ SV ** const tmp = trie_words
+ ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
SV *sv= tmp ? sv_newmortal() : NULL;
- PerlIO_printf( Perl_debug_log,
- "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4],
+ Perl_re_exec_indentf( aTHX_ "%sonly one match left, short-circuiting: #%d <%s>%s\n",
+ depth, PL_colors[4],
ST.nextword,
tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
PL_colors[0], PL_colors[1],
locinput = (char*)uc;
continue; /* execute rest of RE */
- assert(0); /* NOTREACHED */
+ /* NOTREACHED */
}
#undef ST
+ case EXACTL: /* /abc/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ /* Complete checking would involve going through every character
+ * matched by the string to see if any is above latin1. But the
+ * comparision otherwise might very well be a fast assembly
+ * language routine, and I (khw) don't think slowing things down
+ * just to check for this warning is worth it. So this just checks
+ * the first character */
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+ }
+ /* FALLTHROUGH */
case EXACT: { /* /abc/ */
char *s = STRING(scan);
ln = STR_LEN(scan);
l++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
{
sayNO;
}
s++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
{
sayNO;
}
const char * s;
U32 fold_utf8_flags;
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
fold_utf8_flags = FOLDEQ_LOCALE;
goto do_exactf;
+ case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
+ is effectively /u; hence to match, target
+ must be UTF-8. */
+ if (! utf8_target) {
+ sayNO;
+ }
+ fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
+ | FOLDEQ_S1_FOLDS_SANE;
+ folder = foldEQ_latin1;
+ fold_array = PL_fold_latin1;
+ goto do_exactf;
+
case EXACTFU_SS: /* /\x{df}/iu */
case EXACTFU: /* /abc/iu */
folder = foldEQ_latin1;
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
patterns */
assert(! is_utf8_pat);
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case EXACTFA: /* /abc/iaa */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
break;
}
- /* XXX Could improve efficiency by separating these all out using a
- * macro or in-line function. At that point regcomp.c would no longer
- * have to set the FLAGS fields of these */
- case BOUNDL: /* /\b/l */
- case NBOUNDL: /* /\B/l */
- case BOUND: /* /\b/ */
- case BOUNDU: /* /\b/u */
+ case NBOUNDL: /* /\B/l */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case BOUNDL: /* /\b/l */
+ {
+ bool b1, b2;
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (FLAGS(scan) != TRADITIONAL_BOUND) {
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
+ goto boundu;
+ }
+
+ if (utf8_target) {
+ if (locinput == reginfo->strbeg)
+ b1 = isWORDCHAR_LC('\n');
+ else {
+ b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg)));
+ }
+ b2 = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_LC('\n')
+ : isWORDCHAR_LC_utf8((U8*)locinput);
+ }
+ else { /* Here the string isn't utf8 */
+ b1 = (locinput == reginfo->strbeg)
+ ? isWORDCHAR_LC('\n')
+ : isWORDCHAR_LC(UCHARAT(locinput - 1));
+ b2 = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_LC('\n')
+ : isWORDCHAR_LC(nextchr);
+ }
+ if (to_complement ^ (b1 == b2)) {
+ sayNO;
+ }
+ break;
+ }
+
+ case NBOUND: /* /\B/ */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case BOUND: /* /\b/ */
+ if (utf8_target) {
+ goto bound_utf8;
+ }
+ goto bound_ascii_match_only;
+
+ case NBOUNDA: /* /\B/a */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
case BOUNDA: /* /\b/a */
- case NBOUND: /* /\B/ */
+ {
+ bool b1, b2;
+
+ bound_ascii_match_only:
+ /* Here the string isn't utf8, or is utf8 and only ascii characters
+ * are to match \w. In the latter case looking at the byte just
+ * prior to the current one may be just the final byte of a
+ * multi-byte character. This is ok. There are two cases:
+ * 1) it is a single byte character, and then the test is doing
+ * just what it's supposed to.
+ * 2) it is a multi-byte character, in which case the final byte is
+ * never mistakable for ASCII, and so the test will say it is
+ * not a word character, which is the correct answer. */
+ b1 = (locinput == reginfo->strbeg)
+ ? isWORDCHAR_A('\n')
+ : isWORDCHAR_A(UCHARAT(locinput - 1));
+ b2 = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_A('\n')
+ : isWORDCHAR_A(nextchr);
+ if (to_complement ^ (b1 == b2)) {
+ sayNO;
+ }
+ break;
+ }
+
case NBOUNDU: /* /\B/u */
- case NBOUNDA: /* /\B/a */
- /* was last char in word? */
- if (utf8_target
- && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
- && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
- {
- if (locinput == reginfo->strbeg)
- ln = '\n';
- else {
- const U8 * const r =
- reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
+ to_complement = 1;
+ /* FALLTHROUGH */
- ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
- 0, uniflags);
- }
- if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
- ln = isWORDCHAR_uni(ln);
- if (NEXTCHR_IS_EOS)
- n = 0;
- else {
- LOAD_UTF8_CHARCLASS_ALNUM();
- n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
- utf8_target);
+ case BOUNDU: /* /\b/u */
+
+ boundu:
+ if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
+ match = FALSE;
+ }
+ else if (utf8_target) {
+ bound_utf8:
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
+ ? 0 /* isWORDCHAR_L1('\n') */
+ : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg)));
+ b2 = (NEXTCHR_IS_EOS)
+ ? 0 /* isWORDCHAR_L1('\n') */
+ : isWORDCHAR_utf8((U8*)locinput);
+ match = cBOOL(b1 != b2);
+ break;
}
- }
- else {
- ln = isWORDCHAR_LC_uvchr(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
- }
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else {
+ /* Find the gcb values of previous and current
+ * chars, then see if is a break point */
+ match = isGCB(getGCB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ utf8_target);
+ }
+ break;
+
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getLB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
+ case SB_BOUND: /* Always matches at begin and end */
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isSB(getSB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getSB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
+ case WB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isWB(WB_UNKNOWN,
+ getWB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getWB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+ }
}
- else {
+ else { /* Not utf8 target */
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
+ ? 0 /* isWORDCHAR_L1('\n') */
+ : isWORDCHAR_L1(UCHARAT(locinput - 1));
+ b2 = (NEXTCHR_IS_EOS)
+ ? 0 /* isWORDCHAR_L1('\n') */
+ : isWORDCHAR_L1(nextchr);
+ match = cBOOL(b1 != b2);
+ break;
+ }
- /* Here the string isn't utf8, or is utf8 and only ascii
- * characters are to match \w. In the latter case looking at
- * the byte just prior to the current one may be just the final
- * byte of a multi-byte character. This is ok. There are two
- * cases:
- * 1) it is a single byte character, and then the test is doing
- * just what it's supposed to.
- * 2) it is a multi-byte character, in which case the final
- * byte is never mistakable for ASCII, and so the test
- * will say it is not a word character, which is the
- * correct answer. */
- ln = (locinput != reginfo->strbeg) ?
- UCHARAT(locinput - 1) : '\n';
- switch (FLAGS(scan)) {
- case REGEX_UNICODE_CHARSET:
- ln = isWORDCHAR_L1(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
- break;
- case REGEX_LOCALE_CHARSET:
- ln = isWORDCHAR_LC(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
- break;
- case REGEX_DEPENDS_CHARSET:
- ln = isWORDCHAR(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- ln = isWORDCHAR_A(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
- break;
- default:
- Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
- break;
- }
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else { /* Only CR-LF combo isn't a GCB in 0-255
+ range */
+ match = UCHARAT(locinput - 1) != '\r'
+ || UCHARAT(locinput) != '\n';
+ }
+ break;
+
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
+ getLB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
+ case SB_BOUND: /* Always matches at begin and end */
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
+ getSB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
+ case WB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isWB(WB_UNKNOWN,
+ getWB_VAL_CP(UCHARAT(locinput -1)),
+ getWB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+ }
}
- /* Note requires that all BOUNDs be lower than all NBOUNDs in
- * regcomp.sym */
- if (((!ln) == (!n)) == (OP(scan) < NBOUND))
- sayNO;
+
+ if (to_complement ^ ! match) {
+ sayNO;
+ }
break;
- case ANYOF: /* /[abc]/ */
+ case ANYOFL: /* /[abc]/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
+ {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
+ /* FALLTHROUGH */
+ case ANYOFD: /* /[abc]/d */
+ case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
- if (utf8_target) {
+ if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
utf8_target))
sayNO;
locinput += UTF8SKIP(locinput);
}
else {
- if (!REGINCLASS(rex, scan, (U8*)locinput))
+ if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
sayNO;
locinput++;
}
/* FALLTHROUGH */
case POSIXL: /* \w or [:punct:] etc. under /l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (NEXTCHR_IS_EOS)
sayNO;
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
sayNO;
}
+
+ locinput++;
+ break;
}
- else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
- if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
- (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
- *(locinput + 1))))))
- {
- sayNO;
- }
+
+ if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+ goto utf8_posix_above_latin1;
}
- else { /* Here, must be an above Latin-1 code point */
- goto utf8_posix_not_eos;
+
+ /* Here is a UTF-8 variant code point below 256 and the target is
+ * UTF-8 */
+ if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
+ EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
+ *(locinput + 1))))))
+ {
+ sayNO;
}
- /* Here, must be utf8 */
- locinput += UTF8SKIP(locinput);
- break;
+ goto increment_locinput;
case NPOSIXD: /* \W or [:^punct:] etc. under /d */
to_complement = 1;
}
to_complement = 1;
- /* FALLTHROUGH */
+ goto join_nposixa;
case POSIXA: /* \w or [:punct:] etc. under /a */
* UTF-8, and also from NPOSIXA even in UTF-8 when the current
* character is a single byte */
- if (NEXTCHR_IS_EOS
- || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
- FLAGS(scan)))))
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+
+ join_nposixa:
+
+ if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
+ FLAGS(scan)))))
{
sayNO;
}
if (NEXTCHR_IS_EOS) {
sayNO;
}
- utf8_posix_not_eos:
/* Use _generic_isCC() for characters within Latin1. (Note that
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
FLAGS(scan)))))
{
locinput += 2;
}
else { /* Handle above Latin-1 code points */
+ utf8_posix_above_latin1:
classnum = (_char_class_number) FLAGS(scan);
if (classnum < _FIRST_NON_SWASH_CC) {
}
else { /* Here, uses macros to find above Latin-1 code points */
switch (classnum) {
- case _CC_ENUM_SPACE: /* XXX would require separate
- code if we revert the change
- of \v matching this */
- case _CC_ENUM_PSXSPC:
+ case _CC_ENUM_SPACE:
if (! (to_complement
^ cBOOL(is_XPERLSPACE_high(locinput))))
{
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
- /* From http://www.unicode.org/reports/tr29 (5.2 version). An
- extended Grapheme Cluster is:
-
- CR LF
- | Prepend* Begin Extend*
- | .
-
- Begin is: ( Special_Begin | ! Control )
- Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- Extend is: ( Grapheme_Extend | Spacing_Mark )
- Control is: [ GCB_Control | CR | LF ]
- Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- If we create a 'Regular_Begin' = Begin - Special_Begin, then
- we can rewrite
-
- Begin is ( Regular_Begin + Special Begin )
-
- It turns out that 98.4% of all Unicode code points match
- Regular_Begin. Doing it this way eliminates a table match in
- the previous implementation for almost all Unicode code points.
-
- There is a subtlety with Prepend* which showed up in testing.
- Note that the Begin, and only the Begin is required in:
- | Prepend* Begin Extend*
- Also, Begin contains '! Control'. A Prepend must be a
- '! Control', which means it must also be a Begin. What it
- comes down to is that if we match Prepend* and then find no
- suitable Begin afterwards, that if we backtrack the last
- Prepend, that one will be a suitable Begin.
- */
-
if (NEXTCHR_IS_EOS)
sayNO;
if (! utf8_target) {
}
else {
- /* Utf8: See if is ( CR LF ); already know that locinput <
- * reginfo->strend, so locinput+1 is in bounds */
- if ( nextchr == '\r' && locinput+1 < reginfo->strend
- && UCHARAT(locinput + 1) == '\n')
- {
- locinput += 2;
- }
- else {
- STRLEN len;
-
- /* In case have to backtrack to beginning, then match '.' */
- char *starting = locinput;
-
- /* In case have to backtrack the last prepend */
- char *previous_prepend = NULL;
-
- LOAD_UTF8_CHARCLASS_GCB();
+ /* Get the gcb type for the current character */
+ GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
- /* Match (prepend)* */
- while (locinput < reginfo->strend
- && (len = is_GCB_Prepend_utf8(locinput)))
+ /* Then scan through the input until we get to the first
+ * character whose type is supposed to be a gcb with the
+ * current character. (There is always a break at the
+ * end-of-input) */
+ locinput += UTF8SKIP(locinput);
+ while (locinput < reginfo->strend) {
+ GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
+ if (isGCB(prev_gcb, cur_gcb,
+ (U8*) reginfo->strbeg, (U8*) locinput,
+ utf8_target))
{
- previous_prepend = locinput;
- locinput += len;
- }
-
- /* As noted above, if we matched a prepend character, but
- * the next thing won't match, back off the last prepend we
- * matched, as it is guaranteed to match the begin */
- if (previous_prepend
- && (locinput >= reginfo->strend
- || (! swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)
- && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
- )
- {
- locinput = previous_prepend;
- }
-
- /* Note that here we know reginfo->strend > locinput, as we
- * tested that upon input to this switch case, and if we
- * moved locinput forward, we tested the result just above
- * and it either passed, or we backed off so that it will
- * now pass */
- if (swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)) {
- locinput += UTF8SKIP(locinput);
+ break;
}
- else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
-
- /* Here did not match the required 'Begin' in the
- * second term. So just match the very first
- * character, the '.' of the final term of the regex */
- locinput = starting + UTF8SKIP(starting);
- goto exit_utf8;
- } else {
-
- /* Here is a special begin. It can be composed of
- * several individual characters. One possibility is
- * RI+ */
- if ((len = is_GCB_RI_utf8(locinput))) {
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_RI_utf8(locinput)))
- {
- locinput += len;
- }
- } else if ((len = is_GCB_T_utf8(locinput))) {
- /* Another possibility is T+ */
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- } else {
-
- /* Here, neither RI+ nor T+; must be some other
- * Hangul. That means it is one of the others: L,
- * LV, LVT or V, and matches:
- * L* (L | LVT T* | V * V* T* | LV V* T*) */
-
- /* Match L* */
- while (locinput < reginfo->strend
- && (len = is_GCB_L_utf8(locinput)))
- {
- locinput += len;
- }
-
- /* Here, have exhausted L*. If the next character
- * is not an LV, LVT nor V, it means we had to have
- * at least one L, so matches L+ in the original
- * equation, we have a complete hangul syllable.
- * Are done. */
- if (locinput < reginfo->strend
- && is_GCB_LV_LVT_V_utf8(locinput))
- {
- /* Otherwise keep going. Must be LV, LVT or V.
- * See if LVT, by first ruling out V, then LV */
- if (! is_GCB_V_utf8(locinput)
- /* All but every TCount one is LV */
- && (valid_utf8_to_uvchr((U8 *) locinput,
- NULL)
- - SBASE)
- % TCount != 0)
- {
- locinput += UTF8SKIP(locinput);
- } else {
-
- /* Must be V or LV. Take it, then match
- * V* */
- locinput += UTF8SKIP(locinput);
- while (locinput < reginfo->strend
- && (len = is_GCB_V_utf8(locinput)))
- {
- locinput += len;
- }
- }
+ prev_gcb = cur_gcb;
+ locinput += UTF8SKIP(locinput);
+ }
- /* And any of LV, LVT, or V can be followed
- * by T* */
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- }
- }
- }
- /* Match any extender */
- while (locinput < reginfo->strend
- && swash_fetch(PL_utf8_X_extend,
- (U8*)locinput, utf8_target))
- {
- locinput += UTF8SKIP(locinput);
- }
- }
- exit_utf8:
- if (locinput > reginfo->strend) sayNO;
}
break;
const U8 *fold_array;
UV utf8_fold_flags;
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
type = REFFL;
goto do_nref_ref_common;
case REFFL: /* /\1/il */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
utf8_fold_flags = FOLDEQ_LOCALE;
case TAIL: /* placeholder while compiling (A|B|C) */
break;
- case BACK: /* ??? doesn't appear to be used ??? */
- break;
-
#undef ST
#define ST st->u.eval
+#define CUR_EVAL cur_eval->u.eval
+
{
SV *ret;
REGEXP *re_sv;
regexp *re;
regexp_internal *rei;
regnode *startpoint;
+ U32 arg;
- case GOSTART: /* (?R) */
case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
- if (cur_eval && cur_eval->locinput==locinput) {
- if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
- Perl_croak(aTHX_ "Infinite recursion in regex");
+ arg= (U32)ARG(scan);
+ if (cur_eval && cur_eval->locinput == locinput) {
if ( ++nochange_depth > max_nochange_depth )
Perl_croak(aTHX_
"Pattern subroutine nesting without pos change"
re_sv = rex_sv;
re = rex;
rei = rexi;
- if (OP(scan)==GOSUB) {
- startpoint = scan + ARG2L(scan);
- ST.close_paren = ARG(scan);
+ startpoint = scan + ARG2L(scan);
+ EVAL_CLOSE_PAREN_SET( st, arg );
+ /* Detect infinite recursion
+ *
+ * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
+ * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
+ * So we track the position in the string we are at each time
+ * we recurse and if we try to enter the same routine twice from
+ * the same position we throw an error.
+ */
+ if ( rex->recurse_locinput[arg] == locinput ) {
+ /* FIXME: we should show the regop that is failing as part
+ * of the error message. */
+ Perl_croak(aTHX_ "Infinite recursion in regex");
} else {
- startpoint = rei->program+1;
- ST.close_paren = 0;
+ ST.prev_recurse_locinput= rex->recurse_locinput[arg];
+ rex->recurse_locinput[arg]= locinput;
+
+ DEBUG_r({
+ GET_RE_DEBUG_FLAGS_DECL;
+ DEBUG_STACK_r({
+ Perl_re_exec_indentf( aTHX_
+ "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
+ depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
+ );
+ });
+ });
}
/* Save all the positions seen so far. */
/* and then jump to the code we share with EVAL */
goto eval_recurse_doit;
-
- assert(0); /* NOTREACHED */
+ /* NOTREACHED */
case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
if (cur_eval && cur_eval->locinput==locinput) {
n = ARG(scan);
if (rexi->data->what[n] == 'r') { /* code from an external qr */
- newcv = (ReANY(
- (REGEXP*)(rexi->data->data[n])
- ))->qr_anoncv
- ;
+ newcv = (ReANY(
+ (REGEXP*)(rexi->data->data[n])
+ ))->qr_anoncv;
nop = (OP*)rexi->data->data[n+1];
}
else if (rexi->data->what[n] == 'l') { /* literal code */
U8 flags = (CXp_SUB_RE |
((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
if (last_pushed_cv) {
+ /* PUSH/POP_MULTICALL save and restore the
+ * caller's PL_comppad; if we call multiple subs
+ * using the same CX block, we have to save and
+ * unwind the varying PL_comppad's ourselves,
+ * especially restoring the right PL_comppad on
+ * backtrack - so save it on the save stack */
+ SAVECOMPPAD();
CHANGE_MULTICALL_FLAGS(newcv, flags);
}
else {
/* these assignments are just to silence compiler
* warnings */
multicall_cop = NULL;
- newsp = NULL;
}
last_pad = PL_comppad;
assert(o->op_targ == OP_LEAVE);
o = cUNOPo->op_first;
assert(o->op_type == OP_ENTER);
- o = o->op_sibling;
+ o = OpSIBLING(o);
}
if (o->op_type != OP_STUB) {
}
nop = nop->op_next;
- DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
+ DEBUG_STATE_r( Perl_re_printf( aTHX_
" re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
rex->offs[0].end = locinput - reginfo->strbeg;
/* we don't use MULTICALL here as we want to call the
* first op of the block of interest, rather than the
- * first op of the sub */
+ * first op of the sub. Also, we don't want to free
+ * the savestack frame */
before = (IV)(SP-PL_stack_base);
PL_op = nop;
CALLRUNOPS(aTHX); /* Scalar context. */
assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
rex->engine, NULL, NULL,
- /* copy /msix etc to inner pattern */
- scan->flags,
+ /* copy /msixn etc to inner pattern */
+ ARG2L(scan),
pm_flags);
if (!(SvFLAGS(ret)
reginfo->strend, "Matching embedded");
);
startpoint = rei->program + 1;
- ST.close_paren = 0; /* only used for GOSUB */
+ EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
+ * close_paren only for GOSUB */
+ ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
/* Save all the seen positions so far. */
ST.cp = regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
maxopenparen = 0;
/* run the pattern returned from (??{...}) */
- eval_recurse_doit: /* Share code with GOSUB below this line
+ eval_recurse_doit: /* Share code with GOSUB below this line
* At this point we expect the stack context to be
* set up correctly */
cur_eval = st;
/* now continue from first node in postoned RE */
PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
case EVAL_AB: /* cleanup after a successful (??{A})B */
- /* note: this is called twice; first after popping B, then A */
+ /* note: this is called twice; first after popping B, then A */
+ DEBUG_STACK_r({
+ Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
+ depth, cur_eval, ST.prev_eval);
+ });
+
+#define SET_RECURSE_LOCINPUT(STR,VAL)\
+ if ( cur_eval && CUR_EVAL.close_paren ) {\
+ DEBUG_STACK_r({ \
+ Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
+ depth, \
+ CUR_EVAL.close_paren - 1,\
+ cur_eval, \
+ VAL); \
+ }); \
+ rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
+ }
+
+ SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
+
rex_sv = ST.prev_rex;
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
SET_reg_curpm(rex_sv);
reginfo->poscache_maxiter = 0;
if ( nochange_depth )
nochange_depth--;
+
+ SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
sayYES;
case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
/* note: this is called twice; first after popping B, then A */
+ DEBUG_STACK_r({
+ Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
+ depth, cur_eval, ST.prev_eval);
+ });
+
+ SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
+
rex_sv = ST.prev_rex;
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
SET_reg_curpm(rex_sv);
regcppop(rex, &maxopenparen);
cur_eval = ST.prev_eval;
cur_curlyx = ST.prev_curlyx;
+
/* Invalidate cache. See "invalidate" comment above. */
reginfo->poscache_maxiter = 0;
if ( nochange_depth )
nochange_depth--;
- sayNO_SILENT;
+
+ SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
+ sayNO_SILENT;
#undef ST
case OPEN: /* ( */
rex->offs[n].start_tmp = locinput - reginfo->strbeg;
if (n > maxopenparen)
maxopenparen = n;
- DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
"rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
PTR2UV(rex),
PTR2UV(rex->offs),
break;
/* XXX really need to log other places start/end are set too */
-#define CLOSE_CAPTURE \
- rex->offs[n].start = rex->offs[n].start_tmp; \
- rex->offs[n].end = locinput - reginfo->strbeg; \
- DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
- "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
- PTR2UV(rex), \
- PTR2UV(rex->offs), \
- (UV)n, \
- (IV)rex->offs[n].start, \
- (IV)rex->offs[n].end \
+#define CLOSE_CAPTURE \
+ rex->offs[n].start = rex->offs[n].start_tmp; \
+ rex->offs[n].end = locinput - reginfo->strbeg; \
+ DEBUG_BUFFERS_r(Perl_re_printf( aTHX_ \
+ "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
+ PTR2UV(rex), \
+ PTR2UV(rex->offs), \
+ (UV)n, \
+ (IV)rex->offs[n].start, \
+ (IV)rex->offs[n].end \
))
case CLOSE: /* ) */
if (n > rex->lastparen)
rex->lastparen = n;
rex->lastcloseparen = n;
- if (cur_eval && cur_eval->u.eval.close_paren == n) {
+ if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
goto fake_end;
- }
+
break;
case ACCEPT: /* (*ACCEPT) */
- if (ARG(scan)){
+ if (scan->flags)
+ sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (ARG2L(scan)){
regnode *cursor;
for (cursor=scan;
cursor && OP(cursor)!=END;
if (n > rex->lastparen)
rex->lastparen = n;
rex->lastcloseparen = n;
- if ( n == ARG(scan) || (cur_eval &&
- cur_eval->u.eval.close_paren == n))
+ if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
break;
}
}
}
}
goto fake_end;
- /*NOTREACHED*/
+ /* NOTREACHED */
case GROUPP: /* (?(1)) */
n = ARG(scan); /* which paren pair */
case INSUBP: /* (?(R)) */
n = ARG(scan);
- sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
+ /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
+ * of SCAN is already set up as matches a eval.close_paren */
+ sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
break;
case DEFINEP: /* (?(DEFINE)) */
ST.lastloc = NULL; /* this will be updated by WHILEM */
PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
case CURLYX_end: /* just finished matching all of A*B */
cur_curlyx = ST.prev_curlyx;
sayYES;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case CURLYX_end_fail: /* just failed to match all of A*B */
regcpblow(ST.cp);
cur_curlyx = ST.prev_curlyx;
sayNO;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
#undef ST
{
/* see the discussion above about CURLYX/WHILEM */
I32 n;
- int min = ARG1(cur_curlyx->u.curlyx.me);
- int max = ARG2(cur_curlyx->u.curlyx.me);
- regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
+ int min, max;
+ regnode *A;
assert(cur_curlyx); /* keep Coverity happy */
+
+ min = ARG1(cur_curlyx->u.curlyx.me);
+ max = ARG2(cur_curlyx->u.curlyx.me);
+ A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
ST.cache_offset = 0;
ST.cache_mask = 0;
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "%*s whilem: matched %ld out of %d..%d\n",
- REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
+ DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: matched %ld out of %d..%d\n",
+ depth, (long)n, min, max)
);
/* First just match a string of min A's. */
REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
/* If degenerate A matches "", assume A done. */
if (locinput == cur_curlyx->u.curlyx.lastloc) {
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "%*s whilem: empty match detected, trying continuation...\n",
- REPORT_CODE_OFF+depth*2, "")
+ DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: empty match detected, trying continuation...\n",
+ depth)
);
goto do_whilem_B_max;
}
reginfo->poscache_size = size;
Newxz(aux->poscache, size, char);
}
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
+ DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
"%swhilem: Detected a super-linear match, switching on caching%s...\n",
PL_colors[4], PL_colors[5])
);
mask = 1 << (offset % 8);
offset /= 8;
if (reginfo->info_aux->poscache[offset] & mask) {
- DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "%*s whilem: (cache) already tried at this position...\n",
- REPORT_CODE_OFF+depth*2, "")
+ DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: (cache) already tried at this position...\n",
+ depth)
);
sayNO; /* cache records failure */
}
REGCP_SET(ST.lastcp);
PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
/* Prefer A over B for maximal matching. */
cur_curlyx->u.curlyx.lastloc = locinput;
REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
goto do_whilem_B_max;
}
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_B_min: /* just matched B in a minimal match */
case WHILEM_B_max: /* just matched B in a maximal match */
cur_curlyx = ST.save_curlyx;
sayYES;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
cur_curlyx = ST.save_curlyx;
cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case WHILEM_A_pre_fail: /* just failed to match even minimal A */
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen);
cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "%*s whilem: failed, trying continuation...\n",
- REPORT_CODE_OFF+depth*2, "")
+ DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "whilem: failed, trying continuation...\n",
+ depth)
);
do_whilem_B_max:
if (cur_curlyx->u.curlyx.count >= REG_INFTY
cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
cur_curlyx = ST.save_curlyx;
CACHEsayNO;
}
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
+ DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "trying longer...\n", depth)
);
/* Try grabbing another A and see if it helps. */
cur_curlyx->u.curlyx.lastloc = locinput;
PUSH_STATE_GOTO(WHILEM_A_min,
/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
#undef ST
#define ST st->u.branch
if (next == scan)
next = NULL;
scan = NEXTOPER(scan);
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case BRANCH: /* /(...|A|...)/ */
scan = NEXTOPER(scan); /* scan now points to inner node */
} else {
PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
}
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case CUTGROUP: /* /(*THEN)/ */
- sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
- MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ sv_yes_mark = st->u.mark.mark_name = scan->flags
+ ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
+ : NULL;
PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case CUTGROUP_next_fail:
do_cutgroup = 1;
if (st->u.mark.mark_name)
sv_commit = st->u.mark.mark_name;
sayNO;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case BRANCH_next:
sayYES;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case BRANCH_next_fail: /* that branch failed; try the next, if any */
if (do_cutgroup) {
/* no more branches? */
if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
DEBUG_EXECUTE_r({
- PerlIO_printf( Perl_debug_log,
- "%*s %sBRANCH failed...%s\n",
- REPORT_CODE_OFF+depth*2, "",
+ Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
+ depth,
PL_colors[4],
PL_colors[5] );
});
sayNO_SILENT;
}
continue; /* execute next BRANCH[J] op */
- assert(0); /* NOTREACHED */
+ /* NOTREACHED */
case MINMOD: /* next op will be non-greedy, e.g. A*? */
minmod = 1;
curlym_do_A: /* execute the A in /A{m,n}B/ */
PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case CURLYM_A: /* we've just matched an A */
ST.count++;
ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
}
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
- (int)(REPORT_CODE_OFF+(depth*2)), "",
- (IV) ST.count, (IV)ST.alen)
+ Perl_re_exec_indentf( aTHX_ "CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
+ depth, (IV) ST.count, (IV)ST.alen)
);
- if (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == (U32)ST.me->flags)
+ if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
goto fake_end;
{
case CURLYM_A_fail: /* just failed to match an A */
REGCP_UNWIND(ST.cp);
+
if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
- || (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == (U32)ST.me->flags))
+ || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
sayNO;
curlym_do_B: /* execute the B in /A{m,n}B/ */
/* calculate c1 and c2 for possible match of 1st char
* following curly */
ST.c1 = ST.c2 = CHRTEST_VOID;
+ assert(ST.B);
if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
regnode *text_node = ST.B;
if (! HAS_TEXT(text_node))
}
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s CURLYM trying tail with matches=%"IVdf"...\n",
- (int)(REPORT_CODE_OFF+(depth*2)),
- "", (IV)ST.count)
+ Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%"IVdf"...\n",
+ depth, (IV)ST.count)
);
if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
{
/* simulate B failing */
DEBUG_OPTIMISE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
- (int)(REPORT_CODE_OFF+(depth*2)),"",
+ Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
+ depth,
valid_utf8_to_uvchr((U8 *) locinput, NULL),
valid_utf8_to_uvchr(ST.c1_utf8, NULL),
valid_utf8_to_uvchr(ST.c2_utf8, NULL))
else if (nextchr != ST.c1 && nextchr != ST.c2) {
/* simulate B failing */
DEBUG_OPTIMISE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
- (int)(REPORT_CODE_OFF+(depth*2)),"",
+ Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
+ depth,
(int) nextchr, ST.c1, ST.c2)
);
state_num = CURLYM_B_fail;
}
else
rex->offs[paren].end = -1;
- if (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == (U32)ST.me->flags)
+
+ if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
{
if (ST.count)
goto fake_end;
}
PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case CURLYM_B_fail: /* just failed to match a B */
REGCP_UNWIND(ST.cp);
maxopenparen = ST.paren;
ST.min = ARG1(scan); /* min to match */
ST.max = ARG2(scan); /* max to match */
- if (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == (U32)ST.paren) {
+ if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
+ {
ST.min=1;
ST.max=1;
}
REGCP_SET(ST.cp);
goto curly_try_B_max;
}
- assert(0); /* NOTREACHED */
-
+ NOT_REACHED; /* NOTREACHED */
case CURLY_B_min_known_fail:
/* failed to find B in a non-greedy match where c1,c2 valid */
assert(n == REG_INFTY || locinput == li);
}
CURLY_SETPAREN(ST.paren, ST.count);
- if (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == (U32)ST.paren) {
+ if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
goto fake_end;
- }
PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
}
- assert(0); /* NOTREACHED */
-
+ NOT_REACHED; /* NOTREACHED */
case CURLY_B_min_fail:
/* failed to find B in a non-greedy match where c1,c2 invalid */
{
curly_try_B_min:
CURLY_SETPAREN(ST.paren, ST.count);
- if (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == (U32)ST.paren) {
+ if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
goto fake_end;
- }
PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
}
}
sayNO;
- assert(0); /* NOTREACHED */
-
+ NOT_REACHED; /* NOTREACHED */
- curly_try_B_max:
+ curly_try_B_max:
/* a successful greedy match: now try to match B */
- if (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == (U32)ST.paren) {
+ if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
goto fake_end;
- }
{
bool could_match = locinput < reginfo->strend;
if (ST.c1 == CHRTEST_VOID || could_match) {
CURLY_SETPAREN(ST.paren, ST.count);
PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case CURLY_B_max_fail:
/* failed to find B in a greedy match */
#undef ST
case END: /* last op of main pattern */
- fake_end:
+ fake_end:
if (cur_eval) {
/* we've just finished A in /(??{A})B/; now continue with B */
-
+ SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
st->u.eval.prev_rex = rex_sv; /* inner */
/* Save *all* the positions. */
st->u.eval.cp = regcppush(rex, 0, maxopenparen);
- rex_sv = cur_eval->u.eval.prev_rex;
+ rex_sv = CUR_EVAL.prev_rex;
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
SET_reg_curpm(rex_sv);
rex = ReANY(rex_sv);
rexi = RXi_GET(rex);
- cur_curlyx = cur_eval->u.eval.prev_curlyx;
+
+ st->u.eval.prev_curlyx = cur_curlyx;
+ cur_curlyx = CUR_EVAL.prev_curlyx;
REGCP_SET(st->u.eval.lastcp);
/* Restore parens of the outer rex without popping the
* savestack */
- S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
+ S_regcp_restore(aTHX_ rex, CUR_EVAL.lastcp,
&maxopenparen);
st->u.eval.prev_eval = cur_eval;
- cur_eval = cur_eval->u.eval.prev_eval;
+ cur_eval = CUR_EVAL.prev_eval;
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
- REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
+ Perl_re_exec_indentf( aTHX_ "EVAL trying tail ... (cur_eval=%p)\n",
+ depth, cur_eval););
if ( nochange_depth )
nochange_depth--;
+ SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
+
PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
locinput); /* match B */
}
if (locinput < reginfo->till) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
PL_colors[4],
(long)(locinput - startpos),
(long)(reginfo->till - startpos),
case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %ssubpattern success...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
+ Perl_re_exec_indentf( aTHX_ "%ssubpattern success...%s\n",
+ depth, PL_colors[4], PL_colors[5]));
sayYES; /* Success! */
#undef ST
/* execute body of (?...A) */
PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
case IFMATCH_A_fail: /* body of (?...A) failed */
ST.wanted = !ST.wanted;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case IFMATCH_A: /* body of (?...A) succeeded */
if (ST.logical) {
/* FALLTHROUGH */
case PRUNE: /* (*PRUNE) */
- if (!scan->flags)
+ if (scan->flags)
sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
PUSH_STATE_GOTO(COMMIT_next, next, locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case COMMIT_next_fail:
no_final = 1;
/* FALLTHROUGH */
+ sayNO;
+ NOT_REACHED; /* NOTREACHED */
case OPFAIL: /* (*FAIL) */
- sayNO;
- assert(0); /* NOTREACHED */
+ if (scan->flags)
+ sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (logical) {
+ /* deal with (?(?!)X|Y) properly,
+ * make sure we trigger the no branch
+ * of the trailing IFTHEN structure*/
+ sw= 0;
+ break;
+ } else {
+ sayNO;
+ }
+ NOT_REACHED; /* NOTREACHED */
#define ST st->u.mark
case MARKPOINT: /* (*MARK:foo) */
mark_state = st;
ST.mark_loc = locinput;
PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case MARKPOINT_next:
mark_state = ST.prev_mark;
sayYES;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case MARKPOINT_next_fail:
if (popmark && sv_eq(ST.mark_name,popmark))
sv_commit = ST.mark_name;
DEBUG_EXECUTE_r({
- PerlIO_printf(Perl_debug_log,
- "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
- REPORT_CODE_OFF+depth*2, "",
+ Perl_re_exec_indentf( aTHX_ "%ssetting cutpoint to mark:%"SVf"...%s\n",
+ depth,
PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
});
}
sv_yes_mark = mark_state ?
mark_state->u.mark.mark_name : NULL;
sayNO;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
case SKIP: /* (*SKIP) */
- if (scan->flags) {
+ if (!scan->flags) {
/* (*SKIP) : if we fail we cut here*/
ST.mark_name = NULL;
ST.mark_loc = locinput;
}
no_final = 1;
sayNO;
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
#undef ST
case LNBREAK: /* \R */
/* this is a point to jump to in order to increment
* locinput by one character */
- increment_locinput:
+ increment_locinput:
assert(!NEXTCHR_IS_EOS);
if (utf8_target) {
locinput += PL_utf8skip[nextchr];
/* switch break jumps here */
scan = next; /* prepare to execute the next op and ... */
continue; /* ... jump back to the top, reusing st */
- assert(0); /* NOTREACHED */
+ /* NOTREACHED */
push_yes_state:
/* push a state that backtracks on success */
st->u.yes.prev_yes_state = yes_state;
yes_state = st;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
push_state:
/* push a new regex state, then continue at scan */
{
regmatch_state *curyes = yes_state;
int curd = depth;
regmatch_slab *slab = PL_regmatch_slab;
- for (;curd > -1;cur--,curd--) {
+ for (;curd > -1 && (depth-curd < 3);cur--,curd--) {
if (cur < SLAB_FIRST(slab)) {
slab = slab->prev;
cur = SLAB_LAST(slab);
}
- PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
- REPORT_CODE_OFF + 2 + depth * 2,"",
+ Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n",
+ depth,
curd, PL_reg_name[cur->resume_state],
(curyes == cur) ? "yes" : ""
);
locinput = pushinput;
st = newst;
continue;
- assert(0); /* NOTREACHED */
+ /* NOTREACHED */
}
}
+#ifdef SOLARIS_BAD_OPTIMIZER
+# undef PL_charclass
+#endif
/*
* We get here only if there's trouble -- normally "case END" is
* the terminating point.
*/
Perl_croak(aTHX_ "corrupted regexp pointers");
- /*NOTREACHED*/
- sayNO;
+ NOT_REACHED; /* NOTREACHED */
-yes:
+ yes:
if (yes_state) {
/* we have successfully completed a subexpression, but we must now
* pop to the state marked by yes_state and continue from there */
goto reenter_switch;
}
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
PL_colors[4], PL_colors[5]));
if (reginfo->info_aux_eval) {
result = 1;
goto final_exit;
-no:
+ no:
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %sfailed...%s\n",
- REPORT_CODE_OFF+depth*2, "",
+ Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
+ depth,
PL_colors[4], PL_colors[5])
);
-no_silent:
+ no_silent:
if (no_final) {
if (yes_state) {
goto yes;
yes_state = st->u.yes.prev_yes_state;
state_num = st->resume_state + 1; /* failure = success + 1 */
+ PERL_ASYNC_CHECK();
goto reenter_switch;
}
result = 0;
sv_commit = &PL_sv_yes;
sv_yes_mark = &PL_sv_no;
}
+ assert(sv_err);
+ assert(sv_mrk);
sv_setsv(sv_err, sv_commit);
sv_setsv(sv_mrk, sv_yes_mark);
}
S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
regmatch_info *const reginfo, I32 max, int depth)
{
- dVAR;
char *scan; /* Pointer to current position in target string */
I32 c;
char *loceol = reginfo->strend; /* local version */
I32 hardcount = 0; /* How many matches so far */
bool utf8_target = reginfo->is_utf8_target;
- int to_complement = 0; /* Invert the result? */
+ unsigned int to_complement = 0; /* Invert the result? */
UV utf8_flags;
_char_class_number classnum;
#ifndef DEBUGGING
else
scan = loceol;
break;
- case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
- if (utf8_target && loceol - scan > max) {
-
- /* <loceol> hadn't been adjusted in the UTF-8 case */
- scan += max;
- }
- else {
- scan = loceol;
+ case EXACTL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
}
- break;
+ /* FALLTHROUGH */
case EXACT:
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
/* Target isn't utf8; convert the character in the UTF-8
* pattern to non-UTF8, and do a simple loop */
- c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! reginfo->is_utf8_pat);
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case EXACTFA:
utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_exactf;
case EXACTFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
utf8_flags = FOLDEQ_LOCALE;
goto do_exactf;
utf8_flags = 0;
goto do_exactf;
+ case EXACTFLU8:
+ if (! utf8_target) {
+ break;
+ }
+ utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+ | FOLDEQ_S2_FOLDS_SANE;
+ goto do_exactf;
+
case EXACTFU_SS:
case EXACTFU:
utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
- do_exactf: {
+ do_exactf: {
int c1, c2;
U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
}
break;
}
+ case ANYOFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
+ /* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
while (hardcount < max
hardcount++;
}
} else {
- while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
+ while (scan < loceol && REGINCLASS(prog, p, (U8*)scan, 0))
scan++;
}
break;
/* FALLTHROUGH */
case POSIXL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (! utf8_target) {
while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
*scan)))
to_complement = 1;
goto utf8_posix;
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case NPOSIXA:
if (! utf8_target) {
else {
/* The complement of something that matches only ASCII matches all
- * UTF-8 variant code points, plus everything in ASCII that isn't
- * in the class. */
+ * non-ASCII, plus everything in ASCII that isn't in the class. */
while (hardcount < max && scan < loceol
- && (! UTF8_IS_INVARIANT(*scan)
+ && (! isASCII_utf8(scan)
|| ! _generic_isCC_A((U8) *scan, FLAGS(p))))
{
scan += UTF8SKIP(scan);
}
}
else {
- utf8_posix:
+ utf8_posix:
classnum = (_char_class_number) FLAGS(p);
if (classnum < _FIRST_NON_SWASH_CC) {
}
else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
*(scan + 1)),
classnum))))
{
* code is written for making the loops as tight as possible.
* It could be refactored to save space instead */
switch (classnum) {
- case _CC_ENUM_SPACE: /* XXX would require separate code
- if we revert the change of \v
- matching this */
- /* FALL THROUGH */
- case _CC_ENUM_PSXSPC:
+ case _CC_ENUM_SPACE:
while (hardcount < max
&& scan < loceol
&& (to_complement ^ cBOOL(isSPACE_utf8(scan))))
}
break;
+ case BOUNDL:
+ case NBOUNDL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case BOUND:
case BOUNDA:
- case BOUNDL:
case BOUNDU:
case EOS:
case GPOS:
case KEEPS:
case NBOUND:
case NBOUNDA:
- case NBOUNDL:
case NBOUNDU:
case OPFAIL:
case SBOL:
default:
Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
- regprop(prog, prop, p, reginfo);
- PerlIO_printf(Perl_debug_log,
- "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
- REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
+ regprop(prog, prop, p, reginfo, NULL);
+ Perl_re_exec_indentf( aTHX_ "%s can match %"IVdf" times out of %"IVdf"...\n",
+ depth, SvPVX_const(prop),(IV)c,(IV)max);
});
});
*altsvp = NULL;
}
- return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
+ return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
}
-SV *
-Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
- const regnode* node,
- bool doinit,
- SV** listsvp,
- SV** only_utf8_locale_ptr)
-{
- /* For internal core use only.
- * Returns the swash for the input 'node' in the regex 'prog'.
- * If <doinit> is 'true', will attempt to create the swash if not already
- * done.
- * If <listsvp> is non-null, will return the printable contents of the
- * swash. This can be used to get debugging information even before the
- * swash exists, by calling this function with 'doinit' set to false, in
- * which case the components that will be used to eventually create the
- * swash are returned (in a printable form).
- * Tied intimately to how regcomp.c sets up the data structure */
-
- dVAR;
- SV *sw = NULL;
- SV *si = NULL; /* Input swash initialization string */
- SV* invlist = NULL;
-
- RXi_GET_DECL(prog,progi);
- const struct reg_data * const data = prog ? progi->data : NULL;
-
- PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
-
- assert(ANYOF_FLAGS(node)
- & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
-
- if (data && data->count) {
- const U32 n = ARG(node);
-
- if (data->what[n] == 's') {
- SV * const rv = MUTABLE_SV(data->data[n]);
- AV * const av = MUTABLE_AV(SvRV(rv));
- SV **const ary = AvARRAY(av);
- U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
-
- si = *ary; /* ary[0] = the string to initialize the swash with */
-
- /* Elements 3 and 4 are either both present or both absent. [3] is
- * any inversion list generated at compile time; [4] indicates if
- * that inversion list has any user-defined properties in it. */
- if (av_tindex(av) >= 2) {
- if (only_utf8_locale_ptr
- && ary[2]
- && ary[2] != &PL_sv_undef)
- {
- *only_utf8_locale_ptr = ary[2];
- }
- else {
- *only_utf8_locale_ptr = NULL;
- }
-
- if (av_tindex(av) >= 3) {
- invlist = ary[3];
- if (SvUV(ary[4])) {
- swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
- }
- }
- else {
- invlist = NULL;
- }
- }
-
- /* Element [1] is reserved for the set-up swash. If already there,
- * return it; if not, create it and store it there */
- if (ary[1] && SvROK(ary[1])) {
- sw = ary[1];
- }
- else if (doinit && ((si && si != &PL_sv_undef)
- || (invlist && invlist != &PL_sv_undef))) {
-
- sw = _core_swash_init("utf8", /* the utf8 package */
- "", /* nameless */
- si,
- 1, /* binary */
- 0, /* not from tr/// */
- invlist,
- &swash_init_flags);
- (void)av_store(av, 1, sw);
- }
- }
- }
-
- /* If requested, return a printable version of what this swash matches */
- if (listsvp) {
- SV* matches_string = newSVpvn("", 0);
-
- /* The swash should be used, if possible, to get the data, as it
- * contains the resolved data. But this function can be called at
- * compile-time, before everything gets resolved, in which case we
- * return the currently best available information, which is the string
- * that will eventually be used to do that resolving, 'si' */
- if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
- && (si && si != &PL_sv_undef))
- {
- sv_catsv(matches_string, si);
- }
-
- /* Add the inversion list to whatever we have. This may have come from
- * the swash, or from an input parameter */
- if (invlist) {
- sv_catsv(matches_string, _invlist_contents(invlist));
- }
- *listsvp = matches_string;
- }
-
- return sw;
-}
#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
/*
- reginclass - determine if a character falls into a character class
- n is the ANYOF regnode
+ n is the ANYOF-type regnode
p is the target string
p_end points to one byte beyond the end of the target string
utf8_target tells whether p is in UTF-8.
* UTF8_ALLOW_FFFF */
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+ if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
+ }
}
/* If this character is potentially in the bitmap, check it */
- if (c < 256) {
+ if (c < NUM_ANYOF_CODE_POINTS) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
- else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
- && ! utf8_target
- && ! isASCII(c))
+ else if ((flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) == ANYOFD
+ && ! utf8_target
+ && ! isASCII(c))
{
match = TRUE;
}
else if (flags & ANYOF_LOCALE_FLAGS) {
- if (flags & ANYOF_LOC_FOLD) {
- if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
- match = TRUE;
- }
+ if ((flags & ANYOFL_FOLD)
+ && c < 256
+ && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
+ {
+ match = TRUE;
}
- if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
+ else if (ANYOF_POSIXL_TEST_ANY_SET(n)
+ && c < 256
+ ) {
/* The data structure is arranged so bits 0, 2, 4, ... are set
* if the class includes the Posix character class given by
/* If the bitmap didn't (or couldn't) match, and something outside the
* bitmap could match, try that. */
if (!match) {
- if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
- match = TRUE; /* Everything above 255 matches */
+ if (c >= NUM_ANYOF_CODE_POINTS
+ && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
+ {
+ match = TRUE; /* Everything above the bitmap matches */
}
- else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
- || (utf8_target && (flags & ANYOF_UTF8))
- || ((flags & ANYOF_LOC_FOLD)
- && IN_UTF8_CTYPE_LOCALE
- && ARG(n) != ANYOF_NONBITMAP_EMPTY))
+ /* Here doesn't match everything above the bitmap. If there is
+ * some information available beyond the bitmap, we may find a
+ * match in it. If so, this is most likely because the code point
+ * is outside the bitmap range. But rarely, it could be because of
+ * some other reason. If so, various flags are set to indicate
+ * this possibility. On ANYOFD nodes, there may be matches that
+ * happen only when the target string is UTF-8; or for other node
+ * types, because runtime lookup is needed, regardless of the
+ * UTF-8ness of the target string. Finally, under /il, there may
+ * be some matches only possible if the locale is a UTF-8 one. */
+ else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
+ && ( c >= NUM_ANYOF_CODE_POINTS
+ || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
+ && ( UNLIKELY(OP(n) != ANYOFD)
+ || (utf8_target && ! isASCII_uni(c)
+# if NUM_ANYOF_CODE_POINTS > 256
+ && c < 256
+# endif
+ )))
+ || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
+ && IN_UTF8_CTYPE_LOCALE)))
{
SV* only_utf8_locale = NULL;
SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
- &only_utf8_locale);
+ &only_utf8_locale, NULL);
if (sw) {
+ U8 utf8_buffer[2];
U8 * utf8_p;
if (utf8_target) {
utf8_p = (U8 *) p;
} else { /* Convert to utf8 */
- STRLEN len = 1;
- utf8_p = bytes_to_utf8(p, &len);
+ utf8_p = utf8_buffer;
+ append_utf8_from_native_byte(*p, &utf8_p);
+ utf8_p = utf8_buffer;
}
if (swash_fetch(sw, utf8_p, TRUE)) {
match = TRUE;
}
-
- /* If we allocated a string above, free it */
- if (! utf8_target) Safefree(utf8_p);
}
if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
match = _invlist_contains_cp(only_utf8_locale, c);
}
if (UNICODE_IS_SUPER(c)
- && (flags & ANYOF_WARN_SUPER)
+ && (flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) != ANYOFD
&& ckWARN_d(WARN_NON_UNICODE))
{
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
* 'off' >= 0, backwards if negative. But don't go outside of position
* 'lim', which better be < s if off < 0 */
- dVAR;
-
PERL_ARGS_ASSERT_REGHOP3;
if (off >= 0) {
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
STATIC U8 *
S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
{
- dVAR;
-
PERL_ARGS_ASSERT_REGHOP4;
if (off >= 0) {
if (UTF8_IS_CONTINUED(*s)) {
while (s > llim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
* char pos */
STATIC U8 *
-S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
+S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
{
- dVAR;
-
PERL_ARGS_ASSERT_REGHOPMAYBE3;
if (off >= 0) {
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
static void
S_cleanup_regmatch_info_aux(pTHX_ void *arg)
{
- dVAR;
regmatch_info_aux *aux = (regmatch_info_aux *) arg;
regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
regmatch_slab *s;
/* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
* on the converted value; returns FALSE if can't be converted. */
- dVAR;
int i = 1;
PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
}
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/