const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
I32 p;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGCPPUSH;
{
UV i;
U32 paren;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGCPPOP;
RXi_GET_DECL(prog,progi);
regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
regmatch_info *const reginfo = ®info_buf;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_RE_INTUIT_START;
PERL_UNUSED_ARG(flags);
const U8* const str = (U8*)STRING(progi->regstclass);
/* XXX this value could be pre-computed */
- const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
+ const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
? (reginfo->is_utf8_pat
- ? utf8_distance(str + STR_LEN(progi->regstclass), str)
- : STR_LEN(progi->regstclass))
+ ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
+ : (SSize_t)STR_LEN(progi->regstclass))
: 1);
char * endpos;
char *s;
case ANYOFHr:
if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
REXEC_FBC_CLASS_SCAN(TRUE,
- ( inRANGE((U8) NATIVE_UTF8_TO_I8(*s),
+ ( inRANGE(NATIVE_UTF8_TO_I8(*s),
LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
&& reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
}
break;
+ case ANYOFHs:
+ if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
+ REXEC_FBC_CLASS_SCAN(TRUE,
+ ( strend -s >= FLAGS(c)
+ && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
+ && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+ }
+ break;
+
+ case ANYOFR:
+ if (utf8_target) {
+ REXEC_FBC_CLASS_SCAN(TRUE,
+ ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
+ (U8 *) strend,
+ NULL),
+ ANYOFRbase(c), ANYOFRdelta(c))));
+ }
+ else {
+ REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
+ ANYOFRbase(c), ANYOFRdelta(c)));
+ }
+ break;
+
+ case ANYOFRb:
+ if (utf8_target) {
+
+ /* We know what the first byte of any matched string should be */
+ U8 first_byte = FLAGS(c);
+
+ REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
+ withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
+ (U8 *) strend,
+ NULL),
+ ANYOFRbase(c), ANYOFRdelta(c)));
+ }
+ else {
+ REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
+ ANYOFRbase(c), ANYOFRdelta(c)));
+ }
+ break;
+
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
/* FALLTHROUGH */
| FOLDEQ_S2_FOLDS_SANE;
goto do_exactf_utf8;
- case EXACTFU_ONLY8:
+ case EXACTFU_REQ8:
if (! utf8_target) {
break;
}
U8 *bitmap=NULL;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
/* We can't just allocate points here. We need to wrap it in
* an SV so it gets freed properly if there is a croak while
regmatch_info *const reginfo = ®info_buf;
regexp_paren_pair *swap = NULL;
I32 oldsave;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGEXEC_FLAGS;
PERL_UNUSED_ARG(data);
if (!startpos ||
((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
{
- DEBUG_r(Perl_re_printf( aTHX_
+ DEBUG_GPOS_r(Perl_re_printf( aTHX_
"fail: ganch-gofs before earliest possible start\n"));
return 0;
}
minlen = prog->minlen;
if ((startpos + minlen) > strend || startpos < strbeg) {
- DEBUG_r(Perl_re_printf( aTHX_
- "Regex match can't succeed, so not even tried\n"));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ "Regex match can't succeed, so not even tried\n"));
return 0;
}
U32 depth = 0; /* used by REGCP_SET */
#endif
RXi_GET_DECL(prog,progi);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGTRY;
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
if ( OP(text_node) == EXACT
- || OP(text_node) == EXACT_ONLY8
+ || OP(text_node) == LEXACT
+ || OP(text_node) == EXACT_REQ8
+ || OP(text_node) == LEXACT_REQ8
|| OP(text_node) == EXACTL)
{
* copy the input to the output, avoiding finding the code point of
* that character */
if (!is_utf8_pat) {
- assert(OP(text_node) != EXACT_ONLY8);
+ assert( OP(text_node) != EXACT_REQ8
+ && OP(text_node) != LEXACT_REQ8);
c2 = c1 = *pat;
}
else if (utf8_target) {
Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
utf8_has_been_setup = TRUE;
}
- else if (OP(text_node) == EXACT_ONLY8) {
+ else if ( OP(text_node) == EXACT_REQ8
+ || OP(text_node) == LEXACT_REQ8)
+ {
return FALSE; /* Can only match UTF-8 target */
}
else {
}
}
else if (c1 > 255) {
- const unsigned int * remaining_folds;
- unsigned int first_fold;
+ const U32 * remaining_folds;
+ U32 first_fold;
/* Look up what code points (besides c1) fold to c1; e.g.,
* [ 'K', KELVIN_SIGN ] both fold to 'k'. */
case EXACTFU:
c2 = PL_fold_latin1[c1];
break;
- case EXACTFU_ONLY8:
+ case EXACTFU_REQ8:
return FALSE;
NOT_REACHED; /* NOTREACHED */
S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
{
/* returns a boolean indicating if there is a Grapheme Cluster Boundary
- * between the inputs. See http://www.unicode.org/reports/tr29/. */
+ * between the inputs. See https://www.unicode.org/reports/tr29/. */
PERL_ARGS_ASSERT_ISGCB;
}
while (prev == GCB_Extend);
- return prev != GCB_XPG_XX;
+ return prev != GCB_ExtPict_XX;
}
default:
const bool utf8_target)
{
/* returns a boolean indicating if there is a Sentence Boundary Break
- * between the inputs. See http://www.unicode.org/reports/tr29/ */
+ * between the inputs. See https://www.unicode.org/reports/tr29/ */
U8 * lpos = (U8 *) curpos;
bool has_para_sep = FALSE;
#endif
#ifdef DEBUGGING
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
#endif
/* protect against undef(*^R) */
}
#undef ST
+ case LEXACT_REQ8:
+ if (! utf8_target) {
+ sayNO;
+ }
+ /* FALLTHROUGH */
+
+ case LEXACT:
{
char *s;
+ s = STRINGl(scan);
+ ln = STR_LENl(scan);
+ goto join_short_long_exact;
+
case EXACTL: /* /abc/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
}
goto do_exact;
- case EXACT_ONLY8:
+ case EXACT_REQ8:
if (! utf8_target) {
sayNO;
}
do_exact:
s = STRINGs(scan);
ln = STR_LENs(scan);
+
+ join_short_long_exact:
if (utf8_target != is_utf8_pat) {
/* The target and the pattern have differing utf8ness. */
char *l = locinput;
fold_array = PL_fold_latin1;
goto do_exactf;
- case EXACTFU_ONLY8: /* /abc/iu with something in /abc/ > 255 */
+ case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
if (! utf8_target) {
sayNO;
}
if (locinput == reginfo->strbeg)
b1 = isWORDCHAR_LC('\n');
else {
- b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
- (U8*)(reginfo->strbeg)),
- (U8*)(reginfo->strend));
+ U8 *p = reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg));
+ b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
}
b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
case TRADITIONAL_BOUND:
{
bool b1, b2;
- b1 = (locinput == reginfo->strbeg)
- ? 0 /* isWORDCHAR_L1('\n') */
- : isWORDCHAR_utf8_safe(
- reghop3((U8*)locinput,
- -1,
- (U8*)(reginfo->strbeg)),
- (U8*) reginfo->strend);
+ if (locinput == reginfo->strbeg) {
+ b1 = 0 /* isWORDCHAR_L1('\n') */;
+ }
+ else {
+ U8 *p = reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg));
+
+ b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
+ }
b2 = (NEXTCHR_IS_EOS)
? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8_safe((U8*)locinput,
case ANYOFH:
if ( ! utf8_target
|| NEXTCHR_IS_EOS
- || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
+ || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
|| ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
utf8_target))
{
goto increment_locinput;
break;
+ case ANYOFHs:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || loceol - locinput < FLAGS(scan)
+ || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+ utf8_target))
+ {
+ sayNO;
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFR:
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+
+ if (utf8_target) {
+ if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
+ || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
+ (U8 *) reginfo->strend,
+ NULL),
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ else {
+ if (! withinCOUNT((U8) *locinput,
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFRb:
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+
+ if (utf8_target) {
+ if ( ANYOF_FLAGS(scan) != (U8) *locinput
+ || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
+ (U8 *) reginfo->strend,
+ NULL),
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ else {
+ if (! withinCOUNT((U8) *locinput,
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ goto increment_locinput;
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number
* */
rex->recurse_locinput[arg]= locinput;
DEBUG_r({
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_STACK_r({
Perl_re_exec_indentf( aTHX_
"entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
/* NOTREACHED */
case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
- if (cur_eval && cur_eval->locinput==locinput) {
+ if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
if ( ++nochange_depth > max_nochange_depth )
Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
} else {
scan = this_eol;
break;
+ case LEXACT_REQ8:
+ if (! utf8_target) {
+ break;
+ }
+ /* FALLTHROUGH */
+
+ case LEXACT:
{
U8 * string;
Size_t str_len;
+ string = (U8 *) STRINGl(p);
+ str_len = STR_LENl(p);
+ goto join_short_long_exact;
+
case EXACTL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
}
goto do_exact;
- case EXACT_ONLY8:
+ case EXACT_REQ8:
if (! utf8_target) {
break;
}
do_exact:
string = (U8 *) STRINGs(p);
str_len = STR_LENs(p);
+
+ join_short_long_exact:
assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1);
c = *string;
| FOLDEQ_S2_FOLDS_SANE;
goto do_exactf;
- case EXACTFU_ONLY8:
+ case EXACTFU_REQ8:
if (! utf8_target) {
break;
}
if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
while ( hardcount < max
&& scan < this_eol
- && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+ && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
&& reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
{
scan += UTF8SKIP(scan);
if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
while ( hardcount < max
&& scan < this_eol
- && inRANGE((U8) NATIVE_UTF8_TO_I8(*scan),
+ && inRANGE(NATIVE_UTF8_TO_I8(*scan),
LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
- && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+ && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
&& reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
{
scan += UTF8SKIP(scan);
}
break;
+ case ANYOFHs:
+ if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
+ while ( hardcount < max
+ && scan + FLAGS(p) < this_eol
+ && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFR:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < this_eol
+ && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
+ (U8 *) this_eol,
+ NULL),
+ ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ while ( hardcount < max
+ && scan < this_eol
+ && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan++;
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFRb:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < this_eol
+ && (U8) *scan == ANYOF_FLAGS(p)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
+ (U8 *) this_eol,
+ NULL),
+ ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ while ( hardcount < max
+ && scan < this_eol
+ && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan++;
+ hardcount++;
+ }
+ }
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number */
case NPOSIXL:
*startposp = scan;
DEBUG_r({
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
regprop(prog, prop, p, reginfo, NULL);
S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
{
dVAR;
- const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHr))
+ const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
? 0
: ANYOF_FLAGS(n);
bool match = FALSE;
#ifndef PERL_IN_XSUB_RE
bool
-Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
+Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
{
/* Temporary helper function for toke.c. Verify that the code point 'cp'
* is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
* the larger string bounded by 'strbeg' and 'strend'.
*
- * 'cp' needs to be assigned (if not a future version of the Unicode
+ * 'cp' needs to be assigned (if not, a future version of the Unicode
* Standard could make it something that combines with adjacent characters,
* so code using it would then break), and there has to be a GCB break
* before and after the character. */
GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
const U8 * prev_cp_start;
- PERL_ARGS_ASSERT__IS_GRAPHEME;
+ PERL_ARGS_ASSERT_IS_GRAPHEME;
if ( UNLIKELY(UNICODE_IS_SUPER(cp))
|| UNLIKELY(UNICODE_IS_NONCHAR(cp)))