#include "dquote_static.c"
#include "charclass_invlists.h"
#include "inline_invlist.c"
-#include "utf8_strings.h"
+#include "unicode_constants.h"
+
+#ifdef HAS_ISBLANK
+# define hasISBLANK 1
+#else
+# define hasISBLANK 0
+#endif
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
string can occur infinitely far to the right.
- minlenp
- A pointer to the minimum length of the pattern that the string
- was found inside. This is important as in the case of positive
+ A pointer to the minimum number of characters of the pattern that the
+ string was found inside. This is important as in the case of positive
lookahead or positive lookbehind we can have multiple patterns
involved. Consider
* these get optimized out
*
* If there are problematic code sequences, *min_subtract is set to the delta
- * that the minimum size of the node can be less than its actual size. And,
- * the node type of the result is changed to reflect that it contains these
- * sequences.
+ * number of characters that the minimum size of the node can be less than its
+ * actual size. And, the node type of the result is changed to reflect that it
+ * contains these sequences.
*
* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
* and contains LATIN SMALL LETTER SHARP S
* U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
*
* This means that in case-insensitive matching (or "loose
- * matching", as Unicode calls it), an EXACTF of length six (the
- * UTF-8 encoded byte length of the above casefolded versions) can
- * match a target string of length two (the byte length of UTF-8
- * encoded U+0390 or U+03B0). This would rather mess up the
- * minimum length computation. (there are other code points that
- * also fold to these two sequences, but the delta is smaller)
+ * matching", as Unicode calls it), an EXACTF of length 3 chars can
+ * match a target string of length 1 char. This would rather mess
+ * up the minimum length computation.
*
* If these sequences are found, the minimum length is decreased by
- * four (six minus two).
+ * two.
*
* Similarly, 'ss' may match the single char and byte LATIN SMALL
* LETTER SHARP S. We decrease the min length by 1 for each
break;
}
greek_sequence:
- *min_subtract += 4;
+ *min_subtract += 2;
/* This requires special handling by trie's, so change
* the node type to indicate this. If EXACTFA and
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
- I32 min = 0, pars = 0, code;
+ I32 min = 0; /* There must be at least this number of characters to match */
+ I32 pars = 0, code;
regnode *scan = *scanp, *next;
I32 delta = 0;
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
- UV min_subtract = 0; /* How much to subtract from the minimum node
- length to get a real minimum (because the
- folded version may be shorter) */
+ UV min_subtract = 0; /* How mmany chars to subtract from the minimum
+ node length to get a real minimum (because
+ the folded version may be shorter) */
bool has_exactf_sharp_s = FALSE;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
* trietype so we can turn them into a trie. If/when we
* allow NOTHING to start a trie sequence this condition will be
* required, and it isn't expensive so we leave it in for now. */
- if ( trietype != NOTHING )
+ if ( trietype && trietype != NOTHING )
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
trietype, depth+1 );
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
- if ( last ) {
+ if ( last && trietype ) {
if ( trietype != NOTHING ) {
/* the last branch of the sequence was part of a trie,
* so we have to construct it here outside of the loop
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
min += l - min_subtract;
- if (min < 0) {
- min = 0;
- }
+ assert (min >= 0);
delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += l - min_subtract;
case ALNUM:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (!isWORDCHAR_L1(value)) {
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
+ ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
/* Even if under locale, set the bits for non-locale
* in case it isn't a true locale-node. This will
case NALNUM:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
if (OP(scan) == NALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
+ ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
/* Even if under locale, set the bits for non-locale in
* case it isn't a true locale-node. This will create
else {
while (SvAMAGIC(msv)
&& (sv = AMG_CALLunary(msv, string_amg))
- && sv != msv)
- {
+ && sv != msv
+ && !( SvROK(msv)
+ && SvROK(sv)
+ && SvRV(msv) == SvRV(sv))
+ ) {
msv = sv;
SvGETMAGIC(msv);
}
#ifdef STUPID_PATTERN_CHECKS
if (RX_PRELEN(rx) == 0)
r->extflags |= RXf_NULL;
- if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
- /* XXX: this should happen BEFORE we compile */
- r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
- else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
+ if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
r->extflags |= RXf_WHITE;
else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
r->extflags |= RXf_START_ONLY;
#else
- if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
- /* XXX: this should happen BEFORE we compile */
- r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
- else {
+ {
regnode *first = ri->program + 1;
U8 fop = OP(first);
char *s = NULL;
I32 i = 0;
I32 s1, t1;
+ I32 n = paren;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
- if (!rx->subbeg) {
- sv_setsv(sv,&PL_sv_undef);
- return;
- }
- else
- if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
- /* $` */
+ if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
+ || n == RX_BUFF_IDX_CARET_FULLMATCH
+ || n == RX_BUFF_IDX_CARET_POSTMATCH
+ )
+ && !(rx->extflags & RXf_PMf_KEEPCOPY)
+ )
+ goto ret_undef;
+
+ if (!rx->subbeg)
+ goto ret_undef;
+
+ if (n == RX_BUFF_IDX_CARET_FULLMATCH)
+ /* no need to distinguish between them any more */
+ n = RX_BUFF_IDX_FULLMATCH;
+
+ if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
+ && rx->offs[0].start != -1)
+ {
+ /* $`, ${^PREMATCH} */
i = rx->offs[0].start;
s = rx->subbeg;
}
else
- if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
- /* $' */
- s = rx->subbeg + rx->offs[0].end;
- i = rx->sublen - rx->offs[0].end;
+ if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
+ && rx->offs[0].end != -1)
+ {
+ /* $', ${^POSTMATCH} */
+ s = rx->subbeg - rx->suboffset + rx->offs[0].end;
+ i = rx->sublen + rx->suboffset - rx->offs[0].end;
}
else
- if ( 0 <= paren && paren <= (I32)rx->nparens &&
- (s1 = rx->offs[paren].start) != -1 &&
- (t1 = rx->offs[paren].end) != -1)
+ if ( 0 <= n && n <= (I32)rx->nparens &&
+ (s1 = rx->offs[n].start) != -1 &&
+ (t1 = rx->offs[n].end) != -1)
{
- /* $& $1 ... */
+ /* $&, ${^MATCH}, $1 ... */
i = t1 - s1;
- s = rx->subbeg + s1;
+ s = rx->subbeg + s1 - rx->suboffset;
} else {
- sv_setsv(sv,&PL_sv_undef);
- return;
+ goto ret_undef;
}
+
+ assert(s >= rx->subbeg);
assert(rx->sublen >= (s - rx->subbeg) + i );
if (i >= 0) {
const int oldtainted = PL_tainted;
SvTAINTED_off(sv);
}
} else {
+ ret_undef:
sv_setsv(sv,&PL_sv_undef);
return;
}
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
- switch (paren) {
- /* $` / ${^PREMATCH} */
- case RX_BUFF_IDX_PREMATCH:
+ switch (paren) {
+ case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
+ if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+ goto warn_undef;
+ /*FALLTHROUGH*/
+
+ case RX_BUFF_IDX_PREMATCH: /* $` */
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
if (i > 0) {
}
}
return 0;
- /* $' / ${^POSTMATCH} */
- case RX_BUFF_IDX_POSTMATCH:
+
+ case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
+ if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+ goto warn_undef;
+ case RX_BUFF_IDX_POSTMATCH: /* $' */
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
if (i > 0) {
}
}
return 0;
+
+ case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
+ if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+ goto warn_undef;
+ /*FALLTHROUGH*/
+
/* $& / ${^MATCH}, $1, $2, ... */
default:
if (paren <= (I32)rx->nparens &&
i = t1 - s1;
goto getlen;
} else {
+ warn_undef:
if (ckWARN(WARN_UNINITIALIZED))
report_uninit((const SV *)sv);
return 0;
}
getlen:
if (i > 0 && RXp_MATCH_UTF8(rx)) {
- const char * const s = rx->subbeg + s1;
+ const char * const s = rx->subbeg - rx->suboffset + s1;
const U8 *ep;
STRLEN el;
* list.)
* Taking the complement (inverting) an inversion list is quite simple, if the
* first element is 0, remove it; otherwise add a 0 element at the beginning.
- * This implementation reserves an element at the beginning of each inversion list
- * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
- * beginning of the list is either that element if 0, or the next one if 1.
+ * This implementation reserves an element at the beginning of each inversion
+ * list to contain 0 when the list contains 0, and contains 1 otherwise. The
+ * actual beginning of the list is either that element if 0, or the next one if
+ * 1.
*
* More about inversion lists can be found in "Unicode Demystified"
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
char *parse_start;
#endif
const char *maxpos = NULL;
+
+ /* Save the original in case we change the emitted regop to a FAIL. */
+ regnode * const orig_emit = RExC_emit;
+
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGPIECE;
vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
RExC_parse = next;
nextchar(pRExC_state);
+ if (max < min) { /* If can't match, warn and optimize to fail
+ unconditionally */
+ if (SIZE_ONLY) {
+ ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+
+ /* We can't back off the size because we have to reserve
+ * enough space for all the things we are about to throw
+ * away, but we can shrink it by the ammount we are about
+ * to re-use here */
+ RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+ }
+ else {
+ RExC_emit = orig_emit;
+ }
+ ret = reg_node(pRExC_state, OPFAIL);
+ return ret;
+ }
do_curly:
if ((flags&SIMPLE)) {
*flagp = WORST;
if (max > 0)
*flagp |= HASWIDTH;
- if (max < min)
- vFAIL("Can't do {n,m} with n > m");
if (!SIZE_ONLY) {
ARG1_SET(ret, (U16)min);
ARG2_SET(ret, (U16)max);
switch (skip) {
case 4:
if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
- namedclass = ANYOF_ALNUM;
+ namedclass = ANYOF_WORDCHAR;
break;
case 5:
/* Names all of length 5. */
* A similar issue a little bit later when switching on
* namedclass. --jhi */
switch ((I32)value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
+ case 'w': namedclass = ANYOF_WORDCHAR; break;
+ case 'W': namedclass = ANYOF_NWORDCHAR; break;
case 's': namedclass = ANYOF_SPACE; break;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
runtime_posix_matches_above_Unicode);
break;
case ANYOF_ASCII:
+#ifdef HAS_ISASCII
if (LOC) {
ANYOF_CLASS_SET(ret, namedclass);
}
- else {
+ else
+#endif /* Not isascii(); just use the hard-coded definition for it */
_invlist_union(posixes, PL_ASCII, &posixes);
- }
break;
case ANYOF_NASCII:
+#ifdef HAS_ISASCII
if (LOC) {
ANYOF_CLASS_SET(ret, namedclass);
}
else {
+#endif
_invlist_union_complement_2nd(posixes,
PL_ASCII, &posixes);
if (DEPENDS_SEMANTICS) {
ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
}
+#ifdef HAS_ISASCII
}
+#endif
break;
case ANYOF_BLANK:
- DO_POSIX(ret, namedclass, posixes,
+ if (hasISBLANK || ! LOC) {
+ DO_POSIX(ret, namedclass, posixes,
PL_PosixBlank, PL_XPosixBlank);
+ }
+ else { /* There is no isblank() and we are in locale: We
+ use the ASCII range and the above-Latin1 range
+ code points */
+ SV* scratch_list = NULL;
+
+ /* Include all above-Latin1 blanks */
+ _invlist_intersection(PL_AboveLatin1,
+ PL_XPosixBlank,
+ &scratch_list);
+ /* Add it to the running total of posix classes */
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
+ /* Add the ASCII-range blanks to the running total. */
+ _invlist_union(posixes, PL_PosixBlank, &posixes);
+ }
break;
case ANYOF_NBLANK:
- DO_N_POSIX(ret, namedclass, posixes,
- PL_PosixBlank, PL_XPosixBlank);
+ if (hasISBLANK || ! LOC) {
+ DO_N_POSIX(ret, namedclass, posixes,
+ PL_PosixBlank, PL_XPosixBlank);
+ }
+ else { /* There is no isblank() and we are in locale */
+ SV* scratch_list = NULL;
+
+ /* Include all above-Latin1 non-blanks */
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+
+ /* Add them to the running total of posix classes */
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
+
+ /* Get the list of all non-ASCII-blanks in Latin 1, and
+ * add them to the running total */
+ _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
break;
case ANYOF_CNTRL:
DO_POSIX(ret, namedclass, posixes,
}
break;
}
- case ANYOF_ALNUM: /* Really is 'Word' */
+ case ANYOF_WORDCHAR:
DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
break;
- case ANYOF_NALNUM:
+ case ANYOF_NWORDCHAR:
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
runtime_posix_matches_above_Unicode);
* modifier to the regex. We first calculate the base node
* type, and if it should be inverted */
- case ANYOF_NALNUM:
+ case ANYOF_NWORDCHAR:
invert = ! invert;
/* FALLTHROUGH */
- case ANYOF_ALNUM:
+ case ANYOF_WORDCHAR:
op = ALNUM;
goto join_charset_classes;
PL_reg_oldsaved = NULL;
PL_reg_oldsavedlen = 0;
+ PL_reg_oldsavedoffset = 0;
+ PL_reg_oldsavedcoffset = 0;
PL_reg_maxiter = 0;
PL_reg_leftiter = 0;
PL_reg_poscache = NULL;