#include "inline_invlist.c"
#include "unicode_constants.h"
+#ifdef HAS_ISBLANK
+# define hasISBLANK 1
+#else
+# define hasISBLANK 0
+#endif
+
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
string can occur infinitely far to the right.
- minlenp
- A pointer to the minimum length of the pattern that the string
- was found inside. This is important as in the case of positive
+ A pointer to the minimum number of characters of the pattern that the
+ string was found inside. This is important as in the case of positive
lookahead or positive lookbehind we can have multiple patterns
involved. Consider
* these get optimized out
*
* If there are problematic code sequences, *min_subtract is set to the delta
- * that the minimum size of the node can be less than its actual size. And,
- * the node type of the result is changed to reflect that it contains these
- * sequences.
+ * number of characters that the minimum size of the node can be less than its
+ * actual size. And, the node type of the result is changed to reflect that it
+ * contains these sequences.
*
* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
* and contains LATIN SMALL LETTER SHARP S
* U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
*
* This means that in case-insensitive matching (or "loose
- * matching", as Unicode calls it), an EXACTF of length six (the
- * UTF-8 encoded byte length of the above casefolded versions) can
- * match a target string of length two (the byte length of UTF-8
- * encoded U+0390 or U+03B0). This would rather mess up the
- * minimum length computation. (there are other code points that
- * also fold to these two sequences, but the delta is smaller)
+ * matching", as Unicode calls it), an EXACTF of length 3 chars can
+ * match a target string of length 1 char. This would rather mess
+ * up the minimum length computation.
*
* If these sequences are found, the minimum length is decreased by
- * four (six minus two).
+ * two.
*
* Similarly, 'ss' may match the single char and byte LATIN SMALL
* LETTER SHARP S. We decrease the min length by 1 for each
break;
}
greek_sequence:
- *min_subtract += 4;
+ *min_subtract += 2;
/* This requires special handling by trie's, so change
* the node type to indicate this. If EXACTFA and
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
- I32 min = 0, pars = 0, code;
+ I32 min = 0; /* There must be at least this number of characters to match */
+ I32 pars = 0, code;
regnode *scan = *scanp, *next;
I32 delta = 0;
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
- UV min_subtract = 0; /* How much to subtract from the minimum node
- length to get a real minimum (because the
- folded version may be shorter) */
+ UV min_subtract = 0; /* How mmany chars to subtract from the minimum
+ node length to get a real minimum (because
+ the folded version may be shorter) */
bool has_exactf_sharp_s = FALSE;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
* trietype so we can turn them into a trie. If/when we
* allow NOTHING to start a trie sequence this condition will be
* required, and it isn't expensive so we leave it in for now. */
- if ( trietype != NOTHING )
+ if ( trietype && trietype != NOTHING )
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
trietype, depth+1 );
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
- if ( last ) {
+ if ( last && trietype ) {
if ( trietype != NOTHING ) {
/* the last branch of the sequence was part of a trie,
* so we have to construct it here outside of the loop
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
min += l - min_subtract;
- if (min < 0) {
- min = 0;
- }
+ assert (min >= 0);
delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += l - min_subtract;
case ALNUM:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (!isWORDCHAR_L1(value)) {
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
+ ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
/* Even if under locale, set the bits for non-locale
* in case it isn't a true locale-node. This will
case NALNUM:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
if (OP(scan) == NALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
+ ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
/* Even if under locale, set the bits for non-locale in
* case it isn't a true locale-node. This will create
switch (skip) {
case 4:
if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
- namedclass = ANYOF_ALNUM;
+ namedclass = ANYOF_WORDCHAR;
break;
case 5:
/* Names all of length 5. */
* A similar issue a little bit later when switching on
* namedclass. --jhi */
switch ((I32)value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
+ case 'w': namedclass = ANYOF_WORDCHAR; break;
+ case 'W': namedclass = ANYOF_NWORDCHAR; break;
case 's': namedclass = ANYOF_SPACE; break;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
runtime_posix_matches_above_Unicode);
break;
case ANYOF_ASCII:
+#ifdef HAS_ISASCII
if (LOC) {
ANYOF_CLASS_SET(ret, namedclass);
}
- else {
+ else
+#endif /* Not isascii(); just use the hard-coded definition for it */
_invlist_union(posixes, PL_ASCII, &posixes);
- }
break;
case ANYOF_NASCII:
+#ifdef HAS_ISASCII
if (LOC) {
ANYOF_CLASS_SET(ret, namedclass);
}
else {
+#endif
_invlist_union_complement_2nd(posixes,
PL_ASCII, &posixes);
if (DEPENDS_SEMANTICS) {
ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
}
+#ifdef HAS_ISASCII
}
+#endif
break;
case ANYOF_BLANK:
- DO_POSIX(ret, namedclass, posixes,
+ if (hasISBLANK || ! LOC) {
+ DO_POSIX(ret, namedclass, posixes,
PL_PosixBlank, PL_XPosixBlank);
+ }
+ else { /* There is no isblank() and we are in locale: We
+ use the ASCII range and the above-Latin1 range
+ code points */
+ SV* scratch_list = NULL;
+
+ /* Include all above-Latin1 blanks */
+ _invlist_intersection(PL_AboveLatin1,
+ PL_XPosixBlank,
+ &scratch_list);
+ /* Add it to the running total of posix classes */
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
+ /* Add the ASCII-range blanks to the running total. */
+ _invlist_union(posixes, PL_PosixBlank, &posixes);
+ }
break;
case ANYOF_NBLANK:
- DO_N_POSIX(ret, namedclass, posixes,
- PL_PosixBlank, PL_XPosixBlank);
+ if (hasISBLANK || ! LOC) {
+ DO_N_POSIX(ret, namedclass, posixes,
+ PL_PosixBlank, PL_XPosixBlank);
+ }
+ else { /* There is no isblank() and we are in locale */
+ SV* scratch_list = NULL;
+
+ /* Include all above-Latin1 non-blanks */
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+
+ /* Add them to the running total of posix classes */
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
+
+ /* Get the list of all non-ASCII-blanks in Latin 1, and
+ * add them to the running total */
+ _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
break;
case ANYOF_CNTRL:
DO_POSIX(ret, namedclass, posixes,
}
break;
}
- case ANYOF_ALNUM: /* Really is 'Word' */
+ case ANYOF_WORDCHAR:
DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
break;
- case ANYOF_NALNUM:
+ case ANYOF_NWORDCHAR:
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
runtime_posix_matches_above_Unicode);
* modifier to the regex. We first calculate the base node
* type, and if it should be inverted */
- case ANYOF_NALNUM:
+ case ANYOF_NWORDCHAR:
invert = ! invert;
/* FALLTHROUGH */
- case ANYOF_ALNUM:
+ case ANYOF_WORDCHAR:
op = ALNUM;
goto join_charset_classes;