char *start; /* Start of input for compile */
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
- I32 whilem_seen; /* number of WHILEM in this expr */
+ SSize_t whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
regnode *emit_bound; /* First regnode outside of the allocated space */
regnode *emit; /* Code-emit pointer; if = &emit_dummy,
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
- I32 size; /* Code size. */
+ SSize_t size; /* Code size. */
I32 npar; /* Capture buffer count, (OPEN). */
I32 cpar; /* Capture buffer count, (CLOSE). */
I32 nestroot; /* root parens we are in - used by accept */
- max_offset
Only used for floating strings. This is the rightmost point that
- the string can appear at. If set to I32 max it indicates that the
+ the string can appear at. If set to SSize_t_MAX it indicates that the
string can occur infinitely far to the right.
- minlenp
typedef struct scan_data_t {
/*I32 len_min; unused */
/*I32 len_delta; unused */
- I32 pos_min;
- I32 pos_delta;
+ SSize_t pos_min;
+ SSize_t pos_delta;
SV *last_found;
- I32 last_end; /* min value, <0 unless valid. */
- I32 last_start_min;
- I32 last_start_max;
+ SSize_t last_end; /* min value, <0 unless valid. */
+ SSize_t last_start_min;
+ SSize_t last_start_max;
SV **longest; /* Either &l_fixed, or &l_float. */
SV *longest_fixed; /* longest fixed string found in pattern */
- I32 offset_fixed; /* offset where it starts */
- I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
+ SSize_t offset_fixed; /* offset where it starts */
+ SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
I32 lookbehind_fixed; /* is the position of the string modfied by LB */
SV *longest_float; /* longest floating string found in pattern */
- I32 offset_float_min; /* earliest point in string it can appear */
- I32 offset_float_max; /* latest point in string it can appear */
- I32 *minlen_float; /* pointer to the minlen relevant to the string */
- I32 lookbehind_float; /* is the position of the string modified by LB */
+ SSize_t offset_float_min; /* earliest point in string it can appear */
+ SSize_t offset_float_max; /* latest point in string it can appear */
+ SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
+ SSize_t lookbehind_float; /* is the pos of the string modified by LB */
I32 flags;
I32 whilem_c;
- I32 *last_closep;
+ SSize_t *last_closep;
struct regnode_charclass_class *start_class;
} scan_data_t;
+/* The below is perhaps overboard, but this allows us to save a test at the
+ * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
+ * and 'a' differ by a single bit; the same with the upper and lower case of
+ * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
+ * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit position
+ * where the upper- and lowercase differ. XXX There are about 40 other
+ * instances in the Perl core where this micro-optimization could be used.
+ * Should decide if maintenance cost is worse, before changing those
+ *
+ * Returns a boolean as to whether or not 'v' is either a lowercase or
+ * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
+ * compile-time constant, the generated code is better than some optimizing
+ * compilers figure out, amounting to a mask and test. The results are
+ * meaningless if 'c' is not one of [A-Za-z] */
+#define isARG2_lower_or_UPPER_ARG1(c, v) \
+ (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
+
/*
* Forward declarations for pregcomp()'s friends.
*/
floating substrings if needed. */
STATIC void
-S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
+ SSize_t *minlenp, int is_inf)
{
const STRLEN l = CHR_SVLEN(data->last_found);
const STRLEN old_l = CHR_SVLEN(*data->longest);
data->offset_float_min = l ? data->last_start_min : data->pos_min;
data->offset_float_max = (l
? data->last_start_max
- : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
- if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
- data->offset_float_max = I32_MAX;
+ : (data->pos_delta == SSize_t_MAX
+ ? SSize_t_MAX
+ : data->pos_min + data->pos_delta));
+ if (is_inf
+ || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
+ data->offset_float_max = SSize_t_MAX;
if (data->flags & SF_BEFORE_EOL)
data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
255, which means that the union with cl should just be
what cl has in it, so can ignore this flag
ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
- is 127-255 to match them, but then invert that, so the
- union with cl should just be what cl has in it, so can
+ is (ASCII) 127-255 to match them, but then invert that, so
+ the union with cl should just be what cl has in it, so can
ignore this flag
*/
} else { /* 'or_with' is not inverted */
8: EXACT <baz>(10)
10: END(0)
- d = uvuni_to_utf8_flags(d, uv, 0);
+ d = uvchr_to_utf8_flags(d, uv, 0);
is the recommended Unicode-aware way of saying
if (UTF) { \
SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
+ unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- /* if it is UTF then it is either already folded, or does not need folding */ \
- uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
- } \
- else if (folder == PL_fold_latin1) { \
- /* if we use this folder we have to obey unicode rules on latin-1 data */ \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
- skiplen = UNISKIP(uvc); \
- foldlen -= skiplen; \
- scan = foldbuf + skiplen; \
- } \
- } else { \
- /* raw data, will be folded later if needed */ \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+/* This gets the next character from the input, folding it if not already
+ * folded. */
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need \
+ * folding */ \
+ uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* This folder implies Unicode rules, which in the range expressible \
+ * by not UTF is the lower case, with the two exceptions, one of \
+ * which should have been taken care of before calling this */ \
+ assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
+ uvc = toLOWER_L1(*uc); \
+ if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
+ len = 1; \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
HV *widecharmap = NULL;
AV *revcharmap = newAV();
regnode *cur;
- const U32 uniflags = UTF8_ALLOW_DEFAULT;
STRLEN len = 0;
UV uvc = 0;
U16 curword = 0;
case EXACT: break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
const U8 *uc = (U8*)STRING( noper );
const U8 *e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
- const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars = 0;
+ STRLEN minbytes = 0;
+ STRLEN maxbytes = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
regardless of encoding */
if (OP( noper ) == EXACTFU_SS) {
/* false positives are ok, so just set this */
- TRIE_BITMAP_SET(trie,0xDF);
+ TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
- chars++;
+
+ /* Acummulate to the current values, the range in the number of
+ * bytes that this character could match. The max is presumed to
+ * be the same as the folded input (which TRIE_READ_CHAR returns),
+ * except that when this is not in UTF-8, it could be matched
+ * against a string which is UTF-8, and the variant characters
+ * could be 2 bytes instead of the 1 here. Likewise, for the
+ * minimum number of bytes when not folded. When folding, the min
+ * is assumed to be 1 byte could fold to match the single character
+ * here, or in the case of a multi-char fold, 1 byte can fold to
+ * the whole sequence. 'foldlen' is used to denote whether we are
+ * in such a sequence, skipping the min setting if so. XXX TODO
+ * Use the exact list of what folds to each character, from
+ * PL_utf8_foldclosures */
+ if (UTF) {
+ maxbytes += UTF8SKIP(uc);
+ if (! folder) {
+ /* A non-UTF-8 string could be 1 byte to match our 2 */
+ minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
+ ? 1
+ : UTF8SKIP(uc);
+ }
+ else {
+ if (foldlen) {
+ foldlen -= UTF8SKIP(uc);
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
+ else {
+ maxbytes += (UNI_IS_INVARIANT(*uc))
+ ? 1
+ : 2;
+ if (! folder) {
+ minbytes++;
+ }
+ else {
+ if (foldlen) {
+ foldlen--;
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
if ( uvc < 256 ) {
if ( folder ) {
U8 folded= folder[ (U8) uvc ];
if ( !UTF ) {
/* store first byte of utf8 representation of
variant codepoints */
- if (! UNI_IS_INVARIANT(uvc)) {
+ if (! NATIVE_IS_INVARIANT(uvc)) {
TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
}
}
}
}
if( cur == first ) {
- trie->minlen = chars;
- trie->maxlen = chars;
- } else if (chars < trie->minlen) {
- trie->minlen = chars;
- } else if (chars > trie->maxlen) {
- trie->maxlen = chars;
- }
- if (OP( noper ) == EXACTFU_SS) {
- /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
- if (trie->minlen > 1)
- trie->minlen= 1;
- }
- if (OP( noper ) == EXACTFU_TRICKYFOLD) {
- /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
- * - We assume that any such sequence might match a 2 byte string */
- if (trie->minlen > 2 )
- trie->minlen= 2;
+ trie->minlen = minbytes;
+ trie->maxlen = maxbytes;
+ } else if (minbytes < trie->minlen) {
+ trie->minlen = minbytes;
+ } else if (maxbytes > trie->maxlen) {
+ trie->maxlen = maxbytes;
}
-
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
U16 charid = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
U16 charid = 0; /* sanity init */
U32 accept_state = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- STRLEN skiplen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
* that is "sss".
*
* It turns out that there are problems with all multi-character folds, and not
- * just these three. Now the code is general, for all such cases, but the
- * three still have some special handling. The approach taken is:
+ * just these three. Now the code is general, for all such cases. The
+ * approach taken is:
* 1) This routine examines each EXACTFish node that could contain multi-
* character fold sequences. It returns in *min_subtract how much to
* subtract from the the actual length of the string to get a real minimum
* used by the caller to adjust the min length of the match, and the delta
* between min and max, so that the optimizer doesn't reject these
* possibilities based on size constraints.
- * 2) Certain of these sequences require special handling by the trie code,
- * so, if found, this code changes the joined node type to special ops:
- * EXACTFU_TRICKYFOLD and EXACTFU_SS.
- * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
+ * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
* is used for an EXACTFU node that contains at least one "ss" sequence in
* it. For non-UTF-8 patterns and strings, this is the only case where
* there is a possible fold length change. That means that a regular
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
* 'ss', even if the pattern isn't UTF-8. This avoids the issues
* described in the next item.
- * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
* pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
* UTF-8 pattern.) An assumption that the optimizer part of regexec.c
* (probably unwittingly, in Perl_regexec_flags()) makes is that a
* but in a non-UTF8 pattern, folding it to that above-Latin1 string would
* require the pattern to be forced into UTF-8, the overhead of which we
* want to avoid.)
- */
+ *
+ * Similarly, the code that generates tries doesn't currently handle
+ * not-already-folded multi-char folds, and it looks like a pain to change
+ * that. Therefore, trie generation of EXACTFA nodes with the sharp s
+ * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
+ * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
+ * using /iaa matching will be doing so almost entirely with ASCII
+ * strings, so this should rarely be encountered in practice */
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
}
/* Nodes with 'ss' require special handling, except for EXACTFL
- * and EXACTFA for which there is no multi-char fold to this */
+ * and EXACTFA-ish for which there is no multi-char fold to
+ * this */
if (len == 2 && *s == 's' && *(s+1) == 's'
- && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+ && OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
{
count = 2;
OP(scan) = EXACTFU_SS;
s += 2;
}
- else if (len == 6 /* len is the same in both ASCII and EBCDIC
- for these */
- && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)
- || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)))
- {
- count = 3;
-
- /* These two folds require special handling by trie's, so
- * change the node type to indicate this. If EXACTFA and
- * EXACTFL were ever to be handled by trie's, this would
- * have to be changed. If this node has already been
- * changed to EXACTFU_SS in this loop, leave it as is. (I
- * (khw) think it doesn't matter in regexec.c for UTF
- * patterns, but no need to change it */
- if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_TRICKYFOLD;
- }
- s += 6;
- }
else { /* Here is a generic multi-char fold. */
const U8* multi_end = s + len;
* test for them. The code that generates the
* is_MULTI_foo() macros croaks should one actually get put
* into Unicode .) */
- if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
+ {
count = utf8_length(s, multi_end);
s = multi_end;
}
/* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
* fold to the ASCII range (and there are no existing ones in the
* upper latin1 range). But, as outlined in the comments preceding
- * this function, we need to flag any occurrences of the sharp s */
+ * this function, we need to flag any occurrences of the sharp s.
+ * This character forbids trie formation (because of added
+ * complexity) */
while (s < s_end) {
if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ OP(scan) = EXACTFA_NO_TRIE;
*has_exactf_sharp_s = TRUE;
break;
}
* have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
- /* The below is perhaps overboard, but this allows us to save a
- * test each time through the loop at the expense of a mask. This
- * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
- * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
- * are 64. This uses an exclusive 'or' to find that bit and then
- * inverts it to form a mask, with just a single 0, in the bit
- * position where 'S' and 's' differ. */
- const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
- const U8 s_masked = 's' & S_or_s_mask;
-
while (s < upper) {
int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
}
if (len == 2
- && ((*s & S_or_s_mask) == s_masked)
- && ((*(s+1) & S_or_s_mask) == s_masked))
+ && isARG2_lower_or_UPPER_ARG1('s', *s)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
{
/* EXACTF nodes need to know that the minimum length
#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
-STATIC I32
+STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
- I32 *minlenp, I32 *deltap,
+ SSize_t *minlenp, SSize_t *deltap,
regnode *last,
scan_data_t *data,
I32 stopparen,
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
- I32 min = 0; /* There must be at least this number of characters to match */
+ /* There must be at least this number of characters to match */
+ SSize_t min = 0;
I32 pars = 0, code;
regnode *scan = *scanp, *next;
- I32 delta = 0;
+ SSize_t delta = 0;
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
int is_inf_internal = 0; /* The studied chunk is infinite */
I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
scan_data_t data_fake;
SV *re_trie_maxbuff = NULL;
regnode *first_non_open = scan;
- I32 stopmin = I32_MAX;
+ SSize_t stopmin = SSize_t_MAX;
scan_frame *frame = NULL;
GET_RE_DEBUG_FLAGS_DECL;
/* NOTE - There is similar code to this block below for handling
TRIE nodes on a re-study. If you change stuff here check there
too. */
- I32 max1 = 0, min1 = I32_MAX, num = 0;
+ SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
struct regnode_charclass_class accum;
regnode * const startbranch=scan;
cl_init_zero(pRExC_state, &accum);
while (OP(scan) == code) {
- I32 deltanext, minnext, f = 0, fake;
+ SSize_t deltanext, minnext, fake;
+ I32 f = 0;
struct regnode_charclass_class this_class;
num++;
stopparen, recursed, NULL, f,depth+1);
if (min1 > minnext)
min1 = minnext;
- if (deltanext == I32_MAX) {
+ if (deltanext == SSize_t_MAX) {
is_inf = is_inf_internal = 1;
- max1 = I32_MAX;
+ max1 = SSize_t_MAX;
} else if (max1 < minnext + deltanext)
max1 = minnext + deltanext;
scan = next;
min1 = 0;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += min1;
- if (data->pos_delta >= I32_MAX - (max1 - min1))
- data->pos_delta = I32_MAX;
+ if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
+ data->pos_delta = SSize_t_MAX;
else
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
data->longest = &(data->longest_float);
}
min += min1;
- if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
- delta = I32_MAX;
+ if (delta == SSize_t_MAX
+ || SSize_t_MAX - delta - (max1 - min1) < 0)
+ delta = SSize_t_MAX;
else
delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
EXACT | EXACT
EXACTFU | EXACTFU
EXACTFU_SS | EXACTFU
- EXACTFU_TRICKYFOLD | EXACTFU
- EXACTFA | 0
+ EXACTFA | EXACTFA
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
( EXACT == (X) ) ? EXACT : \
- ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
+ ( EXACTFA == (X) ) ? EXACTFA : \
0 )
/* dont use tail as the end marker for this traverse */
}
}
else if (OP(scan) == EXACT) {
- I32 l = STR_LEN(scan);
+ SSize_t l = STR_LEN(scan);
UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
if (data->last_end == -1) { /* Update the start info. */
data->last_start_min = data->pos_min;
data->last_start_max = is_inf
- ? I32_MAX : data->pos_min + data->pos_delta;
+ ? SSize_t_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
if (UTF)
flags &= ~SCF_DO_STCLASS;
}
else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
- I32 l = STR_LEN(scan);
+ SSize_t l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
/* Search for fixed substrings supports EXACT only. */
/* All other (EXACTFL handled above) folds except under
* /iaa that include s, S, and sharp_s also may include
* the others */
- if (OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
+ {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
LATIN_SMALL_LETTER_SHARP_S);
/* All folds except under /iaa that include s, S,
* and sharp_s also may include the others */
- if (OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
+ {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
LATIN_SMALL_LETTER_SHARP_S);
flags &= ~SCF_DO_STCLASS;
}
else if (REGNODE_VARIES(OP(scan))) {
- I32 mincount, maxcount, minnext, deltanext, fl = 0;
- I32 f = flags, pos_before = 0;
+ SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
+ I32 fl = 0, f = flags;
regnode * const oscan = scan;
struct regnode_charclass_class this_class;
struct regnode_charclass_class *oclass = NULL;
}
min += minnext * mincount;
- is_inf_internal |= deltanext == I32_MAX
+ is_inf_internal |= deltanext == SSize_t_MAX
|| (maxcount == REG_INFTY && minnext + deltanext > 0);
is_inf |= is_inf_internal;
if (is_inf)
- delta = I32_MAX;
+ delta = SSize_t_MAX;
else
delta += (minnext + deltanext) * maxcount - minnext * mincount;
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
#if defined(SPARC64_GCC_WORKAROUND)
- I32 b = 0;
+ SSize_t b = 0;
STRLEN l = 0;
const char *s = NULL;
- I32 old = 0;
+ SSize_t old = 0;
if (pos_before >= data->last_start_min)
b = pos_before;
old = b - data->last_start_min;
#else
- I32 b = pos_before >= data->last_start_min
+ SSize_t b = pos_before >= data->last_start_min
? pos_before : data->last_start_min;
STRLEN l;
const char * const s = SvPV_const(data->last_found, l);
- I32 old = b - data->last_start_min;
+ SSize_t old = b - data->last_start_min;
#endif
if (UTF)
} else {
/* start offset must point into the last copy */
data->last_start_min += minnext * (mincount - 1);
- data->last_start_max += is_inf ? I32_MAX
+ data->last_start_max += is_inf ? SSize_t_MAX
: (maxcount - 1) * (minnext + data->pos_delta);
}
}
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
- counted, deltanext, I32_MAX, minnext, maxcount, mincount);
-if (deltanext != I32_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
+PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
+ " SSize_t_MAX=%"UVdf" minnext=%"UVdf
+ " maxcount=%"UVdf" mincount=%"UVdf"\n",
+ (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
+ (UV)mincount);
+if (deltanext != SSize_t_MAX)
+PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+ (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
+ - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
- if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
- data->pos_delta = I32_MAX;
+ if (deltanext == SSize_t_MAX ||
+ -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
+ data->pos_delta = SSize_t_MAX;
else
data->pos_delta += - counted * deltanext +
(minnext + deltanext) * maxcount - minnext * mincount;
data->last_start_min =
data->pos_min - CHR_SVLEN(last_str);
data->last_start_max = is_inf
- ? I32_MAX
+ ? SSize_t_MAX
: data->pos_min + data->pos_delta
- CHR_SVLEN(last_str);
}
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
for (value = 0; value < loop_max; value++) {
- if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+ if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
}
}
}
* in case it isn't a true locale-node. This will
* create false positives if it truly is locale */
for (value = 0; value < loop_max; value++) {
- if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+ if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
}
}
}
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
for (value = 0; value < loop_max; value++) {
- if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+ if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
}
}
}
* case it isn't a true locale-node. This will create
* false positives if it truly is locale */
for (value = 0; value < loop_max; value++) {
- if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+ if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
}
}
if (PL_regkind[OP(scan)] == NPOSIXD) {
In this case we can't do fixed string optimisation.
*/
- I32 deltanext, minnext, fake = 0;
+ SSize_t deltanext, minnext, fake = 0;
regnode *nscan;
struct regnode_charclass_class intrnl;
int f = 0;
length of the pattern, something we won't know about
until after the recurse.
*/
- I32 deltanext, fake = 0;
+ SSize_t deltanext;
+ I32 fake = 0;
regnode *nscan;
struct regnode_charclass_class intrnl;
int f = 0;
have to worry about freeing them when we know
they wont be used, which would be a pain.
*/
- I32 *minnextp;
- Newx( minnextp, 1, I32 );
+ SSize_t *minnextp;
+ Newx( minnextp, 1, SSize_t );
SAVEFREEPV(minnextp);
if (data) {
{
if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
RExC_rx->extflags |= RXf_ANCH_GPOS;
- if (RExC_rx->gofs < (U32)min)
+ if (RExC_rx->gofs < (STRLEN)min)
RExC_rx->gofs = min;
} else {
RExC_rx->extflags |= RXf_GPOS_FLOAT;
regnode *trie_node= scan;
regnode *tail= regnext(scan);
reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
- I32 max1 = 0, min1 = I32_MAX;
+ SSize_t max1 = 0, min1 = SSize_t_MAX;
struct regnode_charclass_class accum;
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
for ( word=1 ; word <= trie->wordcount ; word++)
{
- I32 deltanext=0, minnext=0, f = 0, fake;
+ SSize_t deltanext=0, minnext=0, f = 0, fake;
struct regnode_charclass_class this_class;
data_fake.flags = 0;
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
- if (min1 > (I32)(minnext + trie->minlen))
+ if (min1 > (SSize_t)(minnext + trie->minlen))
min1 = minnext + trie->minlen;
- if (deltanext == I32_MAX) {
+ if (deltanext == SSize_t_MAX) {
is_inf = is_inf_internal = 1;
- max1 = I32_MAX;
- } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
+ max1 = SSize_t_MAX;
+ } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
max1 = minnext + deltanext + trie->maxlen;
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
DEBUG_STUDYDATA("pre-fin:",data,depth);
*scanp = scan;
- *deltap = is_inf_internal ? I32_MAX : delta;
+ *deltap = is_inf_internal ? SSize_t_MAX : delta;
if (flags & SCF_DO_SUBSTR && is_inf)
- data->pos_delta = I32_MAX - data->pos_min;
+ data->pos_delta = SSize_t_MAX - data->pos_min;
if (is_par > (I32)U8_MAX)
is_par = 0;
if (is_par && pars==1 && data) {
HV * const table = GvHV(PL_hintgv);
SV **ptr;
- if (!table)
+ if (!table || !(PL_hints & HINT_LOCALIZE_HH))
return &PL_core_reg_engine;
ptr = hv_fetchs(table, "regcomp", FALSE);
if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
Newx(dst, *plen_p * 2 + 1, U8);
while (s < *plen_p) {
- const UV uv = NATIVE_TO_ASCII(src[s]);
- if (UNI_IS_INVARIANT(uv))
- dst[d] = (U8)UTF_TO_NATIVE(uv);
+ if (NATIVE_IS_INVARIANT(src[s]))
+ dst[d] = src[s];
else {
- dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
- dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
+ dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
+ dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
}
if (n < num_code_blocks) {
if (!do_end && pRExC_state->code_blocks[n].start == s) {
STRLEN orig_patlen = 0;
bool code = 0;
SV *msv = use_delim ? delim : *svp;
+ if (!msv) msv = &PL_sv_undef;
/* if we've got a delimiter, we go round the loop twice for each
* svp slot (except the last), using the delimiter the second
* The code in this block is based on S_pushav() */
AV *const av = (AV*)msv;
- const I32 maxarg = AvFILL(av) + 1;
+ const SSize_t maxarg = AvFILL(av) + 1;
SV **array;
if (oplist) {
}
if (SvRMAGICAL(av)) {
- U32 i;
+ SSize_t i;
Newx(array, maxarg, SV*);
SAVEFREEPV(array);
- for (i=0; i < (U32)maxarg; i++) {
+ for (i=0; i < maxarg; i++) {
SV ** const svp = av_fetch(av, i, FALSE);
array[i] = svp ? *svp : &PL_sv_undef;
}
STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
+ SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
{
/* This is the common code for setting up the floating and fixed length
* string data extracted from Perl_re_op_compile() below. Returns a boolean
* as to whether succeeded or not */
- I32 t,ml;
+ I32 t;
+ SSize_t ml;
if (! (longest_length
|| (eol /* Can't have SEOL and MULTI */
follow this item. We calculate it ahead of time as once the
lookbehind offset is added in we lose the ability to correctly
calculate it.*/
- ml = minlen ? *(minlen) : (I32)longest_length;
+ ml = minlen ? *(minlen) : (SSize_t)longest_length;
*rx_end_shift = ml - offset
- longest_length + (SvTAIL(sv_longest) != 0)
+ lookbehind;
char *exp;
regnode *scan;
I32 flags;
- I32 minlen = 0;
+ SSize_t minlen = 0;
U32 rx_flags;
SV *pat;
SV *code_blocksv = NULL;
/* testing for BRANCH here tells us whether there is "must appear"
data in the pattern. If there is then we can use it for optimisations */
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
- I32 fake;
+ SSize_t fake;
STRLEN longest_float_length, longest_fixed_length;
struct regnode_charclass_class ch_class; /* pointed to by data */
int stclass_flag;
- I32 last_close = 0; /* pointed to by data */
+ SSize_t last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
/*
{
r->float_min_offset = data.offset_float_min - data.lookbehind_float;
r->float_max_offset = data.offset_float_max;
- if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
+ if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
r->float_max_offset -= data.lookbehind_float;
SvREFCNT_inc_simple_void_NN(data.longest_float);
}
}
else {
/* Several toplevels. Best we can is to set minlen. */
- I32 fake;
+ SSize_t fake;
struct regnode_charclass_class ch_class;
- I32 last_close = 0;
+ SSize_t last_close = 0;
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
});
#ifdef RE_TRACK_PATTERN_OFFSETS
DEBUG_OFFSETS_r(if (ri->u.offsets) {
- const U32 len = ri->u.offsets[0];
- U32 i;
+ const STRLEN len = ri->u.offsets[0];
+ STRLEN i;
GET_RE_DEBUG_FLAGS_DECL;
PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
for (i = 1; i <= len; i++) {
{
SV *ret;
AV *av;
- I32 length;
+ SSize_t length;
struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
{
struct regexp *const rx = ReANY(r);
char *s = NULL;
- I32 i = 0;
- I32 s1, t1;
+ SSize_t i = 0;
+ SSize_t s1, t1;
I32 n = paren;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
- if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
+ if ( n == RX_BUFF_IDX_CARET_PREMATCH
|| n == RX_BUFF_IDX_CARET_FULLMATCH
|| n == RX_BUFF_IDX_CARET_POSTMATCH
- )
- && !(rx->extflags & RXf_PMf_KEEPCOPY)
- )
- goto ret_undef;
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto ret_undef;
+ }
if (!rx->subbeg)
goto ret_undef;
}
assert(s >= rx->subbeg);
- assert(rx->sublen >= (s - rx->subbeg) + i );
+ assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
if (i >= 0) {
#if NO_TAINT_SUPPORT
sv_setpvn(sv, s, i);
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
+ if ( paren == RX_BUFF_IDX_CARET_PREMATCH
+ || paren == RX_BUFF_IDX_CARET_FULLMATCH
+ || paren == RX_BUFF_IDX_CARET_POSTMATCH
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto warn_undef;
+ }
+
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
switch (paren) {
case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
case RX_BUFF_IDX_PREMATCH: /* $` */
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
return 0;
case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
case RX_BUFF_IDX_POSTMATCH: /* $' */
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
}
return 0;
- case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
- /* $& / ${^MATCH}, $1, $2, ... */
- default:
+ default: /* $& / ${^MATCH}, $1, $2, ... */
if (paren <= (I32)rx->nparens &&
(s1 = rx->offs[paren].start) != -1 &&
(t1 = rx->offs[paren].end) != -1)
}
#endif
-#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
+#ifndef PERL_IN_XSUB_RE
void
-Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
+Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
{
- /* Dumps out the ranges in an inversion list. The string 'header'
- * if present is output on a line before the first range */
+ /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
+ * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
+ * the string 'indent'. The output looks like this:
+ [0] 0x000A .. 0x000D
+ [2] 0x0085
+ [4] 0x2028 .. 0x2029
+ [6] 0x3104 .. INFINITY
+ * This means that the first range of code points matched by the list are
+ * 0xA through 0xD; the second range contains only the single code point
+ * 0x85, etc. An inversion list is an array of UVs. Two array elements
+ * are used to define each range (except if the final range extends to
+ * infinity, only a single element is needed). The array index of the
+ * first element for the corresponding range is given in brackets. */
UV start, end;
+ STRLEN count = 0;
PERL_ARGS_ASSERT__INVLIST_DUMP;
- if (header && strlen(header)) {
- PerlIO_printf(Perl_debug_log, "%s\n", header);
- }
if (invlist_is_iterating(invlist)) {
- PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
+ Perl_dump_indent(aTHX_ level, file,
+ "%sCan't dump inversion list because is in middle of iterating\n",
+ indent);
return;
}
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
+ indent, (UV)count, start);
}
else if (end != start) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
- start, end);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
+ indent, (UV)count, start, end);
}
else {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
+ Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
+ indent, (UV)count, start);
}
+ count += 2;
}
}
#endif
if (! len_passed_in) {
if (UTF) {
if (FOLD && (! LOC || code_point > 255)) {
- _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+ _to_uni_fold_flags(code_point,
character,
&len,
FOLD_FLAGS_FULL | ((LOC)
defchar: {
STRLEN len = 0;
- UV ender;
+ UV ender = 0;
char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
- U8 node_type;
+ U8 node_type = compute_EXACTish(pRExC_state);
bool next_is_quantifier;
char * oldp = NULL;
+ /* We can convert EXACTF nodes to EXACTFU if they contain only
+ * characters that match identically regardless of the target
+ * string's UTF8ness. The reason to do this is that EXACTF is not
+ * trie-able, EXACTFU is. (We don't need to figure this out until
+ * pass 2) */
+ bool maybe_exactfu = node_type == EXACTF && PASS2;
+
/* If a folding node contains only code points that don't
* participate in folds, it can be changed into an EXACT node,
* which allows the optimizer more things to look for */
bool maybe_exact;
- ender = 0;
- node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
/* In pass1, folded, we use a temporary buffer instead of the
/* We do the EXACTFish to EXACT node only if folding, and not if in
* locale, as whether a character folds or not isn't known until
- * runtime */
- maybe_exact = FOLD && ! LOC;
+ * runtime. (And we don't need to figure this out until pass 2) */
+ maybe_exact = FOLD && ! LOC && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
p++;
break;
case 'a':
- ender = ASCII_TO_NATIVE('\007');
+ ender = '\a';
p++;
break;
case 'o':
goto loopdone;
case '1': case '2': case '3':case '4':
case '5': case '6': case '7':
- /* When we parse backslash escapes there is ambiguity between
- * backreferences and octal escapes. Any escape from \1 - \9 is
- * a backreference, any multi-digit escape which does not start with
- * 0 and which when evaluated as decimal could refer to an already
- * parsed capture buffer is a backslash. Anything else is octal.
+ /* When we parse backslash escapes there is ambiguity
+ * between backreferences and octal escapes. Any escape
+ * from \1 - \9 is a backreference, any multi-digit
+ * escape which does not start with 0 and which when
+ * evaluated as decimal could refer to an already
+ * parsed capture buffer is a backslash. Anything else
+ * is octal.
*
- * Note this implies that \118 could be interpreted as 118 OR as
- * "\11" . "8" depending on whether there were 118 capture buffers
- * defined already in the pattern.
- */
+ * Note this implies that \118 could be interpreted as
+ * 118 OR as "\11" . "8" depending on whether there
+ * were 118 capture buffers defined already in the
+ * pattern. */
if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
{ /* Not to be treated as an octal constant, go
find backref */
REGC((char)ender, s++);
}
}
- else /* FOLD */
- if (! ( UTF
+ else /* FOLD */ if (! ( UTF
/* See comments for join_exact() as to why we fold this
* non-UTF at compile time */
|| (node_type == EXACTFU
&& ender == LATIN_SMALL_LETTER_SHARP_S)))
{
+ if (IS_IN_SOME_FOLD_L1(ender)) {
+ maybe_exact = FALSE;
+
+ /* See if the character's fold differs between /d and
+ * /u. This includes the multi-char fold SHARP S to
+ * 'ss' */
+ if (maybe_exactfu
+ && (PL_fold[ender] != PL_fold_latin1[ender]
+ || ender == LATIN_SMALL_LETTER_SHARP_S
+ || (len > 0
+ && isARG2_lower_or_UPPER_ARG1('s', ender)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
+ {
+ maybe_exactfu = FALSE;
+ }
+ }
*(s++) = (char) ender;
- maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
}
else { /* UTF */
* utf8. If we start to fold non-UTF patterns, be sure to
* update join_exact() */
if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
+ if (NATIVE_IS_INVARIANT(ender)) {
*s = (U8) ender;
foldlen = 1;
} else {
/* No Latin1 characters participate in multi-char
* folds under /l */
if (LOC
- || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
- *s, *(s+1))))
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+ *s, *(s+1))))
{
break;
}
* do any better */
if (len == 0) {
len = full_len;
+
+ /* If the node ends in an 's' we make sure it stays EXACTF,
+ * as if it turns into an EXACTFU, it could later get
+ * joined with another 's' that would then wrongly match
+ * the sharp s */
+ if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+ {
+ maybe_exactfu = FALSE;
+ }
} else {
/* Here, the node does contain some characters that aren't
if (len == 0) {
OP(ret) = NOTHING;
}
- else{
-
- /* If 'maybe_exact' is still set here, means there are no
- * code points in the node that participate in folds */
- if (FOLD && maybe_exact) {
- OP(ret) = EXACT;
+ else {
+ if (FOLD) {
+ /* If 'maybe_exact' is still set here, means there are no
+ * code points in the node that participate in folds;
+ * similarly for 'maybe_exactfu' and code points that match
+ * differently depending on UTF8ness of the target string
+ * */
+ if (maybe_exact) {
+ OP(ret) = EXACT;
+ }
+ else if (maybe_exactfu) {
+ OP(ret) = EXACTFU;
+ }
}
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
}
regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
- const I32 orig_size = RExC_size;
+ const SSize_t orig_size = RExC_size;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGCLASS;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
case 'e': value = ASCII_TO_NATIVE('\033');break;
- case 'a': value = ASCII_TO_NATIVE('\007');break;
+ case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
{
* included. literal_endpoint==2 means both ends of the range used
* a literal character, not \x{foo} */
if (literal_endpoint == 2
- && (prevvalue >= 'a' && value <= 'z')
- || (prevvalue >= 'A' && value <= 'Z'))
+ && ((prevvalue >= 'a' && value <= 'z')
+ || (prevvalue >= 'A' && value <= 'Z')))
{
+ _invlist_intersection(this_range, PL_ASCII,
+ &this_range);
_invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
&this_range);
}
/* If the folds haven't been read in, call a fold function
* to force that */
if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
/* This string is just a short named one above \xff */
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFA_NO_TRIE:
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
/* output information about the unicode matching */
if (flags & ANYOF_UNICODE_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (ANYOF_NONBITMAP(o))
- sv_catpvs(sv, "{unicode}");
- if (flags & ANYOF_NONBITMAP_NON_UTF8)
- sv_catpvs(sv, "{outside bitmap}");
-
- if (ANYOF_NONBITMAP(o)) {
- SV *lv; /* Set if there is something outside the bit map */
- SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
+ else if (ANYOF_NONBITMAP(o)) {
+ SV *lv; /* Set if there is something outside the bit map. */
bool byte_output = FALSE; /* If something in the bitmap has been
output */
- if (lv && lv != &PL_sv_undef) {
- if (sw) {
- int rangestart = -1;
- int i;
- U8 s[UTF8_MAXBYTES_CASE+1];
-
- for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
- uvchr_to_utf8(s, i);
-
- if (i < 256
- && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
- things already
- output as part
- of the bitmap */
- && swash_fetch(sw, s, TRUE))
- {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- byte_output = TRUE;
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++) {
- put_byte(sv, rangestart);
- }
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i-1);
- }
- rangestart = -1;
- }
- }
- }
+ if (flags & ANYOF_NONBITMAP_NON_UTF8) {
+ sv_catpvs(sv, "{outside bitmap}");
+ }
+ else {
+ sv_catpvs(sv, "{utf8}");
+ }
- {
- char *s = savesvpv(lv);
- char * const origs = s;
+ /* Get the stuff that wasn't in the bitmap */
+ (void) regclass_swash(prog, o, FALSE, &lv, NULL);
+ if (lv && lv != &PL_sv_undef) {
+ char *s = savesvpv(lv);
+ char * const origs = s;
- while (*s && *s != '\n')
- s++;
+ while (*s && *s != '\n')
+ s++;
- if (*s == '\n') {
- const char * const t = ++s;
+ if (*s == '\n') {
+ const char * const t = ++s;
- if (byte_output) {
- sv_catpvs(sv, " ");
- }
+ if (byte_output) {
+ sv_catpvs(sv, " ");
+ }
- while (*s) {
- if (*s == '\n') {
+ while (*s) {
+ if (*s == '\n') {
- /* Truncate very long output */
- if (s - origs > 256) {
- Perl_sv_catpvf(aTHX_ sv,
- "%.*s...",
- (int) (s - origs - 1),
- t);
- goto out_dump;
- }
- *s = ' ';
- }
- else if (*s == '\t') {
- *s = '-';
- }
- s++;
- }
- if (s[-1] == ' ')
- s[-1] = 0;
+ /* Truncate very long output */
+ if (s - origs > 256) {
+ Perl_sv_catpvf(aTHX_ sv,
+ "%.*s...",
+ (int) (s - origs - 1),
+ t);
+ goto out_dump;
+ }
+ *s = ' ';
+ }
+ else if (*s == '\t') {
+ *s = '-';
+ }
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
- sv_catpv(sv, t);
- }
+ sv_catpv(sv, t);
+ }
- out_dump:
+ out_dump:
- Safefree(origs);
- }
+ Safefree(origs);
SvREFCNT_dec_NN(lv);
}
}