char *start; /* Start of input for compile */
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
- I32 whilem_seen; /* number of WHILEM in this expr */
+ SSize_t whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
regnode *emit_bound; /* First regnode outside of the allocated space */
regnode *emit; /* Code-emit pointer; if = &emit_dummy,
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
- I32 size; /* Code size. */
+ SSize_t size; /* Code size. */
I32 npar; /* Capture buffer count, (OPEN). */
I32 cpar; /* Capture buffer count, (CLOSE). */
I32 nestroot; /* root parens we are in - used by accept */
- max_offset
Only used for floating strings. This is the rightmost point that
- the string can appear at. If set to I32 max it indicates that the
+ the string can appear at. If set to SSize_t_MAX it indicates that the
string can occur infinitely far to the right.
- minlenp
typedef struct scan_data_t {
/*I32 len_min; unused */
/*I32 len_delta; unused */
- I32 pos_min;
- I32 pos_delta;
+ SSize_t pos_min;
+ SSize_t pos_delta;
SV *last_found;
- I32 last_end; /* min value, <0 unless valid. */
- I32 last_start_min;
- I32 last_start_max;
+ SSize_t last_end; /* min value, <0 unless valid. */
+ SSize_t last_start_min;
+ SSize_t last_start_max;
SV **longest; /* Either &l_fixed, or &l_float. */
SV *longest_fixed; /* longest fixed string found in pattern */
- I32 offset_fixed; /* offset where it starts */
- I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
+ SSize_t offset_fixed; /* offset where it starts */
+ SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
I32 lookbehind_fixed; /* is the position of the string modfied by LB */
SV *longest_float; /* longest floating string found in pattern */
- I32 offset_float_min; /* earliest point in string it can appear */
- I32 offset_float_max; /* latest point in string it can appear */
- I32 *minlen_float; /* pointer to the minlen relevant to the string */
- I32 lookbehind_float; /* is the position of the string modified by LB */
+ SSize_t offset_float_min; /* earliest point in string it can appear */
+ SSize_t offset_float_max; /* latest point in string it can appear */
+ SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
+ SSize_t lookbehind_float; /* is the pos of the string modified by LB */
I32 flags;
I32 whilem_c;
- I32 *last_closep;
+ SSize_t *last_closep;
struct regnode_charclass_class *start_class;
} scan_data_t;
+/* The below is perhaps overboard, but this allows us to save a test at the
+ * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
+ * and 'a' differ by a single bit; the same with the upper and lower case of
+ * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
+ * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit position
+ * where the upper- and lowercase differ. XXX There are about 40 other
+ * instances in the Perl core where this micro-optimization could be used.
+ * Should decide if maintenance cost is worse, before changing those
+ *
+ * Returns a boolean as to whether or not 'v' is either a lowercase or
+ * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
+ * compile-time constant, the generated code is better than some optimizing
+ * compilers figure out, amounting to a mask and test. The results are
+ * meaningless if 'c' is not one of [A-Za-z] */
+#define isARG2_lower_or_UPPER_ARG1(c, v) \
+ (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
+
/*
* Forward declarations for pregcomp()'s friends.
*/
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
#define SCF_SEEN_ACCEPT 0x8000
+#define SCF_TRIE_DOING_RESTUDY 0x10000
#define UTF cBOOL(RExC_utf8)
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-#define ckWARN2regdep(loc,m, a1) STMT_START { \
+#define ckWARN2reg_d(loc,m, a1) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
m REPORT_LOCATION, \
a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
floating substrings if needed. */
STATIC void
-S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
+ SSize_t *minlenp, int is_inf)
{
const STRLEN l = CHR_SVLEN(data->last_found);
const STRLEN old_l = CHR_SVLEN(*data->longest);
data->offset_float_min = l ? data->last_start_min : data->pos_min;
data->offset_float_max = (l
? data->last_start_max
- : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
- if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
- data->offset_float_max = I32_MAX;
+ : (data->pos_delta == SSize_t_MAX
+ ? SSize_t_MAX
+ : data->pos_min + data->pos_delta));
+ if (is_inf
+ || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
+ data->offset_float_max = SSize_t_MAX;
if (data->flags & SF_BEFORE_EOL)
data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
}
/* These two functions currently do the exact same thing */
-#define cl_init_zero S_cl_init
+#define cl_init_zero cl_init
/* 'AND' a given class with another one. Can create false positives. 'cl'
* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
255, which means that the union with cl should just be
what cl has in it, so can ignore this flag
ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
- is 127-255 to match them, but then invert that, so the
- union with cl should just be what cl has in it, so can
+ is (ASCII) 127-255 to match them, but then invert that, so
+ the union with cl should just be what cl has in it, so can
ignore this flag
*/
} else { /* 'or_with' is not inverted */
8: EXACT <baz>(10)
10: END(0)
- d = uvuni_to_utf8_flags(d, uv, 0);
+ d = uvchr_to_utf8_flags(d, uv, 0);
is the recommended Unicode-aware way of saying
if (UTF) { \
SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
+ unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- /* if it is UTF then it is either already folded, or does not need folding */ \
- uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
- } \
- else if (folder == PL_fold_latin1) { \
- /* if we use this folder we have to obey unicode rules on latin-1 data */ \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
- skiplen = UNISKIP(uvc); \
- foldlen -= skiplen; \
- scan = foldbuf + skiplen; \
- } \
- } else { \
- /* raw data, will be folded later if needed */ \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+/* This gets the next character from the input, folding it if not already
+ * folded. */
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need \
+ * folding */ \
+ uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* This folder implies Unicode rules, which in the range expressible \
+ * by not UTF is the lower case, with the two exceptions, one of \
+ * which should have been taken care of before calling this */ \
+ assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
+ uvc = toLOWER_L1(*uc); \
+ if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
+ len = 1; \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
case EXACT: break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
const U8 *uc = (U8*)STRING( noper );
const U8 *e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
- const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars = 0;
+ STRLEN minbytes = 0;
+ STRLEN maxbytes = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
regardless of encoding */
if (OP( noper ) == EXACTFU_SS) {
/* false positives are ok, so just set this */
- TRIE_BITMAP_SET(trie,0xDF);
+ TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
- chars++;
+
+ /* Acummulate to the current values, the range in the number of
+ * bytes that this character could match. The max is presumed to
+ * be the same as the folded input (which TRIE_READ_CHAR returns),
+ * except that when this is not in UTF-8, it could be matched
+ * against a string which is UTF-8, and the variant characters
+ * could be 2 bytes instead of the 1 here. Likewise, for the
+ * minimum number of bytes when not folded. When folding, the min
+ * is assumed to be 1 byte could fold to match the single character
+ * here, or in the case of a multi-char fold, 1 byte can fold to
+ * the whole sequence. 'foldlen' is used to denote whether we are
+ * in such a sequence, skipping the min setting if so. XXX TODO
+ * Use the exact list of what folds to each character, from
+ * PL_utf8_foldclosures */
+ if (UTF) {
+ maxbytes += UTF8SKIP(uc);
+ if (! folder) {
+ /* A non-UTF-8 string could be 1 byte to match our 2 */
+ minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
+ ? 1
+ : UTF8SKIP(uc);
+ }
+ else {
+ if (foldlen) {
+ foldlen -= UTF8SKIP(uc);
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
+ else {
+ maxbytes += (UNI_IS_INVARIANT(*uc))
+ ? 1
+ : 2;
+ if (! folder) {
+ minbytes++;
+ }
+ else {
+ if (foldlen) {
+ foldlen--;
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
if ( uvc < 256 ) {
if ( folder ) {
U8 folded= folder[ (U8) uvc ];
if ( !UTF ) {
/* store first byte of utf8 representation of
variant codepoints */
- if (! UNI_IS_INVARIANT(uvc)) {
+ if (! NATIVE_IS_INVARIANT(uvc)) {
TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
}
}
}
}
if( cur == first ) {
- trie->minlen = chars;
- trie->maxlen = chars;
- } else if (chars < trie->minlen) {
- trie->minlen = chars;
- } else if (chars > trie->maxlen) {
- trie->maxlen = chars;
- }
- if (OP( noper ) == EXACTFU_SS) {
- /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
- if (trie->minlen > 1)
- trie->minlen= 1;
- }
- if (OP( noper ) == EXACTFU_TRICKYFOLD) {
- /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
- * - We assume that any such sequence might match a 2 byte string */
- if (trie->minlen > 2 )
- trie->minlen= 2;
+ trie->minlen = minbytes;
+ trie->maxlen = maxbytes;
+ } else if (minbytes < trie->minlen) {
+ trie->minlen = minbytes;
+ } else if (maxbytes > trie->maxlen) {
+ trie->maxlen = maxbytes;
}
-
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
U16 charid = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
U16 charid = 0; /* sanity init */
U32 accept_state = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- STRLEN skiplen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
* that is "sss".
*
* It turns out that there are problems with all multi-character folds, and not
- * just these three. Now the code is general, for all such cases, but the
- * three still have some special handling. The approach taken is:
+ * just these three. Now the code is general, for all such cases. The
+ * approach taken is:
* 1) This routine examines each EXACTFish node that could contain multi-
* character fold sequences. It returns in *min_subtract how much to
* subtract from the the actual length of the string to get a real minimum
* used by the caller to adjust the min length of the match, and the delta
* between min and max, so that the optimizer doesn't reject these
* possibilities based on size constraints.
- * 2) Certain of these sequences require special handling by the trie code,
- * so, if found, this code changes the joined node type to special ops:
- * EXACTFU_TRICKYFOLD and EXACTFU_SS.
- * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
+ * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
* is used for an EXACTFU node that contains at least one "ss" sequence in
* it. For non-UTF-8 patterns and strings, this is the only case where
* there is a possible fold length change. That means that a regular
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
* 'ss', even if the pattern isn't UTF-8. This avoids the issues
* described in the next item.
- * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
* pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
* UTF-8 pattern.) An assumption that the optimizer part of regexec.c
* (probably unwittingly, in Perl_regexec_flags()) makes is that a
* but in a non-UTF8 pattern, folding it to that above-Latin1 string would
* require the pattern to be forced into UTF-8, the overhead of which we
* want to avoid.)
- */
+ *
+ * Similarly, the code that generates tries doesn't currently handle
+ * not-already-folded multi-char folds, and it looks like a pain to change
+ * that. Therefore, trie generation of EXACTFA nodes with the sharp s
+ * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
+ * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
+ * using /iaa matching will be doing so almost entirely with ASCII
+ * strings, so this should rarely be encountered in practice */
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
}
/* Nodes with 'ss' require special handling, except for EXACTFL
- * and EXACTFA for which there is no multi-char fold to this */
+ * and EXACTFA-ish for which there is no multi-char fold to
+ * this */
if (len == 2 && *s == 's' && *(s+1) == 's'
- && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+ && OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
{
count = 2;
OP(scan) = EXACTFU_SS;
s += 2;
}
- else if (len == 6 /* len is the same in both ASCII and EBCDIC
- for these */
- && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)
- || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)))
- {
- count = 3;
-
- /* These two folds require special handling by trie's, so
- * change the node type to indicate this. If EXACTFA and
- * EXACTFL were ever to be handled by trie's, this would
- * have to be changed. If this node has already been
- * changed to EXACTFU_SS in this loop, leave it as is. (I
- * (khw) think it doesn't matter in regexec.c for UTF
- * patterns, but no need to change it */
- if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_TRICKYFOLD;
- }
- s += 6;
- }
else { /* Here is a generic multi-char fold. */
const U8* multi_end = s + len;
* test for them. The code that generates the
* is_MULTI_foo() macros croaks should one actually get put
* into Unicode .) */
- if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
+ {
count = utf8_length(s, multi_end);
s = multi_end;
}
/* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
* fold to the ASCII range (and there are no existing ones in the
* upper latin1 range). But, as outlined in the comments preceding
- * this function, we need to flag any occurrences of the sharp s */
+ * this function, we need to flag any occurrences of the sharp s.
+ * This character forbids trie formation (because of added
+ * complexity) */
while (s < s_end) {
if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ OP(scan) = EXACTFA_NO_TRIE;
*has_exactf_sharp_s = TRUE;
break;
}
* have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
- /* The below is perhaps overboard, but this allows us to save a
- * test each time through the loop at the expense of a mask. This
- * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
- * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
- * are 64. This uses an exclusive 'or' to find that bit and then
- * inverts it to form a mask, with just a single 0, in the bit
- * position where 'S' and 's' differ. */
- const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
- const U8 s_masked = 's' & S_or_s_mask;
-
while (s < upper) {
int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
}
if (len == 2
- && ((*s & S_or_s_mask) == s_masked)
- && ((*(s+1) & S_or_s_mask) == s_masked))
+ && isARG2_lower_or_UPPER_ARG1('s', *s)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
{
/* EXACTF nodes need to know that the minimum length
#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
-STATIC I32
+STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
- I32 *minlenp, I32 *deltap,
+ SSize_t *minlenp, SSize_t *deltap,
regnode *last,
scan_data_t *data,
I32 stopparen,
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
- I32 min = 0; /* There must be at least this number of characters to match */
+ /* There must be at least this number of characters to match */
+ SSize_t min = 0;
I32 pars = 0, code;
regnode *scan = *scanp, *next;
- I32 delta = 0;
+ SSize_t delta = 0;
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
int is_inf_internal = 0; /* The studied chunk is infinite */
I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
scan_data_t data_fake;
SV *re_trie_maxbuff = NULL;
regnode *first_non_open = scan;
- I32 stopmin = I32_MAX;
+ SSize_t stopmin = SSize_t_MAX;
scan_frame *frame = NULL;
GET_RE_DEBUG_FLAGS_DECL;
/* NOTE - There is similar code to this block below for handling
TRIE nodes on a re-study. If you change stuff here check there
too. */
- I32 max1 = 0, min1 = I32_MAX, num = 0;
+ SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
struct regnode_charclass_class accum;
regnode * const startbranch=scan;
cl_init_zero(pRExC_state, &accum);
while (OP(scan) == code) {
- I32 deltanext, minnext, f = 0, fake;
+ SSize_t deltanext, minnext, fake;
+ I32 f = 0;
struct regnode_charclass_class this_class;
num++;
stopparen, recursed, NULL, f,depth+1);
if (min1 > minnext)
min1 = minnext;
- if (deltanext == I32_MAX) {
+ if (deltanext == SSize_t_MAX) {
is_inf = is_inf_internal = 1;
- max1 = I32_MAX;
+ max1 = SSize_t_MAX;
} else if (max1 < minnext + deltanext)
max1 = minnext + deltanext;
scan = next;
min1 = 0;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += min1;
- if (data->pos_delta >= I32_MAX - (max1 - min1))
- data->pos_delta = I32_MAX;
+ if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
+ data->pos_delta = SSize_t_MAX;
else
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
data->longest = &(data->longest_float);
}
min += min1;
- if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
- delta = I32_MAX;
+ if (delta == SSize_t_MAX
+ || SSize_t_MAX - delta - (max1 - min1) < 0)
+ delta = SSize_t_MAX;
else
delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
EXACT | EXACT
EXACTFU | EXACTFU
EXACTFU_SS | EXACTFU
- EXACTFU_TRICKYFOLD | EXACTFU
- EXACTFA | 0
+ EXACTFA | EXACTFA
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
( EXACT == (X) ) ? EXACT : \
- ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
+ ( EXACTFA == (X) ) ? EXACTFA : \
0 )
/* dont use tail as the end marker for this traverse */
}
}
else if (OP(scan) == EXACT) {
- I32 l = STR_LEN(scan);
+ SSize_t l = STR_LEN(scan);
UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
if (data->last_end == -1) { /* Update the start info. */
data->last_start_min = data->pos_min;
data->last_start_max = is_inf
- ? I32_MAX : data->pos_min + data->pos_delta;
+ ? SSize_t_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
if (UTF)
flags &= ~SCF_DO_STCLASS;
}
else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
- I32 l = STR_LEN(scan);
+ SSize_t l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
/* Search for fixed substrings supports EXACT only. */
/* All other (EXACTFL handled above) folds except under
* /iaa that include s, S, and sharp_s also may include
* the others */
- if (OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
+ {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
LATIN_SMALL_LETTER_SHARP_S);
/* All folds except under /iaa that include s, S,
* and sharp_s also may include the others */
- if (OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
+ {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
LATIN_SMALL_LETTER_SHARP_S);
flags &= ~SCF_DO_STCLASS;
}
else if (REGNODE_VARIES(OP(scan))) {
- I32 mincount, maxcount, minnext, deltanext, fl = 0;
- I32 f = flags, pos_before = 0;
+ SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
+ I32 fl = 0, f = flags;
regnode * const oscan = scan;
struct regnode_charclass_class this_class;
struct regnode_charclass_class *oclass = NULL;
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if ( /* ? quantifier ok, except for (?{ ... }) */
- (next_is_eval || !(mincount == 0 && maxcount == 1))
+ if (!(flags & SCF_TRIE_DOING_RESTUDY)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
}
min += minnext * mincount;
- is_inf_internal |= deltanext == I32_MAX
+ is_inf_internal |= deltanext == SSize_t_MAX
|| (maxcount == REG_INFTY && minnext + deltanext > 0);
is_inf |= is_inf_internal;
if (is_inf)
- delta = I32_MAX;
+ delta = SSize_t_MAX;
else
delta += (minnext + deltanext) * maxcount - minnext * mincount;
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
#if defined(SPARC64_GCC_WORKAROUND)
- I32 b = 0;
+ SSize_t b = 0;
STRLEN l = 0;
const char *s = NULL;
- I32 old = 0;
+ SSize_t old = 0;
if (pos_before >= data->last_start_min)
b = pos_before;
old = b - data->last_start_min;
#else
- I32 b = pos_before >= data->last_start_min
+ SSize_t b = pos_before >= data->last_start_min
? pos_before : data->last_start_min;
STRLEN l;
const char * const s = SvPV_const(data->last_found, l);
- I32 old = b - data->last_start_min;
+ SSize_t old = b - data->last_start_min;
#endif
if (UTF)
} else {
/* start offset must point into the last copy */
data->last_start_min += minnext * (mincount - 1);
- data->last_start_max += is_inf ? I32_MAX
+ data->last_start_max += is_inf ? SSize_t_MAX
: (maxcount - 1) * (minnext + data->pos_delta);
}
}
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
- counted, deltanext, I32_MAX, minnext, maxcount, mincount);
-if (deltanext != I32_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
+PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
+ " SSize_t_MAX=%"UVdf" minnext=%"UVdf
+ " maxcount=%"UVdf" mincount=%"UVdf"\n",
+ (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
+ (UV)mincount);
+if (deltanext != SSize_t_MAX)
+PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+ (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
+ - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
- if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
- data->pos_delta = I32_MAX;
+ if (deltanext == SSize_t_MAX ||
+ -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
+ data->pos_delta = SSize_t_MAX;
else
data->pos_delta += - counted * deltanext +
(minnext + deltanext) * maxcount - minnext * mincount;
data->last_start_min =
data->pos_min - CHR_SVLEN(last_str);
data->last_start_max = is_inf
- ? I32_MAX
+ ? SSize_t_MAX
: data->pos_min + data->pos_delta
- CHR_SVLEN(last_str);
}
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
for (value = 0; value < loop_max; value++) {
- if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+ if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
}
}
}
* in case it isn't a true locale-node. This will
* create false positives if it truly is locale */
for (value = 0; value < loop_max; value++) {
- if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+ if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
}
}
}
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
for (value = 0; value < loop_max; value++) {
- if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+ if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
}
}
}
* case it isn't a true locale-node. This will create
* false positives if it truly is locale */
for (value = 0; value < loop_max; value++) {
- if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+ if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
}
}
if (PL_regkind[OP(scan)] == NPOSIXD) {
In this case we can't do fixed string optimisation.
*/
- I32 deltanext, minnext, fake = 0;
+ SSize_t deltanext, minnext, fake = 0;
regnode *nscan;
struct regnode_charclass_class intrnl;
int f = 0;
length of the pattern, something we won't know about
until after the recurse.
*/
- I32 deltanext, fake = 0;
+ SSize_t deltanext;
+ I32 fake = 0;
regnode *nscan;
struct regnode_charclass_class intrnl;
int f = 0;
have to worry about freeing them when we know
they wont be used, which would be a pain.
*/
- I32 *minnextp;
- Newx( minnextp, 1, I32 );
+ SSize_t *minnextp;
+ Newx( minnextp, 1, SSize_t );
SAVEFREEPV(minnextp);
if (data) {
{
if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
RExC_rx->extflags |= RXf_ANCH_GPOS;
- if (RExC_rx->gofs < (U32)min)
+ if (RExC_rx->gofs < (STRLEN)min)
RExC_rx->gofs = min;
} else {
RExC_rx->extflags |= RXf_GPOS_FLOAT;
regnode *trie_node= scan;
regnode *tail= regnext(scan);
reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
- I32 max1 = 0, min1 = I32_MAX;
+ SSize_t max1 = 0, min1 = SSize_t_MAX;
struct regnode_charclass_class accum;
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
for ( word=1 ; word <= trie->wordcount ; word++)
{
- I32 deltanext=0, minnext=0, f = 0, fake;
+ SSize_t deltanext=0, minnext=0, f = 0, fake;
struct regnode_charclass_class this_class;
data_fake.flags = 0;
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
- if (min1 > (I32)(minnext + trie->minlen))
+ if (min1 > (SSize_t)(minnext + trie->minlen))
min1 = minnext + trie->minlen;
- if (deltanext == I32_MAX) {
+ if (deltanext == SSize_t_MAX) {
is_inf = is_inf_internal = 1;
- max1 = I32_MAX;
- } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
+ max1 = SSize_t_MAX;
+ } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
max1 = minnext + deltanext + trie->maxlen;
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
DEBUG_STUDYDATA("pre-fin:",data,depth);
*scanp = scan;
- *deltap = is_inf_internal ? I32_MAX : delta;
+ *deltap = is_inf_internal ? SSize_t_MAX : delta;
if (flags & SCF_DO_SUBSTR && is_inf)
- data->pos_delta = I32_MAX - data->pos_min;
+ data->pos_delta = SSize_t_MAX - data->pos_min;
if (is_par > (I32)U8_MAX)
is_par = 0;
if (is_par && pars==1 && data) {
Newx(dst, *plen_p * 2 + 1, U8);
while (s < *plen_p) {
- const UV uv = NATIVE_TO_ASCII(src[s]);
- if (UNI_IS_INVARIANT(uv))
- dst[d] = (U8)UTF_TO_NATIVE(uv);
+ if (NATIVE_IS_INVARIANT(src[s]))
+ dst[d] = src[s];
else {
- dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
- dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
+ dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
+ dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
}
if (n < num_code_blocks) {
if (!do_end && pRExC_state->code_blocks[n].start == s) {
STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
+ SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
{
/* This is the common code for setting up the floating and fixed length
* string data extracted from Perl_re_op_compile() below. Returns a boolean
* as to whether succeeded or not */
- I32 t,ml;
+ I32 t;
+ SSize_t ml;
if (! (longest_length
|| (eol /* Can't have SEOL and MULTI */
follow this item. We calculate it ahead of time as once the
lookbehind offset is added in we lose the ability to correctly
calculate it.*/
- ml = minlen ? *(minlen) : (I32)longest_length;
+ ml = minlen ? *(minlen) : (SSize_t)longest_length;
*rx_end_shift = ml - offset
- longest_length + (SvTAIL(sv_longest) != 0)
+ lookbehind;
char *exp;
regnode *scan;
I32 flags;
- I32 minlen = 0;
+ SSize_t minlen = 0;
U32 rx_flags;
SV *pat;
SV *code_blocksv = NULL;
I32 sawlookahead = 0;
I32 sawplus = 0;
I32 sawopen = 0;
+ I32 sawminmod = 0;
+
regex_charset initial_charset = get_regex_charset(orig_rx_flags);
bool recompile = 0;
bool runtime_code = 0;
}
reStudy:
- r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
+ r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
Zero(r->substrs, 1, struct reg_substr_data);
#ifdef TRIE_STUDY_OPT
/* testing for BRANCH here tells us whether there is "must appear"
data in the pattern. If there is then we can use it for optimisations */
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
- I32 fake;
+ SSize_t fake;
STRLEN longest_float_length, longest_fixed_length;
struct regnode_charclass_class ch_class; /* pointed to by data */
int stclass_flag;
- I32 last_close = 0; /* pointed to by data */
+ SSize_t last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
/*
* the only op that could be a regnode is PLUS, all the rest
* will be regnode_1 or regnode_2.
*
+ * (yves doesn't think this is true)
*/
if (OP(first) == PLUS)
sawplus = 1;
- else
+ else {
+ if (OP(first) == MINMOD)
+ sawminmod = 1;
first += regarglen[OP(first)];
-
+ }
first = NEXTOPER(first);
first_next= regnext(first);
}
first = NEXTOPER(first);
goto again;
}
- if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
+ if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
&& !pRExC_state->num_code_blocks) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->intflags |= PREGf_SKIP;
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
&data, -1, NULL, NULL,
- SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
+ SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
+ | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
&& data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& !(RExC_seen & REG_SEEN_VERBARG)
- && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
+ && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
r->extflags |= RXf_CHECK_ALL;
scan_commit(pRExC_state, &data,&minlen,0);
{
r->float_min_offset = data.offset_float_min - data.lookbehind_float;
r->float_max_offset = data.offset_float_max;
- if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
+ if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
r->float_max_offset -= data.lookbehind_float;
SvREFCNT_inc_simple_void_NN(data.longest_float);
}
r->check_offset_min = r->float_min_offset;
r->check_offset_max = r->float_max_offset;
}
- /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
- This should be changed ASAP! */
- if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
+ if ((r->check_substr || r->check_utf8) ) {
r->extflags |= RXf_USE_INTUIT;
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->extflags |= RXf_INTUIT_TAIL;
}
else {
/* Several toplevels. Best we can is to set minlen. */
- I32 fake;
+ SSize_t fake;
struct regnode_charclass_class ch_class;
- I32 last_close = 0;
+ SSize_t last_close = 0;
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
- &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
+ &data, -1, NULL, NULL,
+ SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
+ |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(NOOP);
});
#ifdef RE_TRACK_PATTERN_OFFSETS
DEBUG_OFFSETS_r(if (ri->u.offsets) {
- const U32 len = ri->u.offsets[0];
- U32 i;
+ const STRLEN len = ri->u.offsets[0];
+ STRLEN i;
GET_RE_DEBUG_FLAGS_DECL;
PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
for (i = 1; i <= len; i++) {
{
SV *ret;
AV *av;
- I32 length;
+ SSize_t length;
struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
{
struct regexp *const rx = ReANY(r);
char *s = NULL;
- I32 i = 0;
- I32 s1, t1;
+ SSize_t i = 0;
+ SSize_t s1, t1;
I32 n = paren;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
- if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
+ if ( n == RX_BUFF_IDX_CARET_PREMATCH
|| n == RX_BUFF_IDX_CARET_FULLMATCH
|| n == RX_BUFF_IDX_CARET_POSTMATCH
- )
- && !(rx->extflags & RXf_PMf_KEEPCOPY)
- )
- goto ret_undef;
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto ret_undef;
+ }
if (!rx->subbeg)
goto ret_undef;
}
assert(s >= rx->subbeg);
- assert(rx->sublen >= (s - rx->subbeg) + i );
+ assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
if (i >= 0) {
#if NO_TAINT_SUPPORT
sv_setpvn(sv, s, i);
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
+ if ( paren == RX_BUFF_IDX_CARET_PREMATCH
+ || paren == RX_BUFF_IDX_CARET_FULLMATCH
+ || paren == RX_BUFF_IDX_CARET_POSTMATCH
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto warn_undef;
+ }
+
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
switch (paren) {
case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
case RX_BUFF_IDX_PREMATCH: /* $` */
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
return 0;
case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
case RX_BUFF_IDX_POSTMATCH: /* $' */
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
}
return 0;
- case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
- /* $& / ${^MATCH}, $1, $2, ... */
- default:
+ default: /* $& / ${^MATCH}, $1, $2, ... */
if (paren <= (I32)rx->nparens &&
(s1 = rx->offs[paren].start) != -1 &&
(t1 = rx->offs[paren].end) != -1)
/* This section of code defines the inversion list object and its methods. The
* interfaces are highly subject to change, so as much as possible is static to
* this file. An inversion list is here implemented as a malloc'd C UV array
- * with some added info that is placed as UVs at the beginning in a header
- * portion. An inversion list for Unicode is an array of code points, sorted
- * by ordinal number. The zeroth element is the first code point in the list.
- * The 1th element is the first element beyond that not in the list. In other
- * words, the first range is
+ * as an SVt_INVLIST scalar.
+ *
+ * An inversion list for Unicode is an array of code points, sorted by ordinal
+ * number. The zeroth element is the first code point in the list. The 1th
+ * element is the first element beyond that not in the list. In other words,
+ * the first range is
* invlist[0]..(invlist[1]-1)
* The other ranges follow. Thus every element whose index is divisible by two
* marks the beginning of a range that is in the list, and every element not
* Taking the complement (inverting) an inversion list is quite simple, if the
* first element is 0, remove it; otherwise add a 0 element at the beginning.
* This implementation reserves an element at the beginning of each inversion
- * list to contain 0 when the list contains 0, and contains 1 otherwise. The
- * actual beginning of the list is either that element if 0, or the next one if
- * 1.
+ * list to always contain 0; there is an additional flag in the header which
+ * indicates if the list begins at the 0, or is offset to begin at the next
+ * element.
*
* More about inversion lists can be found in "Unicode Demystified"
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
* should eventually be made public */
/* The header definitions are in F<inline_invlist.c> */
-#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
-#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
-
-#define INVLIST_INITIAL_LEN 10
PERL_STATIC_INLINE UV*
S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
{
/* Returns a pointer to the first element in the inversion list's array.
* This is called upon initialization of an inversion list. Where the
- * array begins depends on whether the list has the code point U+0000
- * in it or not. The other parameter tells it whether the code that
- * follows this call is about to put a 0 in the inversion list or not.
- * The first element is either the element with 0, if 0, or the next one,
- * if 1 */
+ * array begins depends on whether the list has the code point U+0000 in it
+ * or not. The other parameter tells it whether the code that follows this
+ * call is about to put a 0 in the inversion list or not. The first
+ * element is either the element reserved for 0, if TRUE, or the element
+ * after it, if FALSE */
- UV* zero = get_invlist_zero_addr(invlist);
+ bool* offset = get_invlist_offset_addr(invlist);
+ UV* zero_addr = (UV *) SvPVX(invlist);
PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
/* Must be empty */
- assert(! *_get_invlist_len_addr(invlist));
+ assert(! _invlist_len(invlist));
+
+ *zero_addr = 0;
/* 1^1 = 0; 1^0 = 1 */
- *zero = 1 ^ will_have_0;
- return zero + *zero;
+ *offset = 1 ^ will_have_0;
+ return zero_addr + *offset;
}
PERL_STATIC_INLINE UV*
/* Must not be empty. If these fail, you probably didn't check for <len>
* being non-zero before trying to get the array */
- assert(*_get_invlist_len_addr(invlist));
- assert(*get_invlist_zero_addr(invlist) == 0
- || *get_invlist_zero_addr(invlist) == 1);
-
- /* The array begins either at the element reserved for zero if the
- * list contains 0 (that element will be set to 0), or otherwise the next
- * element (in which case the reserved element will be set to 1). */
- return (UV *) (get_invlist_zero_addr(invlist)
- + *get_invlist_zero_addr(invlist));
+ assert(_invlist_len(invlist));
+
+ /* The very first element always contains zero, The array begins either
+ * there, or if the inversion list is offset, at the element after it.
+ * The offset header field determines which; it contains 0 or 1 to indicate
+ * how much additionally to add */
+ assert(0 == *(SvPVX(invlist)));
+ return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
}
PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
+S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
{
- /* Sets the current number of elements stored in the inversion list */
+ /* Sets the current number of elements stored in the inversion list.
+ * Updates SvCUR correspondingly */
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- *_get_invlist_len_addr(invlist) = len;
-
- assert(len <= SvLEN(invlist));
-
- SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
- /* If the list contains U+0000, that element is part of the header,
- * and should not be counted as part of the array. It will contain
- * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
- * subtract:
- * SvCUR_set(invlist,
- * TO_INTERNAL_SIZE(len
- * - (*get_invlist_zero_addr(inv_list) ^ 1)));
- * But, this is only valid if len is not 0. The consequences of not doing
- * this is that the memory allocation code may think that 1 more UV is
- * being used than actually is, and so might do an unnecessary grow. That
- * seems worth not bothering to make this the precise amount.
- *
- * Note that when inverting, SvCUR shouldn't change */
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ SvCUR_set(invlist,
+ (len == 0)
+ ? 0
+ : TO_INTERNAL_SIZE(len + offset));
+ assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
}
PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
{
- /* Return the address of the UV that is reserved to hold the cached index
+ /* Return the address of the IV that is reserved to hold the cached index
* */
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
- return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ return &(((XINVLIST*) SvANY(invlist))->prev_index);
}
PERL_STATIC_INLINE IV
PERL_ARGS_ASSERT_INVLIST_MAX;
- return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
- ? _invlist_len(invlist)
- : FROM_INTERNAL_SIZE(SvLEN(invlist));
-}
-
-PERL_STATIC_INLINE UV*
-S_get_invlist_zero_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that is reserved to hold 0 if the inversion
- * list contains 0. This has to be the last element of the heading, as the
- * list proper starts with either it if 0, or the next element if not.
- * (But we force it to contain either 0 or 1) */
+ assert(SvTYPE(invlist) == SVt_INVLIST);
- PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
+ /* Assumes worst case, in which the 0 element is not counted in the
+ * inversion list, so subtracts 1 for that */
+ return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
+ ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
+ : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
}
#ifndef PERL_IN_XSUB_RE
SV* new_list;
if (initial_size < 0) {
- initial_size = INVLIST_INITIAL_LEN;
+ initial_size = 10;
}
/* Allocate the initial space */
- new_list = newSV(TO_INTERNAL_SIZE(initial_size));
- invlist_set_len(new_list, 0);
+ new_list = newSV_type(SVt_INVLIST);
- /* Force iterinit() to be used to get iteration to work */
- *get_invlist_iter_addr(new_list) = UV_MAX;
+ /* First 1 is in case the zero element isn't in the list; second 1 is for
+ * trailing NUL */
+ SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
+ invlist_set_len(new_list, 0, 0);
- /* This should force a segfault if a method doesn't initialize this
- * properly */
- *get_invlist_zero_addr(new_list) = UV_MAX;
+ /* Force iterinit() to be used to get iteration to work */
+ *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
*get_invlist_previous_index_addr(new_list) = 0;
- *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 5
-# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
-#endif
return new_list;
}
#endif
STATIC SV*
-S__new_invlist_C_array(pTHX_ UV* list)
+S__new_invlist_C_array(pTHX_ const UV* const list)
{
/* Return a pointer to a newly constructed inversion list, initialized to
* point to <list>, which has to be in the exact correct inversion list
* form, including internal fields. Thus this is a dangerous routine that
- * should not be used in the wrong hands */
+ * should not be used in the wrong hands. The passed in 'list' contains
+ * several header fields at the beginning that are not part of the
+ * inversion list body proper */
+
+ const STRLEN length = (STRLEN) list[0];
+ const UV version_id = list[1];
+ const bool offset = cBOOL(list[2]);
+#define HEADER_LENGTH 3
+ /* If any of the above changes in any way, you must change HEADER_LENGTH
+ * (if appropriate) and regenerate INVLIST_VERSION_ID by running
+ * perl -E 'say int(rand 2**31-1)'
+ */
+#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
+ data structure type, so that one being
+ passed in can be validated to be an
+ inversion list of the correct vintage.
+ */
- SV* invlist = newSV_type(SVt_PV);
+ SV* invlist = newSV_type(SVt_INVLIST);
PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
- SvPV_set(invlist, (char *) list);
+ if (version_id != INVLIST_VERSION_ID) {
+ Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+ }
+
+ /* The generated array passed in includes header elements that aren't part
+ * of the list proper, so start it just after them */
+ SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
+
SvLEN_set(invlist, 0); /* Means we own the contents, and the system
shouldn't touch it */
- SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
- if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
- Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
- }
+ *(get_invlist_offset_addr(invlist)) = offset;
+
+ /* The 'length' passed to us is the physical number of elements in the
+ * inversion list. But if there is an offset the logical number is one
+ * less than that */
+ invlist_set_len(invlist, length - offset, offset);
- /* Initialize the iteration pointer.
- * XXX This could be done at compile time in charclass_invlists.h, but I
- * (khw) am not confident that the suffixes for specifying the C constant
- * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
- * to use 64 bits; might need a Configure probe */
+ invlist_set_previous_index(invlist, 0);
+
+ /* Initialize the iteration pointer. */
invlist_iterfinish(invlist);
return invlist;
PERL_ARGS_ASSERT_INVLIST_EXTEND;
- SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ /* Add one to account for the zero element at the beginning which may not
+ * be counted by the calling parameters */
+ SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
}
PERL_STATIC_INLINE void
{
PERL_ARGS_ASSERT_INVLIST_TRIM;
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
/* Change the length of the inversion list to how many entries it currently
* has */
-
SvPV_shrink_to_cur((SV *) invlist);
}
UV* array;
UV max = invlist_max(invlist);
UV len = _invlist_len(invlist);
+ bool offset;
PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
if (len == 0) { /* Empty lists must be initialized */
- array = _invlist_array_init(invlist, start == 0);
+ offset = start != 0;
+ array = _invlist_array_init(invlist, ! offset);
}
else {
/* Here, the existing list is non-empty. The current max entry in the
* value not in the set, it is extending the set, so the new first
* value not in the set is one greater than the newly extended range.
* */
+ offset = *get_invlist_offset_addr(invlist);
if (array[final_element] == start) {
if (end != UV_MAX) {
array[final_element] = end + 1;
else {
/* But if the end is the maximum representable on the machine,
* just let the range that this would extend to have no end */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, offset);
}
return;
}
len += 2; /* Includes an element each for the start and end of range */
- /* If overflows the existing space, extend, which may cause the array to be
- * moved */
+ /* If wll overflow the existing space, extend, which may cause the array to
+ * be moved */
if (max < len) {
invlist_extend(invlist, len);
- invlist_set_len(invlist, len); /* Have to set len here to avoid assert
- failure in invlist_array() */
+
+ /* Have to set len here to avoid assert failure in invlist_array() */
+ invlist_set_len(invlist, len, offset);
+
array = invlist_array(invlist);
}
else {
- invlist_set_len(invlist, len);
+ invlist_set_len(invlist, len, offset);
}
/* The next item on the list starts the range, the one after that is
else {
/* But if the end is the maximum representable on the machine, just let
* the range have no end */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, offset);
}
}
}
void
-Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
{
/* Take the union of two inversion lists and point <output> to it. *output
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* return the larger of the input lists, but then outside code might need
* to keep track of whether to free the input list or not */
- UV* array_a; /* a's array */
- UV* array_b;
+ const UV* array_a; /* a's array */
+ const UV* array_b;
UV len_a; /* length of a's array */
UV len_b;
if (complement_b) {
/* To complement, we invert: if the first element is 0, remove it. To
- * do this, we just pretend the array starts one later, and clear the
- * flag as we don't have to do anything else later */
+ * do this, we just pretend the array starts one later */
if (array_b[0] == 0) {
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
/* Set result to final length, which can change the pointer to array_u, so
* re-find it */
if (len_u != _invlist_len(u)) {
- invlist_set_len(u, len_u);
+ invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
invlist_trim(u);
array_u = invlist_array(u);
}
}
}
- /* If we've changed b, restore it */
- if (complement_b) {
- array_b[0] = 1;
- }
-
/* We may be removing a reference to one of the inputs */
if (a == *output || b == *output) {
assert(! invlist_is_iterating(*output));
}
void
-Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
+Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
{
/* Take the intersection of two inversion lists and point <i> to it. *i
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* union above
*/
- UV* array_a; /* a's array */
- UV* array_b;
+ const UV* array_a; /* a's array */
+ const UV* array_b;
UV len_a; /* length of a's array */
UV len_b;
assert(a != b);
/* Special case if either one is empty */
- len_a = _invlist_len(a);
+ len_a = (a == NULL) ? 0 : _invlist_len(a);
if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
if (len_a != 0 && complement_b) {
* must be every possible code point. Thus the intersection is
* simply 'a'. */
if (*i != a) {
- *i = invlist_clone(a);
-
if (*i == b) {
SvREFCNT_dec_NN(b);
}
+
+ *i = invlist_clone(a);
}
/* else *i is already 'a' */
return;
if (complement_b) {
/* To complement, we invert: if the first element is 0, remove it. To
- * do this, we just pretend the array starts one later, and clear the
- * flag as we don't have to do anything else later */
+ * do this, we just pretend the array starts one later */
if (array_b[0] == 0) {
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
/* Set result to final length, which can change the pointer to array_r, so
* re-find it */
if (len_r != _invlist_len(r)) {
- invlist_set_len(r, len_r);
+ invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
invlist_trim(r);
array_r = invlist_array(r);
}
}
}
- /* If we've changed b, restore it */
- if (complement_b) {
- array_b[0] = 1;
- }
-
/* We may be removing a reference to one of the inputs */
if (a == *i || b == *i) {
assert(! invlist_is_iterating(*i));
* have a zero; removes it otherwise. As described above, the data
* structure is set up so that this is very efficient */
- UV* len_pos = _get_invlist_len_addr(invlist);
-
PERL_ARGS_ASSERT__INVLIST_INVERT;
assert(! invlist_is_iterating(invlist));
/* The inverse of matching nothing is matching everything */
- if (*len_pos == 0) {
+ if (_invlist_len(invlist) == 0) {
_append_range_to_invlist(invlist, 0, UV_MAX);
return;
}
- /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
- * zero element was a 0, so it is being removed, so the length decrements
- * by 1; and vice-versa. SvCUR is unaffected */
- if (*get_invlist_zero_addr(invlist) ^= 1) {
- (*len_pos)--;
- }
- else {
- (*len_pos)++;
- }
+ *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
}
void
invlist_extend(invlist, len);
array = invlist_array(invlist);
}
- invlist_set_len(invlist, len);
+ invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
array[len - 1] = PERL_UNICODE_MAX + 1;
}
else { /* Remove the 0x110000 */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
}
}
/* Need to allocate extra space to accommodate Perl's addition of a
* trailing NUL to SvPV's, since it thinks they are always strings */
SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
- STRLEN length = SvCUR(invlist);
+ STRLEN physical_length = SvCUR(invlist);
+ bool offset = *(get_invlist_offset_addr(invlist));
PERL_ARGS_ASSERT_INVLIST_CLONE;
- SvCUR_set(new_invlist, length); /* This isn't done automatically */
- Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
+ *(get_invlist_offset_addr(new_invlist)) = offset;
+ invlist_set_len(new_invlist, _invlist_len(invlist), offset);
+ Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
return new_invlist;
}
-PERL_STATIC_INLINE UV*
+PERL_STATIC_INLINE STRLEN*
S_get_invlist_iter_addr(pTHX_ SV* invlist)
{
/* Return the address of the UV that contains the current iteration
PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
- return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
-}
+ assert(SvTYPE(invlist) == SVt_INVLIST);
-PERL_STATIC_INLINE UV*
-S_get_invlist_version_id_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that contains the version id. */
-
- PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+ return &(((XINVLIST*) SvANY(invlist))->iterator);
}
PERL_STATIC_INLINE void
PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
- *get_invlist_iter_addr(invlist) = UV_MAX;
+ *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
}
STATIC bool
* <*start> and <*end> are unchanged, and the next call to this function
* will start over at the beginning of the list */
- UV* pos = get_invlist_iter_addr(invlist);
+ STRLEN* pos = get_invlist_iter_addr(invlist);
UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
if (*pos >= len) {
- *pos = UV_MAX; /* Force iterinit() to be required next time */
+ *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
return FALSE;
}
{
PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
- return *(get_invlist_iter_addr(invlist)) < UV_MAX;
+ return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
PERL_STATIC_INLINE UV
}
#endif
-#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
+#ifndef PERL_IN_XSUB_RE
void
-Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
+Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
{
- /* Dumps out the ranges in an inversion list. The string 'header'
- * if present is output on a line before the first range */
+ /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
+ * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
+ * the string 'indent'. The output looks like this:
+ [0] 0x000A .. 0x000D
+ [2] 0x0085
+ [4] 0x2028 .. 0x2029
+ [6] 0x3104 .. INFINITY
+ * This means that the first range of code points matched by the list are
+ * 0xA through 0xD; the second range contains only the single code point
+ * 0x85, etc. An inversion list is an array of UVs. Two array elements
+ * are used to define each range (except if the final range extends to
+ * infinity, only a single element is needed). The array index of the
+ * first element for the corresponding range is given in brackets. */
UV start, end;
+ STRLEN count = 0;
PERL_ARGS_ASSERT__INVLIST_DUMP;
- if (header && strlen(header)) {
- PerlIO_printf(Perl_debug_log, "%s\n", header);
- }
if (invlist_is_iterating(invlist)) {
- PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
+ Perl_dump_indent(aTHX_ level, file,
+ "%sCan't dump inversion list because is in middle of iterating\n",
+ indent);
return;
}
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
+ indent, (UV)count, start);
}
else if (end != start) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
- start, end);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
+ indent, (UV)count, start, end);
}
else {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
+ Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
+ indent, (UV)count, start);
}
+ count += 2;
}
}
#endif
-#if 0
+#ifdef PERL_ARGS_ASSERT__INVLISTEQ
bool
-S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
+S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
{
/* Return a boolean as to if the two passed in inversion lists are
* identical. The final argument, if TRUE, says to take the complement of
* the second inversion list before doing the comparison */
- UV* array_a = invlist_array(a);
- UV* array_b = invlist_array(b);
+ const UV* array_a = invlist_array(a);
+ const UV* array_b = invlist_array(b);
UV len_a = _invlist_len(a);
UV len_b = _invlist_len(b);
/* Otherwise, to complement, we invert. Here, the first element is
* 0, just remove it. To do this, we just pretend the array starts
- * one later, and clear the flag as we don't have to do anything
- * else later */
+ * one later */
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
}
}
- if (complement_b) {
- array_b[0] = 1;
- }
return retval;
}
#endif
#undef HEADER_LENGTH
-#undef INVLIST_INITIAL_LENGTH
#undef TO_INTERNAL_SIZE
#undef FROM_INTERNAL_SIZE
-#undef INVLIST_LEN_OFFSET
-#undef INVLIST_ZERO_OFFSET
-#undef INVLIST_ITER_OFFSET
#undef INVLIST_VERSION_ID
-#undef INVLIST_PREVIOUS_INDEX_OFFSET
/* End of inversion list object */
ret = reg_node(pRExC_state, OPFAIL);
return ret;
}
- else if (max == 0) { /* replace {0} with a nothing node */
- if (SIZE_ONLY) {
- RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
- }
- else {
- RExC_emit = orig_emit;
- }
- ret = reg_node(pRExC_state, NOTHING);
- return ret;
- }
do_curly:
if ((flags&SIMPLE)) {
reginsert(pRExC_state, MINMOD, ret, depth+1);
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
-#ifndef REG_ALLOW_MINMOD_SUSPEND
else
-#endif
if (RExC_parse < RExC_end && *RExC_parse == '+') {
regnode *ender;
nextchar(pRExC_state);
ret->flags = 0;
ender = reg_node(pRExC_state, TAIL);
REGTAIL(pRExC_state, ret, ender);
- /*ret= ender;*/
}
if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
if (! len_passed_in) {
if (UTF) {
if (FOLD && (! LOC || code_point > 255)) {
- _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+ _to_uni_fold_flags(code_point,
character,
&len,
FOLD_FLAGS_FULL | ((LOC)
if (num < 1)
vFAIL("Reference to nonexistent or unclosed group");
}
- if (!isg && num > 9 && num >= RExC_npar)
+ if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
/* Probably a character specified in octal, e.g. \35 */
goto defchar;
else {
defchar: {
STRLEN len = 0;
- UV ender;
+ UV ender = 0;
char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
- U8 node_type;
+ U8 node_type = compute_EXACTish(pRExC_state);
bool next_is_quantifier;
char * oldp = NULL;
+ /* We can convert EXACTF nodes to EXACTFU if they contain only
+ * characters that match identically regardless of the target
+ * string's UTF8ness. The reason to do this is that EXACTF is not
+ * trie-able, EXACTFU is. (We don't need to figure this out until
+ * pass 2) */
+ bool maybe_exactfu = node_type == EXACTF && PASS2;
+
/* If a folding node contains only code points that don't
* participate in folds, it can be changed into an EXACT node,
* which allows the optimizer more things to look for */
bool maybe_exact;
- ender = 0;
- node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
/* In pass1, folded, we use a temporary buffer instead of the
/* We do the EXACTFish to EXACT node only if folding, and not if in
* locale, as whether a character folds or not isn't known until
- * runtime */
- maybe_exact = FOLD && ! LOC;
+ * runtime. (And we don't need to figure this out until pass 2) */
+ maybe_exact = FOLD && ! LOC && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
p++;
break;
case 'a':
- ender = ASCII_TO_NATIVE('\007');
+ ender = '\a';
p++;
break;
case 'o':
p++;
ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
break;
- case '0': case '1': case '2': case '3':case '4':
+ case '8': case '9': /* must be a backreference */
+ --p;
+ goto loopdone;
+ case '1': case '2': case '3':case '4':
case '5': case '6': case '7':
- if (*p == '0' ||
- (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+ /* When we parse backslash escapes there is ambiguity
+ * between backreferences and octal escapes. Any escape
+ * from \1 - \9 is a backreference, any multi-digit
+ * escape which does not start with 0 and which when
+ * evaluated as decimal could refer to an already
+ * parsed capture buffer is a backslash. Anything else
+ * is octal.
+ *
+ * Note this implies that \118 could be interpreted as
+ * 118 OR as "\11" . "8" depending on whether there
+ * were 118 capture buffers defined already in the
+ * pattern. */
+ if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
+ { /* Not to be treated as an octal constant, go
+ find backref */
+ --p;
+ goto loopdone;
+ }
+ case '0':
{
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
form_short_octal_warning(p, numlen));
}
}
- else { /* Not to be treated as an octal constant, go
- find backref */
- --p;
- goto loopdone;
- }
if (PL_encoding && ender < 0x100)
goto recode_encoding;
break;
REGC((char)ender, s++);
}
}
- else /* FOLD */
- if (! ( UTF
+ else /* FOLD */ if (! ( UTF
/* See comments for join_exact() as to why we fold this
* non-UTF at compile time */
|| (node_type == EXACTFU
&& ender == LATIN_SMALL_LETTER_SHARP_S)))
{
+ if (IS_IN_SOME_FOLD_L1(ender)) {
+ maybe_exact = FALSE;
+
+ /* See if the character's fold differs between /d and
+ * /u. This includes the multi-char fold SHARP S to
+ * 'ss' */
+ if (maybe_exactfu
+ && (PL_fold[ender] != PL_fold_latin1[ender]
+ || ender == LATIN_SMALL_LETTER_SHARP_S
+ || (len > 0
+ && isARG2_lower_or_UPPER_ARG1('s', ender)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
+ {
+ maybe_exactfu = FALSE;
+ }
+ }
*(s++) = (char) ender;
- maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
}
else { /* UTF */
* utf8. If we start to fold non-UTF patterns, be sure to
* update join_exact() */
if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
+ if (NATIVE_IS_INVARIANT(ender)) {
*s = (U8) ender;
foldlen = 1;
} else {
/* No Latin1 characters participate in multi-char
* folds under /l */
if (LOC
- || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
- *s, *(s+1))))
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+ *s, *(s+1))))
{
break;
}
* do any better */
if (len == 0) {
len = full_len;
+
+ /* If the node ends in an 's' we make sure it stays EXACTF,
+ * as if it turns into an EXACTFU, it could later get
+ * joined with another 's' that would then wrongly match
+ * the sharp s */
+ if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+ {
+ maybe_exactfu = FALSE;
+ }
} else {
/* Here, the node does contain some characters that aren't
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
- /* If 'maybe_exact' is still set here, means there are no
- * code points in the node that participate in folds */
- if (FOLD && maybe_exact) {
- OP(ret) = EXACT;
- }
-
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
* node. Might as well be NOTHING instead */
if (len == 0) {
OP(ret) = NOTHING;
}
- else{
+ else {
+ if (FOLD) {
+ /* If 'maybe_exact' is still set here, means there are no
+ * code points in the node that participate in folds;
+ * similarly for 'maybe_exactfu' and code points that match
+ * differently depending on UTF8ness of the target string
+ * */
+ if (maybe_exact) {
+ OP(ret) = EXACT;
+ }
+ else if (maybe_exactfu) {
+ OP(ret) = EXACTFU;
+ }
+ }
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
}
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
if (SIZE_ONLY) {
+ UV depth = 0; /* how many nested (?[...]) constructs */
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
RExC_parse = regpatws(pRExC_state, RExC_parse,
TRUE); /* means recognize comments */
switch (*RExC_parse) {
+ case '?':
+ if (RExC_parse[1] == '[') depth++, RExC_parse++;
+ /* FALL THROUGH */
default:
break;
case '\\':
}
case ']':
+ if (depth--) break;
RExC_parse++;
if (RExC_parse < RExC_end
&& *RExC_parse == ')')
|| IS_OPERAND(lparen)
|| SvUV(lparen) != '(')
{
+ SvREFCNT_dec(current);
RExC_parse++;
vFAIL("Unexpected ')'");
}
char current_operator;
if (IS_OPERAND(top)) {
+ SvREFCNT_dec_NN(top);
+ SvREFCNT_dec_NN(current);
vFAIL("Operand with no preceding operator");
}
current_operator = (char) SvUV(top);
regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
- const I32 orig_size = RExC_size;
+ const SSize_t orig_size = RExC_size;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGCLASS;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
case 'e': value = ASCII_TO_NATIVE('\033');break;
- case 'a': value = ASCII_TO_NATIVE('\007');break;
+ case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
{
* included. literal_endpoint==2 means both ends of the range used
* a literal character, not \x{foo} */
if (literal_endpoint == 2
- && (prevvalue >= 'a' && value <= 'z')
- || (prevvalue >= 'A' && value <= 'Z'))
+ && ((prevvalue >= 'a' && value <= 'z')
+ || (prevvalue >= 'A' && value <= 'Z')))
{
+ _invlist_intersection(this_range, PL_ASCII,
+ &this_range);
_invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
&this_range);
}
/* If the folds haven't been read in, call a fold function
* to force that */
if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
/* This string is just a short named one above \xff */
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
default:
/* Use deprecated warning to increase the
* chances of this being output */
- ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
break;
}
}
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(ret, i)) {
ANYOF_BITMAP_SET(ret, i);
- prevvalue = value;
- value = i;
}
}
}
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFA_NO_TRIE:
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
#ifdef DEBUGGING
+
+static void
+S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
+{
+ int bit;
+ int set=0;
+
+ for (bit=0; bit<32; bit++) {
+ if (flags & (1<<bit)) {
+ if (!set++ && lead)
+ PerlIO_printf(Perl_debug_log, "%s",lead);
+ PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
+ }
+ }
+ if (lead) {
+ if (set)
+ PerlIO_printf(Perl_debug_log, "\n");
+ else
+ PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
+ }
+}
+
static void
S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
{
if (r->extflags & RXf_EVAL_SEEN)
PerlIO_printf(Perl_debug_log, "with eval ");
PerlIO_printf(Perl_debug_log, "\n");
- DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
+ DEBUG_FLAGS_r({
+ regdump_extflags("r->extflags: ",r->extflags);
+ regdump_intflags("r->intflags: ",r->intflags);
+ });
#else
PERL_ARGS_ASSERT_REGDUMP;
PERL_UNUSED_CONTEXT;
)
);
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
- int i;
- int rangestart = -1;
- U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
sv_catpvs(sv, "[");
- for (i = 0; i <= 256; i++) {
- if (i < 256 && BITMAP_TEST(bitmap,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i - 1);
- }
- rangestart = -1;
- }
- }
+ (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie));
sv_catpvs(sv, "]");
}
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
- int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
sv_catpvs(sv, "^");
/* output what the standard cp 0-255 bitmap matches */
- for (i = 0; i <= 256; i++) {
- if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i - 1);
- }
- do_sep = 1;
- rangestart = -1;
- }
- }
+ do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
/* output any special charclass tests (used entirely under use locale) */
- if (ANYOF_CLASS_TEST_ANY_SET(o))
- for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
+ if (ANYOF_CLASS_TEST_ANY_SET(o)) {
+ int i;
+ for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
do_sep = 1;
}
+ }
+ }
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
/* output information about the unicode matching */
if (flags & ANYOF_UNICODE_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (ANYOF_NONBITMAP(o))
- sv_catpvs(sv, "{unicode}");
- if (flags & ANYOF_NONBITMAP_NON_UTF8)
- sv_catpvs(sv, "{outside bitmap}");
-
- if (ANYOF_NONBITMAP(o)) {
- SV *lv; /* Set if there is something outside the bit map */
- SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
+ else if (ANYOF_NONBITMAP(o)) {
+ SV *lv; /* Set if there is something outside the bit map. */
bool byte_output = FALSE; /* If something in the bitmap has been
output */
- if (lv && lv != &PL_sv_undef) {
- if (sw) {
- U8 s[UTF8_MAXBYTES_CASE+1];
-
- for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
- uvchr_to_utf8(s, i);
-
- if (i < 256
- && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
- things already
- output as part
- of the bitmap */
- && swash_fetch(sw, s, TRUE))
- {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- byte_output = TRUE;
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++) {
- put_byte(sv, rangestart);
- }
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i-1);
- }
- rangestart = -1;
- }
- }
- }
+ if (flags & ANYOF_NONBITMAP_NON_UTF8) {
+ sv_catpvs(sv, "{outside bitmap}");
+ }
+ else {
+ sv_catpvs(sv, "{utf8}");
+ }
- {
- char *s = savesvpv(lv);
- char * const origs = s;
+ /* Get the stuff that wasn't in the bitmap */
+ (void) regclass_swash(prog, o, FALSE, &lv, NULL);
+ if (lv && lv != &PL_sv_undef) {
+ char *s = savesvpv(lv);
+ char * const origs = s;
- while (*s && *s != '\n')
- s++;
+ while (*s && *s != '\n')
+ s++;
- if (*s == '\n') {
- const char * const t = ++s;
+ if (*s == '\n') {
+ const char * const t = ++s;
- if (byte_output) {
- sv_catpvs(sv, " ");
- }
+ if (byte_output) {
+ sv_catpvs(sv, " ");
+ }
- while (*s) {
- if (*s == '\n') {
+ while (*s) {
+ if (*s == '\n') {
- /* Truncate very long output */
- if (s - origs > 256) {
- Perl_sv_catpvf(aTHX_ sv,
- "%.*s...",
- (int) (s - origs - 1),
- t);
- goto out_dump;
- }
- *s = ' ';
- }
- else if (*s == '\t') {
- *s = '-';
- }
- s++;
- }
- if (s[-1] == ' ')
- s[-1] = 0;
+ /* Truncate very long output */
+ if (s - origs > 256) {
+ Perl_sv_catpvf(aTHX_ sv,
+ "%.*s...",
+ (int) (s - origs - 1),
+ t);
+ goto out_dump;
+ }
+ *s = ' ';
+ }
+ else if (*s == '\t') {
+ *s = '-';
+ }
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
- sv_catpv(sv, t);
- }
+ sv_catpv(sv, t);
+ }
- out_dump:
+ out_dump:
- Safefree(origs);
- }
+ Safefree(origs);
SvREFCNT_dec_NN(lv);
}
}
so we need to copy it locally. */
RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
ret->mother_re = NULL;
- ret->gofs = 0;
}
#endif /* PERL_IN_XSUB_RE */
So the old condition can be simplified to !isPRINT(c) */
if (!isPRINT(c)) {
- if (c < 256) {
- Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
- }
- else {
- Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
- }
+ switch (c) {
+ case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+ case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+ case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
+ case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
+ case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+
+ default:
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+ break;
+ }
}
else {
const char string = c;
}
}
+STATIC bool
+S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+{
+ /* Appends to 'sv' a displayable version of the innards of the bracketed
+ * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
+ * output anything */
+
+ int i;
+ int rangestart = -1;
+ bool has_output_anything = FALSE;
+
+ PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ int j = i - 1;
+ if (i <= rangestart + 3) { /* Individual chars in short ranges */
+ for (; rangestart < i; rangestart++)
+ put_byte(sv, rangestart);
+ }
+ else if ( j > 255
+ || ! isALPHANUMERIC(rangestart)
+ || ! isALPHANUMERIC(j)
+ || isDIGIT(rangestart) != isDIGIT(j)
+ || isUPPER(rangestart) != isUPPER(j)
+ || isLOWER(rangestart) != isLOWER(j)
+
+ /* This final test should get optimized out except
+ * on EBCDIC platforms, where it causes ranges that
+ * cross discontinuities like i/j to be shown as hex
+ * instead of the misleading, e.g. H-K (since that
+ * range includes more than H, I, J, K). */
+ || (j - rangestart)
+ != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
+ {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
+ rangestart,
+ (j < 256) ? j : 255);
+ }
+ else { /* Here, the ends of the range are both digits, or both
+ uppercase, or both lowercase; and there's no
+ discontinuity in the range (which could happen on EBCDIC
+ platforms) */
+ put_byte(sv, rangestart);
+ sv_catpvs(sv, "-");
+ put_byte(sv, j);
+ }
+ rangestart = -1;
+ has_output_anything = TRUE;
+ }
+ }
+
+ return has_output_anything;
+}
#define CLEAR_OPTSTART \
if (optstart) STMT_START { \