#endif
#include "dquote_static.c"
-#ifndef PERL_IN_XSUB_RE
-# include "charclass_invlists.h"
-#endif
+#include "charclass_invlists.h"
+#include "inline_invlist.c"
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#ifdef op
#undef op
* 'ss' or not is not knowable at compile time. It will match iff the
* target string is in UTF-8, unlike the EXACTFU nodes, where it always
* matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
- * it can't be folded to "ss" at compile time, unlike EXACTFU does as
+ * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
* described in item 3). An assumption that the optimizer part of
* regexec.c (probably unwittingly) makes is that a character in the
* pattern corresponds to at most a single character in the target string.
uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
- else if (has_exactf_sharp_s) {
+ if (has_exactf_sharp_s) {
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
min += l - min_subtract;
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext /* atom is fixed width */
&& minnext != 0 /* CURLYM can't handle zero width */
+ && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
) {
/* XXXX How to optimize if data == 0? */
/* Optimize to a simpler form. */
}
+STATIC bool
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+{
+ /* This is the common code for setting up the floating and fixed length
+ * string data extracted from Perlre_op_compile() below. Returns a boolean
+ * as to whether succeeded or not */
+
+ I32 t,ml;
+
+ if (! (longest_length
+ || (eol /* Can't have SEOL and MULTI */
+ && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
+ )
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
+ || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
+ {
+ return FALSE;
+ }
+
+ /* copy the information about the longest from the reg_scan_data
+ over to the program. */
+ if (SvUTF8(sv_longest)) {
+ *rx_utf8 = sv_longest;
+ *rx_substr = NULL;
+ } else {
+ *rx_substr = sv_longest;
+ *rx_utf8 = NULL;
+ }
+ /* end_shift is how many chars that must be matched that
+ follow this item. We calculate it ahead of time as once the
+ lookbehind offset is added in we lose the ability to correctly
+ calculate it.*/
+ ml = minlen ? *(minlen) : (I32)longest_length;
+ *rx_end_shift = ml - offset
+ - longest_length + (SvTAIL(sv_longest) != 0)
+ + lookbehind;
+
+ t = (eol/* Can't have SEOL and MULTI */
+ && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
+ fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
+
+ return TRUE;
+}
+
/*
* Perl_re_op_compile - the perl internal RE engine's function to compile a
* regular expression into internal code.
dVAR;
REGEXP *rx;
struct regexp *r;
- register regexp_internal *ri;
+ regexp_internal *ri;
STRLEN plen;
char * VOL exp;
char* xend;
scan_commit(pRExC_state, &data,&minlen,0);
SvREFCNT_dec(data.last_found);
- /* Note that code very similar to this but for anchored string
- follows immediately below, changes may need to be made to both.
- Be careful.
- */
longest_float_length = CHR_SVLEN(data.longest_float);
- if (longest_float_length
- || (data.flags & SF_FL_BEFORE_EOL
- && (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE))))
- {
- I32 t,ml;
- /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
- if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
- || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
- goto remove_float; /* As in (a)+. */
-
- /* copy the information about the longest float from the reg_scan_data
- over to the program. */
- if (SvUTF8(data.longest_float)) {
- r->float_utf8 = data.longest_float;
- r->float_substr = NULL;
- } else {
- r->float_substr = data.longest_float;
- r->float_utf8 = NULL;
- }
- /* float_end_shift is how many chars that must be matched that
- follow this item. We calculate it ahead of time as once the
- lookbehind offset is added in we lose the ability to correctly
- calculate it.*/
- ml = data.minlen_float ? *(data.minlen_float)
- : (I32)longest_float_length;
- r->float_end_shift = ml - data.offset_float_min
- - longest_float_length + (SvTAIL(data.longest_float) != 0)
- + data.lookbehind_float;
+ if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
+ && data.offset_fixed == data.offset_float_min
+ && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
+ && S_setup_longest (aTHX_ pRExC_state,
+ data.longest_float,
+ &(r->float_utf8),
+ &(r->float_substr),
+ &(r->float_end_shift),
+ data.lookbehind_float,
+ data.offset_float_min,
+ data.minlen_float,
+ longest_float_length,
+ data.flags & SF_FL_BEFORE_EOL,
+ data.flags & SF_FL_BEFORE_MEOL))
+ {
r->float_min_offset = data.offset_float_min - data.lookbehind_float;
r->float_max_offset = data.offset_float_max;
if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
r->float_max_offset -= data.lookbehind_float;
-
- t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
- && (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
}
else {
- remove_float:
r->float_substr = r->float_utf8 = NULL;
SvREFCNT_dec(data.longest_float);
longest_float_length = 0;
}
- /* Note that code very similar to this but for floating string
- is immediately above, changes may need to be made to both.
- Be careful.
- */
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
- /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
- if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
- && (longest_fixed_length
- || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)))) )
+ if (S_setup_longest (aTHX_ pRExC_state,
+ data.longest_fixed,
+ &(r->anchored_utf8),
+ &(r->anchored_substr),
+ &(r->anchored_end_shift),
+ data.lookbehind_fixed,
+ data.offset_fixed,
+ data.minlen_fixed,
+ longest_fixed_length,
+ data.flags & SF_FIX_BEFORE_EOL,
+ data.flags & SF_FIX_BEFORE_MEOL))
{
- I32 t,ml;
-
- /* copy the information about the longest fixed
- from the reg_scan_data over to the program. */
- if (SvUTF8(data.longest_fixed)) {
- r->anchored_utf8 = data.longest_fixed;
- r->anchored_substr = NULL;
- } else {
- r->anchored_substr = data.longest_fixed;
- r->anchored_utf8 = NULL;
- }
- /* fixed_end_shift is how many chars that must be matched that
- follow this item. We calculate it ahead of time as once the
- lookbehind offset is added in we lose the ability to correctly
- calculate it.*/
- ml = data.minlen_fixed ? *(data.minlen_fixed)
- : (I32)longest_fixed_length;
- r->anchored_end_shift = ml - data.offset_fixed
- - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
- + data.lookbehind_fixed;
r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
-
- t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
}
else {
r->anchored_substr = r->anchored_utf8 = NULL;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
+
if (ri->regstclass
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
* Some of the methods should always be private to the implementation, and some
* should eventually be made public */
-#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
-#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
-
-/* This is a combination of a version and data structure type, so that one
- * being passed in can be validated to be an inversion list of the correct
- * vintage. When the structure of the header is changed, a new random number
- * in the range 2**31-1 should be generated and the new() method changed to
- * insert that at this location. Then, if an auxiliary program doesn't change
- * correspondingly, it will be discovered immediately */
-#define INVLIST_VERSION_ID_OFFSET 2
-#define INVLIST_VERSION_ID 1064334010
+/* The header definitions are in F<inline_invlist.c> */
-/* For safety, when adding new elements, remember to #undef them at the end of
- * the inversion list code section */
-
-#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
-/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
- * contains the code point U+00000, and begins here. If 1, the inversion list
- * doesn't contain U+0000, and it begins at the next UV in the array.
- * Inverting an inversion list consists of adding or removing the 0 at the
- * beginning of it. By reserving a space for that 0, inversion can be made
- * very fast */
-
-#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
-
-/* Internally things are UVs */
#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
/* Must be empty */
- assert(! *get_invlist_len_addr(invlist));
+ assert(! *_get_invlist_len_addr(invlist));
/* 1^1 = 0; 1^0 = 1 */
*zero = 1 ^ will_have_0;
/* Must not be empty. If these fail, you probably didn't check for <len>
* being non-zero before trying to get the array */
- assert(*get_invlist_len_addr(invlist));
+ assert(*_get_invlist_len_addr(invlist));
assert(*get_invlist_zero_addr(invlist) == 0
|| *get_invlist_zero_addr(invlist) == 1);
+ *get_invlist_zero_addr(invlist));
}
-PERL_STATIC_INLINE UV*
-S_get_invlist_len_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that contains the current number
- * of used elements in the inversion list */
-
- PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
-}
-
-PERL_STATIC_INLINE UV
-S_invlist_len(pTHX_ SV* const invlist)
-{
- /* Returns the current number of elements stored in the inversion list's
- * array */
-
- PERL_ARGS_ASSERT_INVLIST_LEN;
-
- return *get_invlist_len_addr(invlist);
-}
-
PERL_STATIC_INLINE void
S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
{
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- *get_invlist_len_addr(invlist) = len;
+ *_get_invlist_len_addr(invlist) = len;
assert(len <= SvLEN(invlist));
* Note that when inverting, SvCUR shouldn't change */
}
+PERL_STATIC_INLINE IV*
+S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
+{
+ /* Return the address of the UV that is reserved to hold the cached index
+ * */
+
+ PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
+
+ return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+}
+
+PERL_STATIC_INLINE IV
+S_invlist_previous_index(pTHX_ SV* const invlist)
+{
+ /* Returns cached index of previous search */
+
+ PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
+
+ return *get_invlist_previous_index_addr(invlist);
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
+{
+ /* Caches <index> for later retrieval */
+
+ PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
+
+ assert(index == 0 || index < (int) _invlist_len(invlist));
+
+ *get_invlist_previous_index_addr(invlist) = index;
+}
+
PERL_STATIC_INLINE UV
S_invlist_max(pTHX_ SV* const invlist)
{
* properly */
*get_invlist_zero_addr(new_list) = UV_MAX;
+ *get_invlist_previous_index_addr(new_list) = 0;
*get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 4
+#if HEADER_LENGTH != 5
# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
#endif
SvPV_set(invlist, (char *) list);
SvLEN_set(invlist, 0); /* Means we own the contents, and the system
shouldn't touch it */
- SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
+ SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
SvPV_shrink_to_cur((SV *) invlist);
}
-/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
- * etc */
-#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
-#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
-
#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
STATIC void
UV* array;
UV max = invlist_max(invlist);
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
}
}
-STATIC IV
-S_invlist_search(pTHX_ SV* const invlist, const UV cp)
+#ifndef PERL_IN_XSUB_RE
+
+IV
+Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
* contains <cp> */
IV low = 0;
- IV high = invlist_len(invlist);
- const UV * const array = invlist_array(invlist);
+ IV mid;
+ IV high = _invlist_len(invlist);
+ const IV highest_element = high - 1;
+ const UV* array;
- PERL_ARGS_ASSERT_INVLIST_SEARCH;
+ PERL_ARGS_ASSERT__INVLIST_SEARCH;
- /* If list is empty or the code point is before the first element, return
- * failure. */
- if (high == 0 || cp < array[0]) {
+ /* If list is empty, return failure. */
+ if (high == 0) {
return -1;
}
+ /* If the code point is before the first element, return failure. (We
+ * can't combine this with the test above, because we can't get the array
+ * unless we know the list is non-empty) */
+ array = invlist_array(invlist);
+
+ mid = invlist_previous_index(invlist);
+ assert(mid >=0 && mid <= highest_element);
+
+ /* <mid> contains the cache of the result of the previous call to this
+ * function (0 the first time). See if this call is for the same result,
+ * or if it is for mid-1. This is under the theory that calls to this
+ * function will often be for related code points that are near each other.
+ * And benchmarks show that caching gives better results. We also test
+ * here if the code point is within the bounds of the list. These tests
+ * replace others that would have had to be made anyway to make sure that
+ * the array bounds were not exceeded, and give us extra information at the
+ * same time */
+ if (cp >= array[mid]) {
+ if (cp >= array[highest_element]) {
+ return highest_element;
+ }
+
+ /* Here, array[mid] <= cp < array[highest_element]. This means that
+ * the final element is not the answer, so can exclude it; it also
+ * means that <mid> is not the final element, so can refer to 'mid + 1'
+ * safely */
+ if (cp < array[mid + 1]) {
+ return mid;
+ }
+ high--;
+ low = mid + 1;
+ }
+ else { /* cp < aray[mid] */
+ if (cp < array[0]) { /* Fail if outside the array */
+ return -1;
+ }
+ high = mid;
+ if (cp >= array[mid - 1]) {
+ goto found_entry;
+ }
+ }
+
/* Binary search. What we are looking for is <i> such that
* array[i] <= cp < array[i+1]
- * The loop below converges on the i+1. */
+ * The loop below converges on the i+1. Note that there may not be an
+ * (i+1)th element in the array, and things work nonetheless */
while (low < high) {
- IV mid = (low + high) / 2;
- if (array[mid] <= cp) {
+ mid = (low + high) / 2;
+ assert(mid <= highest_element);
+ if (array[mid] <= cp) { /* cp >= array[mid] */
low = mid + 1;
/* We could do this extra test to exit the loop early.
}
}
- return high - 1;
+ found_entry:
+ high--;
+ invlist_set_previous_index(invlist, high);
+ return high;
}
-#ifndef PERL_IN_XSUB_RE
-
void
Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
{
* that <swatch> is all 0's on input */
UV current = start;
- const IV len = invlist_len(invlist);
+ const IV len = _invlist_len(invlist);
IV i;
const UV * array;
array = invlist_array(invlist);
/* Find which element it is */
- i = invlist_search(invlist, start);
+ i = _invlist_search(invlist, start);
/* We populate from <start> to <end> */
while (current < end) {
current = array[i];
if (current >= end) { /* Finished if beyond the end of what we
are populating */
- return;
+ if (LIKELY(end < UV_MAX)) {
+ return;
+ }
+
+ /* We get here when the upper bound is the maximum
+ * representable on the machine, and we are looking for just
+ * that code point. Have to special case it */
+ i = len;
+ goto join_end_of_list;
}
}
assert(current >= start);
swatch[offset >> 3] |= 1 << (offset & 7);
}
+ join_end_of_list:
+
/* Quit if at the end of the list */
if (i >= len) {
assert(a != b);
/* If either one is empty, the union is the other one */
- if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
+ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
if (*output == a) {
if (a != NULL) {
SvREFCNT_dec(a);
} /* else *output already = b; */
return;
}
- else if ((len_b = invlist_len(b)) == 0) {
+ else if ((len_b = _invlist_len(b)) == 0) {
if (*output == b) {
SvREFCNT_dec(b);
}
/* Set result to final length, which can change the pointer to array_u, so
* re-find it */
- if (len_u != invlist_len(u)) {
+ if (len_u != _invlist_len(u)) {
invlist_set_len(u, len_u);
invlist_trim(u);
array_u = invlist_array(u);
assert(a != b);
/* Special case if either one is empty */
- len_a = invlist_len(a);
- if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
+ len_a = _invlist_len(a);
+ if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
if (len_a != 0 && complement_b) {
/* Set result to final length, which can change the pointer to array_r, so
* re-find it */
- if (len_r != invlist_len(r)) {
+ if (len_r != _invlist_len(r)) {
invlist_set_len(r, len_r);
invlist_trim(r);
array_r = invlist_array(r);
len = 0;
}
else {
- len = invlist_len(invlist);
+ len = _invlist_len(invlist);
}
/* If comes after the final entry, can just append it to the end */
if (len == 0
|| start >= invlist_array(invlist)
- [invlist_len(invlist) - 1])
+ [_invlist_len(invlist) - 1])
{
_append_range_to_invlist(invlist, start, end);
return invlist;
#endif
-STATIC bool
-S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
-{
- /* Does <invlist> contain code point <cp> as part of the set? */
-
- IV index = invlist_search(invlist, cp);
-
- PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
-
- return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
-}
-
PERL_STATIC_INLINE SV*
S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
return _add_range_to_invlist(invlist, cp, cp);
* have a zero; removes it otherwise. As described above, the data
* structure is set up so that this is very efficient */
- UV* len_pos = get_invlist_len_addr(invlist);
+ UV* len_pos = _get_invlist_len_addr(invlist);
PERL_ARGS_ASSERT__INVLIST_INVERT;
_invlist_invert(invlist);
- len = invlist_len(invlist);
+ len = _invlist_len(invlist);
if (len != 0) { /* If empty do nothing */
array = invlist_array(invlist);
/* Need to allocate extra space to accommodate Perl's addition of a
* trailing NUL to SvPV's, since it thinks they are always strings */
- SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
+ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
STRLEN length = SvCUR(invlist);
PERL_ARGS_ASSERT_INVLIST_CLONE;
* will start over at the beginning of the list */
UV* pos = get_invlist_iter_addr(invlist);
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
* 0, or if the list is empty. If this distinction matters to you, check
* for emptiness before calling this function */
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_HIGHEST;
UV* array_a = invlist_array(a);
UV* array_b = invlist_array(b);
- UV len_a = invlist_len(a);
- UV len_b = invlist_len(b);
+ UV len_a = _invlist_len(a);
+ UV len_b = _invlist_len(b);
UV i = 0; /* current index into the arrays */
bool retval = TRUE; /* Assume are identical until proven otherwise */
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
dVAR;
- register regnode *ret; /* Will be the head of the group. */
- register regnode *br;
- register regnode *lastbr;
- register regnode *ender = NULL;
- register I32 parno = 0;
+ regnode *ret; /* Will be the head of the group. */
+ regnode *br;
+ regnode *lastbr;
+ regnode *ender = NULL;
+ I32 parno = 0;
I32 flags;
U32 oregflags = RExC_flags;
bool have_branch = 0;
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
{
dVAR;
- register regnode *ret;
- register regnode *chain = NULL;
- register regnode *latest;
+ regnode *ret;
+ regnode *chain = NULL;
+ regnode *latest;
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
dVAR;
- register regnode *ret;
- register char op;
- register char *next;
+ regnode *ret;
+ char op;
+ char *next;
I32 flags;
const char * const origparse = RExC_parse;
I32 min;
return(ret);
}
-
-/* reg_namedseq(pRExC_state,UVp, UV depth)
+STATIC bool
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+{
- This is expected to be called by a parser routine that has
- recognized '\N' and needs to handle the rest. RExC_parse is
- expected to point at the first char following the N at the time
- of the call.
+ /* This is expected to be called by a parser routine that has recognized '\N'
+ and needs to handle the rest. RExC_parse is expected to point at the first
+ char following the N at the time of the call. On successful return,
+ RExC_parse has been updated to point to just after the sequence identified
+ by this routine, and <*flagp> has been updated.
- The \N may be inside (indicated by valuep not being NULL) or outside a
+ The \N may be inside (indicated by the boolean <in_char_class>) or outside a
character class.
\N may begin either a named sequence, or if outside a character class, mean
to match a non-newline. For non single-quoted regexes, the tokenizer has
- attempted to decide which, and in the case of a named sequence converted it
+ attempted to decide which, and in the case of a named sequence, converted it
into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
where c1... are the characters in the sequence. For single-quoted regexes,
the tokenizer passes the \N sequence through unchanged; this code will not
- attempt to determine this nor expand those. The net effect is that if the
- beginning of the passed-in pattern isn't '{U+' or there is no '}', it
- signals that this \N occurrence means to match a non-newline.
-
+ attempt to determine this nor expand those, instead raising a syntax error.
+ The net effect is that if the beginning of the passed-in pattern isn't '{U+'
+ or there is no '}', it signals that this \N occurrence means to match a
+ non-newline.
+
Only the \N{U+...} form should occur in a character class, for the same
reason that '.' inside a character class means to just match a period: it
just doesn't make sense.
-
- If valuep is non-null then it is assumed that we are parsing inside
- of a charclass definition and the first codepoint in the resolved
- string is returned via *valuep and the routine will return NULL.
- In this mode if a multichar string is returned from the charnames
- handler, a warning will be issued, and only the first char in the
- sequence will be examined. If the string returned is zero length
- then the value of *valuep is undefined and NON-NULL will
- be returned to indicate failure. (This will NOT be a valid pointer
- to a regnode.)
-
- If valuep is null then it is assumed that we are parsing normal text and a
- new EXACT node is inserted into the program containing the resolved string,
- and a pointer to the new node is returned. But if the string is zero length
- a NOTHING node is emitted instead.
- On success RExC_parse is set to the char following the endbrace.
- Parsing failures will generate a fatal error via vFAIL(...)
+ The function raises an error (via vFAIL), and doesn't return for various
+ syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
+ success; it returns FALSE otherwise.
+
+ If <valuep> is non-null, it means the caller can accept an input sequence
+ consisting of a just a single code point; <*valuep> is set to that value
+ if the input is such.
+
+ If <node_p> is non-null it signifies that the caller can accept any other
+ legal sequence (i.e., one that isn't just a single code point). <*node_p>
+ is set as follows:
+ 1) \N means not-a-NL: points to a newly created REG_ANY node;
+ 2) \N{}: points to a new NOTHING node;
+ 3) otherwise: points to a new EXACT node containing the resolved
+ string.
+ Note that FALSE is returned for single code point sequences if <valuep> is
+ null.
*/
-STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
-{
+
char * endbrace; /* '}' following the name */
- regnode *ret = NULL;
char* p;
+ char *endchar; /* Points to '.' or '}' ending cur char in the input
+ stream */
+ bool has_multiple_chars; /* true if the input stream contains a sequence of
+ more than one character */
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+ PERL_ARGS_ASSERT_GROK_BSLASH_N;
GET_RE_DEBUG_FLAGS;
+ assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
+
/* The [^\n] meaning of \N ignores spaces and comments under the /x
* modifier. The other meaning does not */
p = (RExC_flags & RXf_PMf_EXTENDED)
? regwhite( pRExC_state, RExC_parse )
: RExC_parse;
-
+
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The former is assumed when it can't be the latter. */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
- if (valuep) {
+ if (! node_p) {
/* no bare \N in a charclass */
- vFAIL("\\N in a character class must be a named character: \\N{...}");
- }
+ if (in_char_class) {
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
+ return FALSE;
+ }
nextchar(pRExC_state);
- ret = reg_node(pRExC_state, REG_ANY);
+ *node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
RExC_parse--;
- Set_Node_Length(ret, 1); /* MJD */
- return ret;
+ Set_Node_Length(*node_p, 1); /* MJD */
+ return TRUE;
}
- /* Here, we have decided it should be a named sequence */
+ /* Here, we have decided it should be a named character or sequence */
/* The test above made sure that the next real character is a '{', but
* under the /x modifier, it could be separated by space (or a comment and
}
if (endbrace == RExC_parse) { /* empty: \N{} */
- if (! valuep) {
- RExC_parse = endbrace + 1;
- return reg_node(pRExC_state,NOTHING);
- }
-
- if (SIZE_ONLY) {
- ckWARNreg(RExC_parse,
- "Ignoring zero length \\N{} in character class"
- );
- RExC_parse = endbrace + 1;
+ bool ret = TRUE;
+ if (node_p) {
+ *node_p = reg_node(pRExC_state,NOTHING);
+ }
+ else if (in_char_class) {
+ if (SIZE_ONLY && in_char_class) {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class"
+ );
+ }
+ ret = FALSE;
}
- *valuep = 0;
- return (regnode *) &RExC_parse; /* Invalid regnode pointer */
+ else {
+ return FALSE;
+ }
+ nextchar(pRExC_state);
+ return ret;
}
- REQUIRE_UTF8; /* named sequences imply Unicode semantics */
+ RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
RExC_parse += 2; /* Skip past the 'U+' */
- if (valuep) { /* In a bracketed char class */
- /* We only pay attention to the first char of
- multichar strings being returned. I kinda wonder
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
+
+ /* Code points are separated by dots. If none, there is only one code
+ * point, and is terminated by the brace */
+ has_multiple_chars = (endchar < endbrace);
+
+ if (valuep && (! has_multiple_chars || in_char_class)) {
+ /* We only pay attention to the first char of
+ multichar strings being returned in char classes. I kinda wonder
if this makes sense as it does change the behaviour
from earlier versions, OTOH that behaviour was broken
as well. XXX Solution is to recharacterize as
[rest-of-class]|multi1|multi2... */
- STRLEN length_of_hex;
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
+ I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-
- char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
- if (endchar < endbrace) {
- ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
- }
- length_of_hex = (STRLEN)(endchar - RExC_parse);
- *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+ *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check */
? UTF8SKIP(RExC_parse)
: 1;
/* Guard against malformed utf8 */
- if (RExC_parse >= endchar) RExC_parse = endchar;
+ if (RExC_parse >= endchar) {
+ RExC_parse = endchar;
+ }
vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
+ }
- RExC_parse = endbrace + 1;
- if (endchar == endbrace) return NULL;
+ if (in_char_class && has_multiple_chars) {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
+ RExC_parse = endbrace + 1;
+ }
+ else if (! node_p || ! has_multiple_chars) {
- ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
+ /* Here, the input is legal, but not according to the caller's
+ * options. We fail without advancing the parse, so that the
+ * caller can try again */
+ RExC_parse = p;
+ return FALSE;
}
- else { /* Not a char class */
+ else {
/* What is done here is to convert this to a sub-pattern of the form
* (?:\x{char1}\x{char2}...)
SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
STRLEN len;
- char *endchar; /* Points to '.' or '}' ending cur char in the input
- stream */
char *orig_end = RExC_end;
+ I32 flags;
while (RExC_parse < endbrace) {
- /* Code points are separated by dots. If none, there is only one
- * code point, and is terminated by the brace */
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
-
/* Convert to notation the rest of the code understands */
sv_catpv(substitute_parse, "\\x{");
sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
/* Point to the beginning of the next character in the sequence. */
RExC_parse = endchar + 1;
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
}
sv_catpv(substitute_parse, ")");
/* The values are Unicode, and therefore not subject to recoding */
RExC_override_recoding = 1;
- ret = reg(pRExC_state, 1, flagp, depth+1);
+ *node_p = reg(pRExC_state, 1, &flags, depth+1);
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
RExC_parse = endbrace;
RExC_end = orig_end;
RExC_override_recoding = 0;
- nextchar(pRExC_state);
+ nextchar(pRExC_state);
}
- return ret;
+ return TRUE;
}
}
PERL_STATIC_INLINE void
-S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STRLEN len, UV code_point)
+S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
{
- /* This knows the details about sizing an EXACTish node, and potentially
- * populating it with a single character. If <len> is non-zero, it assumes
- * that the node has already been populated, and just does the sizing,
- * ignoring <code_point>. Otherwise it looks at <code_point> and
- * calculates what <len> should be. In pass 1, it sizes the node
- * appropriately. In pass 2, it additionally will populate the node's
- * STRING with <code_point>, if <len> is 0.
+ /* This knows the details about sizing an EXACTish node, setting flags for
+ * it (by setting <*flagp>, and potentially populating it with a single
+ * character.
+ *
+ * If <len> is non-zero, this function assumes that the node has already
+ * been populated, and just does the sizing. In this case <code_point>
+ * should be the final code point that has already been placed into the
+ * node. This value will be ignored except that under some circumstances
+ * <*flagp> is set based on it.
+ *
+ * If <len is zero, the function assumes that the node is to contain only
+ * the single character given by <code_point> and calculates what <len>
+ * should be. In pass 1, it sizes the node appropriately. In pass 2, it
+ * additionally will populate the node's STRING with <code_point>, if <len>
+ * is 0. In both cases <*flagp> is appropriately set
*
* It knows that under FOLD, UTF characters and the Latin Sharp S must be
* folded (the latter only when the rules indicate it can match 'ss') */
Copy((char *) character, STRING(node), len, char);
}
}
+
+ *flagp |= HASWIDTH;
+ if (len == 1 && UNI_IS_INVARIANT(code_point))
+ *flagp |= SIMPLE;
}
/*
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
dVAR;
- register regnode *ret = NULL;
+ regnode *ret = NULL;
I32 flags;
char *parse_start = RExC_parse;
U8 op;
case '[':
{
char * const oregcomp_parse = ++RExC_parse;
- ret = regclass(pRExC_state,depth+1);
+ ret = regclass(pRExC_state, flagp,depth+1);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
}
nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
}
RExC_parse--;
- ret = regclass(pRExC_state,depth+1);
+ ret = regclass(pRExC_state, flagp,depth+1);
RExC_end = oldregxend;
RExC_parse--;
Set_Node_Offset(ret, parse_start + 2);
Set_Node_Cur_Length(ret);
nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
}
break;
case 'N':
- /* Handle \N and \N{NAME} here and not below because it can be
- multicharacter. join_exact() will join them up later on.
- Also this makes sure that things like /\N{BLAH}+/ and
- \N{BLAH} being multi char Just Happen. dmq*/
+ /* Handle \N and \N{NAME} with multiple code points here and not
+ * below because it can be multicharacter. join_exact() will join
+ * them up later on. Also this makes sure that things like
+ * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
+ * The options to the grok function call causes it to fail if the
+ * sequence is just a single code point. We then go treat it as
+ * just another character in the current EXACT node, and hence it
+ * gets uniform treatment with all the other characters. The
+ * special treatment for quantifiers is not needed for such single
+ * character sequences */
++RExC_parse;
- ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
+ if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+ RExC_parse--;
+ goto defchar;
+ }
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
RExC_parse++;
defchar: {
- register STRLEN len;
- register UV ender;
- register char *p;
+ STRLEN len = 0;
+ UV ender;
+ char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
- char foldbuf[MAX_NODE_STRING_SIZE];
+ char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+ char *s0;
+ U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
U8 node_type;
bool next_is_quantifier;
+ char * oldp = NULL;
ender = 0;
node_type = compute_EXACTish(pRExC_state);
* actual node, as the node doesn't exist yet */
s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
+ s0 = s;
+
+ reparse:
+
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
* 255 allows us to not have to worry about overflow due to
* could back off to end with only a code point that isn't such a
* non-final, but it is possible for there not to be any in the
* entire node. */
- for (len = 0, p = RExC_parse - 1;
- len < MAX_NODE_STRING_SIZE && p < RExC_end;
+ for (p = RExC_parse - 1;
+ len < upper_parse && p < RExC_end;
len++)
{
- char * const oldp = p;
+ oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
case 'g': case 'G': /* generic-backref, pos assertion */
case 'h': case 'H': /* HORIZWS */
case 'k': case 'K': /* named backref, keep marker */
- case 'N': /* named char sequence */
case 'p': case 'P': /* Unicode property */
case 'R': /* LNBREAK */
case 's': case 'S': /* space class */
ender = '\n';
p++;
break;
+ case 'N': /* Handle a single-code point named character. */
+ /* The options cause it to fail if a multiple code
+ * point sequence. Handle those in the switch() above
+ * */
+ RExC_parse = p + 1;
+ if (! grok_bslash_N(pRExC_state, NULL, &ender,
+ flagp, depth, FALSE))
+ {
+ RExC_parse = p = oldp;
+ goto loopdone;
+ }
+ p = RExC_parse;
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
+ break;
case 'r':
ender = '\r';
p++;
break;
} /* End of switch on the literal */
+ /* Here, have looked at the literal character and <ender>
+ * contains its ordinal, <p> points to the character after it
+ */
+
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
len += foldlen - 1;
}
else {
- REGC((char)ender, s++);
+ *(s++) = ender;
}
}
else if (UTF) {
} /* End of loop through literal characters */
+ /* Here we have either exhausted the input or ran out of room in
+ * the node. (If we encountered a character that can't be in the
+ * node, transfer is made directly to <loopdone>, and so we
+ * wouldn't have fallen off the end of the loop.) In the latter
+ * case, we artificially have to split the node into two, because
+ * we just don't have enough space to hold everything. This
+ * creates a problem if the final character participates in a
+ * multi-character fold in the non-final position, as a match that
+ * should have occurred won't, due to the way nodes are matched,
+ * and our artificial boundary. So back off until we find a non-
+ * problematic character -- one that isn't at the beginning or
+ * middle of such a fold. (Either it doesn't participate in any
+ * folds, or appears only in the final position of all the folds it
+ * does participate in.) A better solution with far fewer false
+ * positives, and that would fill the nodes more completely, would
+ * be to actually have available all the multi-character folds to
+ * test against, and to back-off only far enough to be sure that
+ * this node isn't ending with a partial one. <upper_parse> is set
+ * further below (if we need to reparse the node) to include just
+ * up through that final non-problematic character that this code
+ * identifies, so when it is set to less than the full node, we can
+ * skip the rest of this */
+ if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+
+ const STRLEN full_len = len;
+
+ assert(len >= MAX_NODE_STRING_SIZE);
+
+ /* Here, <s> points to the final byte of the final character.
+ * Look backwards through the string until find a non-
+ * problematic character */
+
+ if (! UTF) {
+
+ /* These two have no multi-char folds to non-UTF characters
+ */
+ if (ASCII_FOLD_RESTRICTED || LOC) {
+ goto loopdone;
+ }
+
+ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+ len = s - s0 + 1;
+ }
+ else {
+ if (! PL_NonL1NonFinalFold) {
+ PL_NonL1NonFinalFold = _new_invlist_C_array(
+ NonL1_Perl_Non_Final_Folds_invlist);
+ }
+
+ /* Point to the first byte of the final character */
+ s = (char *) utf8_hop((U8 *) s, -1);
+
+ while (s >= s0) { /* Search backwards until find
+ non-problematic char */
+ if (UTF8_IS_INVARIANT(*s)) {
+
+ /* There are no ascii characters that participate
+ * in multi-char folds under /aa. In EBCDIC, the
+ * non-ascii invariants are all control characters,
+ * so don't ever participate in any folds. */
+ if (ASCII_FOLD_RESTRICTED
+ || ! IS_NON_FINAL_FOLD(*s))
+ {
+ break;
+ }
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+ /* No Latin1 characters participate in multi-char
+ * folds under /l */
+ if (LOC
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
+ *s, *(s+1))))
+ {
+ break;
+ }
+ }
+ else if (! _invlist_contains_cp(
+ PL_NonL1NonFinalFold,
+ valid_utf8_to_uvchr((U8 *) s, NULL)))
+ {
+ break;
+ }
+
+ /* Here, the current character is problematic in that
+ * it does occur in the non-final position of some
+ * fold, so try the character before it, but have to
+ * special case the very first byte in the string, so
+ * we don't read outside the string */
+ s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+ } /* End of loop backwards through the string */
+
+ /* If there were only problematic characters in the string,
+ * <s> will point to before s0, in which case the length
+ * should be 0, otherwise include the length of the
+ * non-problematic character just found */
+ len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
+ }
+
+ /* Here, have found the final character, if any, that is
+ * non-problematic as far as ending the node without splitting
+ * it across a potential multi-char fold. <len> contains the
+ * number of bytes in the node up-to and including that
+ * character, or is 0 if there is no such character, meaning
+ * the whole node contains only problematic characters. In
+ * this case, give up and just take the node as-is. We can't
+ * do any better */
+ if (len == 0) {
+ len = full_len;
+ } else {
+
+ /* Here, the node does contain some characters that aren't
+ * problematic. If one such is the final character in the
+ * node, we are done */
+ if (len == full_len) {
+ goto loopdone;
+ }
+ else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+
+ /* If the final character is problematic, but the
+ * penultimate is not, back-off that last character to
+ * later start a new node with it */
+ p = oldp;
+ goto loopdone;
+ }
+
+ /* Here, the final non-problematic character is earlier
+ * in the input than the penultimate character. What we do
+ * is reparse from the beginning, going up only as far as
+ * this final ok one, thus guaranteeing that the node ends
+ * in an acceptable character. The reason we reparse is
+ * that we know how far in the character is, but we don't
+ * know how to correlate its position with the input parse.
+ * An alternate implementation would be to build that
+ * correlation as we go along during the original parse,
+ * but that would entail extra work for every node, whereas
+ * this code gets executed only when the string is too
+ * large for the node, and the final two characters are
+ * problematic, an infrequent occurrence. Yet another
+ * possible strategy would be to save the tail of the
+ * string, and the next time regatom is called, initialize
+ * with that. The problem with this is that unless you
+ * back off one more character, you won't be guaranteed
+ * regatom will get called again, unless regbranch,
+ * regpiece ... are also changed. If you do back off that
+ * extra character, so that there is input guaranteed to
+ * force calling regatom, you can't handle the case where
+ * just the first character in the node is acceptable. I
+ * (khw) decided to try this method which doesn't have that
+ * pitfall; if performance issues are found, we can do a
+ * combination of the current approach plus that one */
+ upper_parse = len;
+ len = 0;
+ s = s0;
+ goto reparse;
+ }
+ } /* End of verifying node ends with an appropriate char */
+
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
+
+ /* I (khw) don't know if you can get here with zero length, but the
+ * old code handled this situation by creating a zero-length EXACT
+ * node. Might as well be NOTHING instead */
+ if (len == 0) {
+ OP(ret) = NOTHING;
+ }
+ else{
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
+ }
+
RExC_parse = p - 1;
Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
if (iv < 0)
vFAIL("Internal disaster");
}
- if (len > 0)
- *flagp |= HASWIDTH;
- if (len == 1 && UNI_IS_INVARIANT(ender))
- *flagp |= SIMPLE;
- alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0);
} /* End of label 'defchar:' */
break;
} /* End of giant switch on input character */
above 255, a range list is used */
STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
dVAR;
- register UV nextvalue;
- register UV prevvalue = OOB_UNICODE;
- register IV range = 0;
- UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
- register regnode *ret;
+ UV nextvalue;
+ UV prevvalue = OOB_UNICODE;
+ IV range = 0;
+ UV value = 0;
+ regnode *ret;
STRLEN numlen;
IV namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
if this makes sense as it does change the behaviour
from earlier versions, OTOH that behaviour was broken
as well. */
- UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
+ if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
+ TRUE /* => charclass */))
+ {
goto parseit;
}
- value= v;
}
break;
case 'p':
}
if (!SIZE_ONLY) {
cp_list = add_cp_to_invlist(cp_list, '-');
- element_count++;
}
+ element_count++;
} else
range = 1; /* yeah, it's a range! */
continue; /* but do it the next time */
if (invert) {
op += NALNUM - ALNUM;
}
+ *flagp |= HASWIDTH|SIMPLE;
break;
/* The second group doesn't depend of the charset modifiers.
case ANYOF_HORIZWS:
is_horizws:
op = (invert) ? NHORIZWS : HORIZWS;
+ *flagp |= HASWIDTH|SIMPLE;
break;
case ANYOF_NVERTWS:
/* FALLTHROUGH */
case ANYOF_VERTWS:
op = (invert) ? NVERTWS : VERTWS;
+ *flagp |= HASWIDTH|SIMPLE;
break;
case ANYOF_MAX:
if (invert) {
if (! LOC && value == '\n') {
op = REG_ANY; /* Optimize [^\n] */
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
}
}
else if (value < 256 || UTF) {
if (prevvalue == '0') {
if (value == '9') {
op = (invert) ? NDIGITA : DIGITA;
+ *flagp |= HASWIDTH|SIMPLE;
}
}
}
if (! SIZE_ONLY) {
FLAGS(ret) = arg;
}
+ *flagp |= HASWIDTH|SIMPLE;
}
else if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
}
RExC_parse = (char *) cur_parse;
if (SIZE_ONLY)
return ret;
- /****** !SIZE_ONLY AFTER HERE *********/
+ /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
/* If folding, we calculate all characters that could fold to or from the
* ones already on the list */
* rules hard-coded into Perl. (This case happens legitimately
* during compilation of Perl itself before the Unicode tables
* are generated) */
- if (invlist_len(PL_utf8_foldable) == 0) {
+ if (_invlist_len(PL_utf8_foldable) == 0) {
PL_utf8_foldclosures = newHV();
}
else {
* it doesn't match anything. (perluniprops.pod notes such
* properties) */
op = OPFAIL;
+ *flagp |= HASWIDTH|SIMPLE;
}
else if (start == end) { /* The range is a single code point */
if (! invlist_iternext(cp_list, &start, &end)
else if (start == 0) {
if (end == UV_MAX) {
op = SANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
}
else if (end == '\n' - 1
&& invlist_iternext(cp_list, &start, &end)
&& start == '\n' + 1 && end == UV_MAX)
{
op = REG_ANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
}
}
RExC_parse = (char *)cur_parse;
if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
}
SvREFCNT_dec(listsv);
}
/* If have completely emptied it, remove it completely */
- if (invlist_len(cp_list) == 0) {
+ if (_invlist_len(cp_list) == 0) {
SvREFCNT_dec(cp_list);
cp_list = NULL;
}
RExC_rxi->data->data[n] = (void*)rv;
ARG_SET(ret, n);
}
+
+ *flagp |= HASWIDTH|SIMPLE;
return ret;
}
#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
dVAR;
- register regnode *ptr;
+ regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
dVAR;
- register regnode *ptr;
+ regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
dVAR;
- register regnode *src;
- register regnode *dst;
- register regnode *place;
+ regnode *src;
+ regnode *dst;
+ regnode *place;
const int offset = regarglen[(U8)op];
const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
dVAR;
- register regnode *scan;
+ regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGTAIL;
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
dVAR;
- register regnode *scan;
+ regnode *scan;
U8 exact = PSEUDO;
#ifdef EXPERIMENTAL_INPLACESCAN
I32 min = 0;
{
#ifdef DEBUGGING
dVAR;
- register int k;
+ int k;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
static const char * const anyofs[] = {
Perl_regnext(pTHX_ register regnode *p)
{
dVAR;
- register I32 offset;
+ I32 offset;
if (!p)
return(NULL);
SV* sv, I32 indent, U32 depth)
{
dVAR;
- register U8 op = PSEUDO; /* Arbitrary non-END op. */
- register const regnode *next;
+ U8 op = PSEUDO; /* Arbitrary non-END op. */
+ const regnode *next;
const regnode *optstart= NULL;
RXi_GET_DECL(r,ri);
if (PL_regkind[(U8)op] == BRANCHJ) {
assert(next);
{
- register const regnode *nnode = (OP(next) == LONGJMP
- ? regnext((regnode *)next)
- : next);
+ const regnode *nnode = (OP(next) == LONGJMP
+ ? regnext((regnode *)next)
+ : next);
if (last && nnode > last)
nnode = last;
DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);