#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
-#ifdef op
-#undef op
-#endif /* op */
-
-#ifdef MSDOS
-# if defined(BUGGY_MSC6)
- /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
-# pragma optimize("a",off)
- /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
-# pragma optimize("w",on )
-# endif /* BUGGY_MSC6 */
-#endif /* MSDOS */
-
#ifndef STATIC
#define STATIC static
#endif
-typedef struct RExC_state_t {
+struct RExC_state_t {
U32 flags; /* RXf_* are we folding, multilining? */
U32 pm_flags; /* PMf_* stuff from the calling PMOP */
char *precomp; /* uncompiled string. */
char *start; /* Start of input for compile */
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
- I32 whilem_seen; /* number of WHILEM in this expr */
+ SSize_t whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
regnode *emit_bound; /* First regnode outside of the allocated space */
- regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
+ regnode *emit; /* Code-emit pointer; if = &emit_dummy,
+ implies compiling, so don't emit */
+ regnode_ssc emit_dummy; /* placeholder for emit to point to;
+ large enough for the largest
+ non-EXACTish node, so can use it as
+ scratch in pass1 */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
- I32 size; /* Code size. */
+ SSize_t size; /* Code size. */
I32 npar; /* Capture buffer count, (OPEN). */
I32 cpar; /* Capture buffer count, (CLOSE). */
I32 nestroot; /* root parens we are in - used by accept */
I32 recurse_count; /* Number of recurse regops */
I32 in_lookbehind;
I32 contains_locale;
+ I32 contains_i;
I32 override_recoding;
I32 in_multi_char_class;
struct reg_code_block *code_blocks; /* positions of literal (?{})
#define RExC_lastnum (pRExC_state->lastnum)
#define RExC_paren_name_list (pRExC_state->paren_name_list)
#endif
-} RExC_state_t;
+};
#define RExC_flags (pRExC_state->flags)
#define RExC_pm_flags (pRExC_state->pm_flags)
#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
#endif
#define RExC_emit (pRExC_state->emit)
+#define RExC_emit_dummy (pRExC_state->emit_dummy)
#define RExC_emit_start (pRExC_state->emit_start)
#define RExC_emit_bound (pRExC_state->emit_bound)
#define RExC_naughty (pRExC_state->naughty)
#define RExC_recurse_count (pRExC_state->recurse_count)
#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
#define RExC_contains_locale (pRExC_state->contains_locale)
+#define RExC_contains_i (pRExC_state->contains_i)
#define RExC_override_recoding (pRExC_state->override_recoding)
#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
((*s) == '{' && regcurly(s, FALSE)))
-#ifdef SPSTART
-#undef SPSTART /* dratted cpp namespace... */
-#endif
/*
* Flags to be passed up and down.
*/
#define namedclass_to_classnum(class) ((int) ((class) / 2))
#define classnum_to_namedclass(classnum) ((classnum) * 2)
+#define _invlist_union_complement_2nd(a, b, output) \
+ _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
+#define _invlist_intersection_complement_2nd(a, b, output) \
+ _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
+
/* About scan_data_t.
During optimisation we recurse through the regexp program performing
- max_offset
Only used for floating strings. This is the rightmost point that
- the string can appear at. If set to I32 max it indicates that the
+ the string can appear at. If set to SSize_t_MAX it indicates that the
string can occur infinitely far to the right.
- minlenp
typedef struct scan_data_t {
/*I32 len_min; unused */
/*I32 len_delta; unused */
- I32 pos_min;
- I32 pos_delta;
+ SSize_t pos_min;
+ SSize_t pos_delta;
SV *last_found;
- I32 last_end; /* min value, <0 unless valid. */
- I32 last_start_min;
- I32 last_start_max;
+ SSize_t last_end; /* min value, <0 unless valid. */
+ SSize_t last_start_min;
+ SSize_t last_start_max;
SV **longest; /* Either &l_fixed, or &l_float. */
SV *longest_fixed; /* longest fixed string found in pattern */
- I32 offset_fixed; /* offset where it starts */
- I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
+ SSize_t offset_fixed; /* offset where it starts */
+ SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
I32 lookbehind_fixed; /* is the position of the string modfied by LB */
SV *longest_float; /* longest floating string found in pattern */
- I32 offset_float_min; /* earliest point in string it can appear */
- I32 offset_float_max; /* latest point in string it can appear */
- I32 *minlen_float; /* pointer to the minlen relevant to the string */
- I32 lookbehind_float; /* is the position of the string modified by LB */
+ SSize_t offset_float_min; /* earliest point in string it can appear */
+ SSize_t offset_float_max; /* latest point in string it can appear */
+ SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
+ SSize_t lookbehind_float; /* is the pos of the string modified by LB */
I32 flags;
I32 whilem_c;
- I32 *last_closep;
- struct regnode_charclass_class *start_class;
+ SSize_t *last_closep;
+ regnode_ssc *start_class;
} scan_data_t;
+/* The below is perhaps overboard, but this allows us to save a test at the
+ * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
+ * and 'a' differ by a single bit; the same with the upper and lower case of
+ * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
+ * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit position
+ * where the upper- and lowercase differ. XXX There are about 40 other
+ * instances in the Perl core where this micro-optimization could be used.
+ * Should decide if maintenance cost is worse, before changing those
+ *
+ * Returns a boolean as to whether or not 'v' is either a lowercase or
+ * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
+ * compile-time constant, the generated code is better than some optimizing
+ * compilers figure out, amounting to a mask and test. The results are
+ * meaningless if 'c' is not one of [A-Za-z] */
+#define isARG2_lower_or_UPPER_ARG1(c, v) \
+ (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
+
/*
* Forward declarations for pregcomp()'s friends.
*/
#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
-#ifdef NO_UNARY_PLUS
-# define SF_FIX_SHIFT_EOL (0+2)
-# define SF_FL_SHIFT_EOL (0+4)
-#else
-# define SF_FIX_SHIFT_EOL (+2)
-# define SF_FL_SHIFT_EOL (+4)
-#endif
+#define SF_FIX_SHIFT_EOL (+2)
+#define SF_FL_SHIFT_EOL (+4)
#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
#define SCF_SEEN_ACCEPT 0x8000
+#define SCF_TRIE_DOING_RESTUDY 0x10000
#define UTF cBOOL(RExC_utf8)
#define MARKER1 "<-- HERE" /* marker as it appears in the description */
#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
-#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
+#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
+
+#define REPORT_LOCATION_ARGS(offset) \
+ UTF8fARG(UTF, offset, RExC_precomp), \
+ UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
/*
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
} STMT_END
#define FAIL(msg) _FAIL( \
- Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
- msg, (int)len, RExC_precomp, ellipses))
+ Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
+ msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
#define FAIL2(msg,arg) _FAIL( \
- Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
- arg, (int)len, RExC_precomp, ellipses))
+ Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
+ arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
/*
* Simple_vFAIL -- like FAIL, but marks the current location in the scan
#define Simple_vFAIL(m) STMT_START { \
const IV offset = RExC_parse - RExC_precomp; \
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
- m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ m, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
/*
*/
#define Simple_vFAIL2(m,a1) STMT_START { \
const IV offset = RExC_parse - RExC_precomp; \
- S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
+ REPORT_LOCATION_ARGS(offset)); \
} STMT_END
/*
*/
#define Simple_vFAIL3(m, a1, a2) STMT_START { \
const IV offset = RExC_parse - RExC_precomp; \
- S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
+ REPORT_LOCATION_ARGS(offset)); \
} STMT_END
/*
*/
#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
const IV offset = RExC_parse - RExC_precomp; \
- S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
+ REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define vFAIL4(m,a1,a2,a3) STMT_START { \
Simple_vFAIL4(m, a1, a2, a3); \
} STMT_END
+/* A specialized version of vFAIL2 that works with UTF8f */
+#define vFAIL2utf8f(m, a1) STMT_START { \
+ const IV offset = RExC_parse - RExC_precomp; \
+ if (!SIZE_ONLY) \
+ SAVEFREESV(RExC_rx_sv); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
+ REPORT_LOCATION_ARGS(offset)); \
+} STMT_END
+
+
/* m is not necessarily a "literal string", in this macro */
#define reg_warn_non_literal_string(loc, m) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
- m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ m, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define ckWARNreg(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
+ REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define vWARN_dep(loc, m) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
+ REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define ckWARNdep(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
m REPORT_LOCATION, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
+ REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define ckWARNregdep(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
m REPORT_LOCATION, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
+ REPORT_LOCATION_ARGS(offset)); \
} STMT_END
-#define ckWARN2regdep(loc,m, a1) STMT_START { \
+#define ckWARN2reg_d(loc,m, a1) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
m REPORT_LOCATION, \
- a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ a1, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define ckWARN2reg(loc, m, a1) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ a1, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define vWARN3(loc, m, a1, a2) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ a1, a2, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ a1, a2, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
#define Set_Cur_Node_Offset
#define Set_Node_Length_To_R(node,len)
#define Set_Node_Length(node,len)
-#define Set_Node_Cur_Length(node)
+#define Set_Node_Cur_Length(node,start)
#define Node_Offset(n)
#define Node_Length(n)
#define Set_Node_Offset_Length(node,offset,len)
#define Set_Node_Length(node,len) \
Set_Node_Length_To_R((node)-RExC_emit_start, len)
-#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
-#define Set_Node_Cur_Length(node) \
- Set_Node_Length(node, RExC_parse - parse_start)
+#define Set_Node_Cur_Length(node, start) \
+ Set_Node_Length(node, RExC_parse - start)
/* Get offsets and lengths */
#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
floating substrings if needed. */
STATIC void
-S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
+ SSize_t *minlenp, int is_inf)
{
const STRLEN l = CHR_SVLEN(data->last_found);
const STRLEN old_l = CHR_SVLEN(*data->longest);
data->offset_float_min = l ? data->last_start_min : data->pos_min;
data->offset_float_max = (l
? data->last_start_max
- : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
- if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
- data->offset_float_max = I32_MAX;
+ : (data->pos_delta == SSize_t_MAX
+ ? SSize_t_MAX
+ : data->pos_min + data->pos_delta));
+ if (is_inf
+ || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
+ data->offset_float_max = SSize_t_MAX;
if (data->flags & SF_BEFORE_EOL)
data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
DEBUG_STUDYDATA("commit: ",data,0);
}
-/* These macros set, clear and test whether the synthetic start class ('ssc',
- * given by the parameter) matches an empty string (EOS). This uses the
- * 'next_off' field in the node, to save a bit in the flags field. The ssc
- * stands alone, so there is never a next_off, so this field is otherwise
- * unused. The EOS information is used only for compilation, but theoretically
- * it could be passed on to the execution code. This could be used to store
- * more than one bit of information, but only this one is currently used. */
-#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
-#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
-#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
-
-/* Can match anything (initialization) */
+/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
+ * list that describes which code points it matches */
+
STATIC void
-S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
+S_ssc_anything(pTHX_ regnode_ssc *ssc)
{
- PERL_ARGS_ASSERT_CL_ANYTHING;
+ /* Set the SSC 'ssc' to match an empty string or any code point */
+
+ PERL_ARGS_ASSERT_SSC_ANYTHING;
- ANYOF_BITMAP_SETALL(cl);
- cl->flags = ANYOF_UNICODE_ALL;
- SET_SSC_EOS(cl);
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
+ _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
+ ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
+}
+
+STATIC int
+S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
+{
+ /* Returns TRUE if the SSC 'ssc' can match the empty string or any code
+ * point */
+
+ UV start, end;
+ bool ret;
+
+ PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ if (! ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING) {
+ return FALSE;
+ }
+
+ /* See if the list consists solely of the range 0 - Infinity */
+ invlist_iterinit(ssc->invlist);
+ ret = invlist_iternext(ssc->invlist, &start, &end)
+ && start == 0
+ && end == UV_MAX;
+
+ invlist_iterfinish(ssc->invlist);
+
+ if (ret) {
+ return TRUE;
+ }
+
+ /* If e.g., both \w and \W are set, matches everything */
+ if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
+ int i;
+ for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
+ if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+STATIC void
+S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
+{
+ /* Initializes the SSC 'ssc'. This includes setting it to match an empty
+ * string, any code point, or any posix class under locale */
+
+ PERL_ARGS_ASSERT_SSC_INIT;
+
+ Zero(ssc, 1, regnode_ssc);
+ OP(ssc) = ANYOF_SYNTHETIC;
+ ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
+ ssc_anything(ssc);
/* If any portion of the regex is to operate under locale rules,
* initialization includes it. The reason this isn't done for all regexes
* is that the optimizer was written under the assumption that locale was
* all-or-nothing. Given the complexity and lack of documentation in the
- * optimizer, and that there are inadequate test cases for locale, so many
+ * optimizer, and that there are inadequate test cases for locale, many
* parts of it may not work properly, it is safest to avoid locale unless
* necessary. */
if (RExC_contains_locale) {
- ANYOF_CLASS_SETALL(cl); /* /l uses class */
- cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
+ ANYOF_POSIXL_SETALL(ssc);
+ ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
+ if (RExC_contains_i) {
+ ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
+ }
}
else {
- ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
+ ANYOF_POSIXL_ZERO(ssc);
}
}
-/* Can match anything (initialization) */
STATIC int
-S_cl_is_anything(const struct regnode_charclass_class *cl)
+S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
+ const regnode_ssc *ssc)
{
- int value;
+ /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
+ * to the list of code points matched, and locale posix classes; hence does
+ * not check its flags) */
- PERL_ARGS_ASSERT_CL_IS_ANYTHING;
+ UV start, end;
+ bool ret;
- for (value = 0; value < ANYOF_MAX; value += 2)
- if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
- return 1;
- if (!(cl->flags & ANYOF_UNICODE_ALL))
- return 0;
- if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
- return 0;
- return 1;
+ PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ invlist_iterinit(ssc->invlist);
+ ret = invlist_iternext(ssc->invlist, &start, &end)
+ && start == 0
+ && end == UV_MAX;
+
+ invlist_iterfinish(ssc->invlist);
+
+ if (! ret) {
+ return FALSE;
+ }
+
+ if (RExC_contains_locale) {
+ if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
+ || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
+ || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
+ {
+ return FALSE;
+ }
+ if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
+ return FALSE;
+ }
+ }
+
+ return TRUE;
}
-/* Can match anything (initialization) */
-STATIC void
-S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
+STATIC SV*
+S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
+ const regnode_charclass_posixl* const node)
{
- PERL_ARGS_ASSERT_CL_INIT;
+ /* Returns a mortal inversion list defining which code points are matched
+ * by 'node', which is of type ANYOF. Handles complementing the result if
+ * appropriate. If some code points aren't knowable at this time, the
+ * returned list must, and will, contain every possible code point. */
+
+ SV* invlist = sv_2mortal(_new_invlist(0));
+ unsigned int i;
+ const U32 n = ARG(node);
+
+ PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
+
+ /* Look at the data structure created by S_set_ANYOF_arg() */
+ if (n != ANYOF_NONBITMAP_EMPTY) {
+ SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
+ AV * const av = MUTABLE_AV(SvRV(rv));
+ SV **const ary = AvARRAY(av);
+ assert(RExC_rxi->data->what[n] == 's');
+
+ if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
+ invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
+ }
+ else if (ary[0] && ary[0] != &PL_sv_undef) {
+
+ /* Here, no compile-time swash, and there are things that won't be
+ * known until runtime -- we have to assume it could be anything */
+ return _add_range_to_invlist(invlist, 0, UV_MAX);
+ }
+ else {
- Zero(cl, 1, struct regnode_charclass_class);
- cl->type = ANYOF;
- cl_anything(pRExC_state, cl);
- ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
+ /* Here no compile-time swash, and no run-time only data. Use the
+ * node's inversion list */
+ invlist = sv_2mortal(invlist_clone(ary[2]));
+ }
+ }
+
+ /* An ANYOF node contains a bitmap for the first 256 code points, and an
+ * inversion list for the others, but if there are code points that should
+ * match only conditionally on the target string being UTF-8, those are
+ * placed in the inversion list, and not the bitmap. Since there are
+ * circumstances under which they could match, they are included in the
+ * SSC. But if the ANYOF node is to be inverted, we have to exclude them
+ * here, so that when we invert below, the end result actually does include
+ * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
+ * before we add the unconditionally matched code points */
+ if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ _invlist_intersection_complement_2nd(invlist,
+ PL_UpperLatin1,
+ &invlist);
+ }
+
+ /* Add in the points from the bit map */
+ for (i = 0; i < 256; i++) {
+ if (ANYOF_BITMAP_TEST(node, i)) {
+ invlist = add_cp_to_invlist(invlist, i);
+ }
+ }
+
+ /* If this can match all upper Latin1 code points, have to add them
+ * as well */
+ if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
+ _invlist_union(invlist, PL_UpperLatin1, &invlist);
+ }
+
+ /* Similarly for these */
+ if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
+ invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
+ }
+
+ if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ _invlist_invert(invlist);
+ }
+
+ return invlist;
}
/* These two functions currently do the exact same thing */
-#define cl_init_zero S_cl_init
+#define ssc_init_zero ssc_init
+
+#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
+#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
-/* 'AND' a given class with another one. Can create false positives. 'cl'
- * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
- * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
STATIC void
-S_cl_and(struct regnode_charclass_class *cl,
- const struct regnode_charclass_class *and_with)
+S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
{
- PERL_ARGS_ASSERT_CL_AND;
+ /* Take the flags 'and_with' and accumulate them anded into the flags for
+ * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
+ * The flags 'and_with' should not come from another SSC (otherwise the
+ * EMPTY_STRING flag won't work) */
- assert(PL_regkind[and_with->type] == ANYOF);
+ const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
- /* I (khw) am not sure all these restrictions are necessary XXX */
- if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
- && !(ANYOF_CLASS_TEST_ANY_SET(cl))
- && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(and_with->flags & ANYOF_LOC_FOLD)
- && !(cl->flags & ANYOF_LOC_FOLD)) {
- int i;
+ PERL_ARGS_ASSERT_SSC_FLAGS_AND;
- if (and_with->flags & ANYOF_INVERT)
- for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
- cl->bitmap[i] &= ~and_with->bitmap[i];
- else
- for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
- cl->bitmap[i] &= and_with->bitmap[i];
- } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
-
- if (and_with->flags & ANYOF_INVERT) {
-
- /* Here, the and'ed node is inverted. Get the AND of the flags that
- * aren't affected by the inversion. Those that are affected are
- * handled individually below */
- U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
- cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
- cl->flags |= affected_flags;
-
- /* We currently don't know how to deal with things that aren't in the
- * bitmap, but we know that the intersection is no greater than what
- * is already in cl, so let there be false positives that get sorted
- * out after the synthetic start class succeeds, and the node is
- * matched for real. */
-
- /* The inversion of these two flags indicate that the resulting
- * intersection doesn't have them */
- if (and_with->flags & ANYOF_UNICODE_ALL) {
- cl->flags &= ~ANYOF_UNICODE_ALL;
- }
- if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
- cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
- }
- }
- else { /* and'd node is not inverted */
- U8 outside_bitmap_but_not_utf8; /* Temp variable */
-
- if (! ANYOF_NONBITMAP(and_with)) {
-
- /* Here 'and_with' doesn't match anything outside the bitmap
- * (except possibly ANYOF_UNICODE_ALL), which means the
- * intersection can't either, except for ANYOF_UNICODE_ALL, in
- * which case we don't know what the intersection is, but it's no
- * greater than what cl already has, so can just leave it alone,
- * with possible false positives */
- if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
- ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
- cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
- }
- }
- else if (! ANYOF_NONBITMAP(cl)) {
-
- /* Here, 'and_with' does match something outside the bitmap, and cl
- * doesn't have a list of things to match outside the bitmap. If
- * cl can match all code points above 255, the intersection will
- * be those above-255 code points that 'and_with' matches. If cl
- * can't match all Unicode code points, it means that it can't
- * match anything outside the bitmap (since the 'if' that got us
- * into this block tested for that), so we leave the bitmap empty.
- */
- if (cl->flags & ANYOF_UNICODE_ALL) {
- ARG_SET(cl, ARG(and_with));
+ /* Use just the SSC-related flags from 'and_with' */
+ ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
+ ANYOF_FLAGS(ssc) |= ssc_only_flags;
+}
- /* and_with's ARG may match things that don't require UTF8.
- * And now cl's will too, in spite of this being an 'and'. See
- * the comments below about the kludge */
- cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
- }
- }
- else {
- /* Here, both 'and_with' and cl match something outside the
- * bitmap. Currently we do not do the intersection, so just match
- * whatever cl had at the beginning. */
- }
-
-
- /* Take the intersection of the two sets of flags. However, the
- * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
- * kludge around the fact that this flag is not treated like the others
- * which are initialized in cl_anything(). The way the optimizer works
- * is that the synthetic start class (SSC) is initialized to match
- * anything, and then the first time a real node is encountered, its
- * values are AND'd with the SSC's with the result being the values of
- * the real node. However, there are paths through the optimizer where
- * the AND never gets called, so those initialized bits are set
- * inappropriately, which is not usually a big deal, as they just cause
- * false positives in the SSC, which will just mean a probably
- * imperceptible slow down in execution. However this bit has a
- * higher false positive consequence in that it can cause utf8.pm,
- * utf8_heavy.pl ... to be loaded when not necessary, which is a much
- * bigger slowdown and also causes significant extra memory to be used.
- * In order to prevent this, the code now takes a different tack. The
- * bit isn't set unless some part of the regular expression needs it,
- * but once set it won't get cleared. This means that these extra
- * modules won't get loaded unless there was some path through the
- * pattern that would have required them anyway, and so any false
- * positives that occur by not ANDing them out when they could be
- * aren't as severe as they would be if we treated this bit like all
- * the others */
- outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
- & ANYOF_NONBITMAP_NON_UTF8;
- cl->flags &= and_with->flags;
- cl->flags |= outside_bitmap_but_not_utf8;
+/* 'AND' a given class with another one. Can create false positives. 'ssc'
+ * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
+ * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
+
+STATIC void
+S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
+ const regnode_ssc *and_with)
+{
+ /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
+ * another SSC or a regular ANYOF class. Can create false positives. */
+
+ SV* anded_cp_list;
+ U8 anded_flags;
+
+ PERL_ARGS_ASSERT_SSC_AND;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
+ * the code point inversion list and just the relevant flags */
+ if (OP(and_with) == ANYOF_SYNTHETIC) {
+ anded_cp_list = and_with->invlist;
+ anded_flags = ANYOF_FLAGS(and_with);
+ }
+ else {
+ anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
+ (regnode_charclass_posixl*) and_with);
+ anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
+ }
+
+ ANYOF_FLAGS(ssc) &= anded_flags;
+
+ /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
+ * C2 is the list of code points in 'and-with'; P2, its posix classes.
+ * 'and_with' may be inverted. When not inverted, we have the situation of
+ * computing:
+ * (C1 | P1) & (C2 | P2)
+ * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
+ * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
+ * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
+ * <= ((C1 & C2) | P1 | P2)
+ * Alternatively, the last few steps could be:
+ * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
+ * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
+ * <= (C1 | C2 | (P1 & P2))
+ * We favor the second approach if either P1 or P2 is non-empty. This is
+ * because these components are a barrier to doing optimizations, as what
+ * they match cannot be known until the moment of matching as they are
+ * dependent on the current locale, 'AND"ing them likely will reduce or
+ * eliminate them.
+ * But we can do better if we know that C1,P1 are in their initial state (a
+ * frequent occurrence), each matching everything:
+ * (<everything>) & (C2 | P2) = C2 | P2
+ * Similarly, if C2,P2 are in their initial state (again a frequent
+ * occurrence), the result is a no-op
+ * (C1 | P1) & (<everything>) = C1 | P1
+ *
+ * Inverted, we have
+ * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
+ * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
+ * <= (C1 & ~C2) | (P1 & ~P2)
+ * */
+
+ if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
+ && OP(and_with) != ANYOF_SYNTHETIC)
+ {
+ unsigned int i;
+
+ ssc_intersection(ssc,
+ anded_cp_list,
+ FALSE /* Has already been inverted */
+ );
+
+ /* If either P1 or P2 is empty, the intersection will be also; can skip
+ * the loop */
+ if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
+ ANYOF_POSIXL_ZERO(ssc);
+ }
+ else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
+
+ /* Note that the Posix class component P from 'and_with' actually
+ * looks like:
+ * P = Pa | Pb | ... | Pn
+ * where each component is one posix class, such as in [\w\s].
+ * Thus
+ * ~P = ~(Pa | Pb | ... | Pn)
+ * = ~Pa & ~Pb & ... & ~Pn
+ * <= ~Pa | ~Pb | ... | ~Pn
+ * The last is something we can easily calculate, but unfortunately
+ * is likely to have many false positives. We could do better
+ * in some (but certainly not all) instances if two classes in
+ * P have known relationships. For example
+ * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
+ * So
+ * :lower: & :print: = :lower:
+ * And similarly for classes that must be disjoint. For example,
+ * since \s and \w can have no elements in common based on rules in
+ * the POSIX standard,
+ * \w & ^\S = nothing
+ * Unfortunately, some vendor locales do not meet the Posix
+ * standard, in particular almost everything by Microsoft.
+ * The loop below just changes e.g., \w into \W and vice versa */
+
+ regnode_charclass_posixl temp;
+ int add = 1; /* To calculate the index of the complement */
+
+ ANYOF_POSIXL_ZERO(&temp);
+ for (i = 0; i < ANYOF_MAX; i++) {
+ assert(i % 2 != 0
+ || ! ANYOF_POSIXL_TEST(and_with, i)
+ || ! ANYOF_POSIXL_TEST(and_with, i + 1));
+
+ if (ANYOF_POSIXL_TEST(and_with, i)) {
+ ANYOF_POSIXL_SET(&temp, i + add);
+ }
+ add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
+ }
+ ANYOF_POSIXL_AND(&temp, ssc);
+
+ } /* else ssc already has no posixes */
+ } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
+ in its initial state */
+ else if (OP(and_with) != ANYOF_SYNTHETIC
+ || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
+ {
+ /* But if 'ssc' is in its initial state, the result is just 'and_with';
+ * copy it over 'ssc' */
+ if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
+ if (OP(and_with) == ANYOF_SYNTHETIC) {
+ StructCopy(and_with, ssc, regnode_ssc);
+ }
+ else {
+ ssc->invlist = anded_cp_list;
+ ANYOF_POSIXL_ZERO(ssc);
+ if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
+ ANYOF_POSIXL_OR(and_with, ssc);
+ }
+ }
+ }
+ else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
+ || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
+ {
+ /* One or the other of P1, P2 is non-empty. */
+ ANYOF_POSIXL_AND(and_with, ssc);
+ ssc_union(ssc, anded_cp_list, FALSE);
+ }
+ else { /* P1 = P2 = empty */
+ ssc_intersection(ssc, anded_cp_list, FALSE);
+ }
}
}
-/* 'OR' a given class with another one. Can create false positives. 'cl'
- * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
- * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
STATIC void
-S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
+S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
+ const regnode_ssc *or_with)
{
- PERL_ARGS_ASSERT_CL_OR;
-
- if (or_with->flags & ANYOF_INVERT) {
-
- /* Here, the or'd node is to be inverted. This means we take the
- * complement of everything not in the bitmap, but currently we don't
- * know what that is, so give up and match anything */
- if (ANYOF_NONBITMAP(or_with)) {
- cl_anything(pRExC_state, cl);
- }
- /* We do not use
- * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
- * <= (B1 | !B2) | (CL1 | !CL2)
- * which is wasteful if CL2 is small, but we ignore CL2:
- * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
- * XXXX Can we handle case-fold? Unclear:
- * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
- * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
- */
- else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(or_with->flags & ANYOF_LOC_FOLD)
- && !(cl->flags & ANYOF_LOC_FOLD) ) {
- int i;
+ /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
+ * another SSC or a regular ANYOF class. Can create false positives if
+ * 'or_with' is to be inverted. */
- for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
- cl->bitmap[i] |= ~or_with->bitmap[i];
- } /* XXXX: logic is complicated otherwise */
- else {
- cl_anything(pRExC_state, cl);
- }
+ SV* ored_cp_list;
+ U8 ored_flags;
- /* And, we can just take the union of the flags that aren't affected
- * by the inversion */
- cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
+ PERL_ARGS_ASSERT_SSC_OR;
- /* For the remaining flags:
- ANYOF_UNICODE_ALL and inverted means to not match anything above
- 255, which means that the union with cl should just be
- what cl has in it, so can ignore this flag
- ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
- is 127-255 to match them, but then invert that, so the
- union with cl should just be what cl has in it, so can
- ignore this flag
- */
- } else { /* 'or_with' is not inverted */
- /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
- if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && (!(or_with->flags & ANYOF_LOC_FOLD)
- || (cl->flags & ANYOF_LOC_FOLD)) ) {
- int i;
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
+ * the code point inversion list and just the relevant flags */
+ if (OP(or_with) == ANYOF_SYNTHETIC) {
+ ored_cp_list = or_with->invlist;
+ ored_flags = ANYOF_FLAGS(or_with);
+ }
+ else {
+ ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
+ (regnode_charclass_posixl*) or_with);
+ ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
+ }
+
+ ANYOF_FLAGS(ssc) |= ored_flags;
+
+ /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
+ * C2 is the list of code points in 'or-with'; P2, its posix classes.
+ * 'or_with' may be inverted. When not inverted, we have the simple
+ * situation of computing:
+ * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
+ * If P1|P2 yields a situation with both a class and its complement are
+ * set, like having both \w and \W, this matches all code points, and we
+ * can delete these from the P component of the ssc going forward. XXX We
+ * might be able to delete all the P components, but I (khw) am not certain
+ * about this, and it is better to be safe.
+ *
+ * Inverted, we have
+ * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
+ * <= (C1 | P1) | ~C2
+ * <= (C1 | ~C2) | P1
+ * (which results in actually simpler code than the non-inverted case)
+ * */
- /* OR char bitmap and class bitmap separately */
- for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
- cl->bitmap[i] |= or_with->bitmap[i];
- if (or_with->flags & ANYOF_CLASS) {
- ANYOF_CLASS_OR(or_with, cl);
+ if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
+ && OP(or_with) != ANYOF_SYNTHETIC)
+ {
+ /* We ignore P2, leaving P1 going forward */
+ }
+ else { /* Not inverted */
+ ANYOF_POSIXL_OR(or_with, ssc);
+ if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
+ unsigned int i;
+ for (i = 0; i < ANYOF_MAX; i += 2) {
+ if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
+ {
+ ssc_match_all_cp(ssc);
+ ANYOF_POSIXL_CLEAR(ssc, i);
+ ANYOF_POSIXL_CLEAR(ssc, i+1);
+ if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
+ ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
+ }
+ }
}
- }
- else { /* XXXX: logic is complicated, leave it along for a moment. */
- cl_anything(pRExC_state, cl);
- }
+ }
+ }
- if (ANYOF_NONBITMAP(or_with)) {
+ ssc_union(ssc,
+ ored_cp_list,
+ FALSE /* Already has been inverted */
+ );
+}
- /* Use the added node's outside-the-bit-map match if there isn't a
- * conflict. If there is a conflict (both nodes match something
- * outside the bitmap, but what they match outside is not the same
- * pointer, and hence not easily compared until XXX we extend
- * inversion lists this far), give up and allow the start class to
- * match everything outside the bitmap. If that stuff is all above
- * 255, can just set UNICODE_ALL, otherwise caould be anything. */
- if (! ANYOF_NONBITMAP(cl)) {
- ARG_SET(cl, ARG(or_with));
- }
- else if (ARG(cl) != ARG(or_with)) {
+PERL_STATIC_INLINE void
+S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
+{
+ PERL_ARGS_ASSERT_SSC_UNION;
- if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
- cl_anything(pRExC_state, cl);
- }
- else {
- cl->flags |= ANYOF_UNICODE_ALL;
- }
- }
- }
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
- /* Take the union */
- cl->flags |= or_with->flags;
- }
+ _invlist_union_maybe_complement_2nd(ssc->invlist,
+ invlist,
+ invert2nd,
+ &ssc->invlist);
+}
+
+PERL_STATIC_INLINE void
+S_ssc_intersection(pTHX_ regnode_ssc *ssc,
+ SV* const invlist,
+ const bool invert2nd)
+{
+ PERL_ARGS_ASSERT_SSC_INTERSECTION;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ _invlist_intersection_maybe_complement_2nd(ssc->invlist,
+ invlist,
+ invert2nd,
+ &ssc->invlist);
+}
+
+PERL_STATIC_INLINE void
+S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
+{
+ PERL_ARGS_ASSERT_SSC_ADD_RANGE;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
+}
+
+PERL_STATIC_INLINE void
+S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
+{
+ /* AND just the single code point 'cp' into the SSC 'ssc' */
+
+ SV* cp_list = _new_invlist(2);
+
+ PERL_ARGS_ASSERT_SSC_CP_AND;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ cp_list = add_cp_to_invlist(cp_list, cp);
+ ssc_intersection(ssc, cp_list,
+ FALSE /* Not inverted */
+ );
+ SvREFCNT_dec_NN(cp_list);
+}
+
+PERL_STATIC_INLINE void
+S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
+{
+ /* Set the SSC 'ssc' to not match any locale things */
+
+ PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ ANYOF_POSIXL_ZERO(ssc);
+ ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
+}
+
+STATIC void
+S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
+{
+ /* The inversion list in the SSC is marked mortal; now we need a more
+ * permanent copy, which is stored the same way that is done in a regular
+ * ANYOF node, with the first 256 code points in a bit map */
+
+ SV* invlist = invlist_clone(ssc->invlist);
+
+ PERL_ARGS_ASSERT_SSC_FINALIZE;
+
+ assert(OP(ssc) == ANYOF_SYNTHETIC);
+
+ /* The code in this file assumes that all but these flags aren't relevant
+ * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
+ * time we reach here */
+ assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
+
+ populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
+
+ set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
+
+ assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
}
#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
8: EXACT <baz>(10)
10: END(0)
- d = uvuni_to_utf8_flags(d, uv, 0);
+ d = uvchr_to_utf8_flags(d, uv, 0);
is the recommended Unicode-aware way of saying
if (UTF) { \
SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
+ unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- /* if it is UTF then it is either already folded, or does not need folding */ \
- uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
- } \
- else if (folder == PL_fold_latin1) { \
- /* if we use this folder we have to obey unicode rules on latin-1 data */ \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
- skiplen = UNISKIP(uvc); \
- foldlen -= skiplen; \
- scan = foldbuf + skiplen; \
- } \
- } else { \
- /* raw data, will be folded later if needed */ \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+/* This gets the next character from the input, folding it if not already
+ * folded. */
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need \
+ * folding */ \
+ uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* This folder implies Unicode rules, which in the range expressible \
+ * by not UTF is the lower case, with the two exceptions, one of \
+ * which should have been taken care of before calling this */ \
+ assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
+ uvc = toLOWER_L1(*uc); \
+ if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
+ len = 1; \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
HV *widecharmap = NULL;
AV *revcharmap = newAV();
regnode *cur;
- const U32 uniflags = UTF8_ALLOW_DEFAULT;
STRLEN len = 0;
UV uvc = 0;
U16 curword = 0;
const U8 * folder = NULL;
#ifdef DEBUGGING
- const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
+ const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
AV *trie_words = NULL;
/* along with revcharmap, this only used during construction but both are
* useful during debugging so we store them in the struct when debugging.
*/
#else
- const U32 data_slot = add_data( pRExC_state, 2, "tu" );
+ const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
STRLEN trie_charcount=0;
#endif
SV *re_trie_maxbuff;
case EXACT: break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
const U8 *uc = (U8*)STRING( noper );
const U8 *e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
- const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars = 0;
+ STRLEN minbytes = 0;
+ STRLEN maxbytes = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
regardless of encoding */
if (OP( noper ) == EXACTFU_SS) {
/* false positives are ok, so just set this */
- TRIE_BITMAP_SET(trie,0xDF);
+ TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
- chars++;
+
+ /* Acummulate to the current values, the range in the number of
+ * bytes that this character could match. The max is presumed to
+ * be the same as the folded input (which TRIE_READ_CHAR returns),
+ * except that when this is not in UTF-8, it could be matched
+ * against a string which is UTF-8, and the variant characters
+ * could be 2 bytes instead of the 1 here. Likewise, for the
+ * minimum number of bytes when not folded. When folding, the min
+ * is assumed to be 1 byte could fold to match the single character
+ * here, or in the case of a multi-char fold, 1 byte can fold to
+ * the whole sequence. 'foldlen' is used to denote whether we are
+ * in such a sequence, skipping the min setting if so. XXX TODO
+ * Use the exact list of what folds to each character, from
+ * PL_utf8_foldclosures */
+ if (UTF) {
+ maxbytes += UTF8SKIP(uc);
+ if (! folder) {
+ /* A non-UTF-8 string could be 1 byte to match our 2 */
+ minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
+ ? 1
+ : UTF8SKIP(uc);
+ }
+ else {
+ if (foldlen) {
+ foldlen -= UTF8SKIP(uc);
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
+ else {
+ maxbytes += (UNI_IS_INVARIANT(*uc))
+ ? 1
+ : 2;
+ if (! folder) {
+ minbytes++;
+ }
+ else {
+ if (foldlen) {
+ foldlen--;
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
if ( uvc < 256 ) {
if ( folder ) {
U8 folded= folder[ (U8) uvc ];
if ( !UTF ) {
/* store first byte of utf8 representation of
variant codepoints */
- if (! UNI_IS_INVARIANT(uvc)) {
+ if (! UVCHR_IS_INVARIANT(uvc)) {
TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
}
}
}
}
if( cur == first ) {
- trie->minlen = chars;
- trie->maxlen = chars;
- } else if (chars < trie->minlen) {
- trie->minlen = chars;
- } else if (chars > trie->maxlen) {
- trie->maxlen = chars;
- }
- if (OP( noper ) == EXACTFU_SS) {
- /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
- if (trie->minlen > 1)
- trie->minlen= 1;
+ trie->minlen = minbytes;
+ trie->maxlen = maxbytes;
+ } else if (minbytes < trie->minlen) {
+ trie->minlen = minbytes;
+ } else if (maxbytes > trie->maxlen) {
+ trie->maxlen = maxbytes;
}
- if (OP( noper ) == EXACTFU_TRICKYFOLD) {
- /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
- * - We assume that any such sequence might match a 2 byte string */
- if (trie->minlen > 2 )
- trie->minlen= 2;
- }
-
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
U16 charid = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
/*
Second Pass -- Flat Table Representation.
- we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
- We know that we will need Charcount+1 trans at most to store the data
- (one row per char at worst case) So we preallocate both structures
- assuming worst case.
+ we dont use the 0 slot of either trans[] or states[] so we add 1 to
+ each. We know that we will need Charcount+1 trans at most to store
+ the data (one row per char at worst case) So we preallocate both
+ structures assuming worst case.
We then construct the trie using only the .next slots of the entry
structs.
- We use the .check field of the first entry of the node temporarily to
- make compression both faster and easier by keeping track of how many non
- zero fields are in the node.
+ We use the .check field of the first entry of the node temporarily
+ to make compression both faster and easier by keeping track of how
+ many non zero fields are in the node.
Since trans are numbered from 1 any 0 pointer in the table is a FAIL
transition.
- There are two terms at use here: state as a TRIE_NODEIDX() which is a
- number representing the first entry of the node, and state as a
- TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
- TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
- are 2 entrys per node. eg:
+ There are two terms at use here: state as a TRIE_NODEIDX() which is
+ a number representing the first entry of the node, and state as a
+ TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
+ and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
+ if there are 2 entrys per node. eg:
A B A B
1. 2 4 1. 3 7
3. 0 0 5. 0 0
4. 0 0 7. 0 0
- The table is internally in the right hand, idx form. However as we also
- have to deal with the states array which is indexed by nodenum we have to
- use TRIE_NODENUM() to convert.
+ The table is internally in the right hand, idx form. However as we
+ also have to deal with the states array which is indexed by nodenum
+ we have to use TRIE_NODENUM() to convert.
*/
DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
U16 charid = 0; /* sanity init */
U32 accept_state = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- STRLEN skiplen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
STATIC void
S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
{
-/* The Trie is constructed and compressed now so we can build a fail array if it's needed
+/* The Trie is constructed and compressed now so we can build a fail array if
+ * it's needed
- This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
- "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
+ This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
+ 3.32 in the
+ "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
+ Ullman 1985/88
ISBN 0-201-10088-6
- We find the fail state for each state in the trie, this state is the longest proper
- suffix of the current state's 'word' that is also a proper prefix of another word in our
- trie. State 1 represents the word '' and is thus the default fail state. This allows
- the DFA not to have to restart after its tried and failed a word at a given point, it
- simply continues as though it had been matching the other word in the first place.
+ We find the fail state for each state in the trie, this state is the longest
+ proper suffix of the current state's 'word' that is also a proper prefix of
+ another word in our trie. State 1 represents the word '' and is thus the
+ default fail state. This allows the DFA not to have to restart after its
+ tried and failed a word at a given point, it simply continues as though it
+ had been matching the other word in the first place.
Consider
'abcdgu'=~/abcdefg|cdgu/
- When we get to 'd' we are still matching the first word, we would encounter 'g' which would
- fail, which would bring us to the state representing 'd' in the second word where we would
- try 'g' and succeed, proceeding to match 'cdgu'.
+ When we get to 'd' we are still matching the first word, we would encounter
+ 'g' which would fail, which would bring us to the state representing 'd' in
+ the second word where we would try 'g' and succeed, proceeding to match
+ 'cdgu'.
*/
/* add a fail transition */
const U32 trie_offset = ARG(source);
U32 base = trie->states[ 1 ].trans.base;
U32 *fail;
reg_ac_data *aho;
- const U32 data_slot = add_data( pRExC_state, 1, "T" );
+ const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
}
-/*
- * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
- * These need to be revisited when a newer toolchain becomes available.
- */
-#if defined(__sparc64__) && defined(__GNUC__)
-# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
-# undef SPARC64_GCC_WORKAROUND
-# define SPARC64_GCC_WORKAROUND 1
-# endif
-#endif
-
#define DEBUG_PEEP(str,scan,depth) \
DEBUG_OPTIMISE_r({if (scan){ \
SV * const mysv=sv_newmortal(); \
* that is "sss".
*
* It turns out that there are problems with all multi-character folds, and not
- * just these three. Now the code is general, for all such cases, but the
- * three still have some special handling. The approach taken is:
+ * just these three. Now the code is general, for all such cases. The
+ * approach taken is:
* 1) This routine examines each EXACTFish node that could contain multi-
* character fold sequences. It returns in *min_subtract how much to
* subtract from the the actual length of the string to get a real minimum
* used by the caller to adjust the min length of the match, and the delta
* between min and max, so that the optimizer doesn't reject these
* possibilities based on size constraints.
- * 2) Certain of these sequences require special handling by the trie code,
- * so, if found, this code changes the joined node type to special ops:
- * EXACTFU_TRICKYFOLD and EXACTFU_SS.
- * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
+ * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
* is used for an EXACTFU node that contains at least one "ss" sequence in
* it. For non-UTF-8 patterns and strings, this is the only case where
* there is a possible fold length change. That means that a regular
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
* 'ss', even if the pattern isn't UTF-8. This avoids the issues
* described in the next item.
- * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
- * 'ss' or not is not knowable at compile time. It will match iff the
- * target string is in UTF-8, unlike the EXACTFU nodes, where it always
- * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
- * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
- * described in item 3). An assumption that the optimizer part of
- * regexec.c (probably unwittingly) makes is that a character in the
- * pattern corresponds to at most a single character in the target string.
- * (And I do mean character, and not byte here, unlike other parts of the
- * documentation that have never been updated to account for multibyte
- * Unicode.) This assumption is wrong only in this case, as all other
- * cases are either 1-1 folds when no UTF-8 is involved; or is true by
- * virtue of having this file pre-fold UTF-8 patterns. I'm
- * reluctant to try to change this assumption, so instead the code punts.
- * This routine examines EXACTF nodes for the sharp s, and returns a
- * boolean indicating whether or not the node is an EXACTF node that
- * contains a sharp s. When it is true, the caller sets a flag that later
- * causes the optimizer in this file to not set values for the floating
- * and fixed string lengths, and thus avoids the optimizer code in
- * regexec.c that makes the invalid assumption. Thus, there is no
- * optimization based on string lengths for EXACTF nodes that contain the
- * sharp s. This only happens for /id rules (which means the pattern
- * isn't in UTF-8).
- */
+ * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
+ * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
+ * (probably unwittingly, in Perl_regexec_flags()) makes is that a
+ * character in the pattern corresponds to at most a single character in
+ * the target string. (And I do mean character, and not byte here, unlike
+ * other parts of the documentation that have never been updated to
+ * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
+ * two character string 'ss'; in EXACTFA nodes it can match
+ * "\x{17F}\x{17F}". These violate the assumption, and they are the only
+ * instances where it is violated. I'm reluctant to try to change the
+ * assumption, as the code involved is impenetrable to me (khw), so
+ * instead the code here punts. This routine examines (when the pattern
+ * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
+ * boolean indicating whether or not the node contains a sharp s. When it
+ * is true, the caller sets a flag that later causes the optimizer in this
+ * file to not set values for the floating and fixed string lengths, and
+ * thus avoids the optimizer code in regexec.c that makes the invalid
+ * assumption. Thus, there is no optimization based on string lengths for
+ * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
+ * (The reason the assumption is wrong only in these two cases is that all
+ * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
+ * other folds to their expanded versions. We can't prefold sharp s to
+ * 'ss' in EXACTF nodes because we don't know at compile time if it
+ * actually matches 'ss' or not. It will match iff the target string is
+ * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
+ * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
+ * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
+ * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
+ * require the pattern to be forced into UTF-8, the overhead of which we
+ * want to avoid.)
+ *
+ * Similarly, the code that generates tries doesn't currently handle
+ * not-already-folded multi-char folds, and it looks like a pain to change
+ * that. Therefore, trie generation of EXACTFA nodes with the sharp s
+ * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
+ * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
+ * using /iaa matching will be doing so almost entirely with ASCII
+ * strings, so this should rarely be encountered in practice */
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
}
/* Nodes with 'ss' require special handling, except for EXACTFL
- * and EXACTFA for which there is no multi-char fold to this */
+ * and EXACTFA-ish for which there is no multi-char fold to
+ * this */
if (len == 2 && *s == 's' && *(s+1) == 's'
- && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+ && OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
{
count = 2;
OP(scan) = EXACTFU_SS;
s += 2;
}
- else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
- && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)
- || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)))
- {
- count = 3;
-
- /* These two folds require special handling by trie's, so
- * change the node type to indicate this. If EXACTFA and
- * EXACTFL were ever to be handled by trie's, this would
- * have to be changed. If this node has already been
- * changed to EXACTFU_SS in this loop, leave it as is. (I
- * (khw) think it doesn't matter in regexec.c for UTF
- * patterns, but no need to change it */
- if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_TRICKYFOLD;
- }
- s += 6;
- }
else { /* Here is a generic multi-char fold. */
const U8* multi_end = s + len;
* test for them. The code that generates the
* is_MULTI_foo() macros croaks should one actually get put
* into Unicode .) */
- if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
+ {
count = utf8_length(s, multi_end);
s = multi_end;
}
next_iteration: ;
}
}
- else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
-
- /* Here, the pattern is not UTF-8. Look for the multi-char folds
- * that are all ASCII. As in the above case, EXACTFL and EXACTFA
- * nodes can't have multi-char folds to this range (and there are
- * no existing ones in the upper latin1 range). In the EXACTF
- * case we look also for the sharp s, which can be in the final
+ else if (OP(scan) == EXACTFA) {
+
+ /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
+ * fold to the ASCII range (and there are no existing ones in the
+ * upper latin1 range). But, as outlined in the comments preceding
+ * this function, we need to flag any occurrences of the sharp s.
+ * This character forbids trie formation (because of added
+ * complexity) */
+ while (s < s_end) {
+ if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ OP(scan) = EXACTFA_NO_TRIE;
+ *has_exactf_sharp_s = TRUE;
+ break;
+ }
+ s++;
+ continue;
+ }
+ }
+ else if (OP(scan) != EXACTFL) {
+
+ /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
+ * multi-char folds that are all Latin1. (This code knows that
+ * there are no current multi-char folds possible with EXACTFL,
+ * relying on fold_grind.t to catch any errors if the very unlikely
+ * event happens that some get added in future Unicode versions.)
+ * As explained in the comments preceding this function, we look
+ * also for the sharp s in EXACTF nodes; it can be in the final
* position. Otherwise we can stop looking 1 byte earlier because
* have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
- /* The below is perhaps overboard, but this allows us to save a
- * test each time through the loop at the expense of a mask. This
- * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
- * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
- * are 64. This uses an exclusive 'or' to find that bit and then
- * inverts it to form a mask, with just a single 0, in the bit
- * position where 'S' and 's' differ. */
- const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
- const U8 s_masked = 's' & S_or_s_mask;
-
while (s < upper) {
int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
}
if (len == 2
- && ((*s & S_or_s_mask) == s_masked)
- && ((*(s+1) & S_or_s_mask) == s_masked))
+ && isARG2_lower_or_UPPER_ARG1('s', *s)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
{
/* EXACTF nodes need to know that the minimum length
#define INIT_AND_WITHP \
assert(!and_withp); \
- Newx(and_withp,1,struct regnode_charclass_class); \
+ Newx(and_withp,1, regnode_ssc); \
SAVEFREEPV(and_withp)
/* this is a chain of data about sub patterns we are processing that
#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
-STATIC I32
+STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
- I32 *minlenp, I32 *deltap,
+ SSize_t *minlenp, SSize_t *deltap,
regnode *last,
scan_data_t *data,
I32 stopparen,
U8* recursed,
- struct regnode_charclass_class *and_withp,
+ regnode_ssc *and_withp,
U32 flags, U32 depth)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
- I32 min = 0; /* There must be at least this number of characters to match */
+ /* There must be at least this number of characters to match */
+ SSize_t min = 0;
I32 pars = 0, code;
regnode *scan = *scanp, *next;
- I32 delta = 0;
+ SSize_t delta = 0;
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
int is_inf_internal = 0; /* The studied chunk is infinite */
I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
scan_data_t data_fake;
SV *re_trie_maxbuff = NULL;
regnode *first_non_open = scan;
- I32 stopmin = I32_MAX;
+ SSize_t stopmin = SSize_t_MAX;
scan_frame *frame = NULL;
GET_RE_DEBUG_FLAGS_DECL;
/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
if (OP(next) == code || code == IFTHEN) {
- /* NOTE - There is similar code to this block below for handling
- TRIE nodes on a re-study. If you change stuff here check there
- too. */
- I32 max1 = 0, min1 = I32_MAX, num = 0;
- struct regnode_charclass_class accum;
+ /* NOTE - There is similar code to this block below for
+ * handling TRIE nodes on a re-study. If you change stuff here
+ * check there too. */
+ SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
+ regnode_ssc accum;
regnode * const startbranch=scan;
if (flags & SCF_DO_SUBSTR)
SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
if (flags & SCF_DO_STCLASS)
- cl_init_zero(pRExC_state, &accum);
+ ssc_init_zero(pRExC_state, &accum);
while (OP(scan) == code) {
- I32 deltanext, minnext, f = 0, fake;
- struct regnode_charclass_class this_class;
+ SSize_t deltanext, minnext, fake;
+ I32 f = 0;
+ regnode_ssc this_class;
num++;
data_fake.flags = 0;
if (code != BRANCH)
scan = NEXTOPER(scan);
if (flags & SCF_DO_STCLASS) {
- cl_init(pRExC_state, &this_class);
+ ssc_init(pRExC_state, &this_class);
data_fake.start_class = &this_class;
f = SCF_DO_STCLASS_AND;
}
stopparen, recursed, NULL, f,depth+1);
if (min1 > minnext)
min1 = minnext;
- if (deltanext == I32_MAX) {
+ if (deltanext == SSize_t_MAX) {
is_inf = is_inf_internal = 1;
- max1 = I32_MAX;
+ max1 = SSize_t_MAX;
} else if (max1 < minnext + deltanext)
max1 = minnext + deltanext;
scan = next;
data->whilem_c = data_fake.whilem_c;
}
if (flags & SCF_DO_STCLASS)
- cl_or(pRExC_state, &accum, &this_class);
+ ssc_or(pRExC_state, &accum, &this_class);
}
if (code == IFTHEN && num < 2) /* Empty ELSE branch */
min1 = 0;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += min1;
- if (data->pos_delta >= I32_MAX - (max1 - min1))
- data->pos_delta = I32_MAX;
+ if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
+ data->pos_delta = SSize_t_MAX;
else
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
data->longest = &(data->longest_float);
}
min += min1;
- if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
- delta = I32_MAX;
+ if (delta == SSize_t_MAX
+ || SSize_t_MAX - delta - (max1 - min1) < 0)
+ delta = SSize_t_MAX;
else
delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
- cl_or(pRExC_state, data->start_class, &accum);
+ ssc_or(pRExC_state, data->start_class, &accum);
if (min1) {
- cl_and(data->start_class, and_withp);
+ ssc_and(pRExC_state, data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
}
else if (flags & SCF_DO_STCLASS_AND) {
if (min1) {
- cl_and(data->start_class, &accum);
+ ssc_and(pRExC_state, data->start_class, &accum);
flags &= ~SCF_DO_STCLASS;
}
else {
/* Switch to OR mode: cache the old value of
* data->start_class */
INIT_AND_WITHP;
- StructCopy(data->start_class, and_withp,
- struct regnode_charclass_class);
+ StructCopy(data->start_class, and_withp, regnode_ssc);
flags &= ~SCF_DO_STCLASS_AND;
- StructCopy(&accum, data->start_class,
- struct regnode_charclass_class);
+ StructCopy(&accum, data->start_class, regnode_ssc);
flags |= SCF_DO_STCLASS_OR;
- SET_SSC_EOS(data->start_class);
}
}
if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
/* demq.
- Assuming this was/is a branch we are dealing with: 'scan' now
- points at the item that follows the branch sequence, whatever
- it is. We now start at the beginning of the sequence and look
- for subsequences of
+ Assuming this was/is a branch we are dealing with: 'scan'
+ now points at the item that follows the branch sequence,
+ whatever it is. We now start at the beginning of the
+ sequence and look for subsequences of
BRANCH->EXACT=>x1
BRANCH->EXACT=>x2
tail
- which would be constructed from a pattern like /A|LIST|OF|WORDS/
+ which would be constructed from a pattern like
+ /A|LIST|OF|WORDS/
If we can find such a subsequence we need to turn the first
element into a trie and then add the subsequent branch exact
We have two cases
- 1. patterns where the whole set of branches can be converted.
+ 1. patterns where the whole set of branches can be
+ converted.
2. patterns where only a subset can be converted.
Step through the branches
cur represents each branch,
- noper is the first thing to be matched as part of that branch
+ noper is the first thing to be matched as part
+ of that branch
noper_next is the regnext() of that node.
- We normally handle a case like this /FOO[xyz]|BAR[pqr]/
- via a "jump trie" but we also support building with NOJUMPTRIE,
- which restricts the trie logic to structures like /FOO|BAR/.
-
- If noper is a trieable nodetype then the branch is a possible optimization
- target. If we are building under NOJUMPTRIE then we require that noper_next
- is the same as scan (our current position in the regex program).
-
- Once we have two or more consecutive such branches we can create a
- trie of the EXACT's contents and stitch it in place into the program.
-
- If the sequence represents all of the branches in the alternation we
- replace the entire thing with a single TRIE node.
-
- Otherwise when it is a subsequence we need to stitch it in place and
- replace only the relevant branches. This means the first branch has
- to remain as it is used by the alternation logic, and its next pointer,
- and needs to be repointed at the item on the branch chain following
- the last branch we have optimized away.
-
- This could be either a BRANCH, in which case the subsequence is internal,
- or it could be the item following the branch sequence in which case the
- subsequence is at the end (which does not necessarily mean the first node
- is the start of the alternation).
-
- TRIE_TYPE(X) is a define which maps the optype to a trietype.
+ We normally handle a case like this
+ /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
+ support building with NOJUMPTRIE, which restricts
+ the trie logic to structures like /FOO|BAR/.
+
+ If noper is a trieable nodetype then the branch is
+ a possible optimization target. If we are building
+ under NOJUMPTRIE then we require that noper_next is
+ the same as scan (our current position in the regex
+ program).
+
+ Once we have two or more consecutive such branches
+ we can create a trie of the EXACT's contents and
+ stitch it in place into the program.
+
+ If the sequence represents all of the branches in
+ the alternation we replace the entire thing with a
+ single TRIE node.
+
+ Otherwise when it is a subsequence we need to
+ stitch it in place and replace only the relevant
+ branches. This means the first branch has to remain
+ as it is used by the alternation logic, and its
+ next pointer, and needs to be repointed at the item
+ on the branch chain following the last branch we
+ have optimized away.
+
+ This could be either a BRANCH, in which case the
+ subsequence is internal, or it could be the item
+ following the branch sequence in which case the
+ subsequence is at the end (which does not
+ necessarily mean the first node is the start of the
+ alternation).
+
+ TRIE_TYPE(X) is a define which maps the optype to a
+ trietype.
optype | trietype
----------------+-----------
EXACT | EXACT
EXACTFU | EXACTFU
EXACTFU_SS | EXACTFU
- EXACTFU_TRICKYFOLD | EXACTFU
- EXACTFA | 0
+ EXACTFA | EXACTFA
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
( EXACT == (X) ) ? EXACT : \
- ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
+ ( EXACTFA == (X) ) ? EXACTFA : \
0 )
/* dont use tail as the end marker for this traverse */
);
});
- /* Is noper a trieable nodetype that can be merged with the
- * current trie (if there is one)? */
+ /* Is noper a trieable nodetype that can be merged
+ * with the current trie (if there is one)? */
if ( noper_trietype
&&
(
#endif
&& count < U16_MAX)
{
- /* Handle mergable triable node
- * Either we are the first node in a new trieable sequence,
- * in which case we do some bookkeeping, otherwise we update
- * the end pointer. */
+ /* Handle mergable triable node Either we are
+ * the first node in a new trieable sequence,
+ * in which case we do some bookkeeping,
+ * otherwise we update the end pointer. */
if ( !first ) {
first = cur;
if ( noper_trietype == NOTHING ) {
if ( noper_next_trietype ) {
trietype = noper_next_trietype;
} else if (noper_next_type) {
- /* a NOTHING regop is 1 regop wide. We need at least two
- * for a trie so we can't merge this in */
+ /* a NOTHING regop is 1 regop wide.
+ * We need at least two for a trie
+ * so we can't merge this in */
first = NULL;
}
} else {
} /* end handle mergable triable node */
else {
/* handle unmergable node -
- * noper may either be a triable node which can not be tried
- * together with the current trie, or a non triable node */
+ * noper may either be a triable node which can
+ * not be tried together with the current trie,
+ * or a non triable node */
if ( last ) {
- /* If last is set and trietype is not NOTHING then we have found
- * at least two triable branch sequences in a row of a similar
- * trietype so we can turn them into a trie. If/when we
- * allow NOTHING to start a trie sequence this condition will be
- * required, and it isn't expensive so we leave it in for now. */
+ /* If last is set and trietype is not
+ * NOTHING then we have found at least two
+ * triable branch sequences in a row of a
+ * similar trietype so we can turn them
+ * into a trie. If/when we allow NOTHING to
+ * start a trie sequence this condition
+ * will be required, and it isn't expensive
+ * so we leave it in for now. */
if ( trietype && trietype != NOTHING )
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
trietype, depth+1 );
- last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
+ last = NULL; /* note: we clear/update
+ first, trietype etc below,
+ so we dont do it here */
}
if ( noper_trietype
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
){
- /* noper is triable, so we can start a new trie sequence */
+ /* noper is triable, so we can start a new
+ * trie sequence */
count = 1;
first = cur;
trietype = noper_trietype;
} else if (first) {
- /* if we already saw a first but the current node is not triable then we have
+ /* if we already saw a first but the
+ * current node is not triable then we have
* to reset the first information. */
count = 0;
first = NULL;
});
if ( last && trietype ) {
if ( trietype != NOTHING ) {
- /* the last branch of the sequence was part of a trie,
- * so we have to construct it here outside of the loop
- */
+ /* the last branch of the sequence was part of
+ * a trie, so we have to construct it here
+ * outside of the loop */
made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
#ifdef TRIE_STUDY_OPT
if ( ((made == MADE_EXACT_TRIE &&
}
#endif
} else {
- /* at this point we know whatever we have is a NOTHING sequence/branch
- * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
+ /* at this point we know whatever we have is a
+ * NOTHING sequence/branch AND if 'startbranch'
+ * is 'first' then we can turn the whole thing
+ * into a NOTHING
*/
if ( startbranch == first ) {
regnode *opt;
- /* the entire thing is a NOTHING sequence, something like this:
- * (?:|) So we can turn it into a plain NOTHING op. */
+ /* the entire thing is a NOTHING sequence,
+ * something like this: (?:|) So we can
+ * turn it into a plain NOTHING op. */
DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log,
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
- cl_anything(pRExC_state, data->start_class);
+ ssc_anything(data->start_class);
flags &= ~SCF_DO_STCLASS;
}
} else {
}
}
else if (OP(scan) == EXACT) {
- I32 l = STR_LEN(scan);
+ SSize_t l = STR_LEN(scan);
UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
if (data->last_end == -1) { /* Update the start info. */
data->last_start_min = data->pos_min;
data->last_start_max = is_inf
- ? I32_MAX : data->pos_min + data->pos_delta;
+ ? SSize_t_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
if (UTF)
data->pos_min += l; /* As in the first entry. */
data->flags &= ~SF_BEFORE_EOL;
}
+
+ /* ANDing the code point leaves at most it, and not in locale, and
+ * can't match null string */
if (flags & SCF_DO_STCLASS_AND) {
- /* Check whether it is compatible with what we know already! */
- int compat = 1;
-
-
- /* If compatible, we or it in below. It is compatible if is
- * in the bitmp and either 1) its bit or its fold is set, or 2)
- * it's for a locale. Even if there isn't unicode semantics
- * here, at runtime there may be because of matching against a
- * utf8 string, so accept a possible false positive for
- * latin1-range folds */
- if (uc >= 0x100 ||
- (!(data->start_class->flags & ANYOF_LOCALE)
- && !ANYOF_BITMAP_TEST(data->start_class, uc)
- && (!(data->start_class->flags & ANYOF_LOC_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
- )
- {
- compat = 0;
- }
- ANYOF_CLASS_ZERO(data->start_class);
- ANYOF_BITMAP_ZERO(data->start_class);
- if (compat)
- ANYOF_BITMAP_SET(data->start_class, uc);
- else if (uc >= 0x100) {
- int i;
-
- /* Some Unicode code points fold to the Latin1 range; as
- * XXX temporary code, instead of figuring out if this is
- * one, just assume it is and set all the start class bits
- * that could be some such above 255 code point's fold
- * which will generate fals positives. As the code
- * elsewhere that does compute the fold settles down, it
- * can be extracted out and re-used here */
- for (i = 0; i < 256; i++){
- if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
- ANYOF_BITMAP_SET(data->start_class, i);
- }
- }
- }
- CLEAR_SSC_EOS(data->start_class);
- if (uc < 0x100)
- data->start_class->flags &= ~ANYOF_UNICODE_ALL;
+ ssc_cp_and(data->start_class, uc);
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+ ssc_clear_locale(data->start_class);
}
else if (flags & SCF_DO_STCLASS_OR) {
- /* false positive possible if the class is case-folded */
- if (uc < 0x100)
- ANYOF_BITMAP_SET(data->start_class, uc);
- else
- data->start_class->flags |= ANYOF_UNICODE_ALL;
- CLEAR_SSC_EOS(data->start_class);
- cl_and(data->start_class, and_withp);
+ ssc_add_cp(data->start_class, uc);
+ ssc_and(pRExC_state, data->start_class, and_withp);
}
flags &= ~SCF_DO_STCLASS;
}
else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
- I32 l = STR_LEN(scan);
+ SSize_t l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
+ SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
+ separate code points */
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
data->longest = &(data->longest_float);
}
}
- if (flags & SCF_DO_STCLASS_AND) {
- /* Check whether it is compatible with what we know already! */
- int compat = 1;
- if (uc >= 0x100 ||
- (!(data->start_class->flags & ANYOF_LOCALE)
- && !ANYOF_BITMAP_TEST(data->start_class, uc)
- && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
- {
- compat = 0;
- }
- ANYOF_CLASS_ZERO(data->start_class);
- ANYOF_BITMAP_ZERO(data->start_class);
- if (compat) {
- ANYOF_BITMAP_SET(data->start_class, uc);
- CLEAR_SSC_EOS(data->start_class);
- if (OP(scan) == EXACTFL) {
- /* XXX This set is probably no longer necessary, and
- * probably wrong as LOCALE now is on in the initial
- * state */
- data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
- }
- else {
+ if (OP(scan) == EXACTFL) {
+ if (flags & SCF_DO_STCLASS_AND) {
+ ssc_flags_and(data->start_class,
+ ANYOF_LOCALE|ANYOF_LOC_FOLD);
+ }
+ else if (flags & SCF_DO_STCLASS_OR) {
+ ANYOF_FLAGS(data->start_class)
+ |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
+ }
- /* Also set the other member of the fold pair. In case
- * that unicode semantics is called for at runtime, use
- * the full latin1 fold. (Can't do this for locale,
- * because not known until runtime) */
- ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
-
- /* All other (EXACTFL handled above) folds except under
- * /iaa that include s, S, and sharp_s also may include
- * the others */
- if (OP(scan) != EXACTFA) {
- if (uc == 's' || uc == 'S') {
- ANYOF_BITMAP_SET(data->start_class,
- LATIN_SMALL_LETTER_SHARP_S);
- }
- else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
- ANYOF_BITMAP_SET(data->start_class, 's');
- ANYOF_BITMAP_SET(data->start_class, 'S');
- }
- }
- }
- }
- else if (uc >= 0x100) {
- int i;
- for (i = 0; i < 256; i++){
- if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
- ANYOF_BITMAP_SET(data->start_class, i);
- }
- }
- }
+ /* We don't know what the folds are; it could be anything. XXX
+ * Actually, we only support UTF-8 encoding for code points
+ * above Latin1, so we could know what those folds are. */
+ EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
+ 0,
+ UV_MAX);
+ }
+ else { /* Non-locale EXACTFish */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (flags & SCF_DO_STCLASS_AND) {
+ ssc_clear_locale(data->start_class);
+ }
+ if (uc < 256) { /* We know what the Latin1 folds are ... */
+ if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
+ know if anything folds
+ with this */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
+ PL_fold_latin1[uc]);
+ if (OP(scan) != EXACTFA) { /* The folds below aren't
+ legal under /iaa */
+ if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
+ EXACTF_invlist
+ = add_cp_to_invlist(EXACTF_invlist,
+ LATIN_SMALL_LETTER_SHARP_S);
+ }
+ else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+ EXACTF_invlist
+ = add_cp_to_invlist(EXACTF_invlist, 's');
+ EXACTF_invlist
+ = add_cp_to_invlist(EXACTF_invlist, 'S');
+ }
+ }
+
+ /* We also know if there are above-Latin1 code points
+ * that fold to this (none legal for ASCII and /iaa) */
+ if ((! isASCII(uc) || OP(scan) != EXACTFA)
+ && HAS_NONLATIN1_FOLD_CLOSURE(uc))
+ {
+ /* XXX We could know exactly what does fold to this
+ * if the reverse folds are loaded, as currently in
+ * S_regclass() */
+ _invlist_union(EXACTF_invlist,
+ PL_AboveLatin1,
+ &EXACTF_invlist);
+ }
+ }
+ }
+ else { /* Non-locale, above Latin1. XXX We don't currently
+ know what participates in folds with this, so have
+ to assume anything could */
+
+ /* XXX We could know exactly what does fold to this if the
+ * reverse folds are loaded, as currently in S_regclass().
+ * But we do know that under /iaa nothing in the ASCII
+ * range can participate */
+ if (OP(scan) == EXACTFA) {
+ _invlist_union_complement_2nd(EXACTF_invlist,
+ PL_Posix_ptrs[_CC_ASCII],
+ &EXACTF_invlist);
+ }
+ else {
+ EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
+ 0, UV_MAX);
+ }
+ }
+ }
+ if (flags & SCF_DO_STCLASS_AND) {
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+ ANYOF_POSIXL_ZERO(data->start_class);
+ ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
}
else if (flags & SCF_DO_STCLASS_OR) {
- if (data->start_class->flags & ANYOF_LOC_FOLD) {
- /* false positive possible if the class is case-folded.
- Assume that the locale settings are the same... */
- if (uc < 0x100) {
- ANYOF_BITMAP_SET(data->start_class, uc);
- if (OP(scan) != EXACTFL) {
-
- /* And set the other member of the fold pair, but
- * can't do that in locale because not known until
- * run-time */
- ANYOF_BITMAP_SET(data->start_class,
- PL_fold_latin1[uc]);
-
- /* All folds except under /iaa that include s, S,
- * and sharp_s also may include the others */
- if (OP(scan) != EXACTFA) {
- if (uc == 's' || uc == 'S') {
- ANYOF_BITMAP_SET(data->start_class,
- LATIN_SMALL_LETTER_SHARP_S);
- }
- else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
- ANYOF_BITMAP_SET(data->start_class, 's');
- ANYOF_BITMAP_SET(data->start_class, 'S');
- }
- }
- }
- }
- CLEAR_SSC_EOS(data->start_class);
- }
- cl_and(data->start_class, and_withp);
+ ssc_union(data->start_class, EXACTF_invlist, FALSE);
+ ssc_and(pRExC_state, data->start_class, and_withp);
}
flags &= ~SCF_DO_STCLASS;
+ SvREFCNT_dec(EXACTF_invlist);
}
else if (REGNODE_VARIES(OP(scan))) {
- I32 mincount, maxcount, minnext, deltanext, fl = 0;
- I32 f = flags, pos_before = 0;
+ SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
+ I32 fl = 0, f = flags;
regnode * const oscan = scan;
- struct regnode_charclass_class this_class;
- struct regnode_charclass_class *oclass = NULL;
+ regnode_ssc this_class;
+ regnode_ssc *oclass = NULL;
I32 next_is_eval = 0;
switch (PL_regkind[OP(scan)]) {
data->flags |= SF_IS_INF;
}
if (flags & SCF_DO_STCLASS) {
- cl_init(pRExC_state, &this_class);
+ ssc_init(pRExC_state, &this_class);
oclass = data->start_class;
data->start_class = &this_class;
f |= SCF_DO_STCLASS_AND;
data->start_class = oclass;
if (mincount == 0 || minnext == 0) {
if (flags & SCF_DO_STCLASS_OR) {
- cl_or(pRExC_state, data->start_class, &this_class);
+ ssc_or(pRExC_state, data->start_class, &this_class);
}
else if (flags & SCF_DO_STCLASS_AND) {
/* Switch to OR mode: cache the old value of
* data->start_class */
INIT_AND_WITHP;
- StructCopy(data->start_class, and_withp,
- struct regnode_charclass_class);
+ StructCopy(data->start_class, and_withp, regnode_ssc);
flags &= ~SCF_DO_STCLASS_AND;
- StructCopy(&this_class, data->start_class,
- struct regnode_charclass_class);
+ StructCopy(&this_class, data->start_class, regnode_ssc);
flags |= SCF_DO_STCLASS_OR;
- SET_SSC_EOS(data->start_class);
}
} else { /* Non-zero len */
if (flags & SCF_DO_STCLASS_OR) {
- cl_or(pRExC_state, data->start_class, &this_class);
- cl_and(data->start_class, and_withp);
+ ssc_or(pRExC_state, data->start_class, &this_class);
+ ssc_and(pRExC_state, data->start_class, and_withp);
}
else if (flags & SCF_DO_STCLASS_AND)
- cl_and(data->start_class, &this_class);
+ ssc_and(pRExC_state, data->start_class, &this_class);
flags &= ~SCF_DO_STCLASS;
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if ( /* ? quantifier ok, except for (?{ ... }) */
- (next_is_eval || !(mincount == 0 && maxcount == 1))
+ if (!(flags & SCF_TRIE_DOING_RESTUDY)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
}
min += minnext * mincount;
- is_inf_internal |= deltanext == I32_MAX
+ is_inf_internal |= deltanext == SSize_t_MAX
|| (maxcount == REG_INFTY && minnext + deltanext > 0);
is_inf |= is_inf_internal;
if (is_inf)
- delta = I32_MAX;
+ delta = SSize_t_MAX;
else
delta += (minnext + deltanext) * maxcount - minnext * mincount;
int counted = mincount != 0;
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
-#if defined(SPARC64_GCC_WORKAROUND)
- I32 b = 0;
- STRLEN l = 0;
- const char *s = NULL;
- I32 old = 0;
-
- if (pos_before >= data->last_start_min)
- b = pos_before;
- else
- b = data->last_start_min;
-
- l = 0;
- s = SvPV_const(data->last_found, l);
- old = b - data->last_start_min;
-
-#else
- I32 b = pos_before >= data->last_start_min
+ SSize_t b = pos_before >= data->last_start_min
? pos_before : data->last_start_min;
STRLEN l;
const char * const s = SvPV_const(data->last_found, l);
- I32 old = b - data->last_start_min;
-#endif
+ SSize_t old = b - data->last_start_min;
if (UTF)
old = utf8_hop((U8*)s, old) - (U8*)s;
} else {
/* start offset must point into the last copy */
data->last_start_min += minnext * (mincount - 1);
- data->last_start_max += is_inf ? I32_MAX
+ data->last_start_max += is_inf ? SSize_t_MAX
: (maxcount - 1) * (minnext + data->pos_delta);
}
}
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
- counted, deltanext, I32_MAX, minnext, maxcount, mincount);
-if (deltanext != I32_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
+PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
+ " SSize_t_MAX=%"UVdf" minnext=%"UVdf
+ " maxcount=%"UVdf" mincount=%"UVdf"\n",
+ (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
+ (UV)mincount);
+if (deltanext != SSize_t_MAX)
+PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+ (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
+ - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
- if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
- data->pos_delta = I32_MAX;
+ if (deltanext == SSize_t_MAX ||
+ -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
+ data->pos_delta = SSize_t_MAX;
else
data->pos_delta += - counted * deltanext +
(minnext + deltanext) * maxcount - minnext * mincount;
data->last_start_min =
data->pos_min - CHR_SVLEN(last_str);
data->last_start_max = is_inf
- ? I32_MAX
+ ? SSize_t_MAX
: data->pos_min + data->pos_delta
- CHR_SVLEN(last_str);
}
NEXT_OFF(oscan) += NEXT_OFF(next);
}
continue;
- default: /* REF, and CLUMP only? */
+
+ default:
+#ifdef DEBUGGING
+ Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
+ OP(scan));
+#endif
+ case REF:
+ case CLUMP:
if (flags & SCF_DO_SUBSTR) {
SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
data->longest = &(data->longest_float);
}
is_inf = is_inf_internal = 1;
- if (flags & SCF_DO_STCLASS_OR)
- cl_anything(pRExC_state, data->start_class);
+ if (flags & SCF_DO_STCLASS_OR) {
+ if (OP(scan) == CLUMP) {
+ /* Actually is any start char, but very few code points
+ * aren't start characters */
+ ssc_match_all_cp(data->start_class);
+ }
+ else {
+ ssc_anything(data->start_class);
+ }
+ }
flags &= ~SCF_DO_STCLASS;
break;
}
}
else if (OP(scan) == LNBREAK) {
if (flags & SCF_DO_STCLASS) {
- int value = 0;
- CLEAR_SSC_EOS(data->start_class); /* No match on empty */
if (flags & SCF_DO_STCLASS_AND) {
- for (value = 0; value < 256; value++)
- if (!is_VERTWS_cp(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ ssc_intersection(data->start_class,
+ PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
+ ssc_clear_locale(data->start_class);
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
}
- else {
- for (value = 0; value < 256; value++)
- if (is_VERTWS_cp(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ else if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class,
+ PL_XPosix_ptrs[_CC_VERTSPACE],
+ FALSE);
+ ssc_and(pRExC_state, data->start_class, and_withp);
}
- if (flags & SCF_DO_STCLASS_OR)
- cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
min++;
}
}
else if (REGNODE_SIMPLE(OP(scan))) {
- int value = 0;
if (flags & SCF_DO_SUBSTR) {
SCAN_COMMIT(pRExC_state,data,minlenp);
}
min++;
if (flags & SCF_DO_STCLASS) {
- int loop_max = 256;
- CLEAR_SSC_EOS(data->start_class); /* No match on empty */
+ bool invert = 0;
+ SV* my_invlist = sv_2mortal(_new_invlist(0));
+ U8 classnum;
+ U8 namedclass;
+
+ if (flags & SCF_DO_STCLASS_AND) {
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+ }
/* Some of the logic below assumes that switching
locale on will only add false positives. */
- switch (PL_regkind[OP(scan)]) {
- U8 classnum;
+ switch (OP(scan)) {
- case SANY:
default:
#ifdef DEBUGGING
Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
#endif
- do_default:
+ case CANY:
+ case SANY:
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
- cl_anything(pRExC_state, data->start_class);
+ ssc_match_all_cp(data->start_class);
break;
+
case REG_ANY:
- if (OP(scan) == SANY)
- goto do_default;
- if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
- value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
- || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
- cl_anything(pRExC_state, data->start_class);
+ {
+ SV* REG_ANY_invlist = _new_invlist(2);
+ REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
+ '\n');
+ if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class,
+ REG_ANY_invlist,
+ TRUE /* TRUE => invert, hence all but \n
+ */
+ );
+ }
+ else if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class,
+ REG_ANY_invlist,
+ TRUE /* TRUE => invert */
+ );
+ ssc_clear_locale(data->start_class);
+ }
+ SvREFCNT_dec_NN(REG_ANY_invlist);
}
- if (flags & SCF_DO_STCLASS_AND || !value)
- ANYOF_BITMAP_CLEAR(data->start_class,'\n');
break;
- case ANYOF:
+
+ case ANYOF_WARN_SUPER:
+ case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
- cl_and(data->start_class,
- (struct regnode_charclass_class*)scan);
+ ssc_and(pRExC_state, data->start_class,
+ (regnode_ssc*) scan);
else
- cl_or(pRExC_state, data->start_class,
- (struct regnode_charclass_class*)scan);
+ ssc_or(pRExC_state, data->start_class,
+ (regnode_ssc*)scan);
break;
- case POSIXA:
- loop_max = 128;
+
+ case NPOSIXL:
+ invert = 1;
/* FALL THROUGH */
+
case POSIXL:
- case POSIXD:
- case POSIXU:
classnum = FLAGS(scan);
- if (flags & SCF_DO_STCLASS_AND) {
- if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
- for (value = 0; value < loop_max; value++) {
- if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
- }
- }
- }
- }
- else {
- if (data->start_class->flags & ANYOF_LOCALE) {
- ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
+ namedclass = classnum_to_namedclass(classnum) + invert;
+ if (flags & SCF_DO_STCLASS_AND) {
+ bool was_there = ANYOF_POSIXL_TEST(data->start_class,
+ namedclass);
+ ANYOF_POSIXL_ZERO(data->start_class);
+ if (was_there) { /* Do an AND */
+ ANYOF_POSIXL_SET(data->start_class, namedclass);
}
- else {
-
- /* Even if under locale, set the bits for non-locale
- * in case it isn't a true locale-node. This will
- * create false positives if it truly is locale */
- for (value = 0; value < loop_max; value++) {
- if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+ /* No individual code points can now match */
+ data->start_class->invlist
+ = sv_2mortal(_new_invlist(0));
+ }
+ else {
+ int complement = namedclass + ((invert) ? -1 : 1);
+
+ assert(flags & SCF_DO_STCLASS_OR);
+
+ /* If the complement of this class was already there,
+ * the result is that they match all code points,
+ * (\d + \D == everything). Remove the classes from
+ * future consideration. Locale is not relevant in
+ * this case */
+ if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
+ ssc_match_all_cp(data->start_class);
+ ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
+ ANYOF_POSIXL_CLEAR(data->start_class, complement);
+ if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
+ {
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
}
}
+ else { /* The usual case; just add this class to the
+ existing set */
+ ANYOF_POSIXL_SET(data->start_class, namedclass);
+ ANYOF_FLAGS(data->start_class)
+ |= ANYOF_LOCALE|ANYOF_POSIXL;
}
- }
- break;
- case NPOSIXA:
- loop_max = 128;
+ }
+ break;
+
+ case NPOSIXA: /* For these, we always know the exact set of
+ what's matched */
+ invert = 1;
/* FALL THROUGH */
- case NPOSIXL:
- case NPOSIXU:
+ case POSIXA:
+ classnum = FLAGS(scan);
+ my_invlist = PL_Posix_ptrs[classnum];
+ goto join_posix;
+
case NPOSIXD:
+ case NPOSIXU:
+ invert = 1;
+ /* FALL THROUGH */
+ case POSIXD:
+ case POSIXU:
classnum = FLAGS(scan);
- if (flags & SCF_DO_STCLASS_AND) {
- if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
- for (value = 0; value < loop_max; value++) {
- if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
- }
- }
- }
- }
- else {
- if (data->start_class->flags & ANYOF_LOCALE) {
- ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
- }
- else {
- /* Even if under locale, set the bits for non-locale in
- * case it isn't a true locale-node. This will create
- * false positives if it truly is locale */
- for (value = 0; value < loop_max; value++) {
- if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
- ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
- }
- }
- if (PL_regkind[OP(scan)] == NPOSIXD) {
- data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
- }
- }
- }
- break;
+ /* If we know all the code points that match the class, use
+ * that; otherwise use the Latin1 code points, plus we have
+ * to assume that it could match anything above Latin1 */
+ if (PL_XPosix_ptrs[classnum]) {
+ my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
+ }
+ else {
+ _invlist_union(PL_L1Posix_ptrs[classnum],
+ PL_AboveLatin1, &my_invlist);
+ }
+
+ /* NPOSIXD matches all upper Latin1 code points unless the
+ * target string being matched is UTF-8, which is
+ * unknowable until match time */
+ if (PL_regkind[OP(scan)] == NPOSIXD) {
+ _invlist_union_complement_2nd(my_invlist,
+ PL_Posix_ptrs[_CC_ASCII], &my_invlist);
+ }
+
+ join_posix:
+
+ if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class, my_invlist, invert);
+ ssc_clear_locale(data->start_class);
+ }
+ else {
+ assert(flags & SCF_DO_STCLASS_OR);
+ ssc_union(data->start_class, my_invlist, invert);
+ }
}
if (flags & SCF_DO_STCLASS_OR)
- cl_and(data->start_class, and_withp);
+ ssc_and(pRExC_state, data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
}
In this case we can't do fixed string optimisation.
*/
- I32 deltanext, minnext, fake = 0;
+ SSize_t deltanext, minnext, fake = 0;
regnode *nscan;
- struct regnode_charclass_class intrnl;
+ regnode_ssc intrnl;
int f = 0;
data_fake.flags = 0;
data_fake.pos_delta = delta;
if ( flags & SCF_DO_STCLASS && !scan->flags
&& OP(scan) == IFMATCH ) { /* Lookahead */
- cl_init(pRExC_state, &intrnl);
+ ssc_init(pRExC_state, &intrnl);
data_fake.start_class = &intrnl;
f |= SCF_DO_STCLASS_AND;
}
* *** HACK *** for now just treat as "no information".
* See [perl #56690].
*/
- cl_init(pRExC_state, data->start_class);
+ ssc_init(pRExC_state, data->start_class);
} else {
/* AND before and after: combine and continue */
- const int was = TEST_SSC_EOS(data->start_class);
-
- cl_and(data->start_class, &intrnl);
- if (was)
- SET_SSC_EOS(data->start_class);
+ ssc_and(pRExC_state, data->start_class, &intrnl);
}
}
}
length of the pattern, something we won't know about
until after the recurse.
*/
- I32 deltanext, fake = 0;
+ SSize_t deltanext, fake = 0;
regnode *nscan;
- struct regnode_charclass_class intrnl;
+ regnode_ssc intrnl;
int f = 0;
/* We use SAVEFREEPV so that when the full compile
is finished perl will clean up the allocated
have to worry about freeing them when we know
they wont be used, which would be a pain.
*/
- I32 *minnextp;
- Newx( minnextp, 1, I32 );
+ SSize_t *minnextp;
+ Newx( minnextp, 1, SSize_t );
SAVEFREEPV(minnextp);
if (data) {
data_fake.flags |= SF_IS_INF;
if ( flags & SCF_DO_STCLASS && !scan->flags
&& OP(scan) == IFMATCH ) { /* Lookahead */
- cl_init(pRExC_state, &intrnl);
+ ssc_init(pRExC_state, &intrnl);
data_fake.start_class = &intrnl;
f |= SCF_DO_STCLASS_AND;
}
*minnextp += min;
if (f & SCF_DO_STCLASS_AND) {
- const int was = TEST_SSC_EOS(data.start_class);
-
- cl_and(data->start_class, &intrnl);
- if (was)
- SET_SSC_EOS(data->start_class);
+ ssc_and(pRExC_state, data->start_class, &intrnl);
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
- cl_anything(pRExC_state, data->start_class);
+ ssc_anything(data->start_class);
flags &= ~SCF_DO_STCLASS;
}
else if (OP(scan) == GPOS) {
{
if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
RExC_rx->extflags |= RXf_ANCH_GPOS;
- if (RExC_rx->gofs < (U32)min)
+ if (RExC_rx->gofs < (STRLEN)min)
RExC_rx->gofs = min;
} else {
RExC_rx->extflags |= RXf_GPOS_FLOAT;
regnode *trie_node= scan;
regnode *tail= regnext(scan);
reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
- I32 max1 = 0, min1 = I32_MAX;
- struct regnode_charclass_class accum;
+ SSize_t max1 = 0, min1 = SSize_t_MAX;
+ regnode_ssc accum;
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
if (flags & SCF_DO_STCLASS)
- cl_init_zero(pRExC_state, &accum);
+ ssc_init_zero(pRExC_state, &accum);
if (!trie->jump) {
min1= trie->minlen;
for ( word=1 ; word <= trie->wordcount ; word++)
{
- I32 deltanext=0, minnext=0, f = 0, fake;
- struct regnode_charclass_class this_class;
+ SSize_t deltanext=0, minnext=0, f = 0, fake;
+ regnode_ssc this_class;
data_fake.flags = 0;
if (data) {
data_fake.last_closep = &fake;
data_fake.pos_delta = delta;
if (flags & SCF_DO_STCLASS) {
- cl_init(pRExC_state, &this_class);
+ ssc_init(pRExC_state, &this_class);
data_fake.start_class = &this_class;
f = SCF_DO_STCLASS_AND;
}
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
- if (min1 > (I32)(minnext + trie->minlen))
+ if (min1 > (SSize_t)(minnext + trie->minlen))
min1 = minnext + trie->minlen;
- if (deltanext == I32_MAX) {
+ if (deltanext == SSize_t_MAX) {
is_inf = is_inf_internal = 1;
- max1 = I32_MAX;
- } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
+ max1 = SSize_t_MAX;
+ } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
max1 = minnext + deltanext + trie->maxlen;
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
data->whilem_c = data_fake.whilem_c;
}
if (flags & SCF_DO_STCLASS)
- cl_or(pRExC_state, &accum, &this_class);
+ ssc_or(pRExC_state, &accum, &this_class);
}
}
if (flags & SCF_DO_SUBSTR) {
min += min1;
delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
- cl_or(pRExC_state, data->start_class, &accum);
+ ssc_or(pRExC_state, data->start_class, &accum);
if (min1) {
- cl_and(data->start_class, and_withp);
+ ssc_and(pRExC_state, data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
}
else if (flags & SCF_DO_STCLASS_AND) {
if (min1) {
- cl_and(data->start_class, &accum);
+ ssc_and(pRExC_state, data->start_class, &accum);
flags &= ~SCF_DO_STCLASS;
}
else {
/* Switch to OR mode: cache the old value of
* data->start_class */
INIT_AND_WITHP;
- StructCopy(data->start_class, and_withp,
- struct regnode_charclass_class);
+ StructCopy(data->start_class, and_withp, regnode_ssc);
flags &= ~SCF_DO_STCLASS_AND;
- StructCopy(&accum, data->start_class,
- struct regnode_charclass_class);
+ StructCopy(&accum, data->start_class, regnode_ssc);
flags |= SCF_DO_STCLASS_OR;
- SET_SSC_EOS(data->start_class);
}
}
scan= tail;
DEBUG_STUDYDATA("pre-fin:",data,depth);
*scanp = scan;
- *deltap = is_inf_internal ? I32_MAX : delta;
+ *deltap = is_inf_internal ? SSize_t_MAX : delta;
if (flags & SCF_DO_SUBSTR && is_inf)
- data->pos_delta = I32_MAX - data->pos_min;
+ data->pos_delta = SSize_t_MAX - data->pos_min;
if (is_par > (I32)U8_MAX)
is_par = 0;
if (is_par && pars==1 && data) {
data->flags &= ~SF_IN_PAR;
}
if (flags & SCF_DO_STCLASS_OR)
- cl_and(data->start_class, and_withp);
+ ssc_and(pRExC_state, data->start_class, and_withp);
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
}
STATIC U32
-S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
+S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
{
U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
HV * const table = GvHV(PL_hintgv);
SV **ptr;
- if (!table)
+ if (!table || !(PL_hints & HINT_LOCALIZE_HH))
return &PL_core_reg_engine;
ptr = hv_fetchs(table, "regcomp", FALSE);
if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
PERL_ARGS_ASSERT_PREGCOMP;
- /* Dispatch a request to compile a regexp to correct regexp engine. */
- DEBUG_COMPILE_r({
- PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
- PTR2UV(eng));
- });
- return CALLREGCOMP_ENG(eng, pattern, flags);
+ /* Dispatch a request to compile a regexp to correct regexp engine. */
+ DEBUG_COMPILE_r({
+ PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
+ PTR2UV(eng));
+ });
+ return CALLREGCOMP_ENG(eng, pattern, flags);
+}
+#endif
+
+/* public(ish) entry point for the perl core's own regex compiling code.
+ * It's actually a wrapper for Perl_re_op_compile that only takes an SV
+ * pattern rather than a list of OPs, and uses the internal engine rather
+ * than the current one */
+
+REGEXP *
+Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
+{
+ SV *pat = pattern; /* defeat constness! */
+ PERL_ARGS_ASSERT_RE_COMPILE;
+ return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
+#ifdef PERL_IN_XSUB_RE
+ &my_reg_engine,
+#else
+ &PL_core_reg_engine,
+#endif
+ NULL, NULL, rx_flags, 0);
+}
+
+
+/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
+ * blocks, recalculate the indices. Update pat_p and plen_p in-place to
+ * point to the realloced string and length.
+ *
+ * This is essentially a copy of Perl_bytes_to_utf8() with the code index
+ * stuff added */
+
+static void
+S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
+ char **pat_p, STRLEN *plen_p, int num_code_blocks)
+{
+ U8 *const src = (U8*)*pat_p;
+ U8 *dst;
+ int n=0;
+ STRLEN s = 0, d = 0;
+ bool do_end = 0;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+
+ Newx(dst, *plen_p * 2 + 1, U8);
+
+ while (s < *plen_p) {
+ if (NATIVE_BYTE_IS_INVARIANT(src[s]))
+ dst[d] = src[s];
+ else {
+ dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
+ dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
+ }
+ if (n < num_code_blocks) {
+ if (!do_end && pRExC_state->code_blocks[n].start == s) {
+ pRExC_state->code_blocks[n].start = d;
+ assert(dst[d] == '(');
+ do_end = 1;
+ }
+ else if (do_end && pRExC_state->code_blocks[n].end == s) {
+ pRExC_state->code_blocks[n].end = d;
+ assert(dst[d] == ')');
+ do_end = 0;
+ n++;
+ }
+ }
+ s++;
+ d++;
+ }
+ dst[d] = '\0';
+ *plen_p = d;
+ *pat_p = (char*) dst;
+ SAVEFREEPV(*pat_p);
+ RExC_orig_utf8 = RExC_utf8 = 1;
+}
+
+
+
+/* S_concat_pat(): concatenate a list of args to the pattern string pat,
+ * while recording any code block indices, and handling overloading,
+ * nested qr// objects etc. If pat is null, it will allocate a new
+ * string, or just return the first arg, if there's only one.
+ *
+ * Returns the malloced/updated pat.
+ * patternp and pat_count is the array of SVs to be concatted;
+ * oplist is the optional list of ops that generated the SVs;
+ * recompile_p is a pointer to a boolean that will be set if
+ * the regex will need to be recompiled.
+ * delim, if non-null is an SV that will be inserted between each element
+ */
+
+static SV*
+S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
+ SV *pat, SV ** const patternp, int pat_count,
+ OP *oplist, bool *recompile_p, SV *delim)
+{
+ SV **svp;
+ int n = 0;
+ bool use_delim = FALSE;
+ bool alloced = FALSE;
+
+ /* if we know we have at least two args, create an empty string,
+ * then concatenate args to that. For no args, return an empty string */
+ if (!pat && pat_count != 1) {
+ pat = newSVpvn("", 0);
+ SAVEFREESV(pat);
+ alloced = TRUE;
+ }
+
+ for (svp = patternp; svp < patternp + pat_count; svp++) {
+ SV *sv;
+ SV *rx = NULL;
+ STRLEN orig_patlen = 0;
+ bool code = 0;
+ SV *msv = use_delim ? delim : *svp;
+ if (!msv) msv = &PL_sv_undef;
+
+ /* if we've got a delimiter, we go round the loop twice for each
+ * svp slot (except the last), using the delimiter the second
+ * time round */
+ if (use_delim) {
+ svp--;
+ use_delim = FALSE;
+ }
+ else if (delim)
+ use_delim = TRUE;
+
+ if (SvTYPE(msv) == SVt_PVAV) {
+ /* we've encountered an interpolated array within
+ * the pattern, e.g. /...@a..../. Expand the list of elements,
+ * then recursively append elements.
+ * The code in this block is based on S_pushav() */
+
+ AV *const av = (AV*)msv;
+ const SSize_t maxarg = AvFILL(av) + 1;
+ SV **array;
+
+ if (oplist) {
+ assert(oplist->op_type == OP_PADAV
+ || oplist->op_type == OP_RV2AV);
+ oplist = oplist->op_sibling;;
+ }
+
+ if (SvRMAGICAL(av)) {
+ SSize_t i;
+
+ Newx(array, maxarg, SV*);
+ SAVEFREEPV(array);
+ for (i=0; i < maxarg; i++) {
+ SV ** const svp = av_fetch(av, i, FALSE);
+ array[i] = svp ? *svp : &PL_sv_undef;
+ }
+ }
+ else
+ array = AvARRAY(av);
+
+ pat = S_concat_pat(aTHX_ pRExC_state, pat,
+ array, maxarg, NULL, recompile_p,
+ /* $" */
+ GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
+
+ continue;
+ }
+
+
+ /* we make the assumption here that each op in the list of
+ * op_siblings maps to one SV pushed onto the stack,
+ * except for code blocks, with have both an OP_NULL and
+ * and OP_CONST.
+ * This allows us to match up the list of SVs against the
+ * list of OPs to find the next code block.
+ *
+ * Note that PUSHMARK PADSV PADSV ..
+ * is optimised to
+ * PADRANGE PADSV PADSV ..
+ * so the alignment still works. */
+
+ if (oplist) {
+ if (oplist->op_type == OP_NULL
+ && (oplist->op_flags & OPf_SPECIAL))
+ {
+ assert(n < pRExC_state->num_code_blocks);
+ pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
+ pRExC_state->code_blocks[n].block = oplist;
+ pRExC_state->code_blocks[n].src_regex = NULL;
+ n++;
+ code = 1;
+ oplist = oplist->op_sibling; /* skip CONST */
+ assert(oplist);
+ }
+ oplist = oplist->op_sibling;;
+ }
+
+ /* apply magic and QR overloading to arg */
+
+ SvGETMAGIC(msv);
+ if (SvROK(msv) && SvAMAGIC(msv)) {
+ SV *sv = AMG_CALLunary(msv, regexp_amg);
+ if (sv) {
+ if (SvROK(sv))
+ sv = SvRV(sv);
+ if (SvTYPE(sv) != SVt_REGEXP)
+ Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
+ msv = sv;
+ }
+ }
+
+ /* try concatenation overload ... */
+ if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
+ (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
+ {
+ sv_setsv(pat, sv);
+ /* overloading involved: all bets are off over literal
+ * code. Pretend we haven't seen it */
+ pRExC_state->num_code_blocks -= n;
+ n = 0;
+ }
+ else {
+ /* ... or failing that, try "" overload */
+ while (SvAMAGIC(msv)
+ && (sv = AMG_CALLunary(msv, string_amg))
+ && sv != msv
+ && !( SvROK(msv)
+ && SvROK(sv)
+ && SvRV(msv) == SvRV(sv))
+ ) {
+ msv = sv;
+ SvGETMAGIC(msv);
+ }
+ if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
+ msv = SvRV(msv);
+
+ if (pat) {
+ /* this is a partially unrolled
+ * sv_catsv_nomg(pat, msv);
+ * that allows us to adjust code block indices if
+ * needed */
+ STRLEN dlen;
+ char *dst = SvPV_force_nomg(pat, dlen);
+ orig_patlen = dlen;
+ if (SvUTF8(msv) && !SvUTF8(pat)) {
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
+ sv_setpvn(pat, dst, dlen);
+ SvUTF8_on(pat);
+ }
+ sv_catsv_nomg(pat, msv);
+ rx = msv;
+ }
+ else
+ pat = msv;
+
+ if (code)
+ pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
+ }
+
+ /* extract any code blocks within any embedded qr//'s */
+ if (rx && SvTYPE(rx) == SVt_REGEXP
+ && RX_ENGINE((REGEXP*)rx)->op_comp)
+ {
+
+ RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
+ if (ri->num_code_blocks) {
+ int i;
+ /* the presence of an embedded qr// with code means
+ * we should always recompile: the text of the
+ * qr// may not have changed, but it may be a
+ * different closure than last time */
+ *recompile_p = 1;
+ Renew(pRExC_state->code_blocks,
+ pRExC_state->num_code_blocks + ri->num_code_blocks,
+ struct reg_code_block);
+ pRExC_state->num_code_blocks += ri->num_code_blocks;
+
+ for (i=0; i < ri->num_code_blocks; i++) {
+ struct reg_code_block *src, *dst;
+ STRLEN offset = orig_patlen
+ + ReANY((REGEXP *)rx)->pre_prefix;
+ assert(n < pRExC_state->num_code_blocks);
+ src = &ri->code_blocks[i];
+ dst = &pRExC_state->code_blocks[n];
+ dst->start = src->start + offset;
+ dst->end = src->end + offset;
+ dst->block = src->block;
+ dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
+ src->src_regex
+ ? src->src_regex
+ : (REGEXP*)rx);
+ n++;
+ }
+ }
+ }
+ }
+ /* avoid calling magic multiple times on a single element e.g. =~ $qr */
+ if (alloced)
+ SvSETMAGIC(pat);
+
+ return pat;
}
-#endif
-/* public(ish) entry point for the perl core's own regex compiling code.
- * It's actually a wrapper for Perl_re_op_compile that only takes an SV
- * pattern rather than a list of OPs, and uses the internal engine rather
- * than the current one */
-REGEXP *
-Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
-{
- SV *pat = pattern; /* defeat constness! */
- PERL_ARGS_ASSERT_RE_COMPILE;
- return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
-#ifdef PERL_IN_XSUB_RE
- &my_reg_engine,
-#else
- &PL_core_reg_engine,
-#endif
- NULL, NULL, rx_flags, 0);
-}
/* see if there are any run-time code blocks in the pattern.
* False positives are allowed */
static bool
-S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
- U32 pm_flags, char *pat, STRLEN plen)
+S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
+ char *pat, STRLEN plen)
{
int n = 0;
STRLEN s;
- /* avoid infinitely recursing when we recompile the pattern parcelled up
- * as qr'...'. A single constant qr// string can't have have any
- * run-time component in it, and thus, no runtime code. (A non-qr
- * string, however, can, e.g. $x =~ '(?{})') */
- if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
- return 0;
-
for (s = 0; s < plen; s++) {
if (n < pRExC_state->num_code_blocks
&& s == pRExC_state->code_blocks[n].start)
SAVETMPS;
save_re_context();
PUSHSTACKi(PERLSI_REQUIRE);
- /* this causes the toker to collapse \\ into \ when parsing
- * qr''; normally only q'' does this. It also alters hints
- * handling */
- PL_reg_state.re_reparsing = TRUE;
- eval_sv(sv, G_SCALAR);
+ /* G_RE_REPARSING causes the toker to collapse \\ into \ when
+ * parsing qr''; normally only q'' does this. It also alters
+ * hints handling */
+ eval_sv(sv, G_SCALAR|G_RE_REPARSING);
SvREFCNT_dec_NN(sv);
SPAGAIN;
qr_ref = POPs;
{
Safefree(pRExC_state->code_blocks);
/* use croak_sv ? */
- Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
+ Perl_croak_nocontext("%"SVf, SVfARG(errsv));
}
}
assert(SvROK(qr_ref));
STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
+ SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
{
/* This is the common code for setting up the floating and fixed length
- * string data extracted from Perlre_op_compile() below. Returns a boolean
+ * string data extracted from Perl_re_op_compile() below. Returns a boolean
* as to whether succeeded or not */
- I32 t,ml;
+ I32 t;
+ SSize_t ml;
if (! (longest_length
|| (eol /* Can't have SEOL and MULTI */
follow this item. We calculate it ahead of time as once the
lookbehind offset is added in we lose the ability to correctly
calculate it.*/
- ml = minlen ? *(minlen) : (I32)longest_length;
+ ml = minlen ? *(minlen) : (SSize_t)longest_length;
*rx_end_shift = ml - offset
- longest_length + (SvTAIL(sv_longest) != 0)
+ lookbehind;
regexp_internal *ri;
STRLEN plen;
char *exp;
- char* xend;
regnode *scan;
I32 flags;
- I32 minlen = 0;
+ SSize_t minlen = 0;
U32 rx_flags;
- SV *pat = NULL;
+ SV *pat;
SV *code_blocksv = NULL;
+ SV** new_patternp = patternp;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
I32 sawlookahead = 0;
I32 sawplus = 0;
I32 sawopen = 0;
+ I32 sawminmod = 0;
+
regex_charset initial_charset = get_regex_charset(orig_rx_flags);
- bool code_is_utf8 = 0;
bool recompile = 0;
bool runtime_code = 0;
scan_data_t data;
* having to test them each time otherwise */
if (! PL_AboveLatin1) {
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
- PL_ASCII = _new_invlist_C_array(ASCII_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
+ PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
+
+ PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
+ PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
+ PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
= _new_invlist_C_array(L1PosixAlnum_invlist);
if (expr && (expr->op_type == OP_LIST ||
(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
-
- /* is the source UTF8, and how many code blocks are there? */
+ /* allocate code_blocks if needed */
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
- if (o->op_type == OP_CONST) {
- /* skip if we have SVs as well as OPs. In this case,
- * a) we decide utf8 based on SVs not OPs;
- * b) the current pad may not match that which the ops
- * were compiled in, so, so on threaded builds,
- * cSVOPo_sv would look in the wrong pad */
- if (!pat_count && SvUTF8(cSVOPo_sv))
- code_is_utf8 = 1;
- }
- else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
- /* count of DO blocks */
- ncode++;
- }
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
+ if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
+ ncode++; /* count of DO blocks */
if (ncode) {
pRExC_state->num_code_blocks = ncode;
Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
}
}
- if (pat_count) {
- /* handle a list of SVs */
-
- SV **svp;
- OP *o = NULL;
- int n = 0;
- STRLEN orig_patlen = 0;
+ if (!pat_count) {
+ /* compile-time pattern with just OP_CONSTs and DO blocks */
- /* apply magic and RE overloading to each arg */
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- SV *rx = *svp;
- SvGETMAGIC(rx);
- if (SvROK(rx) && SvAMAGIC(rx)) {
- SV *sv = AMG_CALLunary(rx, regexp_amg);
- if (sv) {
- if (SvROK(sv))
- sv = SvRV(sv);
- if (SvTYPE(sv) != SVt_REGEXP)
- Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
- *svp = sv;
- }
- }
- }
+ int n;
+ OP *o;
- /* process args, concat them if there are multiple ones,
- * and find any code block indexes */
+ /* find how many CONSTs there are */
+ assert(expr);
+ n = 0;
+ if (expr->op_type == OP_CONST)
+ n = 1;
+ else
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST)
+ n++;
+ }
- if (pat_count > 1) {
- if (pRExC_state->num_code_blocks) {
- o = cLISTOPx(expr)->op_first;
- assert( o->op_type == OP_PUSHMARK
- || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
- || o->op_type == OP_PADRANGE);
- o = o->op_sibling;
- }
+ /* fake up an SV array */
- pat = newSVpvn("", 0);
- SAVEFREESV(pat);
-
- /* determine if the pattern is going to be utf8 (needed
- * in advance to align code block indices correctly).
- * XXX This could fail to be detected for an arg with
- * overloading but not concat overloading; but the main effect
- * in this obscure case is to need a 'use re eval' for a
- * literal code block */
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- if (SvUTF8(*svp))
- SvUTF8_on(pat);
- }
- }
+ assert(!new_patternp);
+ Newx(new_patternp, n, SV*);
+ SAVEFREEPV(new_patternp);
+ pat_count = n;
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- SV *sv, *msv = *svp;
- SV *rx = NULL;
- bool code = 0;
- /* we make the assumption here that each op in the list of
- * op_siblings maps to one SV pushed onto the stack,
- * except for code blocks, with have both an OP_NULL and
- * and OP_CONST.
- * This allows us to match up the list of SVs against the
- * list of OPs to find the next code block.
- *
- * Note that PUSHMARK PADSV PADSV ..
- * is optimised to
- * PADRANGE NULL NULL ..
- * so the alignment still works. */
- if (o) {
- if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
- assert(n < pRExC_state->num_code_blocks);
- pRExC_state->code_blocks[n].start = SvCUR(pat);
- pRExC_state->code_blocks[n].block = o;
- pRExC_state->code_blocks[n].src_regex = NULL;
- n++;
- code = 1;
- o = o->op_sibling; /* skip CONST */
- assert(o);
- }
- o = o->op_sibling;;
+ n = 0;
+ if (expr->op_type == OP_CONST)
+ new_patternp[n] = cSVOPx_sv(expr);
+ else
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST)
+ new_patternp[n++] = cSVOPo_sv;
}
- /* try concatenation overload ... */
- if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
- (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
- {
- sv_setsv(pat, sv);
- /* overloading involved: all bets are off over literal
- * code. Pretend we haven't seen it */
- pRExC_state->num_code_blocks -= n;
- n = 0;
- }
- else {
- /* ... or failing that, try "" overload */
- while (SvAMAGIC(msv)
- && (sv = AMG_CALLunary(msv, string_amg))
- && sv != msv
- && !( SvROK(msv)
- && SvROK(sv)
- && SvRV(msv) == SvRV(sv))
- ) {
- msv = sv;
- SvGETMAGIC(msv);
- }
- if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
- msv = SvRV(msv);
- if (pat) {
- orig_patlen = SvCUR(pat);
- sv_catsv_nomg(pat, msv);
- rx = msv;
- }
- else
- pat = msv;
- if (code)
- pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
- }
+ }
- /* extract any code blocks within any embedded qr//'s */
- if (rx && SvTYPE(rx) == SVt_REGEXP
- && RX_ENGINE((REGEXP*)rx)->op_comp)
- {
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Assembling pattern from %d elements%s\n", pat_count,
+ orig_rx_flags & RXf_SPLIT ? " for split" : ""));
- RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
- if (ri->num_code_blocks) {
- int i;
- /* the presence of an embedded qr// with code means
- * we should always recompile: the text of the
- * qr// may not have changed, but it may be a
- * different closure than last time */
- recompile = 1;
- Renew(pRExC_state->code_blocks,
- pRExC_state->num_code_blocks + ri->num_code_blocks,
- struct reg_code_block);
- pRExC_state->num_code_blocks += ri->num_code_blocks;
- for (i=0; i < ri->num_code_blocks; i++) {
- struct reg_code_block *src, *dst;
- STRLEN offset = orig_patlen
- + ReANY((REGEXP *)rx)->pre_prefix;
- assert(n < pRExC_state->num_code_blocks);
- src = &ri->code_blocks[i];
- dst = &pRExC_state->code_blocks[n];
- dst->start = src->start + offset;
- dst->end = src->end + offset;
- dst->block = src->block;
- dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
- src->src_regex
- ? src->src_regex
- : (REGEXP*)rx);
- n++;
- }
- }
- }
- }
- if (pat_count > 1)
- SvSETMAGIC(pat);
+ /* set expr to the first arg op */
- /* handle bare (possibly after overloading) regex: foo =~ $re */
- {
- SV *re = pat;
- if (SvROK(re))
- re = SvRV(re);
- if (SvTYPE(re) == SVt_REGEXP) {
- if (is_bare_re)
- *is_bare_re = TRUE;
- SvREFCNT_inc(re);
- Safefree(pRExC_state->code_blocks);
- return (REGEXP*)re;
- }
- }
+ if (pRExC_state->num_code_blocks
+ && expr->op_type != OP_CONST)
+ {
+ expr = cLISTOPx(expr)->op_first;
+ assert( expr->op_type == OP_PUSHMARK
+ || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
+ || expr->op_type == OP_PADRANGE);
+ expr = expr->op_sibling;
}
- else {
- /* not a list of SVs, so must be a list of OPs */
- int i = -1;
- bool is_code = 0;
- OP *o;
- OP *ofirst, *olast;
-
- assert(expr);
-
- if (expr->op_type == OP_LIST) {
- ofirst = cLISTOPx(expr)->op_first;
- olast = cLISTOPx(expr)->op_last;
- pat = newSVpvn("", 0);
- SAVEFREESV(pat);
- if (code_is_utf8)
- SvUTF8_on(pat);
- }
- else {
- assert(expr->op_type == OP_CONST);
- ofirst = olast = expr;
- pat = NULL;
- }
- /* given a list of CONSTs and DO blocks in expr, append all
- * the CONSTs to pat, and record the start and end of each
- * code block in code_blocks[] (each DO{} op is followed by an
- * OP_CONST containing the corresponding literal '(?{...})
- * text)
- */
- o = ofirst;
- while (1) {
- if (o->op_type == OP_CONST) {
- if (pat) {
- sv_catsv(pat, cSVOPo_sv);
- if (is_code) {
- pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
- is_code = 0;
- }
- }
- else {
- pat = cSVOPx_sv(expr);
- }
- }
- else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
- assert(i+1 < pRExC_state->num_code_blocks);
- pRExC_state->code_blocks[++i].start = SvCUR(pat);
- pRExC_state->code_blocks[i].block = o;
- pRExC_state->code_blocks[i].src_regex = NULL;
- is_code = 1;
- }
- if (o == olast)
- break;
- o = o->op_sibling;
- }
+ pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
+ expr, &recompile, NULL);
+
+ /* handle bare (possibly after overloading) regex: foo =~ $re */
+ {
+ SV *re = pat;
+ if (SvROK(re))
+ re = SvRV(re);
+ if (SvTYPE(re) == SVt_REGEXP) {
+ if (is_bare_re)
+ *is_bare_re = TRUE;
+ SvREFCNT_inc(re);
+ Safefree(pRExC_state->code_blocks);
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Precompiled pattern%s\n",
+ orig_rx_flags & RXf_SPLIT ? " for split" : ""));
+
+ return (REGEXP*)re;
+ }
}
exp = SvPV_nomg(pat, plen);
- xend = exp + plen;
if (!eng->op_comp) {
if ((SvUTF8(pat) && IN_BYTES)
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
RExC_uni_semantics = 0;
RExC_contains_locale = 0;
+ RExC_contains_i = 0;
pRExC_state->runtime_code_qr = NULL;
DEBUG_COMPILE_r({
PL_colors[4],PL_colors[5],s);
});
- if (0) {
- redo_first_pass:
- {
- U8 *const src = (U8*)exp;
- U8 *dst;
- int n=0;
- STRLEN s = 0, d = 0;
- bool do_end = 0;
-
- /* It's possible to write a regexp in ascii that represents Unicode
- codepoints outside of the byte range, such as via \x{100}. If we
- detect such a sequence we have to convert the entire pattern to utf8
- and then recompile, as our sizing calculation will have been based
- on 1 byte == 1 character, but we will need to use utf8 to encode
- at least some part of the pattern, and therefore must convert the whole
- thing.
- -- dmq */
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
- "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-
- /* upgrade pattern to UTF8, and if there are code blocks,
- * recalculate the indices.
- * This is essentially an unrolled Perl_bytes_to_utf8() */
+ redo_first_pass:
+ /* we jump here if we upgrade the pattern to utf8 and have to
+ * recompile */
- Newx(dst, plen * 2 + 1, U8);
-
- while (s < plen) {
- const UV uv = NATIVE_TO_ASCII(src[s]);
- if (UNI_IS_INVARIANT(uv))
- dst[d] = (U8)UTF_TO_NATIVE(uv);
- else {
- dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
- dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
- }
- if (n < pRExC_state->num_code_blocks) {
- if (!do_end && pRExC_state->code_blocks[n].start == s) {
- pRExC_state->code_blocks[n].start = d;
- assert(dst[d] == '(');
- do_end = 1;
- }
- else if (do_end && pRExC_state->code_blocks[n].end == s) {
- pRExC_state->code_blocks[n].end = d;
- assert(dst[d] == ')');
- do_end = 0;
- n++;
- }
- }
- s++;
- d++;
- }
- dst[d] = '\0';
- plen = d;
- exp = (char*) dst;
- xend = exp + plen;
- SAVEFREEPV(exp);
- RExC_orig_utf8 = RExC_utf8 = 1;
- }
- }
+ if ((pm_flags & PMf_USE_RE_EVAL)
+ /* this second condition covers the non-regex literal case,
+ * i.e. $foo =~ '(?{})'. */
+ || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
+ )
+ runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
/* return old regex if pattern hasn't changed */
/* XXX: note in the below we have to check the flags as well as the pattern.
&& ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
&& RX_PRECOMP(old_re)
&& RX_PRELEN(old_re) == plen
- && memEQ(RX_PRECOMP(old_re), exp, plen))
+ && memEQ(RX_PRECOMP(old_re), exp, plen)
+ && !runtime_code /* with runtime code, always recompile */ )
{
- /* with runtime code, always recompile */
- runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
- exp, plen);
- if (!runtime_code) {
- Safefree(pRExC_state->code_blocks);
- return old_re;
- }
+ Safefree(pRExC_state->code_blocks);
+ return old_re;
}
- else if ((pm_flags & PMf_USE_RE_EVAL)
- /* this second condition covers the non-regex literal case,
- * i.e. $foo =~ '(?{})'. */
- || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
- && (PL_hints & HINT_RE_EVAL))
- )
- runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
- exp, plen);
rx_flags = orig_rx_flags;
+ if (rx_flags & PMf_FOLD) {
+ RExC_contains_i = 1;
+ }
if (initial_charset == REGEX_LOCALE_CHARSET) {
RExC_contains_locale = 1;
}
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
/* whoops, we have a non-utf8 pattern, whilst run-time code
* got compiled as utf8. Try again with a utf8 pattern */
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
goto redo_first_pass;
}
}
/* First pass: determine size, legality. */
RExC_parse = exp;
RExC_start = exp;
- RExC_end = xend;
+ RExC_end = exp + plen;
RExC_naughty = 0;
RExC_npar = 1;
RExC_nestroot = 0;
RExC_size = 0L;
- RExC_emit = &PL_regdummy;
+ RExC_emit = (regnode *) &RExC_emit_dummy;
RExC_whilem_seen = 0;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
}
if (reg(pRExC_state, 0, &flags,1) == NULL) {
+ /* It's possible to write a regexp in ascii that represents Unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ -- dmq */
if (flags & RESTART_UTF8) {
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
goto redo_first_pass;
}
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
}
if (code_blocksv)
SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
RExC_flags = rx_flags; /* don't let top level (?i) bleed */
RExC_pm_flags = pm_flags;
RExC_parse = exp;
- RExC_end = xend;
+ RExC_end = exp + plen;
RExC_naughty = 0;
RExC_npar = 1;
RExC_emit_start = ri->program;
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
}
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
}
reStudy:
- r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
+ r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
Zero(r->substrs, 1, struct reg_substr_data);
#ifdef TRIE_STUDY_OPT
/* testing for BRANCH here tells us whether there is "must appear"
data in the pattern. If there is then we can use it for optimisations */
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
- I32 fake;
+ SSize_t fake;
STRLEN longest_float_length, longest_fixed_length;
- struct regnode_charclass_class ch_class; /* pointed to by data */
+ regnode_ssc ch_class; /* pointed to by data */
int stclass_flag;
- I32 last_close = 0; /* pointed to by data */
+ SSize_t last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
/*
* the only op that could be a regnode is PLUS, all the rest
* will be regnode_1 or regnode_2.
*
+ * (yves doesn't think this is true)
*/
if (OP(first) == PLUS)
sawplus = 1;
- else
+ else {
+ if (OP(first) == MINMOD)
+ sawminmod = 1;
first += regarglen[OP(first)];
-
+ }
first = NEXTOPER(first);
first_next= regnext(first);
}
first = NEXTOPER(first);
goto again;
}
- if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
+ if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
&& !pRExC_state->num_code_blocks) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->intflags |= PREGf_SKIP;
SAVEFREESV(data.last_found);
first = scan;
if (!ri->regstclass) {
- cl_init(pRExC_state, &ch_class);
+ ssc_init(pRExC_state, &ch_class);
data.start_class = &ch_class;
stclass_flag = SCF_DO_STCLASS_AND;
} else /* XXXX Check for BOUND? */
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
&data, -1, NULL, NULL,
- SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
+ SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
+ | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
&& data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& !(RExC_seen & REG_SEEN_VERBARG)
- && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
+ && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
r->extflags |= RXf_CHECK_ALL;
scan_commit(pRExC_state, &data,&minlen,0);
{
r->float_min_offset = data.offset_float_min - data.lookbehind_float;
r->float_max_offset = data.offset_float_max;
- if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
+ if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
r->float_max_offset -= data.lookbehind_float;
SvREFCNT_inc_simple_void_NN(data.longest_float);
}
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
- && ! TEST_SSC_EOS(data.start_class)
- && !cl_is_anything(data.start_class))
+ && ! ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING
+ && !ssc_is_anything(data.start_class))
{
- const U32 n = add_data(pRExC_state, 1, "f");
- OP(data.start_class) = ANYOF_SYNTHETIC;
+ const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
- Newx(RExC_rxi->data->data[n], 1,
- struct regnode_charclass_class);
+ ssc_finalize(pRExC_state, data.start_class);
+
+ Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
StructCopy(data.start_class,
- (struct regnode_charclass_class*)RExC_rxi->data->data[n],
- struct regnode_charclass_class);
+ (regnode_ssc*)RExC_rxi->data->data[n],
+ regnode_ssc);
ri->regstclass = (regnode*)RExC_rxi->data->data[n];
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
PerlIO_printf(Perl_debug_log,
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
+ data.start_class = NULL;
}
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
r->check_offset_min = r->float_min_offset;
r->check_offset_max = r->float_max_offset;
}
- /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
- This should be changed ASAP! */
- if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
+ if ((r->check_substr || r->check_utf8) ) {
r->extflags |= RXf_USE_INTUIT;
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->extflags |= RXf_INTUIT_TAIL;
}
else {
/* Several toplevels. Best we can is to set minlen. */
- I32 fake;
- struct regnode_charclass_class ch_class;
- I32 last_close = 0;
+ SSize_t fake;
+ regnode_ssc ch_class;
+ SSize_t last_close = 0;
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
scan = ri->program + 1;
- cl_init(pRExC_state, &ch_class);
+ ssc_init(pRExC_state, &ch_class);
data.start_class = &ch_class;
data.last_closep = &last_close;
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
- &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
+ &data, -1, NULL, NULL,
+ SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
+ |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
+ 0);
CHECK_RESTUDY_GOTO_butfirst(NOOP);
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
= r->float_substr = r->float_utf8 = NULL;
- if (! TEST_SSC_EOS(data.start_class)
- && !cl_is_anything(data.start_class))
+ if (! ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING
+ && !ssc_is_anything(data.start_class))
{
- const U32 n = add_data(pRExC_state, 1, "f");
- OP(data.start_class) = ANYOF_SYNTHETIC;
+ const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
+
+ ssc_finalize(pRExC_state, data.start_class);
- Newx(RExC_rxi->data->data[n], 1,
- struct regnode_charclass_class);
+ Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
StructCopy(data.start_class,
- (struct regnode_charclass_class*)RExC_rxi->data->data[n],
- struct regnode_charclass_class);
+ (regnode_ssc*)RExC_rxi->data->data[n],
+ regnode_ssc);
ri->regstclass = (regnode*)RExC_rxi->data->data[n];
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
PerlIO_printf(Perl_debug_log,
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
+ data.start_class = NULL;
}
}
}
#ifdef DEBUGGING
if (RExC_paren_names) {
- ri->name_list_idx = add_data( pRExC_state, 1, "a" );
+ ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
} else
#endif
});
#ifdef RE_TRACK_PATTERN_OFFSETS
DEBUG_OFFSETS_r(if (ri->u.offsets) {
- const U32 len = ri->u.offsets[0];
- U32 i;
+ const STRLEN len = ri->u.offsets[0];
+ STRLEN i;
GET_RE_DEBUG_FLAGS_DECL;
PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
for (i = 1; i <= len; i++) {
{
SV *ret;
AV *av;
- I32 length;
+ SSize_t length;
struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
{
struct regexp *const rx = ReANY(r);
char *s = NULL;
- I32 i = 0;
- I32 s1, t1;
+ SSize_t i = 0;
+ SSize_t s1, t1;
I32 n = paren;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
- if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
+ if ( n == RX_BUFF_IDX_CARET_PREMATCH
|| n == RX_BUFF_IDX_CARET_FULLMATCH
|| n == RX_BUFF_IDX_CARET_POSTMATCH
- )
- && !(rx->extflags & RXf_PMf_KEEPCOPY)
- )
- goto ret_undef;
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto ret_undef;
+ }
if (!rx->subbeg)
goto ret_undef;
}
assert(s >= rx->subbeg);
- assert(rx->sublen >= (s - rx->subbeg) + i );
+ assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
if (i >= 0) {
#if NO_TAINT_SUPPORT
sv_setpvn(sv, s, i);
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
+ if ( paren == RX_BUFF_IDX_CARET_PREMATCH
+ || paren == RX_BUFF_IDX_CARET_FULLMATCH
+ || paren == RX_BUFF_IDX_CARET_POSTMATCH
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto warn_undef;
+ }
+
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
switch (paren) {
case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
case RX_BUFF_IDX_PREMATCH: /* $` */
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
return 0;
case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
case RX_BUFF_IDX_POSTMATCH: /* $' */
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
}
return 0;
- case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
- /* $& / ${^MATCH}, $1, $2, ... */
- default:
+ default: /* $& / ${^MATCH}, $1, $2, ... */
if (paren <= (I32)rx->nparens &&
(s1 = rx->offs[paren].start) != -1 &&
(t1 = rx->offs[paren].end) != -1)
/* This section of code defines the inversion list object and its methods. The
* interfaces are highly subject to change, so as much as possible is static to
* this file. An inversion list is here implemented as a malloc'd C UV array
- * with some added info that is placed as UVs at the beginning in a header
- * portion. An inversion list for Unicode is an array of code points, sorted
- * by ordinal number. The zeroth element is the first code point in the list.
- * The 1th element is the first element beyond that not in the list. In other
- * words, the first range is
+ * as an SVt_INVLIST scalar.
+ *
+ * An inversion list for Unicode is an array of code points, sorted by ordinal
+ * number. The zeroth element is the first code point in the list. The 1th
+ * element is the first element beyond that not in the list. In other words,
+ * the first range is
* invlist[0]..(invlist[1]-1)
* The other ranges follow. Thus every element whose index is divisible by two
* marks the beginning of a range that is in the list, and every element not
* Taking the complement (inverting) an inversion list is quite simple, if the
* first element is 0, remove it; otherwise add a 0 element at the beginning.
* This implementation reserves an element at the beginning of each inversion
- * list to contain 0 when the list contains 0, and contains 1 otherwise. The
- * actual beginning of the list is either that element if 0, or the next one if
- * 1.
+ * list to always contain 0; there is an additional flag in the header which
+ * indicates if the list begins at the 0, or is offset to begin at the next
+ * element.
*
* More about inversion lists can be found in "Unicode Demystified"
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
* should eventually be made public */
/* The header definitions are in F<inline_invlist.c> */
-#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
-#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
-
-#define INVLIST_INITIAL_LEN 10
PERL_STATIC_INLINE UV*
S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
{
/* Returns a pointer to the first element in the inversion list's array.
* This is called upon initialization of an inversion list. Where the
- * array begins depends on whether the list has the code point U+0000
- * in it or not. The other parameter tells it whether the code that
- * follows this call is about to put a 0 in the inversion list or not.
- * The first element is either the element with 0, if 0, or the next one,
- * if 1 */
+ * array begins depends on whether the list has the code point U+0000 in it
+ * or not. The other parameter tells it whether the code that follows this
+ * call is about to put a 0 in the inversion list or not. The first
+ * element is either the element reserved for 0, if TRUE, or the element
+ * after it, if FALSE */
- UV* zero = get_invlist_zero_addr(invlist);
+ bool* offset = get_invlist_offset_addr(invlist);
+ UV* zero_addr = (UV *) SvPVX(invlist);
PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
/* Must be empty */
- assert(! *_get_invlist_len_addr(invlist));
+ assert(! _invlist_len(invlist));
+
+ *zero_addr = 0;
/* 1^1 = 0; 1^0 = 1 */
- *zero = 1 ^ will_have_0;
- return zero + *zero;
+ *offset = 1 ^ will_have_0;
+ return zero_addr + *offset;
}
PERL_STATIC_INLINE UV*
/* Must not be empty. If these fail, you probably didn't check for <len>
* being non-zero before trying to get the array */
- assert(*_get_invlist_len_addr(invlist));
- assert(*get_invlist_zero_addr(invlist) == 0
- || *get_invlist_zero_addr(invlist) == 1);
-
- /* The array begins either at the element reserved for zero if the
- * list contains 0 (that element will be set to 0), or otherwise the next
- * element (in which case the reserved element will be set to 1). */
- return (UV *) (get_invlist_zero_addr(invlist)
- + *get_invlist_zero_addr(invlist));
+ assert(_invlist_len(invlist));
+
+ /* The very first element always contains zero, The array begins either
+ * there, or if the inversion list is offset, at the element after it.
+ * The offset header field determines which; it contains 0 or 1 to indicate
+ * how much additionally to add */
+ assert(0 == *(SvPVX(invlist)));
+ return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
}
PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
+S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
{
- /* Sets the current number of elements stored in the inversion list */
+ /* Sets the current number of elements stored in the inversion list.
+ * Updates SvCUR correspondingly */
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- *_get_invlist_len_addr(invlist) = len;
-
- assert(len <= SvLEN(invlist));
-
- SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
- /* If the list contains U+0000, that element is part of the header,
- * and should not be counted as part of the array. It will contain
- * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
- * subtract:
- * SvCUR_set(invlist,
- * TO_INTERNAL_SIZE(len
- * - (*get_invlist_zero_addr(inv_list) ^ 1)));
- * But, this is only valid if len is not 0. The consequences of not doing
- * this is that the memory allocation code may think that 1 more UV is
- * being used than actually is, and so might do an unnecessary grow. That
- * seems worth not bothering to make this the precise amount.
- *
- * Note that when inverting, SvCUR shouldn't change */
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ SvCUR_set(invlist,
+ (len == 0)
+ ? 0
+ : TO_INTERNAL_SIZE(len + offset));
+ assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
}
PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
{
- /* Return the address of the UV that is reserved to hold the cached index
+ /* Return the address of the IV that is reserved to hold the cached index
* */
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
- return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ return &(((XINVLIST*) SvANY(invlist))->prev_index);
}
PERL_STATIC_INLINE IV
PERL_ARGS_ASSERT_INVLIST_MAX;
- return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
- ? _invlist_len(invlist)
- : FROM_INTERNAL_SIZE(SvLEN(invlist));
-}
-
-PERL_STATIC_INLINE UV*
-S_get_invlist_zero_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that is reserved to hold 0 if the inversion
- * list contains 0. This has to be the last element of the heading, as the
- * list proper starts with either it if 0, or the next element if not.
- * (But we force it to contain either 0 or 1) */
+ assert(SvTYPE(invlist) == SVt_INVLIST);
- PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
+ /* Assumes worst case, in which the 0 element is not counted in the
+ * inversion list, so subtracts 1 for that */
+ return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
+ ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
+ : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
}
#ifndef PERL_IN_XSUB_RE
SV* new_list;
if (initial_size < 0) {
- initial_size = INVLIST_INITIAL_LEN;
+ initial_size = 10;
}
/* Allocate the initial space */
- new_list = newSV(TO_INTERNAL_SIZE(initial_size));
- invlist_set_len(new_list, 0);
+ new_list = newSV_type(SVt_INVLIST);
- /* Force iterinit() to be used to get iteration to work */
- *get_invlist_iter_addr(new_list) = UV_MAX;
+ /* First 1 is in case the zero element isn't in the list; second 1 is for
+ * trailing NUL */
+ SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
+ invlist_set_len(new_list, 0, 0);
- /* This should force a segfault if a method doesn't initialize this
- * properly */
- *get_invlist_zero_addr(new_list) = UV_MAX;
+ /* Force iterinit() to be used to get iteration to work */
+ *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
*get_invlist_previous_index_addr(new_list) = 0;
- *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 5
-# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
-#endif
return new_list;
}
#endif
STATIC SV*
-S__new_invlist_C_array(pTHX_ UV* list)
+S__new_invlist_C_array(pTHX_ const UV* const list)
{
/* Return a pointer to a newly constructed inversion list, initialized to
* point to <list>, which has to be in the exact correct inversion list
* form, including internal fields. Thus this is a dangerous routine that
- * should not be used in the wrong hands */
+ * should not be used in the wrong hands. The passed in 'list' contains
+ * several header fields at the beginning that are not part of the
+ * inversion list body proper */
+
+ const STRLEN length = (STRLEN) list[0];
+ const UV version_id = list[1];
+ const bool offset = cBOOL(list[2]);
+#define HEADER_LENGTH 3
+ /* If any of the above changes in any way, you must change HEADER_LENGTH
+ * (if appropriate) and regenerate INVLIST_VERSION_ID by running
+ * perl -E 'say int(rand 2**31-1)'
+ */
+#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
+ data structure type, so that one being
+ passed in can be validated to be an
+ inversion list of the correct vintage.
+ */
- SV* invlist = newSV_type(SVt_PV);
+ SV* invlist = newSV_type(SVt_INVLIST);
PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
- SvPV_set(invlist, (char *) list);
+ if (version_id != INVLIST_VERSION_ID) {
+ Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+ }
+
+ /* The generated array passed in includes header elements that aren't part
+ * of the list proper, so start it just after them */
+ SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
+
SvLEN_set(invlist, 0); /* Means we own the contents, and the system
shouldn't touch it */
- SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
- if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
- Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
- }
+ *(get_invlist_offset_addr(invlist)) = offset;
+
+ /* The 'length' passed to us is the physical number of elements in the
+ * inversion list. But if there is an offset the logical number is one
+ * less than that */
+ invlist_set_len(invlist, length - offset, offset);
+
+ invlist_set_previous_index(invlist, 0);
- /* Initialize the iteration pointer.
- * XXX This could be done at compile time in charclass_invlists.h, but I
- * (khw) am not confident that the suffixes for specifying the C constant
- * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
- * to use 64 bits; might need a Configure probe */
+ /* Initialize the iteration pointer. */
invlist_iterfinish(invlist);
return invlist;
PERL_ARGS_ASSERT_INVLIST_EXTEND;
- SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ /* Add one to account for the zero element at the beginning which may not
+ * be counted by the calling parameters */
+ SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
}
PERL_STATIC_INLINE void
{
PERL_ARGS_ASSERT_INVLIST_TRIM;
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
/* Change the length of the inversion list to how many entries it currently
* has */
-
SvPV_shrink_to_cur((SV *) invlist);
}
-#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
-
STATIC void
S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
{
UV* array;
UV max = invlist_max(invlist);
UV len = _invlist_len(invlist);
+ bool offset;
PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
if (len == 0) { /* Empty lists must be initialized */
- array = _invlist_array_init(invlist, start == 0);
+ offset = start != 0;
+ array = _invlist_array_init(invlist, ! offset);
}
else {
/* Here, the existing list is non-empty. The current max entry in the
* value not in the set, it is extending the set, so the new first
* value not in the set is one greater than the newly extended range.
* */
+ offset = *get_invlist_offset_addr(invlist);
if (array[final_element] == start) {
if (end != UV_MAX) {
array[final_element] = end + 1;
else {
/* But if the end is the maximum representable on the machine,
* just let the range that this would extend to have no end */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, offset);
}
return;
}
len += 2; /* Includes an element each for the start and end of range */
- /* If overflows the existing space, extend, which may cause the array to be
- * moved */
+ /* If wll overflow the existing space, extend, which may cause the array to
+ * be moved */
if (max < len) {
invlist_extend(invlist, len);
- invlist_set_len(invlist, len); /* Have to set len here to avoid assert
- failure in invlist_array() */
+
+ /* Have to set len here to avoid assert failure in invlist_array() */
+ invlist_set_len(invlist, len, offset);
+
array = invlist_array(invlist);
}
else {
- invlist_set_len(invlist, len);
+ invlist_set_len(invlist, len, offset);
}
/* The next item on the list starts the range, the one after that is
else {
/* But if the end is the maximum representable on the machine, just let
* the range have no end */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, offset);
}
}
}
void
-Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
{
/* Take the union of two inversion lists and point <output> to it. *output
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
- * the reference count to that list will be decremented. The first list,
- * <a>, may be NULL, in which case a copy of the second list is returned.
- * If <complement_b> is TRUE, the union is taken of the complement
- * (inversion) of <b> instead of b itself.
+ * the reference count to that list will be decremented if not already a
+ * temporary (mortal); otherwise *output will be made correspondingly
+ * mortal. The first list, <a>, may be NULL, in which case a copy of the
+ * second list is returned. If <complement_b> is TRUE, the union is taken
+ * of the complement (inversion) of <b> instead of b itself.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* return the larger of the input lists, but then outside code might need
* to keep track of whether to free the input list or not */
- UV* array_a; /* a's array */
- UV* array_b;
+ const UV* array_a; /* a's array */
+ const UV* array_b;
UV len_a; /* length of a's array */
UV len_b;
/* If either one is empty, the union is the other one */
if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
+ bool make_temp = FALSE; /* Should we mortalize the result? */
+
if (*output == a) {
if (a != NULL) {
- SvREFCNT_dec_NN(a);
+ if (! (make_temp = SvTEMP(a))) {
+ SvREFCNT_dec_NN(a);
+ }
}
}
if (*output != b) {
_invlist_invert(*output);
}
} /* else *output already = b; */
+
+ if (make_temp) {
+ sv_2mortal(*output);
+ }
return;
}
else if ((len_b = _invlist_len(b)) == 0) {
+ bool make_temp = FALSE;
if (*output == b) {
- SvREFCNT_dec_NN(b);
+ if (! (make_temp = SvTEMP(b))) {
+ SvREFCNT_dec_NN(b);
+ }
}
/* The complement of an empty list is a list that has everything in it,
* so the union with <a> includes everything too */
if (complement_b) {
if (a == *output) {
- SvREFCNT_dec_NN(a);
+ if (! (make_temp = SvTEMP(a))) {
+ SvREFCNT_dec_NN(a);
+ }
}
*output = _new_invlist(1);
_append_range_to_invlist(*output, 0, UV_MAX);
*output = invlist_clone(a);
}
/* else *output already = a; */
+
+ if (make_temp) {
+ sv_2mortal(*output);
+ }
return;
}
if (complement_b) {
/* To complement, we invert: if the first element is 0, remove it. To
- * do this, we just pretend the array starts one later, and clear the
- * flag as we don't have to do anything else later */
+ * do this, we just pretend the array starts one later */
if (array_b[0] == 0) {
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
/* Set result to final length, which can change the pointer to array_u, so
* re-find it */
if (len_u != _invlist_len(u)) {
- invlist_set_len(u, len_u);
+ invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
invlist_trim(u);
array_u = invlist_array(u);
}
}
}
- /* If we've changed b, restore it */
- if (complement_b) {
- array_b[0] = 1;
- }
-
- /* We may be removing a reference to one of the inputs */
+ /* We may be removing a reference to one of the inputs. If so, the output
+ * is made mortal if the input was. (Mortal SVs shouldn't have their ref
+ * count decremented) */
if (a == *output || b == *output) {
assert(! invlist_is_iterating(*output));
- SvREFCNT_dec_NN(*output);
+ if ((SvTEMP(*output))) {
+ sv_2mortal(u);
+ }
+ else {
+ SvREFCNT_dec_NN(*output);
+ }
}
*output = u;
+
return;
}
void
-Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
+Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
{
/* Take the intersection of two inversion lists and point <i> to it. *i
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
- * the reference count to that list will be decremented.
- * If <complement_b> is TRUE, the result will be the intersection of <a>
- * and the complement (or inversion) of <b> instead of <b> directly.
+ * the reference count to that list will be decremented if not already a
+ * temporary (mortal); otherwise *i will be made correspondingly mortal.
+ * The first list, <a>, may be NULL, in which case an empty list is
+ * returned. If <complement_b> is TRUE, the result will be the
+ * intersection of <a> and the complement (or inversion) of <b> instead of
+ * <b> directly.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* union above
*/
- UV* array_a; /* a's array */
- UV* array_b;
+ const UV* array_a; /* a's array */
+ const UV* array_b;
UV len_a; /* length of a's array */
UV len_b;
assert(a != b);
/* Special case if either one is empty */
- len_a = _invlist_len(a);
+ len_a = (a == NULL) ? 0 : _invlist_len(a);
if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
+ bool make_temp = FALSE;
if (len_a != 0 && complement_b) {
* must be every possible code point. Thus the intersection is
* simply 'a'. */
if (*i != a) {
- *i = invlist_clone(a);
-
if (*i == b) {
- SvREFCNT_dec_NN(b);
+ if (! (make_temp = SvTEMP(b))) {
+ SvREFCNT_dec_NN(b);
+ }
}
+
+ *i = invlist_clone(a);
}
/* else *i is already 'a' */
+
+ if (make_temp) {
+ sv_2mortal(*i);
+ }
return;
}
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
* intersection must be empty */
if (*i == a) {
- SvREFCNT_dec_NN(a);
+ if (! (make_temp = SvTEMP(a))) {
+ SvREFCNT_dec_NN(a);
+ }
}
else if (*i == b) {
- SvREFCNT_dec_NN(b);
+ if (! (make_temp = SvTEMP(b))) {
+ SvREFCNT_dec_NN(b);
+ }
}
*i = _new_invlist(0);
+ if (make_temp) {
+ sv_2mortal(*i);
+ }
+
return;
}
if (complement_b) {
/* To complement, we invert: if the first element is 0, remove it. To
- * do this, we just pretend the array starts one later, and clear the
- * flag as we don't have to do anything else later */
+ * do this, we just pretend the array starts one later */
if (array_b[0] == 0) {
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
/* Set result to final length, which can change the pointer to array_r, so
* re-find it */
if (len_r != _invlist_len(r)) {
- invlist_set_len(r, len_r);
+ invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
invlist_trim(r);
array_r = invlist_array(r);
}
}
}
- /* If we've changed b, restore it */
- if (complement_b) {
- array_b[0] = 1;
- }
-
- /* We may be removing a reference to one of the inputs */
+ /* We may be removing a reference to one of the inputs. If so, the output
+ * is made mortal if the input was. (Mortal SVs shouldn't have their ref
+ * count decremented) */
if (a == *i || b == *i) {
assert(! invlist_is_iterating(*i));
- SvREFCNT_dec_NN(*i);
+ if (SvTEMP(*i)) {
+ sv_2mortal(r);
+ }
+ else {
+ SvREFCNT_dec_NN(*i);
+ }
}
*i = r;
+
return;
}
* have a zero; removes it otherwise. As described above, the data
* structure is set up so that this is very efficient */
- UV* len_pos = _get_invlist_len_addr(invlist);
-
PERL_ARGS_ASSERT__INVLIST_INVERT;
assert(! invlist_is_iterating(invlist));
/* The inverse of matching nothing is matching everything */
- if (*len_pos == 0) {
+ if (_invlist_len(invlist) == 0) {
_append_range_to_invlist(invlist, 0, UV_MAX);
return;
}
- /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
- * zero element was a 0, so it is being removed, so the length decrements
- * by 1; and vice-versa. SvCUR is unaffected */
- if (*get_invlist_zero_addr(invlist) ^= 1) {
- (*len_pos)--;
- }
- else {
- (*len_pos)++;
- }
+ *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
}
void
invlist_extend(invlist, len);
array = invlist_array(invlist);
}
- invlist_set_len(invlist, len);
+ invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
array[len - 1] = PERL_UNICODE_MAX + 1;
}
else { /* Remove the 0x110000 */
- invlist_set_len(invlist, len - 1);
+ invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
}
}
{
/* Return a new inversion list that is a copy of the input one, which is
- * unchanged */
+ * unchanged. The new list will not be mortal even if the old one was. */
/* Need to allocate extra space to accommodate Perl's addition of a
* trailing NUL to SvPV's, since it thinks they are always strings */
SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
- STRLEN length = SvCUR(invlist);
+ STRLEN physical_length = SvCUR(invlist);
+ bool offset = *(get_invlist_offset_addr(invlist));
PERL_ARGS_ASSERT_INVLIST_CLONE;
- SvCUR_set(new_invlist, length); /* This isn't done automatically */
- Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
+ *(get_invlist_offset_addr(new_invlist)) = offset;
+ invlist_set_len(new_invlist, _invlist_len(invlist), offset);
+ Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
return new_invlist;
}
-PERL_STATIC_INLINE UV*
+PERL_STATIC_INLINE STRLEN*
S_get_invlist_iter_addr(pTHX_ SV* invlist)
{
/* Return the address of the UV that contains the current iteration
PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
- return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
-}
-
-PERL_STATIC_INLINE UV*
-S_get_invlist_version_id_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that contains the version id. */
-
- PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
+ assert(SvTYPE(invlist) == SVt_INVLIST);
- return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+ return &(((XINVLIST*) SvANY(invlist))->iterator);
}
PERL_STATIC_INLINE void
PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
- *get_invlist_iter_addr(invlist) = UV_MAX;
+ *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
}
STATIC bool
* <*start> and <*end> are unchanged, and the next call to this function
* will start over at the beginning of the list */
- UV* pos = get_invlist_iter_addr(invlist);
+ STRLEN* pos = get_invlist_iter_addr(invlist);
UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
if (*pos >= len) {
- *pos = UV_MAX; /* Force iterinit() to be required next time */
+ *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
return FALSE;
}
{
PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
- return *(get_invlist_iter_addr(invlist)) < UV_MAX;
+ return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
PERL_STATIC_INLINE UV
}
#endif
-#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
+#ifndef PERL_IN_XSUB_RE
void
-Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
+Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
{
- /* Dumps out the ranges in an inversion list. The string 'header'
- * if present is output on a line before the first range */
+ /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
+ * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
+ * the string 'indent'. The output looks like this:
+ [0] 0x000A .. 0x000D
+ [2] 0x0085
+ [4] 0x2028 .. 0x2029
+ [6] 0x3104 .. INFINITY
+ * This means that the first range of code points matched by the list are
+ * 0xA through 0xD; the second range contains only the single code point
+ * 0x85, etc. An inversion list is an array of UVs. Two array elements
+ * are used to define each range (except if the final range extends to
+ * infinity, only a single element is needed). The array index of the
+ * first element for the corresponding range is given in brackets. */
UV start, end;
+ STRLEN count = 0;
PERL_ARGS_ASSERT__INVLIST_DUMP;
- if (header && strlen(header)) {
- PerlIO_printf(Perl_debug_log, "%s\n", header);
- }
if (invlist_is_iterating(invlist)) {
- PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
+ Perl_dump_indent(aTHX_ level, file,
+ "%sCan't dump inversion list because is in middle of iterating\n",
+ indent);
return;
}
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
+ indent, (UV)count, start);
}
else if (end != start) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
- start, end);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
+ indent, (UV)count, start, end);
}
else {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
+ Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
+ indent, (UV)count, start);
}
+ count += 2;
}
}
#endif
-#if 0
+#ifdef PERL_ARGS_ASSERT__INVLISTEQ
bool
-S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
+S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
{
/* Return a boolean as to if the two passed in inversion lists are
* identical. The final argument, if TRUE, says to take the complement of
* the second inversion list before doing the comparison */
- UV* array_a = invlist_array(a);
- UV* array_b = invlist_array(b);
+ const UV* array_a = invlist_array(a);
+ const UV* array_b = invlist_array(b);
UV len_a = _invlist_len(a);
UV len_b = _invlist_len(b);
/* Otherwise, to complement, we invert. Here, the first element is
* 0, just remove it. To do this, we just pretend the array starts
- * one later, and clear the flag as we don't have to do anything
- * else later */
+ * one later */
array_b++;
len_b--;
- complement_b = FALSE;
}
else {
- /* But if the first element is not zero, we unshift a 0 before the
- * array. The data structure reserves a space for that 0 (which
- * should be a '1' right now), so physical shifting is unneeded,
- * but temporarily change that element to 0. Before exiting the
- * routine, we must restore the element to '1' */
+ /* But if the first element is not zero, we pretend the list starts
+ * at the 0 that is always stored immediately before the array. */
array_b--;
len_b++;
- array_b[0] = 0;
}
}
}
}
- if (complement_b) {
- array_b[0] = 1;
- }
return retval;
}
#endif
#undef HEADER_LENGTH
-#undef INVLIST_INITIAL_LENGTH
#undef TO_INTERNAL_SIZE
#undef FROM_INTERNAL_SIZE
-#undef INVLIST_LEN_OFFSET
-#undef INVLIST_ZERO_OFFSET
-#undef INVLIST_ITER_OFFSET
#undef INVLIST_VERSION_ID
-#undef INVLIST_PREVIOUS_INDEX_OFFSET
/* End of inversion list object */
STATIC void
-S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
+S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
{
/* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
* constructs, and updates RExC_flags with them. On input, RExC_parse
#define WASTED_O 0x01
#define WASTED_G 0x02
#define WASTED_C 0x04
-#define WASTED_GC (0x02|0x04)
+#define WASTED_GC (WASTED_G|WASTED_C)
I32 wastedflags = 0x00;
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
if (! (wastedflags & wflagbit) ) {
wastedflags |= wflagbit;
+ /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
vWARN5(
RExC_parse + 1,
"Useless (%s%c) - %suse /%c modifier",
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
+ /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
vWARN3(
RExC_parse + 1,
"Useless (%sc) - %suse /gc modifier",
RExC_flags |= posflags;
RExC_flags &= ~negflags;
set_regex_charset(&RExC_flags, cs);
+ if (RExC_flags & RXf_PMf_FOLD) {
+ RExC_contains_i = 1;
+ }
return;
/*NOTREACHED*/
default:
fail_modifiers:
- RExC_parse++;
- vFAIL3("Sequence (%.*s...) not recognized",
- RExC_parse-seqstart, seqstart);
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
+ UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
/*NOTREACHED*/
}
cannot happen. */
STATIC regnode *
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
- /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+ /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
+ * 2 is like 1, but indicates that nextchar() has been called to advance
+ * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
+ * this flag alerts us to the need to check for that */
{
dVAR;
regnode *ret; /* Will be the head of the group. */
/* Make an OPEN node, if parenthesized. */
if (paren) {
+
+ /* Under /x, space and comments can be gobbled up between the '(' and
+ * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
+ * intervening space, as the sequence is a token, and a token should be
+ * indivisible */
+ bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse;
STRLEN verb_len = 0;
unsigned char op = 0;
int argok = 1;
int internal_argval = 0; /* internal_argval is only useful if !argok */
+
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+ }
while ( *RExC_parse && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
start_arg = RExC_parse + 1;
break;
}
if ( ! op ) {
- RExC_parse++;
- vFAIL3("Unknown verb pattern '%.*s'",
- verb_len, start_verb);
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL2utf8f(
+ "Unknown verb pattern '%"UTF8f"'",
+ UTF8fARG(UTF, verb_len, start_verb));
}
if ( argok ) {
if ( start_arg && internal_argval ) {
if ( ! internal_argval && ! SIZE_ONLY ) {
if (start_arg) {
SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
- ARG(ret) = add_data( pRExC_state, 1, "S" );
+ ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
RExC_rxi->data->data[ARG(ret)]=(void*)sv;
ret->flags = 0;
} else {
}
nextchar(pRExC_state);
return ret;
- } else
- if (*RExC_parse == '?') { /* (?...) */
+ }
+ else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+ }
RExC_parse++;
paren = *RExC_parse++;
vFAIL2("Sequence %.3s... not terminated",parse_start);
if (!SIZE_ONLY) {
- num = add_data( pRExC_state, 1, "S" );
+ num = add_data( pRExC_state, STR_WITH_LEN("S"));
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void(sv_dat);
}
*flagp |= HASWIDTH;
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
return ret;
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
+ case '#': /* (?#...) */
+ /* XXX As soon as we disallow separating the '?' and '*' (by
+ * spaces or (?#...) comment), it is believed that this case
+ * will be unreachable and can be removed. See
+ * [perl #117327] */
+ while (*RExC_parse && *RExC_parse != ')')
+ RExC_parse++;
+ if (*RExC_parse != ')')
+ FAIL("Sequence (?#... not terminated");
+ nextchar(pRExC_state);
+ *flagp = TRYAGAIN;
+ return NULL;
case '0' : /* (?0) */
case 'R' : /* (?R) */
if (*RExC_parse != ')')
is_logical = 1;
if (*RExC_parse != '{') {
RExC_parse++;
- vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+ vFAIL2utf8f(
+ "Sequence (%"UTF8f"...) not recognized",
+ UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
/*NOTREACHED*/
}
*flagp |= POSTPONED;
if (!SIZE_ONLY) {
OP *o = cb->block;
if (cb->src_regex) {
- n = add_data(pRExC_state, 2, "rl");
+ n = add_data(pRExC_state, STR_WITH_LEN("rl"));
RExC_rxi->data->data[n] =
(void*)SvREFCNT_inc((SV*)cb->src_regex);
RExC_rxi->data->data[n+1] = (void*)o;
}
else {
- n = add_data(pRExC_state, 1,
- (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
+ n = add_data(pRExC_state,
+ (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
RExC_rxi->data->data[n] = (void*)o;
}
}
(ch == '>' ? '<' : ch));
RExC_parse++;
if (!SIZE_ONLY) {
- num = add_data( pRExC_state, 1, "S" );
+ num = add_data( pRExC_state, STR_WITH_LEN("S"));
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void(sv_dat);
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
/* (?(1)...) */
char c;
+ char *tmp;
parno = atoi(RExC_parse++);
while (isDIGIT(*RExC_parse))
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
- if ((c = *nextchar(pRExC_state)) != ')')
+ if (*(tmp = nextchar(pRExC_state)) != ')') {
+ if ( UTF ) {
+ /* Like the name implies, nextchar deals in chars,
+ * not characters, so if under UTF, undo its work
+ * and skip over the the next character.
+ */
+ RExC_parse = tmp;
+ RExC_parse += UTF8SKIP(RExC_parse);
+ }
vFAIL("Switch condition not recognized");
+ }
insert_if:
REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
br = regbranch(pRExC_state, &flags, 1,depth+1);
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X",
- flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ (UV) flags);
} else
REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
c = *nextchar(pRExC_state);
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X",
- flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ (UV) flags);
}
REGTAIL(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
return ret;
}
else {
- vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unknown switch condition (?(...))");
}
}
case '[': /* (?[ ... ]) */
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
}
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
}
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
case ':':
ender = reg_node(pRExC_state, TAIL);
break;
- case 1:
+ case 1: case 2:
ender = reganode(pRExC_state, CLOSE, parno);
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
if (paren == '>')
node = SUSPEND, flag = 0;
reginsert(pRExC_state, node,ret, depth+1);
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
Set_Node_Offset(ret, parse_start + 1);
ret->flags = flag;
REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
/* Check for proper termination. */
if (paren) {
- RExC_flags = oregflags;
+ /* restore original flags, but keep (?p) */
+ RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
+ FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
}
else if (ret == NULL)
ret = latest;
if (flags & (TRYAGAIN|RESTART_UTF8))
*flagp |= flags & (TRYAGAIN|RESTART_UTF8);
else
- FAIL2("panic: regatom returned NULL, flags=%#X", flags);
+ FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
return(NULL);
}
ret = reg_node(pRExC_state, OPFAIL);
return ret;
}
- else if (max == 0) { /* replace {0} with a nothing node */
- if (SIZE_ONLY) {
- RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
- }
- else {
- RExC_emit = orig_emit;
- }
- ret = reg_node(pRExC_state, NOTHING);
- return ret;
- }
do_curly:
if ((flags&SIMPLE)) {
RExC_naughty += 2 + RExC_naughty / 2;
reginsert(pRExC_state, CURLY, ret, depth+1);
Set_Node_Offset(ret, parse_start+1); /* MJD */
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
}
else {
regnode * const w = reg_node(pRExC_state, WHILEM);
nest_check:
if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
- ckWARN3reg(RExC_parse,
- "%.*s matches null string many times",
- (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
- origparse);
+ ckWARN2reg(RExC_parse,
+ "%"UTF8f" matches null string many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
+ origparse));
(void)ReREFCNT_inc(RExC_rx_sv);
}
reginsert(pRExC_state, MINMOD, ret, depth+1);
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
-#ifndef REG_ALLOW_MINMOD_SUSPEND
else
-#endif
if (RExC_parse < RExC_end && *RExC_parse == '+') {
regnode *ender;
nextchar(pRExC_state);
ret->flags = 0;
ender = reg_node(pRExC_state, TAIL);
REGTAIL(pRExC_state, ret, ender);
- /*ret= ender;*/
}
if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
*flagp = RESTART_UTF8;
return FALSE;
}
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
- flags);
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
+ (UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
* additionally will populate the node's STRING with <code_point>, if <len>
* is 0. In both cases <*flagp> is appropriately set
*
- * It knows that under FOLD, UTF characters and the Latin Sharp S must be
- * folded (the latter only when the rules indicate it can match 'ss') */
+ * It knows that under FOLD, the Latin Sharp S and UTF characters above
+ * 255, must be folded (the former only when the rules indicate it can
+ * match 'ss') */
bool len_passed_in = cBOOL(len != 0);
U8 character[UTF8_MAXBYTES_CASE+1];
if (! len_passed_in) {
if (UTF) {
- if (FOLD) {
- to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
+ if (FOLD && (! LOC || code_point > 255)) {
+ _to_uni_fold_flags(code_point,
+ character,
+ &len,
+ FOLD_FLAGS_FULL | ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
}
else {
uvchr_to_utf8( character, code_point);
if (ret == NULL) {
if (*flagp & RESTART_UTF8)
return NULL;
- FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
}
nextchar(pRExC_state);
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
}
case '(':
nextchar(pRExC_state);
- ret = reg(pRExC_state, 1, &flags,depth+1);
+ ret = reg(pRExC_state, 2, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
if (RExC_parse == RExC_end) {
*flagp = RESTART_UTF8;
return NULL;
}
- FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
+ FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
break;
/* regclass() can only return RESTART_UTF8 if multi-char folds
are allowed. */
if (!ret)
- FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
RExC_parse--;
Set_Node_Offset(ret, parse_start + 2);
- Set_Node_Cur_Length(ret);
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
}
break;
vFAIL2("Sequence %.3s... not terminated",parse_start);
if (!SIZE_ONLY) {
- num = add_data( pRExC_state, 1, "S" );
+ num = add_data( pRExC_state, STR_WITH_LEN("S"));
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void(sv_dat);
}
/* override incorrect value set in reganode MJD */
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
}
goto parse_named_seq;
} }
num = atoi(RExC_parse);
- if (isg && num == 0)
- vFAIL("Reference to invalid group 0");
+ if (isg && num == 0) {
+ if (*RExC_parse == '0') {
+ vFAIL("Reference to invalid group 0");
+ }
+ else {
+ vFAIL("Unterminated \\g... pattern");
+ }
+ }
if (isrel) {
num = RExC_npar - num;
if (num < 1)
vFAIL("Reference to nonexistent or unclosed group");
}
- if (!isg && num > 9 && num >= RExC_npar)
+ if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
/* Probably a character specified in octal, e.g. \35 */
goto defchar;
else {
+#ifdef RE_TRACK_PATTERN_OFFSETS
char * const parse_start = RExC_parse - 1; /* MJD */
+#endif
while (isDIGIT(*RExC_parse))
RExC_parse++;
- if (parse_start == RExC_parse - 1)
- vFAIL("Unterminated \\g... pattern");
if (hasbrace) {
if (*RExC_parse != '}')
vFAIL("Unterminated \\g{...} pattern");
/* override incorrect value set in reganode MJD */
Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
RExC_parse--;
nextchar(pRExC_state);
}
defchar: {
STRLEN len = 0;
- UV ender;
+ UV ender = 0;
char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
- U8 node_type;
+ U8 node_type = compute_EXACTish(pRExC_state);
bool next_is_quantifier;
char * oldp = NULL;
+ /* We can convert EXACTF nodes to EXACTFU if they contain only
+ * characters that match identically regardless of the target
+ * string's UTF8ness. The reason to do this is that EXACTF is not
+ * trie-able, EXACTFU is. (We don't need to figure this out until
+ * pass 2) */
+ bool maybe_exactfu = node_type == EXACTF && PASS2;
+
/* If a folding node contains only code points that don't
* participate in folds, it can be changed into an EXACT node,
* which allows the optimizer more things to look for */
bool maybe_exact;
- ender = 0;
- node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
/* In pass1, folded, we use a temporary buffer instead of the
/* We do the EXACTFish to EXACT node only if folding, and not if in
* locale, as whether a character folds or not isn't known until
- * runtime */
- maybe_exact = FOLD && ! LOC;
+ * runtime. (And we don't need to figure this out until pass 2) */
+ maybe_exact = FOLD && ! LOC && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
p++;
break;
case 'a':
- ender = ASCII_TO_NATIVE('\007');
+ ender = '\a';
p++;
break;
case 'o':
p++;
ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
break;
- case '0': case '1': case '2': case '3':case '4':
+ case '8': case '9': /* must be a backreference */
+ --p;
+ goto loopdone;
+ case '1': case '2': case '3':case '4':
case '5': case '6': case '7':
- if (*p == '0' ||
- (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+ /* When we parse backslash escapes there is ambiguity
+ * between backreferences and octal escapes. Any escape
+ * from \1 - \9 is a backreference, any multi-digit
+ * escape which does not start with 0 and which when
+ * evaluated as decimal could refer to an already
+ * parsed capture buffer is a backslash. Anything else
+ * is octal.
+ *
+ * Note this implies that \118 could be interpreted as
+ * 118 OR as "\11" . "8" depending on whether there
+ * were 118 capture buffers defined already in the
+ * pattern. */
+ if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
+ { /* Not to be treated as an octal constant, go
+ find backref */
+ --p;
+ goto loopdone;
+ }
+ case '0':
{
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
form_short_octal_warning(p, numlen));
}
}
- else { /* Not to be treated as an octal constant, go
- find backref */
- --p;
- goto loopdone;
- }
if (PL_encoding && ender < 0x100)
goto recode_encoding;
break;
if (! SIZE_ONLY
&& RExC_flags & RXf_PMf_EXTENDED
- && ckWARN(WARN_DEPRECATED)
+ && ckWARN_d(WARN_DEPRECATED)
&& is_PATWS_non_low(p, UTF))
{
vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
goto loopdone;
}
- if (FOLD) {
- if (UTF
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- || (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S))
- {
-
-
- /* Prime the casefolded buffer. Locale rules, which
- * apply only to code points < 256, aren't known until
- * execution, so for them, just output the original
- * character using utf8. If we start to fold non-UTF
- * patterns, be sure to update join_exact() */
- if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
- *s = (U8) ender;
- foldlen = 1;
- } else {
- *s = UTF8_TWO_BYTE_HI(ender);
- *(s + 1) = UTF8_TWO_BYTE_LO(ender);
- foldlen = 2;
- }
+ if (! FOLD) {
+ if (UTF) {
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
}
- else {
- UV folded = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL
- | ((LOC) ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0)
- );
- /* If this node only contains non-folding code
- * points so far, see if this new one is also
- * non-folding */
- if (maybe_exact) {
- if (folded != ender) {
- maybe_exact = FALSE;
+ /* The loop increments <len> each time, as all but this
+ * path (and one other) through it add a single byte to
+ * the EXACTish node. But this one has changed len to
+ * be the correct final value, so subtract one to
+ * cancel out the increment that follows */
+ len--;
+ }
+ else {
+ REGC((char)ender, s++);
+ }
+ }
+ else /* FOLD */ if (! ( UTF
+ /* See comments for join_exact() as to why we fold this
+ * non-UTF at compile time */
+ || (node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S)))
+ {
+ if (IS_IN_SOME_FOLD_L1(ender)) {
+ maybe_exact = FALSE;
+
+ /* See if the character's fold differs between /d and
+ * /u. This includes the multi-char fold SHARP S to
+ * 'ss' */
+ if (maybe_exactfu
+ && (PL_fold[ender] != PL_fold_latin1[ender]
+ || ender == LATIN_SMALL_LETTER_SHARP_S
+ || (len > 0
+ && isARG2_lower_or_UPPER_ARG1('s', ender)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
+ {
+ maybe_exactfu = FALSE;
+ }
+ }
+ *(s++) = (char) ender;
+ }
+ else { /* UTF */
+
+ /* Prime the casefolded buffer. Locale rules, which apply
+ * only to code points < 256, aren't known until execution,
+ * so for them, just output the original character using
+ * utf8. If we start to fold non-UTF patterns, be sure to
+ * update join_exact() */
+ if (LOC && ender < 256) {
+ if (UVCHR_IS_INVARIANT(ender)) {
+ *s = (U8) ender;
+ foldlen = 1;
+ } else {
+ *s = UTF8_TWO_BYTE_HI(ender);
+ *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else {
+ UV folded = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+
+ /* If this node only contains non-folding code points
+ * so far, see if this new one is also non-folding */
+ if (maybe_exact) {
+ if (folded != ender) {
+ maybe_exact = FALSE;
+ }
+ else {
+ /* Here the fold is the original; we have
+ * to check further to see if anything
+ * folds to it */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8",
+ "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable =
+ _get_swash_invlist(swash);
+ SvREFCNT_dec_NN(swash);
}
- else {
- /* Here the fold is the original; we have
- * to check further to see if anything
- * folds to it */
- if (! PL_utf8_foldable) {
- SV* swash = swash_init("utf8",
- "_Perl_Any_Folds",
- &PL_sv_undef, 1, 0);
- PL_utf8_foldable =
- _get_swash_invlist(swash);
- SvREFCNT_dec_NN(swash);
- }
- if (_invlist_contains_cp(PL_utf8_foldable,
- ender))
- {
- maybe_exact = FALSE;
- }
+ if (_invlist_contains_cp(PL_utf8_foldable,
+ ender))
+ {
+ maybe_exact = FALSE;
}
}
- ender = folded;
}
- s += foldlen;
-
- /* The loop increments <len> each time, as all but this
- * path (and the one just below for UTF) through it add
- * a single byte to the EXACTish node. But this one
- * has changed len to be the correct final value, so
- * subtract one to cancel out the increment that
- * follows */
- len += foldlen - 1;
- }
- else {
- *(s++) = (char) ender;
- maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
- }
- }
- else if (UTF) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
+ ender = folded;
}
-
- /* See comment just above for - 1 */
- len--;
+ s += foldlen;
+
+ /* The loop increments <len> each time, as all but this
+ * path (and one other) through it add a single byte to the
+ * EXACTish node. But this one has changed len to be the
+ * correct final value, so subtract one to cancel out the
+ * increment that follows */
+ len += foldlen - 1;
}
- else {
- REGC((char)ender, s++);
- }
if (next_is_quantifier) {
/* No Latin1 characters participate in multi-char
* folds under /l */
if (LOC
- || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
- *s, *(s+1))))
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+ *s, *(s+1))))
{
break;
}
* do any better */
if (len == 0) {
len = full_len;
+
+ /* If the node ends in an 's' we make sure it stays EXACTF,
+ * as if it turns into an EXACTFU, it could later get
+ * joined with another 's' that would then wrongly match
+ * the sharp s */
+ if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+ {
+ maybe_exactfu = FALSE;
+ }
} else {
/* Here, the node does contain some characters that aren't
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
- /* If 'maybe_exact' is still set here, means there are no
- * code points in the node that participate in folds */
- if (FOLD && maybe_exact) {
- OP(ret) = EXACT;
- }
-
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
* node. Might as well be NOTHING instead */
if (len == 0) {
OP(ret) = NOTHING;
}
- else{
+ else {
+ if (FOLD) {
+ /* If 'maybe_exact' is still set here, means there are no
+ * code points in the node that participate in folds;
+ * similarly for 'maybe_exactfu' and code points that match
+ * differently depending on UTF8ness of the target string
+ * */
+ if (maybe_exact) {
+ OP(ret) = EXACT;
+ }
+ else if (maybe_exactfu) {
+ OP(ret) = EXACTFU;
+ }
+ }
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
}
RExC_parse = p - 1;
- Set_Node_Cur_Length(ret); /* MJD */
+ Set_Node_Cur_Length(ret, parse_start);
nextchar(pRExC_state);
{
/* len is STRLEN which is unsigned, need to copy to signed */
return p;
}
+STATIC void
+S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
+{
+ /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
+ * sets up the bitmap and any flags, removing those code points from the
+ * inversion list, setting it to NULL should it become completely empty */
+
+ PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
+ assert(PL_regkind[OP(node)] == ANYOF);
+
+ ANYOF_BITMAP_ZERO(node);
+ if (*invlist_ptr) {
+
+ /* This gets set if we actually need to modify things */
+ bool change_invlist = FALSE;
+
+ UV start, end;
+
+ /* Start looking through *invlist_ptr */
+ invlist_iterinit(*invlist_ptr);
+ while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ UV high;
+ int i;
+
+ if (end == UV_MAX && start <= 256) {
+ ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
+ }
+
+ /* Quit if are above what we should change */
+ if (start > 255) {
+ break;
+ }
+
+ change_invlist = TRUE;
+
+ /* Set all the bits in the range, up to the max that we are doing */
+ high = (end < 255) ? end : 255;
+ for (i = start; i <= (int) high; i++) {
+ if (! ANYOF_BITMAP_TEST(node, i)) {
+ ANYOF_BITMAP_SET(node, i);
+ }
+ }
+ }
+ invlist_iterfinish(*invlist_ptr);
+
+ /* Done with loop; remove any code points that are in the bitmap from
+ * *invlist_ptr; similarly for code points above latin1 if we have a flag
+ * to match all of them anyways */
+ if (change_invlist) {
+ _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
+ }
+ if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
+ _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
+ }
+
+ /* If have completely emptied it, remove it completely */
+ if (_invlist_len(*invlist_ptr) == 0) {
+ SvREFCNT_dec_NN(*invlist_ptr);
+ *invlist_ptr = NULL;
+ }
+ }
+}
+
/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
Character classes ([:foo:]) can also be negated ([:^foo:]).
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
}
if (namedclass == OOB_NAMEDCLASS)
- Simple_vFAIL3("POSIX class [:%.*s:] unknown",
- t - s - 1, s + 1);
+ vFAIL2utf8f(
+ "POSIX class [:%"UTF8f":] unknown",
+ UTF8fARG(UTF, t - s - 1, s + 1));
/* The #defines are structured so each complement is +1 to
* the normal one */
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
if (SIZE_ONLY) {
+ UV depth = 0; /* how many nested (?[...]) constructs */
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
"The regex_sets feature is experimental" REPORT_LOCATION,
- (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
+ UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
+ UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
while (RExC_parse < RExC_end) {
SV* current = NULL;
RExC_parse = regpatws(pRExC_state, RExC_parse,
TRUE); /* means recognize comments */
switch (*RExC_parse) {
+ case '?':
+ if (RExC_parse[1] == '[') depth++, RExC_parse++;
+ /* FALL THROUGH */
default:
break;
case '\\':
FALSE, /* don't allow multi-char folds */
TRUE, /* silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* function call leaves parse pointing to the ']', except
* if we faked it */
}
case ']':
+ if (depth--) break;
RExC_parse++;
if (RExC_parse < RExC_end
&& *RExC_parse == ')')
* been parsed and evaluated to a single operand (or else is a syntax
* error), and is handled as a regular operand */
- stack = newAV();
+ sv_2mortal((SV *)(stack = newAV()));
while (RExC_parse < RExC_end) {
I32 top_index = av_tindex(stack);
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
¤t))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
- *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
+ (UV) *flagp);
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
|| IS_OPERAND(lparen)
|| SvUV(lparen) != '(')
{
+ SvREFCNT_dec(current);
RExC_parse++;
vFAIL("Unexpected ')'");
}
}
else {
SV* top = av_pop(stack);
+ SV *prev = NULL;
char current_operator;
if (IS_OPERAND(top)) {
+ SvREFCNT_dec_NN(top);
+ SvREFCNT_dec_NN(current);
vFAIL("Operand with no preceding operator");
}
current_operator = (char) SvUV(top);
goto handle_operand;
case '&':
- _invlist_intersection(av_pop(stack),
+ prev = av_pop(stack);
+ _invlist_intersection(prev,
current,
¤t);
av_push(stack, current);
case '|':
case '+':
- _invlist_union(av_pop(stack), current, ¤t);
+ prev = av_pop(stack);
+ _invlist_union(prev, current, ¤t);
av_push(stack, current);
break;
case '-':
- _invlist_subtract(av_pop(stack), current, ¤t);
+ prev = av_pop(stack);;
+ _invlist_subtract(prev, current, ¤t);
av_push(stack, current);
break;
SV* u = NULL;
SV* element;
- element = av_pop(stack);
- _invlist_union(element, current, &u);
- _invlist_intersection(element, current, &i);
+ prev = av_pop(stack);
+ _invlist_union(prev, current, &u);
+ _invlist_intersection(prev, current, &i);
+ /* _invlist_subtract will overwrite current
+ without freeing what it already contains */
+ element = current;
_invlist_subtract(u, i, ¤t);
av_push(stack, current);
SvREFCNT_dec_NN(i);
Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
}
SvREFCNT_dec_NN(top);
+ SvREFCNT_dec(prev);
}
}
RExC_end = save_end;
SvREFCNT_dec_NN(final);
SvREFCNT_dec_NN(result_string);
- SvREFCNT_dec_NN(stack);
nextchar(pRExC_state);
Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
- const I32 orig_size = RExC_size;
+ const SSize_t orig_size = RExC_size;
+ bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGCLASS;
}
if (!SIZE_ONLY) {
SV* invlist;
+ char* formatted;
char* name;
if (UCHARAT(RExC_parse) == '^') {
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- Newx(name, n + sizeof("_i__\n"), char);
-
- sprintf(name, "%s%.*s%s\n",
- (FOLD) ? "__" : "",
- (int)n,
- RExC_parse,
- (FOLD) ? "_i" : ""
- );
+ formatted = Perl_form(aTHX_
+ "%s%.*s%s\n",
+ (FOLD) ? "__" : "",
+ (int)n,
+ RExC_parse,
+ (FOLD) ? "_i" : ""
+ );
+ name = savepvn(formatted, strlen(formatted));
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
* otherwise add it to the list for run-time look up */
if (ret_invlist) {
RExC_parse = e + 1;
- vFAIL3("Property '%.*s' is unknown", (int) n, name);
+ vFAIL2utf8f(
+ "Property '%"UTF8f"' is unknown",
+ UTF8fARG(UTF, n, name));
}
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
(value == 'p' ? '+' : '!'),
- name);
+ UTF8fARG(UTF, n, name));
has_user_defined_property = TRUE;
/* We don't know yet, so have to assume that the
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
case 'e': value = ASCII_TO_NATIVE('\033');break;
- case 'a': value = ASCII_TO_NATIVE('\007');break;
+ case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
{
/* What matches in a locale is not known until runtime. This includes
* what the Posix classes (like \w, [:space:]) match. Room must be
- * reserved (one time per class) to store such classes, either if Perl
- * is compiled so that locale nodes always should have this space, or
- * if there is such class info to be stored. The space will contain a
- * bit for each named class that is to be matched against. This isn't
- * needed for \p{} and pseudo-classes, as they are not affected by
- * locale, and hence are dealt with separately */
+ * reserved (one time per outer bracketed class) to store such classes,
+ * either if Perl is compiled so that locale nodes always should have
+ * this space, or if there is such posix class info to be stored. The
+ * space will contain a bit for each named class that is to be matched
+ * against. This isn't needed for \p{} and pseudo-classes, as they are
+ * not affected by locale, and hence are dealt with separately */
if (LOC
&& ! need_class
- && (ANYOF_LOCALE == ANYOF_CLASS
- || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
+ && (ANYOF_LOCALE == ANYOF_POSIXL
+ || (namedclass > OOB_NAMEDCLASS
+ && namedclass < ANYOF_POSIXL_MAX)))
{
need_class = 1;
if (SIZE_ONLY) {
- RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
+ RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
}
else {
- RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
- ANYOF_CLASS_ZERO(ret);
+ RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
}
- ANYOF_FLAGS(ret) |= ANYOF_CLASS;
+ ANYOF_POSIXL_ZERO(ret);
+ ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
}
if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+ U8 classnum;
/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
* literal, as is the character that began the false range, i.e.
? RExC_parse - rangebegin
: 0;
if (strict) {
- vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
+ vFAIL2utf8f(
+ "False [] range \"%"UTF8f"\"",
+ UTF8fARG(UTF, w, rangebegin));
}
else {
SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
- ckWARN4reg(RExC_parse,
- "False [] range \"%*.*s\"",
- w, w, rangebegin);
+ ckWARN2reg(RExC_parse,
+ "False [] range \"%"UTF8f"\"",
+ UTF8fARG(UTF, w, rangebegin));
(void)ReREFCNT_inc(RExC_rx_sv);
cp_list = add_cp_to_invlist(cp_list, '-');
cp_list = add_cp_to_invlist(cp_list, prevvalue);
element_count += 2; /* So counts for three values */
}
- if (! SIZE_ONLY) {
- U8 classnum = namedclass_to_classnum(namedclass);
- if (namedclass >= ANYOF_MAX) { /* If a special class */
+ classnum = namedclass_to_classnum(namedclass);
+
+ if (LOC && namedclass < ANYOF_POSIXL_MAX
+#ifndef HAS_ISASCII
+ && classnum != _CC_ASCII
+#endif
+#ifndef HAS_ISBLANK
+ && classnum != _CC_BLANK
+#endif
+ ) {
+ if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
+ && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
+ ? -1
+ : 1)))
+ {
+ posixl_matches_all = TRUE;
+ break;
+ }
+ ANYOF_POSIXL_SET(ret, namedclass);
+ }
+ /* XXX After have made all the posix classes known at compile time
+ * we can move the LOC handling below to above */
+
+ if (! SIZE_ONLY) {
+ if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
/* Here, should be \h, \H, \v, or \V. Neither /d nor
else if (classnum == _CC_ASCII) {
#ifdef HAS_ISASCII
if (LOC) {
- ANYOF_CLASS_SET(ret, namedclass);
+ ANYOF_POSIXL_SET(ret, namedclass);
}
else
#endif /* Not isascii(); just use the hard-coded definition for it */
_invlist_union_maybe_complement_2nd(
posixes,
- PL_ASCII,
+ PL_Posix_ptrs[_CC_ASCII],
cBOOL(namedclass % 2), /* Complement if odd
(NASCII) */
&posixes);
/* This code is structured into two major clauses. The
* first is for classes whose complete definitions may not
- * already be known. It not, the Latin1 definition
+ * already be known. If not, the Latin1 definition
* (guaranteed to already known) is used plus code is
* generated to load the rest at run-time (only if needed).
* If the complete definition is known, it drops down to
}
if (LOC) { /* Under locale, set run-time
lookup */
- ANYOF_CLASS_SET(ret, namedclass);
+ ANYOF_POSIXL_SET(ret, namedclass);
}
else {
/* Add the current class's code points to
Xname);
runtime_posix_matches_above_Unicode = TRUE;
if (LOC) {
- ANYOF_CLASS_SET(ret, namedclass);
+ ANYOF_POSIXL_SET(ret, namedclass);
}
else {
#endif
/* Set this class in the node for runtime
* matching */
- ANYOF_CLASS_SET(ret, namedclass);
+ ANYOF_POSIXL_SET(ret, namedclass);
#ifndef HAS_ISBLANK
}
else {
#ifndef HAS_ISBLANK
if (namedclass != ANYOF_NBLANK) {
#endif
- ANYOF_CLASS_SET(ret, namedclass);
+ ANYOF_POSIXL_SET(ret, namedclass);
#ifndef HAS_ISBLANK
}
else {
if (range) {
if (prevvalue > value) /* b-a */ {
const int w = RExC_parse - rangebegin;
- Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
+ vFAIL2utf8f(
+ "Invalid [] range \"%"UTF8f"\"",
+ UTF8fARG(UTF, w, rangebegin));
range = 0; /* not a valid range */
}
}
/* <multi_char_matches> is actually an array of arrays.
* There will be one or two top-level elements: [2],
* and/or [3]. The [2] element is an array, each
- * element thereof is a character which folds to two
- * characters; likewise for [3]. (Unicode guarantees a
- * maximum of 3 characters in any fold.) When we
- * rewrite the character class below, we will do so
- * such that the longest folds are written first, so
- * that it prefers the longest matching strings first.
- * This is done even if it turns out that any
- * quantifier is non-greedy, out of programmer
- * laziness. Tom Christiansen has agreed that this is
- * ok. This makes the test for the ligature 'ffi' come
- * before the test for 'ff' */
+ * element thereof is a character which folds to TWO
+ * characters; [3] is for folds to THREE characters.
+ * (Unicode guarantees a maximum of 3 characters in any
+ * fold.) When we rewrite the character class below,
+ * we will do so such that the longest folds are
+ * written first, so that it prefers the longest
+ * matching strings first. This is done even if it
+ * turns out that any quantifier is non-greedy, out of
+ * programmer laziness. Tom Christiansen has agreed
+ * that this is ok. This makes the test for the
+ * ligature 'ffi' come before the test for 'ff' */
if (av_exists(multi_char_matches, cp_count)) {
this_array_ptr = (AV**) av_fetch(multi_char_matches,
cp_count, FALSE);
* included. literal_endpoint==2 means both ends of the range used
* a literal character, not \x{foo} */
if (literal_endpoint == 2
- && (prevvalue >= 'a' && value <= 'z')
- || (prevvalue >= 'A' && value <= 'Z'))
+ && ((prevvalue >= 'a' && value <= 'z')
+ || (prevvalue >= 'A' && value <= 'Z')))
{
+ _invlist_intersection(this_range, PL_ASCII,
+ &this_range);
_invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
&this_range);
}
/* If the character class contains only a single element, it may be
* optimizable into another node type which is smaller and runs faster.
* Check if this is the case for this class */
- if (element_count == 1 && ! ret_invlist) {
+ if ((element_count == 1 && ! ret_invlist)
+ || UNLIKELY(posixl_matches_all))
+ {
U8 op = END;
U8 arg = 0;
- if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
- [:digit:] or \p{foo} */
+ if (UNLIKELY(posixl_matches_all)) {
+ op = SANY;
+ }
+ else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
+ \w or [:digit:] or \p{foo}
+ */
/* All named classes are mapped into POSIXish nodes, with its FLAG
* argument giving which class it is */
/* To get locale nodes to not use the full ANYOF size would
* require moving the code above that writes the portions
* of it that aren't in other nodes to after this point.
- * e.g. ANYOF_CLASS_SET */
+ * e.g. ANYOF_POSIXL_SET */
RExC_size = orig_size;
}
}
/* If the folds haven't been read in, call a fold function
* to force that */
if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
/* This string is just a short named one above \xff */
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
default:
/* Use deprecated warning to increase the
* chances of this being output */
- ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
break;
}
}
* doesn't allow them between above and below 256 */
if ((ASCII_FOLD_RESTRICTED
&& (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
- {
+ || (LOC && c < 256)) {
continue;
}
/* Under /d, we put into a separate list the Latin1 things that
* match only when the target string is utf8 */
SV* nonascii_but_latin1_properties = NULL;
- _invlist_intersection(posixes, PL_Latin1,
+ _invlist_intersection(posixes, PL_UpperLatin1,
&nonascii_but_latin1_properties);
- _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
- &nonascii_but_latin1_properties);
_invlist_subtract(posixes, nonascii_but_latin1_properties,
&posixes);
if (cp_list) {
* invert if there are things such as \w, which aren't known until runtime
* */
if (invert
- && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
+ && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
&& ! depends_list
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
if (ret_invlist) {
*ret_invlist = cp_list;
+ SvREFCNT_dec(swash);
/* Discard the generated node */
if (SIZE_ONLY) {
if (cp_list
&& ! invert
&& ! depends_list
- && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
+ && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
UV start, end;
* for things that belong in the bitmap, put them there, and delete from
* <cp_list>. While we are at it, see if everything above 255 is in the
* list, and if so, set a flag to speed up execution */
- ANYOF_BITMAP_ZERO(ret);
- if (cp_list) {
-
- /* This gets set if we actually need to modify things */
- bool change_invlist = FALSE;
-
- UV start, end;
-
- /* Start looking through <cp_list> */
- invlist_iterinit(cp_list);
- while (invlist_iternext(cp_list, &start, &end)) {
- UV high;
- int i;
-
- if (end == UV_MAX && start <= 256) {
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
- }
-
- /* Quit if are above what we should change */
- if (start > 255) {
- break;
- }
-
- change_invlist = TRUE;
-
- /* Set all the bits in the range, up to the max that we are doing */
- high = (end < 255) ? end : 255;
- for (i = start; i <= (int) high; i++) {
- if (! ANYOF_BITMAP_TEST(ret, i)) {
- ANYOF_BITMAP_SET(ret, i);
- prevvalue = value;
- value = i;
- }
- }
- }
- invlist_iterfinish(cp_list);
-
- /* Done with loop; remove any code points that are in the bitmap from
- * <cp_list> */
- if (change_invlist) {
- _invlist_subtract(cp_list, PL_Latin1, &cp_list);
- }
- /* If have completely emptied it, remove it completely */
- if (_invlist_len(cp_list) == 0) {
- SvREFCNT_dec_NN(cp_list);
- cp_list = NULL;
- }
- }
+ populate_ANYOF_from_invlist(ret, &cp_list);
if (invert) {
ANYOF_FLAGS(ret) |= ANYOF_INVERT;
swash = NULL;
}
- if (! cp_list
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
- {
- ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
+ set_ANYOF_arg(pRExC_state, ret, cp_list,
+ (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+ ? listsv : NULL,
+ swash, has_user_defined_property);
+
+ *flagp |= HASWIDTH|SIMPLE;
+ return ret;
+}
+
+#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
+
+STATIC void
+S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
+ regnode* const node,
+ SV* const cp_list,
+ SV* const runtime_defns,
+ SV* const swash,
+ const bool has_user_defined_property)
+{
+ /* Sets the arg field of an ANYOF-type node 'node', using information about
+ * the node passed-in. If there is nothing outside the node's bitmap, the
+ * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
+ * the count returned by add_data(), having allocated and stored an array,
+ * av, that that count references, as follows:
+ * av[0] stores the character class description in its textual form.
+ * This is used later (regexec.c:Perl_regclass_swash()) to
+ * initialize the appropriate swash, and is also useful for dumping
+ * the regnode. This is set to &PL_sv_undef if the textual
+ * description is not needed at run-time (as happens if the other
+ * elements completely define the class)
+ * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
+ * computed from av[0]. But if no further computation need be done,
+ * the swash is stored here now (and av[0] is &PL_sv_undef).
+ * av[2] stores the cp_list inversion list for use in addition or instead
+ * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
+ * (Otherwise everything needed is already in av[0] and av[1])
+ * av[3] is set if any component of the class is from a user-defined
+ * property; used only if av[2] exists */
+
+ UV n;
+
+ PERL_ARGS_ASSERT_SET_ANYOF_ARG;
+
+ if (! cp_list && ! runtime_defns) {
+ ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
}
else {
- /* av[0] stores the character class description in its textual form:
- * used later (regexec.c:Perl_regclass_swash()) to initialize the
- * appropriate swash, and is also useful for dumping the regnode.
- * av[1] if NULL, is a placeholder to later contain the swash computed
- * from av[0]. But if no further computation need be done, the
- * swash is stored there now.
- * av[2] stores the cp_list inversion list for use in addition or
- * instead of av[0]; used only if av[1] is NULL
- * av[3] is set if any component of the class is from a user-defined
- * property; used only if av[1] is NULL */
AV * const av = newAV();
SV *rv;
- av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
- ? SvREFCNT_inc(listsv) : &PL_sv_undef);
+ av_store(av, 0, (runtime_defns)
+ ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
if (swash) {
av_store(av, 1, swash);
SvREFCNT_dec_NN(cp_list);
}
else {
- av_store(av, 1, NULL);
+ av_store(av, 1, &PL_sv_undef);
if (cp_list) {
av_store(av, 2, cp_list);
av_store(av, 3, newSVuv(has_user_defined_property));
}
rv = newRV_noinc(MUTABLE_SV(av));
- n = add_data(pRExC_state, 1, "s");
+ n = add_data(pRExC_state, STR_WITH_LEN("s"));
RExC_rxi->data->data[n] = (void*)rv;
- ARG_SET(ret, n);
+ ARG_SET(node, n);
}
-
- *flagp |= HASWIDTH|SIMPLE;
- return ret;
}
-#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
/* reg_skipcomment()
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFA_NO_TRIE:
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
#ifdef DEBUGGING
+
+static void
+S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
+{
+ int bit;
+ int set=0;
+
+ for (bit=0; bit<32; bit++) {
+ if (flags & (1<<bit)) {
+ if (!set++ && lead)
+ PerlIO_printf(Perl_debug_log, "%s",lead);
+ PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
+ }
+ }
+ if (lead) {
+ if (set)
+ PerlIO_printf(Perl_debug_log, "\n");
+ else
+ PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
+ }
+}
+
static void
S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
{
if (r->extflags & RXf_EVAL_SEEN)
PerlIO_printf(Perl_debug_log, "with eval ");
PerlIO_printf(Perl_debug_log, "\n");
- DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
+ DEBUG_FLAGS_r({
+ regdump_extflags("r->extflags: ",r->extflags);
+ regdump_intflags("r->intflags: ",r->intflags);
+ });
#else
PERL_ARGS_ASSERT_REGDUMP;
PERL_UNUSED_CONTEXT;
/*
- regprop - printable representation of opcode
*/
-#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
-STMT_START { \
- if (do_sep) { \
- Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
- if (flags & ANYOF_INVERT) \
- /*make sure the invert info is in each */ \
- sv_catpvs(sv, "^"); \
- do_sep = 0; \
- } \
-} STMT_END
void
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
|| _CC_VERTSPACE != 16
#error Need to adjust order of anyofs[]
#endif
- "[\\w]",
- "[\\W]",
- "[\\d]",
- "[\\D]",
+ "\\w",
+ "\\W",
+ "\\d",
+ "\\D",
"[:alpha:]",
"[:^alpha:]",
"[:lower:]",
"[:^graph:]",
"[:cased:]",
"[:^cased:]",
- "[\\s]",
- "[\\S]",
+ "\\s",
+ "\\S",
"[:blank:]",
"[:^blank:]",
"[:xdigit:]",
"[:^cntrl:]",
"[:ascii:]",
"[:^ascii:]",
- "[\\v]",
- "[\\V]"
+ "\\v",
+ "\\V"
};
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
)
);
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
- int i;
- int rangestart = -1;
- U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
sv_catpvs(sv, "[");
- for (i = 0; i <= 256; i++) {
- if (i < 256 && BITMAP_TEST(bitmap,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i - 1);
- }
- rangestart = -1;
- }
- }
+ (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie));
sv_catpvs(sv, "]");
}
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
- int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
sv_catpvs(sv, "^");
/* output what the standard cp 0-255 bitmap matches */
- for (i = 0; i <= 256; i++) {
- if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i - 1);
- }
- do_sep = 1;
- rangestart = -1;
- }
- }
+ do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
- EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
- /* output any special charclass tests (used entirely under use locale) */
- if (ANYOF_CLASS_TEST_ANY_SET(o))
- for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
- if (ANYOF_CLASS_TEST(o,i)) {
+ /* output any special charclass tests (used entirely under use
+ * locale) * */
+ if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
+ int i;
+ for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
+ if (ANYOF_POSIXL_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
do_sep = 1;
}
+ }
+ }
- EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+ if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
+ || ANYOF_NONBITMAP(o))
+ {
+ if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ if (flags & ANYOF_INVERT)
+ /*make sure the invert info is in each */
+ sv_catpvs(sv, "^");
+ }
if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
sv_catpvs(sv, "{non-utf8-latin1-all}");
}
/* output information about the unicode matching */
- if (flags & ANYOF_UNICODE_ALL)
+ if (flags & ANYOF_ABOVE_LATIN1_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (ANYOF_NONBITMAP(o))
- sv_catpvs(sv, "{unicode}");
- if (flags & ANYOF_NONBITMAP_NON_UTF8)
- sv_catpvs(sv, "{outside bitmap}");
-
- if (ANYOF_NONBITMAP(o)) {
- SV *lv; /* Set if there is something outside the bit map */
- SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
+ else if (ANYOF_NONBITMAP(o)) {
+ SV *lv; /* Set if there is something outside the bit map. */
bool byte_output = FALSE; /* If something in the bitmap has been
output */
- if (lv && lv != &PL_sv_undef) {
- if (sw) {
- U8 s[UTF8_MAXBYTES_CASE+1];
-
- for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
- uvchr_to_utf8(s, i);
-
- if (i < 256
- && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
- things already
- output as part
- of the bitmap */
- && swash_fetch(sw, s, TRUE))
- {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- byte_output = TRUE;
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++) {
- put_byte(sv, rangestart);
- }
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i-1);
- }
- rangestart = -1;
- }
- }
- }
+ if (flags & ANYOF_NONBITMAP_NON_UTF8) {
+ sv_catpvs(sv, "{outside bitmap}");
+ }
+ else {
+ sv_catpvs(sv, "{utf8}");
+ }
- {
- char *s = savesvpv(lv);
- char * const origs = s;
+ /* Get the stuff that wasn't in the bitmap */
+ (void) regclass_swash(prog, o, FALSE, &lv, NULL);
+ if (lv && lv != &PL_sv_undef) {
+ char *s = savesvpv(lv);
+ char * const origs = s;
- while (*s && *s != '\n')
- s++;
+ while (*s && *s != '\n')
+ s++;
- if (*s == '\n') {
- const char * const t = ++s;
+ if (*s == '\n') {
+ const char * const t = ++s;
- if (byte_output) {
- sv_catpvs(sv, " ");
- }
+ if (byte_output) {
+ sv_catpvs(sv, " ");
+ }
- while (*s) {
- if (*s == '\n') {
+ while (*s) {
+ if (*s == '\n') {
- /* Truncate very long output */
- if (s - origs > 256) {
- Perl_sv_catpvf(aTHX_ sv,
- "%.*s...",
- (int) (s - origs - 1),
- t);
- goto out_dump;
- }
- *s = ' ';
- }
- else if (*s == '\t') {
- *s = '-';
- }
- s++;
- }
- if (s[-1] == ' ')
- s[-1] = 0;
+ /* Truncate very long output */
+ if (s - origs > 256) {
+ Perl_sv_catpvf(aTHX_ sv,
+ "%.*s...",
+ (int) (s - origs - 1),
+ t);
+ goto out_dump;
+ }
+ *s = ' ';
+ }
+ else if (*s == '\t') {
+ *s = '-';
+ }
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
- sv_catpv(sv, t);
- }
+ sv_catpv(sv, t);
+ }
- out_dump:
+ out_dump:
- Safefree(origs);
- }
+ Safefree(origs);
SvREFCNT_dec_NN(lv);
}
}
+ }
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
}
else {
+ if (*anyofs[index] != '[') {
+ sv_catpv(sv, "[");
+ }
sv_catpv(sv, anyofs[index]);
+ if (*anyofs[index] != '[') {
+ sv_catpv(sv, "]");
+ }
}
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
so we need to copy it locally. */
RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
ret->mother_re = NULL;
- ret->gofs = 0;
}
#endif /* PERL_IN_XSUB_RE */
break;
case 'f':
/* This is cheating. */
- Newx(d->data[i], 1, struct regnode_charclass_class);
- StructCopy(ri->data->data[i], d->data[i],
- struct regnode_charclass_class);
+ Newx(d->data[i], 1, regnode_ssc);
+ StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
reti->regstclass = (regnode*)d->data[i];
break;
case 'T':
#endif
STATIC void
-S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
+S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
{
va_list args;
STRLEN l1 = strlen(pat1);
Copy(pat2, buf + l1, l2 , char);
buf[l1 + l2] = '\n';
buf[l1 + l2 + 1] = '\0';
-#ifdef I_STDARG
- /* ANSI variant takes additional second argument */
va_start(args, pat2);
-#else
- va_start(args);
-#endif
msv = vmess(buf, &args);
va_end(args);
message = SvPV_const(msv,l1);
if (l1 > 512)
l1 = 512;
Copy(message, buf, l1 , char);
- buf[l1-1] = '\0'; /* Overwrite \n */
- Perl_croak(aTHX_ "%s", buf);
+ /* l1-1 to avoid \n */
+ Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
}
/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
{
dVAR;
- struct re_save_state *state;
-
- SAVEVPTR(PL_curcop);
- SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
-
- state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
- PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
- SSPUSHUV(SAVEt_RE_STATE);
-
- Copy(&PL_reg_state, state, 1, struct re_save_state);
-
- PL_reg_oldsaved = NULL;
- PL_reg_oldsavedlen = 0;
- PL_reg_oldsavedoffset = 0;
- PL_reg_oldsavedcoffset = 0;
- PL_reg_maxiter = 0;
- PL_reg_leftiter = 0;
- PL_reg_poscache = NULL;
- PL_reg_poscache_size = 0;
-#ifdef PERL_ANY_COW
- PL_nrs = NULL;
-#endif
-
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
if (PL_curpm) {
const REGEXP * const rx = PM_GETRE(PL_curpm);
So the old condition can be simplified to !isPRINT(c) */
if (!isPRINT(c)) {
- if (c < 256) {
- Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
- }
- else {
- Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
- }
+ switch (c) {
+ case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+ case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+ case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
+ case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
+ case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+
+ default:
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+ break;
+ }
}
else {
const char string = c;
}
}
+STATIC bool
+S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+{
+ /* Appends to 'sv' a displayable version of the innards of the bracketed
+ * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
+ * output anything */
+
+ int i;
+ int rangestart = -1;
+ bool has_output_anything = FALSE;
+
+ PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ int j = i - 1;
+ if (i <= rangestart + 3) { /* Individual chars in short ranges */
+ for (; rangestart < i; rangestart++)
+ put_byte(sv, rangestart);
+ }
+ else if ( j > 255
+ || ! isALPHANUMERIC(rangestart)
+ || ! isALPHANUMERIC(j)
+ || isDIGIT(rangestart) != isDIGIT(j)
+ || isUPPER(rangestart) != isUPPER(j)
+ || isLOWER(rangestart) != isLOWER(j)
+
+ /* This final test should get optimized out except
+ * on EBCDIC platforms, where it causes ranges that
+ * cross discontinuities like i/j to be shown as hex
+ * instead of the misleading, e.g. H-K (since that
+ * range includes more than H, I, J, K). */
+ || (j - rangestart)
+ != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
+ {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
+ rangestart,
+ (j < 256) ? j : 255);
+ }
+ else { /* Here, the ends of the range are both digits, or both
+ uppercase, or both lowercase; and there's no
+ discontinuity in the range (which could happen on EBCDIC
+ platforms) */
+ put_byte(sv, rangestart);
+ sv_catpvs(sv, "-");
+ put_byte(sv, j);
+ }
+ rangestart = -1;
+ has_output_anything = TRUE;
+ }
+ }
+
+ return has_output_anything;
+}
#define CLEAR_OPTSTART \
if (optstart) STMT_START { \
}
else if (PL_regkind[(U8)op] == ANYOF) {
/* arglen 1 + class block */
- node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
- ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
+ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
+ ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
node = NEXTOPER(node);
}
else if (PL_regkind[(U8)op] == EXACT) {