|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
else
data->flags &= ~SF_FIX_BEFORE_EOL;
- data->minlen_fixed=minlenp;
+ data->minlen_fixed=minlenp;
data->lookbehind_fixed=0;
}
else { /* *data->longest == data->longest_float */
DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
"%*sCompiling trie using list compiler\n",
(int)depth * 2 + 2, ""));
-
+
trie->states = (reg_trie_state *)
PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
sizeof(reg_trie_state) );
int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
int noff;
regnode *n = scan;
-
+
/* Skip NOTHING and LONGJMP. */
while ((n = regnext(n))
&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
next = regnext(scan);
code = OP(scan);
/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
-
+
if (OP(next) == code || code == IFTHEN) {
/* NOTE - There is similar code to this block below for handling
TRIE nodes on a re-study. If you change stuff here check there
I32 max1 = 0, min1 = I32_MAX, num = 0;
struct regnode_charclass_class accum;
regnode * const startbranch=scan;
-
+
if (flags & SCF_DO_SUBSTR)
SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
if (flags & SCF_DO_STCLASS)
a nested if into a case structure of sorts.
*/
-
+
int made=0;
if (!re_trie_maxbuff) {
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
if ( last && TRIE_TYPE_IS_SAFE ) {
made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
-#ifdef TRIE_STUDY_OPT
+#ifdef TRIE_STUDY_OPT
if ( ((made == MADE_EXACT_TRIE &&
startbranch == first)
|| ( first_non_open == first )) &&
break;
CASE_SYNST_FNC(VERTWS);
CASE_SYNST_FNC(HORIZWS);
-
+
}
if (flags & SCF_DO_STCLASS_OR)
cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_SUBSTR;
}
#endif /* old or new */
-#endif /* TRIE_STUDY_OPT */
+#endif /* TRIE_STUDY_OPT */
/* Else: zero-length, ignore. */
scan = regnext(scan);
sawplus = 1;
else
first += regarglen[OP(first)];
-
+
first = NEXTOPER(first);
first_next= regnext(first);
}
else
ri->regstclass = first;
}
-#ifdef TRIE_STCLASS
+#ifdef TRIE_STCLASS
else if (PL_regkind[OP(first)] == TRIE &&
((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
{
make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
ri->regstclass = trie_op;
}
-#endif
+#endif
else if (REGNODE_SIMPLE(OP(first)))
ri->regstclass = first;
else if (PL_regkind[OP(first)] == BOUND ||
* it happens that c_offset_min has been invalidated, since the
* earlier string may buy us something the later one won't.]
*/
-
+
data.longest_fixed = newSVpvs("");
data.longest_float = newSVpvs("");
data.last_found = newSVpvs("");
&data, -1, NULL, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
-
+
CHECK_RESTUDY_GOTO;
I32 fake;
struct regnode_charclass_class ch_class;
I32 last_close = 0;
-
+
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
scan = ri->program + 1;
RExC_parse++;
if (*RExC_parse!=')')
vFAIL("Expecting close bracket");
-
+
gen_recurse_regop:
if ( paren == '-' ) {
/*
RExC_parse++;
}
if (*RExC_parse != ')') {
- RExC_parse = s;
+ RExC_parse = s;
vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
}
if (!SIZE_ONLY) {
|| RExC_parse[1] == '<'
|| RExC_parse[1] == '{') { /* Lookahead or eval. */
I32 flag;
-
+
ret = reg_node(pRExC_state, LOGICAL);
if (!SIZE_ONLY)
ret->flags = 1;
Set_Node_Length(ret, 1);
}
}
-
+
if (!first && SIZE_ONLY)
RExC_extralen += 1; /* BRANCHJ */
break;
case 'p':
case 'P':
- {
+ {
char* const oldregxend = RExC_end;
#ifdef DEBUGGING
char* parse_start = RExC_parse - 2;
case 'x':
if (*++p == '{') {
char* const e = strchr(p, '}');
-
+
if (!e) {
RExC_parse = p + 1;
vFAIL("Missing right brace on \\x{}");
*flagp |= HASWIDTH;
if (len == 1 && UNI_IS_INVARIANT(ender))
*flagp |= SIMPLE;
-
+
if (SIZE_ONLY)
RExC_size += STR_SZ(len);
else {
POSIXCC(UCHARAT(RExC_parse))) {
const char c = UCHARAT(RExC_parse);
char* const s = RExC_parse++;
-
+
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
if (RExC_parse == RExC_end)
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
- if (! (ANYOF_FLAGS(ret) & ANYOF_INVERT)) {
if (swash) {
SvREFCNT_dec(swash);
}
undefined properties */
NULL, FALSE /* No inversion list */
);
- }
-
- if ( ANYOF_FLAGS(ret) & ANYOF_INVERT
- || ! swash
+ if ( ! swash
|| ! SvROK(swash)
|| ! SvTYPE(SvRV(swash)) == SVt_PVHV
|| ! (invlistsvp =
Safefree(name);
}
RExC_parse = e + 1;
-
- /* The \p could match something in the Latin1 range, hence
- * something that isn't utf8 */
- ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
namedclass = ANYOF_MAX; /* no official name, but it's named */
/* \p means they want Unicode semantics */
range = 0; /* this was not a true range */
}
-
-
if (!SIZE_ONLY) {
const char *what = NULL;
char yesno = 0;
if (! PL_utf8_foldable) {
SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
PL_utf8_foldable = _swash_to_invlist(swash);
+ SvREFCNT_dec(swash);
}
/* This is a hash that for a particular fold gives all characters
}
}
+ /* Here, <nonbitmap> contains all the code points we can determine at
+ * compile time that we haven't put into the bitmap. Go through it, and
+ * for things that belong in the bitmap, put them there, and delete from
+ * <nonbitmap> */
+ if (nonbitmap) {
+
+ /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
+ * possibly only should match when the target string is UTF-8 */
+ UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
+
+ /* This gets set if we actually need to modify things */
+ bool change_invlist = FALSE;
+
+ UV start, end;
+
+ /* Start looking through <nonbitmap> */
+ invlist_iterinit(nonbitmap);
+ while (invlist_iternext(nonbitmap, &start, &end)) {
+ UV high;
+ int i;
+
+ /* Quit if are above what we should change */
+ if (start > max_cp_to_set) {
+ break;
+ }
+
+ change_invlist = TRUE;
+
+ /* Set all the bits in the range, up to the max that we are doing */
+ high = (end < max_cp_to_set) ? end : max_cp_to_set;
+ for (i = start; i <= (int) high; i++) {
+ if (! ANYOF_BITMAP_TEST(ret, i)) {
+ ANYOF_BITMAP_SET(ret, i);
+ stored++;
+ prevvalue = value;
+ value = i;
+ }
+ }
+ }
+
+ /* Done with loop; set <nonbitmap> to not include any code points that
+ * are in the bitmap */
+ if (change_invlist) {
+ SV* keep_list = _new_invlist(2);
+ _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX);
+ _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
+ SvREFCNT_dec(keep_list);
+ }
+
+ /* If have completely emptied it, remove it completely */
+ if (invlist_len(nonbitmap) == 0) {
+ SvREFCNT_dec(nonbitmap);
+ nonbitmap = NULL;
+ }
+ }
/* Here, we have calculated what code points should be in the character
- * class.
+ * class. <nonbitmap> does not overlap the bitmap except possibly in the
+ * case of DEPENDS rules.
*
* Now we can see about various optimizations. Fold calculation (which we
* did above) needs to take place before inversion. Otherwise /[^k]/i
|| (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
&& SvCUR(listsv) == initial_listsv_len)
{
+ int i;
if (! nonbitmap) {
- for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
- ANYOF_BITMAP(ret)[value] ^= 0xFF;
+ for (i = 0; i < 256; ++i) {
+ if (ANYOF_BITMAP_TEST(ret, i)) {
+ ANYOF_BITMAP_CLEAR(ret, i);
+ }
+ else {
+ ANYOF_BITMAP_SET(ret, i);
+ prevvalue = value;
+ value = i;
+ }
+ }
/* The inversion means that everything above 255 is matched */
ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
}
else {
- /* Here, also has things outside the bitmap. Go through each bit
- * individually and add it to the list to get rid of from those
- * things not in the bitmap */
- SV *remove_list = _new_invlist(2);
+ /* Here, also has things outside the bitmap that may overlap with
+ * the bitmap. We have to sync them up, so that they get inverted
+ * in both places. Earlier, we removed all overlaps except in the
+ * case of /d rules, so no syncing is needed except for this case
+ */
+ SV *remove_list = NULL;
+
+ if (DEPENDS_SEMANTICS) {
+ UV start, end;
+
+ /* Set the bits that correspond to the ones that aren't in the
+ * bitmap. Otherwise, when we invert, we'll miss these.
+ * Earlier, we removed from the nonbitmap all code points
+ * < 128, so there is no extra work here */
+ invlist_iterinit(nonbitmap);
+ while (invlist_iternext(nonbitmap, &start, &end)) {
+ if (start > 255) { /* The bit map goes to 255 */
+ break;
+ }
+ if (end > 255) {
+ end = 255;
+ }
+ for (i = start; i <= (int) end; ++i) {
+ ANYOF_BITMAP_SET(ret, i);
+ prevvalue = value;
+ value = i;
+ }
+ }
+ }
+
+ /* Now invert both the bitmap and the nonbitmap. Anything in the
+ * bitmap has to also be removed from the non-bitmap, but again,
+ * there should not be overlap unless is /d rules. */
_invlist_invert(nonbitmap);
- for (value = 0; value < 256; ++value) {
- if (ANYOF_BITMAP_TEST(ret, value)) {
- ANYOF_BITMAP_CLEAR(ret, value);
- remove_list = add_cp_to_invlist(remove_list, value);
+
+ for (i = 0; i < 256; ++i) {
+ if (ANYOF_BITMAP_TEST(ret, i)) {
+ ANYOF_BITMAP_CLEAR(ret, i);
+ if (DEPENDS_SEMANTICS) {
+ if (! remove_list) {
+ remove_list = _new_invlist(2);
+ }
+ remove_list = add_cp_to_invlist(remove_list, i);
+ }
}
else {
- ANYOF_BITMAP_SET(ret, value);
+ ANYOF_BITMAP_SET(ret, i);
+ prevvalue = value;
+ value = i;
}
}
/* And do the removal */
- _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
- SvREFCNT_dec(remove_list);
+ if (DEPENDS_SEMANTICS) {
+ if (remove_list) {
+ _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
+ SvREFCNT_dec(remove_list);
+ }
+ }
+ else {
+ /* There is no overlap for non-/d, so just delete anything
+ * below 256 */
+ SV* keep_list = _new_invlist(2);
+ _append_range_to_invlist(keep_list, 256, UV_MAX);
+ _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
+ SvREFCNT_dec(keep_list);
+ }
}
stored = 256 - stored;
/* Folding in the bitmap is taken care of above, but not for locale (for
* which we have to wait to see what folding is in effect at runtime), and
- * for things not in the bitmap. Set run-time fold flag for these */
- if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
+ * for some things not in the bitmap (only the upper latin folds in this
+ * case, as all other single-char folding has been set above). Set
+ * run-time fold flag for these */
+ if (FOLD && (LOC
+ || (DEPENDS_SEMANTICS
+ && nonbitmap
+ && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
+ || unicode_alternate))
+ {
ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
}
* is just the lower case of the current one (which may resolve to
* itself, or to the other one */
value = toLOWER_LATIN1(value);
- if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
- /* To join adjacent nodes, they must be the exact EXACTish
- * type. Try to use the most likely type, by using EXACTFU if
- * the regex calls for them, or is required because the
- * character is non-ASCII */
+ /* To join adjacent nodes, they must be the exact EXACTish type.
+ * Try to use the most likely type, by using EXACTFA if possible,
+ * then EXACTFU if the regex calls for it, or is required because
+ * the character is non-ASCII. (If <value> is ASCII, its fold is
+ * also ASCII for the cases where we get here.) */
+ if (MORE_ASCII_RESTRICTED && isASCII(value)) {
+ op = EXACTFA;
+ }
+ else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
op = EXACTFU;
}
else { /* Otherwise, more likely to be EXACTF type */
SvREFCNT_dec(unicode_alternate);
}
else {
-
+ /* av[0] stores the character class description in its textual form:
+ * used later (regexec.c:Perl_regclass_swash()) to initialize the
+ * appropriate swash, and is also useful for dumping the regnode.
+ * av[1] if NULL, is a placeholder to later contain the swash computed
+ * from av[0]. But if no further computation need be done, the
+ * swash is stored there now.
+ * av[2] stores the multicharacter foldings, used later in
+ * regexec.c:S_reginclass().
+ * av[3] stores the nonbitmap inversion list for use in addition or
+ * instead of av[0]; not used if av[1] isn't NULL
+ * av[4] is set if any component of the class is from a user-defined
+ * property; not used if av[1] isn't NULL */
AV * const av = newAV();
SV *rv;
- /* The 0th element stores the character class description
- * in its textual form: used later (regexec.c:Perl_regclass_swash())
- * to initialize the appropriate swash (which gets stored in
- * element [1]), and also useful for dumping the regnode.
- * Element [2] stores the multicharacter foldings,
- * used later (regexec.c:S_reginclass()).
- * Element [3] stores the nonbitmap inversion list for use in addition
- * or instead of element [0].
- * Element [4] is set if any component of the class is from a
- * user-defined property */
+
av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
? &PL_sv_undef
: listsv);
We can't do this:
assert(2==regarglen[op]+1);
-
+
Anything larger than this has to allocate the extra amount.
If we changed this to be:
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
-
+
/* output what the standard cp 0-255 bitmap matches */
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
if (ANYOF_NONBITMAP(o)) {
SV *lv; /* Set if there is something outside the bit map */
SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
-
+ bool byte_output = FALSE; /* If something in the bitmap has been
+ output */
+
if (lv && lv != &PL_sv_undef) {
if (sw) {
U8 s[UTF8_MAXBYTES_CASE+1];
for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
uvchr_to_utf8(s, i);
-
- if (i < 256 && swash_fetch(sw, s, TRUE)) {
+
+ if (i < 256
+ && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
+ things already
+ output as part
+ of the bitmap */
+ && swash_fetch(sw, s, TRUE))
+ {
if (rangestart == -1)
rangestart = i;
} else if (rangestart != -1) {
+ byte_output = TRUE;
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- const U8 * const e = uvchr_to_utf8(s,rangestart);
- U8 *p;
- for(p = s; p < e; p++)
- put_byte(sv, *p);
+ put_byte(sv, rangestart);
}
else {
- const U8 *e = uvchr_to_utf8(s,rangestart);
- U8 *p;
- for (p = s; p < e; p++)
- put_byte(sv, *p);
+ put_byte(sv, rangestart);
sv_catpvs(sv, "-");
- e = uvchr_to_utf8(s, i-1);
- for (p = s; p < e; p++)
- put_byte(sv, *p);
+ put_byte(sv, i-1);
}
rangestart = -1;
}
}
-
- sv_catpvs(sv, "..."); /* et cetera */
}
{
char *s = savesvpv(lv);
char * const origs = s;
-
+
while (*s && *s != '\n')
s++;
-
+
if (*s == '\n') {
const char * const t = ++s;
-
+
+ if (byte_output) {
+ sv_catpvs(sv, " ");
+ }
+
while (*s) {
- if (*s == '\n')
+ if (*s == '\n') {
+
+ /* Truncate very long output */
+ if (s - origs > 256) {
+ Perl_sv_catpvf(aTHX_ sv,
+ "%.*s...",
+ (int) (s - origs - 1),
+ t);
+ goto out_dump;
+ }
*s = ' ';
+ }
+ else if (*s == '\t') {
+ *s = '-';
+ }
s++;
}
if (s[-1] == ' ')
s[-1] = 0;
-
+
sv_catpv(sv, t);
}
-
+
+ out_dump:
+
Safefree(origs);
}
SvREFCNT_dec(lv);
}
#endif
-STATIC void
+STATIC void
S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
{
va_list args;
goto after_print;
} else
CLEAR_OPTSTART;
-
+
regprop(r, sv, node);
PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
(int)(2*indent + 1), "", SvPVX_const(sv));
sv_setpvs(sv, "");
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
-
+
PerlIO_printf(Perl_debug_log, "%*s%s ",
(int)(2*(indent+3)), "",
elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,