#define STATIC static
#endif
-#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#endif
-
/* this is a chain of data about sub patterns we are processing that
need to be handled separately/specially in study_chunk. Its so
we can simulate recursion without losing state. */
I32 seen_zerolen;
regnode **open_parens; /* pointers to open parens */
regnode **close_parens; /* pointers to close parens */
- regnode *opend; /* END node in program */
+ regnode *end_op; /* END node in program */
I32 utf8; /* whether the pattern is utf8 or not */
I32 orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
HV *paren_names; /* Paren names */
regnode **recurse; /* Recurse regops */
- I32 recurse_count; /* Number of recurse regops */
+ I32 recurse_count; /* Number of recurse regops we have generated */
U8 *study_chunk_recursed; /* bitmap of which subs we have moved
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
scan_frame *frame_head;
scan_frame *frame_last;
U32 frame_count;
+ AV *warn_text;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#endif
bool seen_unfolded_sharp_s;
bool strict;
+ bool study_started;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
-#define RExC_opend (pRExC_state->opend)
+#define RExC_end_op (pRExC_state->end_op)
#define RExC_paren_names (pRExC_state->paren_names)
#define RExC_recurse (pRExC_state->recurse)
#define RExC_recurse_count (pRExC_state->recurse_count)
#define RExC_frame_last (pRExC_state->frame_last)
#define RExC_frame_count (pRExC_state->frame_count)
#define RExC_strict (pRExC_state->strict)
+#define RExC_study_started (pRExC_state->study_started)
+#define RExC_warn_text (pRExC_state->warn_text)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
#define SF_HAS_PAR 0x0080
#define SF_IN_PAR 0x0100
#define SF_HAS_EVAL 0x0200
+
+
+/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
+ * longest substring in the pattern. When it is not set the optimiser keeps
+ * track of position, but does not keep track of the actual strings seen,
+ *
+ * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
+ * /foo/i will not.
+ *
+ * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
+ * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
+ * turned off because of the alternation (BRANCH). */
#define SCF_DO_SUBSTR 0x0400
+
#define SCF_DO_STCLASS_AND 0x0800
#define SCF_DO_STCLASS_OR 0x1000
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
#define EXPERIMENTAL_INPLACESCAN
#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
-#define DEBUG_RExC_seen() \
+#ifdef DEBUGGING
+int
+Perl_re_printf(pTHX_ const char *fmt, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_PRINTF;
+ va_start(ap, fmt);
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+
+int
+Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_INDENTF;
+ va_start(ap, depth);
+ PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+#endif /* DEBUGGING */
+
+#define DEBUG_RExC_seen() \
DEBUG_OPTIMISE_MORE_r({ \
- PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
+ Perl_re_printf( aTHX_ "RExC_seen: "); \
\
if (RExC_seen & REG_ZERO_LEN_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
\
if (RExC_seen & REG_LOOKBEHIND_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
\
if (RExC_seen & REG_GPOS_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
\
if (RExC_seen & REG_RECURSE_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
\
- if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
+ if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
+ Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
\
if (RExC_seen & REG_VERBARG_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
\
if (RExC_seen & REG_CUTGROUP_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
\
if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
\
if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
- \
- if (RExC_seen & REG_GOSTART_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
\
- if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
+ if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
+ Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
\
- PerlIO_printf(Perl_debug_log,"\n"); \
+ Perl_re_printf( aTHX_ "\n"); \
});
#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
- if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
+ if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
if ( ( flags ) ) { \
- PerlIO_printf(Perl_debug_log, "%s", open_str); \
+ Perl_re_printf( aTHX_ "%s", open_str); \
DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
- PerlIO_printf(Perl_debug_log, "%s", close_str); \
+ Perl_re_printf( aTHX_ "%s", close_str); \
}
#define DEBUG_STUDYDATA(str,data,depth) \
DEBUG_OPTIMISE_MORE_r(if(data){ \
- PerlIO_printf(Perl_debug_log, \
- "%*s" str "Pos:%"IVdf"/%"IVdf \
+ Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \
" Flags: 0x%"UVXf, \
- (int)(depth)*2, "", \
+ depth, \
(IV)((data)->pos_min), \
(IV)((data)->pos_delta), \
(UV)((data)->flags) \
); \
DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
- PerlIO_printf(Perl_debug_log, \
+ Perl_re_printf( aTHX_ \
" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
(IV)((data)->whilem_c), \
(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
is_inf ? "INF " : "" \
); \
if ((data)->last_found) \
- PerlIO_printf(Perl_debug_log, \
+ Perl_re_printf( aTHX_ \
"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
SvPVX_const((data)->last_found), \
(IV)((data)->offset_float_min), \
(IV)((data)->offset_float_max) \
); \
- PerlIO_printf(Perl_debug_log,"\n"); \
+ Perl_re_printf( aTHX_ "\n"); \
});
+
/* =========================================================
* BEGIN edit_distance stuff.
*
assert(is_ANYOF_SYNTHETIC(ssc));
- ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
- _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
+ /* mortalize so won't leak */
+ ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
}
* returned list must, and will, contain every code point that is a
* possibility. */
- SV* invlist = sv_2mortal(_new_invlist(0));
+ SV* invlist = NULL;
SV* only_utf8_locale_invlist = NULL;
unsigned int i;
const U32 n = ARG(node);
/* Here, no compile-time swash, and there are things that won't be
* known until runtime -- we have to assume it could be anything */
+ invlist = sv_2mortal(_new_invlist(1));
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
else if (ary[3] && ary[3] != &PL_sv_undef) {
}
}
+ if (! invlist) {
+ invlist = sv_2mortal(_new_invlist(0));
+ }
+
/* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
* code points, and an inversion list for the others, but if there are code
* points that should match only conditionally on the target string being
/* Add in the points from the bit map */
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
- invlist = add_cp_to_invlist(invlist, i);
+ unsigned int start = i++;
+
+ for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
new_node_has_latin1 = TRUE;
}
}
PERL_ARGS_ASSERT_DUMP_TRIE;
- PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
- (int)depth * 2 + 2,"",
- "Match","Base","Ofs" );
+ Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
+ depth+1, "Match","Base","Ofs" );
for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
SV ** const tmp = av_fetch( revcharmap, state, 0);
if ( tmp ) {
- PerlIO_printf( Perl_debug_log, "%*s",
+ Perl_re_printf( aTHX_ "%*s",
colwidth,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
PL_colors[0], PL_colors[1],
);
}
}
- PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
- (int)depth * 2 + 2,"");
+ Perl_re_printf( aTHX_ "\n");
+ Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
for( state = 0 ; state < trie->uniquecharcount ; state++ )
- PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
- PerlIO_printf( Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
+ Perl_re_printf( aTHX_ "\n");
for( state = 1 ; state < trie->statecount ; state++ ) {
const U32 base = trie->states[ state ].trans.base;
- PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
- (int)depth * 2 + 2,"", (UV)state);
+ Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state);
if ( trie->states[ state ].wordnum ) {
- PerlIO_printf( Perl_debug_log, " W%4X",
- trie->states[ state ].wordnum );
+ Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
} else {
- PerlIO_printf( Perl_debug_log, "%6s", "" );
+ Perl_re_printf( aTHX_ "%6s", "" );
}
- PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
+ Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base );
if ( base ) {
U32 ofs = 0;
!= state))
ofs++;
- PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
+ Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs);
for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
if ( ( base + ofs >= trie->uniquecharcount )
&& trie->trans[ base + ofs
- trie->uniquecharcount ].check == state )
{
- PerlIO_printf( Perl_debug_log, "%*"UVXf,
- colwidth,
- (UV)trie->trans[ base + ofs
- - trie->uniquecharcount ].next );
+ Perl_re_printf( aTHX_ "%*"UVXf, colwidth,
+ (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
+ );
} else {
- PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
+ Perl_re_printf( aTHX_ "%*s",colwidth," ." );
}
}
- PerlIO_printf( Perl_debug_log, "]");
+ Perl_re_printf( aTHX_ "]");
}
- PerlIO_printf( Perl_debug_log, "\n" );
+ Perl_re_printf( aTHX_ "\n" );
}
- PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
- (int)depth*2, "");
+ Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
+ depth);
for (word=1; word <= trie->wordcount; word++) {
- PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
+ Perl_re_printf( aTHX_ " %d:(%d,%d)",
(int)word, (int)(trie->wordinfo[word].prev),
(int)(trie->wordinfo[word].len));
}
- PerlIO_printf(Perl_debug_log, "\n" );
+ Perl_re_printf( aTHX_ "\n" );
}
/*
Dumps a fully constructed but uncompressed trie in list form.
PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
/* print out the table precompression. */
- PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
- (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
- "------:-----+-----------------\n" );
+ Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
+ depth+1 );
+ Perl_re_indentf( aTHX_ "%s",
+ depth+1, "------:-----+-----------------\n" );
for( state=1 ; state < next_alloc ; state ++ ) {
U16 charid;
- PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
- (int)depth * 2 + 2,"", (UV)state );
+ Perl_re_indentf( aTHX_ " %4"UVXf" :",
+ depth+1, (UV)state );
if ( ! trie->states[ state ].wordnum ) {
- PerlIO_printf( Perl_debug_log, "%5s| ","");
+ Perl_re_printf( aTHX_ "%5s| ","");
} else {
- PerlIO_printf( Perl_debug_log, "W%4x| ",
+ Perl_re_printf( aTHX_ "W%4x| ",
trie->states[ state ].wordnum
);
}
SV ** const tmp = av_fetch( revcharmap,
TRIE_LIST_ITEM(state,charid).forid, 0);
if ( tmp ) {
- PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
+ Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ",
colwidth,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
colwidth,
(UV)TRIE_LIST_ITEM(state,charid).newstate
);
if (!(charid % 10))
- PerlIO_printf(Perl_debug_log, "\n%*s| ",
+ Perl_re_printf( aTHX_ "\n%*s| ",
(int)((depth * 2) + 14), "");
}
}
- PerlIO_printf( Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "\n");
}
}
that they are identical.
*/
- PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
+ Perl_re_indentf( aTHX_ "Char : ", depth+1 );
for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
SV ** const tmp = av_fetch( revcharmap, charid, 0);
if ( tmp ) {
- PerlIO_printf( Perl_debug_log, "%*s",
+ Perl_re_printf( aTHX_ "%*s",
colwidth,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
PL_colors[0], PL_colors[1],
}
}
- PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
+ Perl_re_printf( aTHX_ "\n");
+ Perl_re_indentf( aTHX_ "State+-", depth+1 );
for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
- PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
+ Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
}
- PerlIO_printf( Perl_debug_log, "\n" );
+ Perl_re_printf( aTHX_ "\n" );
for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
- PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
- (int)depth * 2 + 2,"",
+ Perl_re_indentf( aTHX_ "%4"UVXf" : ",
+ depth+1,
(UV)TRIE_NODENUM( state ) );
for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
if (v)
- PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
+ Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v );
else
- PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
+ Perl_re_printf( aTHX_ "%*s", colwidth, "." );
}
if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
- PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
+ Perl_re_printf( aTHX_ " (%4"UVXf")\n",
(UV)trie->trans[ state ].check );
} else {
- PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
+ Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n",
(UV)trie->trans[ state ].check,
trie->states[ TRIE_NODENUM( state ) ].wordnum );
}
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
}
DEBUG_TRIE_COMPILE_r({
- PerlIO_printf( Perl_debug_log,
- "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
- (int)depth * 2 + 2, "",
+ Perl_re_indentf( aTHX_
+ "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
+ depth+1,
REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
});
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
regnode *noper = NEXTOPER( cur );
- const U8 *uc = (U8*)STRING( noper );
- const U8 *e = uc + STR_LEN( noper );
+ const U8 *uc;
+ const U8 *e;
int foldlen = 0;
U32 wordlen = 0; /* required init */
STRLEN minchars = 0;
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
- if (noper_next != tail && OP(noper_next) == flags) {
- noper = noper_next;
- uc= (U8*)STRING(noper);
- e= uc + STR_LEN(noper);
- trie->minlen= STR_LEN(noper);
- } else {
- trie->minlen= 0;
- continue;
- }
+ if (noper_next < tail)
+ noper= noper_next;
}
+ if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ uc= (U8*)STRING(noper);
+ e= uc + STR_LEN(noper);
+ } else {
+ trie->minlen= 0;
+ continue;
+ }
+
+
if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
regardless of encoding */
}
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
- PerlIO_printf( Perl_debug_log,
- "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
- (int)depth * 2 + 2,"",
+ Perl_re_indentf( aTHX_
+ "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
+ depth+1,
( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
(int)trie->minlen, (int)trie->maxlen )
STRLEN transcount = 1;
- DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
- "%*sCompiling trie using list compiler\n",
- (int)depth * 2 + 2, ""));
+ DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
+ depth+1));
trie->states = (reg_trie_state *)
PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
regnode *noper = NEXTOPER( cur );
- U8 *uc = (U8*)STRING( noper );
- const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
U16 charid = 0; /* sanity init */
U32 wordlen = 0; /* required init */
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
- if (noper_next != tail && OP(noper_next) == flags) {
- noper = noper_next;
- uc= (U8*)STRING(noper);
- e= uc + STR_LEN(noper);
- }
+ if (noper_next < tail)
+ noper= noper_next;
}
- if (OP(noper) != NOTHING) {
+ if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ const U8 *uc= (U8*)STRING(noper);
+ const U8 *e= uc + STR_LEN(noper);
+
for ( ; uc < e ; uc += len ) {
TRIE_READ_CHAR;
/*
DEBUG_TRIE_COMPILE_MORE_r(
- PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
+ Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
);
*/
}
/*
DEBUG_TRIE_COMPILE_MORE_r(
- PerlIO_printf( Perl_debug_log, " base: %d\n",base);
+ Perl_re_printf( aTHX_ " base: %d\n",base);
);
*/
trie->states[ state ].trans.base=base;
we have to use TRIE_NODENUM() to convert.
*/
- DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
- "%*sCompiling trie using table compiler\n",
- (int)depth * 2 + 2, ""));
+ DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
+ depth+1));
trie->trans = (reg_trie_trans *)
PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
regnode *noper = NEXTOPER( cur );
- const U8 *uc = (U8*)STRING( noper );
- const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
- if (noper_next != tail && OP(noper_next) == flags) {
- noper = noper_next;
- uc= (U8*)STRING(noper);
- e= uc + STR_LEN(noper);
- }
+ if (noper_next < tail)
+ noper= noper_next;
}
- if ( OP(noper) != NOTHING ) {
+ if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ const U8 *uc= (U8*)STRING(noper);
+ const U8 *e= uc + STR_LEN(noper);
+
for ( ; uc < e ; uc += len ) {
TRIE_READ_CHAR;
PerlMemShared_realloc( trie->states, laststate
* sizeof(reg_trie_state) );
DEBUG_TRIE_COMPILE_MORE_r(
- PerlIO_printf( Perl_debug_log,
- "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
- (int)depth * 2 + 2,"",
+ Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
+ depth+1,
(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
+ 1 ),
(IV)next_alloc,
} /* end table compress */
}
DEBUG_TRIE_COMPILE_MORE_r(
- PerlIO_printf(Perl_debug_log,
- "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
- (int)depth * 2 + 2, "",
+ Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n",
+ depth+1,
(UV)trie->statecount,
(UV)trie->lasttrans)
);
});
}
DEBUG_OPTIMISE_r(
- PerlIO_printf(Perl_debug_log,
- "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
- (int)depth * 2 + 2, "",
+ Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n",
+ depth+1,
(UV)mjd_offset, (UV)mjd_nodelen)
);
#endif
if ( count == 2 ) {
Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
DEBUG_OPTIMISE_r(
- PerlIO_printf(Perl_debug_log,
- "%*sNew Start State=%"UVuf" Class: [",
- (int)depth * 2 + 2, "",
+ Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [",
+ depth+1,
(UV)state));
if (idx >= 0) {
SV ** const tmp = av_fetch( revcharmap, idx, 0);
if ( folder )
TRIE_BITMAP_SET(trie, folder[ *ch ]);
DEBUG_OPTIMISE_r(
- PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
+ Perl_re_printf( aTHX_ "%s", (char*)ch)
);
}
}
TRIE_BITMAP_SET(trie,*ch);
if ( folder )
TRIE_BITMAP_SET(trie,folder[ *ch ]);
- DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
+ DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
}
idx = ofs;
}
char *ch = SvPV( *tmp, len );
DEBUG_OPTIMISE_r({
SV *sv=sv_newmortal();
- PerlIO_printf( Perl_debug_log,
- "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
- (int)depth * 2 + 2, "",
+ Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
+ depth+1,
(UV)state, (UV)idx,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
PL_colors[0], PL_colors[1],
} else {
#ifdef DEBUGGING
if (state>1)
- DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
+ DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
#endif
break;
}
*/
fail[ 0 ] = fail[ 1 ] = 0;
DEBUG_TRIE_COMPILE_r({
- PerlIO_printf(Perl_debug_log,
- "%*sStclass Failtable (%"UVuf" states): 0",
- (int)(depth * 2), "", (UV)numstates
+ Perl_re_indentf( aTHX_ "Stclass Failtable (%"UVuf" states): 0",
+ depth, (UV)numstates
);
for( q_read=1; q_read<numstates; q_read++ ) {
- PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
+ Perl_re_printf( aTHX_ ", %"UVuf, (UV)fail[q_read]);
}
- PerlIO_printf(Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "\n");
});
Safefree(q);
/*RExC_seen |= REG_TRIEDFA_SEEN;*/
}
-#define DEBUG_PEEP(str,scan,depth) \
- DEBUG_OPTIMISE_r({if (scan){ \
- regnode *Next = regnext(scan); \
- regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
- PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
- (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
- Next ? (REG_NODE_NUM(Next)) : 0 ); \
+#define DEBUG_PEEP(str,scan,depth) \
+ DEBUG_OPTIMISE_r({if (scan){ \
+ regnode *Next = regnext(scan); \
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
+ Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \
+ depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
+ Next ? (REG_NODE_NUM(Next)) : 0 );\
DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
- PerlIO_printf(Perl_debug_log, "\n"); \
+ Perl_re_printf( aTHX_ "\n"); \
}});
/* The below joins as many adjacent EXACTish nodes as possible into a single
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_STUDY_CHUNK;
+ RExC_study_started= 1;
if ( depth == 0 ) {
);
DEBUG_OPTIMISE_MORE_r(
{
- PerlIO_printf(Perl_debug_log,
- "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
- (int)(depth*2), "", (long)stopparen,
+ Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
+ depth, (long)stopparen,
(unsigned long)RExC_study_chunk_recursed_count,
(unsigned long)depth, (unsigned long)recursed_depth,
scan,
(( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
)
) {
- PerlIO_printf(Perl_debug_log," %d",(int)i);
+ Perl_re_printf( aTHX_ " %d",(int)i);
break;
}
}
if ( j + 1 < recursed_depth ) {
- PerlIO_printf(Perl_debug_log, ",");
+ Perl_re_printf( aTHX_ ",");
}
}
}
- PerlIO_printf(Perl_debug_log,"\n");
+ Perl_re_printf( aTHX_ "\n");
}
);
while ( scan && OP(scan) != END && scan < last ){
DEBUG_PEEP("Peep", scan, depth);
- /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
- * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
- * by a different invocation of reg() -- Yves
+ /* The reason we do this here is that we need to deal with things like
+ * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
+ * parsing code, as each (?:..) is handled by a different invocation of
+ * reg() -- Yves
*/
JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
- PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
- (int)depth * 2 + 2, "",
- "Looking for TRIE'able sequences. Tail node is: ",
+ Perl_re_indentf( aTHX_ "%s %"UVuf":%s\n",
+ depth+1,
+ "Looking for TRIE'able sequences. Tail node is ",
+ (UV)(tail - RExC_emit_start),
SvPV_nolen_const( RExC_mysv )
);
});
U8 noper_trietype = TRIE_TYPE( noper_type );
#if defined(DEBUGGING) || defined(NOJUMPTRIE)
regnode * const noper_next = regnext( noper );
- U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
- U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
+ U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
+ U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
#endif
DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
- PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
- (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
+ Perl_re_indentf( aTHX_ "- %d:%s (%d)",
+ depth+1,
+ REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
- PerlIO_printf( Perl_debug_log, " -> %s",
- SvPV_nolen_const(RExC_mysv));
+ Perl_re_printf( aTHX_ " -> %d:%s",
+ REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
if ( noper_next ) {
regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
- PerlIO_printf( Perl_debug_log,"\t=> %s\t",
- SvPV_nolen_const(RExC_mysv));
+ Perl_re_printf( aTHX_ "\t=> %d:%s\t",
+ REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
}
- PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
+ Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
);
if ( noper_trietype
&&
(
- ( noper_trietype == NOTHING)
+ ( noper_trietype == NOTHING )
|| ( trietype == NOTHING )
|| ( trietype == noper_trietype )
)
#ifdef NOJUMPTRIE
- && noper_next == tail
+ && noper_next >= tail
#endif
&& count < U16_MAX)
{
if ( noper_trietype == NOTHING ) {
#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
regnode * const noper_next = regnext( noper );
- U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
+ U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
#endif
}
if ( noper_trietype
#ifdef NOJUMPTRIE
- && noper_next == tail
+ && noper_next >= tail
#endif
){
/* noper is triable, so we can start a new
} /* loop over branches */
DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
- PerlIO_printf( Perl_debug_log,
- "%*s- %s (%d) <SCAN FINISHED>\n",
- (int)depth * 2 + 2,
- "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
+ Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
+ depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
+ Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
+ REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+ PL_reg_name[trietype]
+ );
});
if ( last && trietype ) {
depth==0 ) {
flags |= SCF_TRIE_RESTUDY;
if ( startbranch == first
- && scan == tail )
+ && scan >= tail )
{
RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
}
* turn it into a plain NOTHING op. */
DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
- PerlIO_printf( Perl_debug_log,
- "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
- "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
+ Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
+ depth+1,
+ SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
});
OP(startbranch)= NOTHING;
} else /* single branch is optimized. */
scan = NEXTOPER(scan);
continue;
- } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
+ } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
I32 paren = 0;
regnode *start = NULL;
regnode *end = NULL;
U32 my_recursed_depth= recursed_depth;
-
- if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
+ if (OP(scan) != SUSPEND) { /* GOSUB */
/* Do setup, note this code has side effects beyond
* the rest of this block. Specifically setting
* RExC_recurse[] must happen at least once during
* study_chunk(). */
- if (OP(scan) == GOSUB) {
- paren = ARG(scan);
- RExC_recurse[ARG2L(scan)] = scan;
- start = RExC_open_parens[paren-1];
- end = RExC_close_parens[paren-1];
- } else {
- start = RExC_rxi->program + 1;
- end = RExC_opend;
- }
+ paren = ARG(scan);
+ RExC_recurse[ARG2L(scan)] = scan;
+ start = RExC_open_parens[paren];
+ end = RExC_close_parens[paren];
+
/* NOTE we MUST always execute the above code, even
- * if we do nothing with a GOSUB/GOSTART */
+ * if we do nothing with a GOSUB */
if (
( flags & SCF_IN_DEFINE )
||
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
- DEBUG_STUDYDATA("set:", data,depth);
+ DEBUG_STUDYDATA("gosub-set:", data,depth);
PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
my_recursed_depth= recursed_depth + 1;
} else {
- DEBUG_STUDYDATA("inf:", data,depth);
+ DEBUG_STUDYDATA("gosub-inf:", data,depth);
/* some form of infinite recursion, assume infinite length
* */
if (flags & SCF_DO_SUBSTR) {
if (OP(nxt) != CLOSE)
goto nogo;
if (RExC_open_parens) {
- RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
- RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
+ RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
+ RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/
}
/* Now we know that nxt2 is the only contents: */
oscan->flags = (U8)ARG(nxt);
oscan->flags = (U8)ARG(nxt);
if (RExC_open_parens) {
- RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
- RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
+ RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
+ RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/
}
OP(nxt1) = OPTIMIZED; /* was OPEN. */
OP(nxt) = OPTIMIZED; /* was CLOSE. */
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
+Perl_re_printf( aTHX_ "counted=%"UVuf" deltanext=%"UVuf
" SSize_t_MAX=%"UVuf" minnext=%"UVuf
" maxcount=%"UVuf" mincount=%"UVuf"\n",
(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
(UV)mincount);
if (deltanext != SSize_t_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
+Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n",
(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
/* Else: zero-length, ignore. */
scan = regnext(scan);
}
- /* If we are exiting a recursion we can unset its recursed bit
- * and allow ourselves to enter it again - no danger of an
- * infinite loop there.
- if (stopparen > -1 && recursed) {
- DEBUG_STUDYDATA("unset:", data,depth);
- PAREN_UNSET( recursed, stopparen);
- }
- */
+
+ finish:
if (frame) {
+ /* we need to unwind recursion. */
depth = depth - 1;
DEBUG_STUDYDATA("frame-end:",data,depth);
goto fake_study_recurse;
}
- finish:
assert(!frame);
DEBUG_STUDYDATA("pre-fin:",data,depth);
/* Dispatch a request to compile a regexp to correct regexp engine. */
DEBUG_COMPILE_r({
- PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
+ Perl_re_printf( aTHX_ "Using engine %"UVxf"\n",
PTR2UV(eng));
});
return CALLREGCOMP_ENG(eng, pattern, flags);
bool do_end = 0;
GET_RE_DEBUG_FLAGS_DECL;
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
Newx(dst, *plen_p * 2 + 1, U8);
sv_catsv_nomg(pat, msv);
rx = msv;
}
- else
- pat = msv;
+ else {
+ /* We have only one SV to process, but we need to verify
+ * it is properly null terminated or we will fail asserts
+ * later. In theory we probably shouldn't get such SV's,
+ * but if we do we should handle it gracefully. */
+ if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) ) {
+ /* not a string, or a string with a trailing null */
+ pat = msv;
+ } else {
+ /* a string with no trailing null, we need to copy it
+ * so it we have a trailing null */
+ pat = newSVsv(msv);
+ }
+ }
if (code)
pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
*p++ = 'x';
*p++ = '\0';
DEBUG_COMPILE_r({
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"%sre-parsing pattern for runtime code:%s %s\n",
PL_colors[4],PL_colors[5],newpat);
});
#ifdef DEBUGGING
dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
if ( ! dump_len_string
- || ! grok_atoUV(dump_len_string, &PL_dump_re_max_len, NULL))
+ || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
{
PL_dump_re_max_len = 0;
}
#endif
}
+ pRExC_state->warn_text = NULL;
pRExC_state->code_blocks = NULL;
pRExC_state->num_code_blocks = 0;
}
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_
"Assembling pattern from %d elements%s\n", pat_count,
orig_rx_flags & RXf_SPLIT ? " for split" : ""));
*is_bare_re = TRUE;
SvREFCNT_inc(re);
Safefree(pRExC_state->code_blocks);
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_
"Precompiled pattern%s\n",
orig_rx_flags & RXf_SPLIT ? " for split" : ""));
RExC_contains_locale = 0;
RExC_contains_i = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
+ RExC_study_started = 0;
pRExC_state->runtime_code_qr = NULL;
RExC_frame_head= NULL;
RExC_frame_last= NULL;
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
- PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+ Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
PL_colors[4],PL_colors[5],s);
});
RExC_whilem_seen = 0;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
- RExC_opend = NULL;
+ RExC_end_op = NULL;
RExC_paren_names = NULL;
#ifdef DEBUGGING
RExC_paren_name_list = NULL;
assert(*RExC_end == '\0');
DEBUG_PARSE_r(
- PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
+ Perl_re_printf( aTHX_ "Starting first pass (sizing)\n");
RExC_lastnum=0;
RExC_lastparse=NULL;
);
pRExC_state->num_code_blocks);
}
else {
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_
"Need to redo pass 1\n"));
}
SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
DEBUG_PARSE_r({
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"Required size %"IVdf" nodes\n"
"Starting second pass (creation)\n",
(IV)RExC_size);
r->intflags = 0;
r->nparens = RExC_npar - 1; /* set early to validate backrefs */
- /* setup various meta data about recursion, this all requires
- * RExC_npar to be correctly set, and a bit later on we clear it */
- if (RExC_seen & REG_RECURSE_SEEN) {
- Newxz(RExC_open_parens, RExC_npar,regnode *);
- SAVEFREEPV(RExC_open_parens);
- Newxz(RExC_close_parens,RExC_npar,regnode *);
- SAVEFREEPV(RExC_close_parens);
- }
- if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
- /* Note, RExC_npar is 1 + the number of parens in a pattern.
- * So its 1 if there are no parens. */
- RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
- ((RExC_npar & 0x07) != 0);
- Newx(RExC_study_chunk_recursed,
- RExC_study_chunk_recursed_bytes * RExC_npar, U8);
- SAVEFREEPV(RExC_study_chunk_recursed);
- }
-
/* Useful during FAIL. */
#ifdef RE_TRACK_PATTERN_OFFSETS
Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
- DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
"%s %"UVuf" bytes for offset annotations.\n",
ri->u.offsets ? "Got" : "Couldn't get",
(UV)((2*RExC_size+1) * sizeof(U32))));
RExC_parse = exp;
RExC_end = exp + plen;
RExC_naughty = 0;
- RExC_npar = 1;
RExC_emit_start = ri->program;
RExC_emit = ri->program;
RExC_emit_bound = ri->program + RExC_size + 1;
pRExC_state->code_index = 0;
*((char*) RExC_emit++) = (char) REG_MAGIC;
+ /* setup various meta data about recursion, this all requires
+ * RExC_npar to be correctly set, and a bit later on we clear it */
+ if (RExC_seen & REG_RECURSE_SEEN) {
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Setting up open/close parens\n",
+ 22, "| |", (int)(0 * 2 + 1), ""));
+
+ /* setup RExC_open_parens, which holds the address of each
+ * OPEN tag, and to make things simpler for the 0 index
+ * the start of the program - this is used later for offsets */
+ Newxz(RExC_open_parens, RExC_npar,regnode *);
+ SAVEFREEPV(RExC_open_parens);
+ RExC_open_parens[0] = RExC_emit;
+
+ /* setup RExC_close_parens, which holds the address of each
+ * CLOSE tag, and to make things simpler for the 0 index
+ * the end of the program - this is used later for offsets */
+ Newxz(RExC_close_parens, RExC_npar,regnode *);
+ SAVEFREEPV(RExC_close_parens);
+ /* we dont know where end op starts yet, so we dont
+ * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
+
+ /* Note, RExC_npar is 1 + the number of parens in a pattern.
+ * So its 1 if there are no parens. */
+ RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
+ ((RExC_npar & 0x07) != 0);
+ Newx(RExC_study_chunk_recursed,
+ RExC_study_chunk_recursed_bytes * RExC_npar, U8);
+ SAVEFREEPV(RExC_study_chunk_recursed);
+ }
+ RExC_npar = 1;
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
}
+ DEBUG_OPTIMISE_r(
+ Perl_re_printf( aTHX_ "Starting post parse optimization\n");
+ );
+
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
Newx(r->substrs, 1, struct reg_substr_data);
copyRExC_state = RExC_state;
} else {
U32 seen=RExC_seen;
- DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
+ DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
RExC_state = copyRExC_state;
if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
#ifdef TRIE_STUDY_OPT
DEBUG_PARSE_r(
if (!restudied)
- PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
+ Perl_re_printf( aTHX_ "first at %"IVdf"\n",
(IV)(first - scan + 1))
);
#else
DEBUG_PARSE_r(
- PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
+ Perl_re_printf( aTHX_ "first at %"IVdf"\n",
(IV)(first - scan + 1))
);
#endif
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
data.start_class = NULL;
regnode_ssc ch_class;
SSize_t last_close = 0;
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
scan = ri->program + 1;
ssc_init(pRExC_state, &ch_class);
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
data.start_class = NULL;
/* Guard against an embedded (?=) or (?<=) with a longer minlen than
the "real" pattern. */
DEBUG_OPTIMISE_r({
- PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
+ Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
(IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
});
r->minlenret = minlen;
if (r->minlen < minlen)
r->minlen = minlen;
+ if (RExC_seen & REG_RECURSE_SEEN ) {
+ r->intflags |= PREGf_RECURSE_SEEN;
+ Newxz(r->recurse_locinput, r->nparens + 1, char *);
+ }
if (RExC_seen & REG_GPOS_SEEN)
r->intflags |= PREGf_GPOS_SEEN;
if (RExC_seen & REG_LOOKBEHIND_SEEN)
= (void*)SvREFCNT_inc(RExC_paren_name_list);
} else
#endif
- ri->name_list_idx = 0;
+ ri->name_list_idx = 0;
- if (RExC_recurse_count) {
- for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
- const regnode *scan = RExC_recurse[RExC_recurse_count-1];
- ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
- }
+ while ( RExC_recurse_count > 0 ) {
+ const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
+ ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
}
+
Newxz(r->offs, RExC_npar, regexp_paren_pair);
/* assume we don't need to swap parens around before we match */
DEBUG_TEST_r({
- PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
+ Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
(unsigned long)RExC_study_chunk_recursed_count);
});
DEBUG_DUMP_r({
DEBUG_RExC_seen();
- PerlIO_printf(Perl_debug_log,"Final program:\n");
+ Perl_re_printf( aTHX_ "Final program:\n");
regdump(r);
});
#ifdef RE_TRACK_PATTERN_OFFSETS
const STRLEN len = ri->u.offsets[0];
STRLEN i;
GET_RE_DEBUG_FLAGS_DECL;
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
for (i = 1; i <= len; i++) {
if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
- PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
+ Perl_re_printf( aTHX_ "%"UVuf":%"UVuf"[%"UVuf"] ",
(UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
}
- PerlIO_printf(Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "\n");
});
#endif
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
int num; \
if (RExC_lastparse!=RExC_parse) { \
- PerlIO_printf(Perl_debug_log, "%s", \
+ Perl_re_printf( aTHX_ "%s", \
Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
RExC_end - RExC_parse, 16, \
"", "", \
) \
); \
} else \
- PerlIO_printf(Perl_debug_log,"%16s",""); \
+ Perl_re_printf( aTHX_ "%16s",""); \
\
if (SIZE_ONLY) \
num = RExC_size + 1; \
else \
num=REG_NODE_NUM(RExC_emit); \
if (RExC_lastnum!=num) \
- PerlIO_printf(Perl_debug_log,"|%4d",num); \
+ Perl_re_printf( aTHX_ "|%4d",num); \
else \
- PerlIO_printf(Perl_debug_log,"|%4s",""); \
- PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
+ Perl_re_printf( aTHX_ "|%4s",""); \
+ Perl_re_printf( aTHX_ "|%*s%-4s", \
(int)((depth*2)), "", \
(funcname) \
); \
#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
DEBUG_PARSE_MSG((funcname)); \
- PerlIO_printf(Perl_debug_log,"%4s","\n"); \
+ Perl_re_printf( aTHX_ "%4s","\n"); \
})
-#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
+#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
DEBUG_PARSE_MSG((funcname)); \
- PerlIO_printf(Perl_debug_log,fmt "\n",args); \
+ Perl_re_printf( aTHX_ fmt "\n",args); \
})
/* This section of code defines the inversion list object and its methods. The
* as an SVt_INVLIST scalar.
*
* An inversion list for Unicode is an array of code points, sorted by ordinal
- * number. The zeroth element is the first code point in the list. The 1th
- * element is the first element beyond that not in the list. In other words,
- * the first range is
- * invlist[0]..(invlist[1]-1)
- * The other ranges follow. Thus every element whose index is divisible by two
- * marks the beginning of a range that is in the list, and every element not
- * divisible by two marks the beginning of a range not in the list. A single
- * element inversion list that contains the single code point N generally
- * consists of two elements
- * invlist[0] == N
- * invlist[1] == N+1
- * (The exception is when N is the highest representable value on the
- * machine, in which case the list containing just it would be a single
- * element, itself. By extension, if the last range in the list extends to
- * infinity, then the first element of that range will be in the inversion list
- * at a position that is divisible by two, and is the final element in the
- * list.)
+ * number. Each element gives the code point that begins a range that extends
+ * up-to but not including the code point given by the next element. The final
+ * element gives the first code point of a range that extends to the platform's
+ * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
+ * ...) give ranges whose code points are all in the inversion list. We say
+ * that those ranges are in the set. The odd-numbered elements give ranges
+ * whose code points are not in the inversion list, and hence not in the set.
+ * Thus, element [0] is the first code point in the list. Element [1]
+ * is the first code point beyond that not in the list; and element [2] is the
+ * first code point beyond that that is in the list. In other words, the first
+ * range is invlist[0]..(invlist[1]-1), and all code points in that range are
+ * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
+ * all code points in that range are not in the inversion list. The third
+ * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
+ * list, and so forth. Thus every element whose index is divisible by two
+ * gives the beginning of a range that is in the list, and every element whose
+ * index is not divisible by two gives the beginning of a range not in the
+ * list. If the final element's index is divisible by two, the inversion list
+ * extends to the platform's infinity; otherwise the highest code point in the
+ * inversion list is the contents of that element minus 1.
+ *
+ * A range that contains just a single code point N will look like
+ * invlist[i] == N
+ * invlist[i+1] == N+1
+ *
+ * If N is UV_MAX (the highest representable code point on the machine), N+1 is
+ * impossible to represent, so element [i+1] is omitted. The single element
+ * inversion list
+ * invlist[0] == UV_MAX
+ * contains just UV_MAX, but is interpreted as matching to infinity.
+ *
* Taking the complement (inverting) an inversion list is quite simple, if the
* first element is 0, remove it; otherwise add a 0 element at the beginning.
* This implementation reserves an element at the beginning of each inversion
* list to always contain 0; there is an additional flag in the header which
* indicates if the list begins at the 0, or is offset to begin at the next
- * element.
+ * element. This means that the inversion list can be inverted without any
+ * copying; just flip the flag.
*
* More about inversion lists can be found in "Unicode Demystified"
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
- * More will be coming when functionality is added later.
*
* The inversion list data structure is currently implemented as an SV pointing
* to an array of UVs that the SV thinks are bytes. This allows us to have an
/* The header definitions are in F<invlist_inline.h> */
+#ifndef PERL_IN_XSUB_RE
+
PERL_STATIC_INLINE UV*
S__invlist_array_init(SV* const invlist, const bool will_have_0)
{
return zero_addr + *offset;
}
+#endif
+
PERL_STATIC_INLINE void
S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
{
#ifndef PERL_IN_XSUB_RE
+STATIC void
+S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
+{
+ /* Replaces the inversion list in 'src' with the one in 'dest'. It steals
+ * the list from 'src', so 'src' is made to have a NULL list. This is
+ * similar to what SvSetMagicSV() would do, if it were implemented on
+ * inversion lists, though this routine avoids a copy */
+
+ const UV src_len = _invlist_len(src);
+ const bool src_offset = *get_invlist_offset_addr(src);
+ const STRLEN src_byte_len = SvLEN(src);
+ char * array = SvPVX(src);
+
+ const int oldtainted = TAINT_get;
+
+ PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
+
+ assert(SvTYPE(src) == SVt_INVLIST);
+ assert(SvTYPE(dest) == SVt_INVLIST);
+ assert(! invlist_is_iterating(src));
+ assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
+
+ /* Make sure it ends in the right place with a NUL, as our inversion list
+ * manipulations aren't careful to keep this true, but sv_usepvn_flags()
+ * asserts it */
+ array[src_byte_len - 1] = '\0';
+
+ TAINT_NOT; /* Otherwise it breaks */
+ sv_usepvn_flags(dest,
+ (char *) array,
+ src_byte_len - 1,
+
+ /* This flag is documented to cause a copy to be avoided */
+ SV_HAS_TRAILING_NUL);
+ TAINT_set(oldtainted);
+ SvPV_set(src, 0);
+ SvLEN_set(src, 0);
+ SvCUR_set(src, 0);
+
+ /* Finish up copying over the other fields in an inversion list */
+ *get_invlist_offset_addr(dest) = src_offset;
+ invlist_set_len(dest, src_len, src_offset);
+ *get_invlist_previous_index_addr(dest) = 0;
+ invlist_iterfinish(dest);
+}
+
PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV* invlist)
{
}
PERL_STATIC_INLINE void
-S_invlist_trim(SV* const invlist)
+S_invlist_trim(SV* invlist)
{
+ /* Free the not currently-being-used space in an inversion list */
+
+ /* But don't free up the space needed for the 0 UV that is always at the
+ * beginning of the list, nor the trailing NUL */
+ const UV min_size = TO_INTERNAL_SIZE(1) + 1;
+
PERL_ARGS_ASSERT_INVLIST_TRIM;
assert(SvTYPE(invlist) == SVt_INVLIST);
- /* Change the length of the inversion list to how many entries it currently
- * has */
- SvPV_shrink_to_cur((SV *) invlist);
+ SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
+}
+
+PERL_STATIC_INLINE void
+S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
+{
+ PERL_ARGS_ASSERT_INVLIST_CLEAR;
+
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ invlist_set_len(invlist, 0, 0);
+ invlist_trim(invlist);
}
#endif /* ifndef PERL_IN_XSUB_RE */
return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
+#ifndef PERL_IN_XSUB_RE
+
PERL_STATIC_INLINE UV
S_invlist_max(SV* const invlist)
{
? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
: FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
}
-
-#ifndef PERL_IN_XSUB_RE
SV*
Perl__new_invlist(pTHX_ IV initial_size)
{
return invlist;
}
-#endif /* ifndef PERL_IN_XSUB_RE */
STATIC void
S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
UV final_element = len - 1;
array = invlist_array(invlist);
- if (array[final_element] > start
+ if ( array[final_element] > start
|| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
{
Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
}
- /* Here, it is a legal append. If the new range begins with the first
- * value not in the set, it is extending the set, so the new first
- * value not in the set is one greater than the newly extended range.
- * */
+ /* Here, it is a legal append. If the new range begins 1 above the end
+ * of the range below it, it is extending the range below it, so the
+ * new first value not in the set is one greater than the newly
+ * extended range. */
offset = *get_invlist_offset_addr(invlist);
if (array[final_element] == start) {
if (end != UV_MAX) {
}
else {
/* But if the end is the maximum representable on the machine,
- * just let the range that this would extend to have no end */
+ * assume that infinity was actually what was meant. Just let
+ * the range that this would extend to have no end */
invlist_set_len(invlist, len - 1, offset);
}
return;
}
}
-#ifndef PERL_IN_XSUB_RE
-
-IV
+SSize_t
Perl__invlist_search(SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
/* Take the union of two inversion lists and point <output> to it. *output
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise *output will be made correspondingly
- * mortal. The first list, <a>, may be NULL, in which case a copy of the
- * second list is returned. If <complement_b> is TRUE, the union is taken
- * of the complement (inversion) of <b> instead of b itself.
+ * temporary (mortal); otherwise just its contents will be modified to be
+ * the union. The first list, <a>, may be NULL, in which case a copy of
+ * the second list is returned. If <complement_b> is TRUE, the union is
+ * taken of the complement (inversion) of <b> instead of b itself.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
* code at your own risk.
*
- * The algorithm is like a merge sort.
- *
- * XXX A potential performance improvement is to keep track as we go along
- * if only one of the inputs contributes to the result, meaning the other
- * is a subset of that one. In that case, we can skip the final copy and
- * return the larger of the input lists, but then outside code might need
- * to keep track of whether to free the input list or not */
+ * The algorithm is like a merge sort. */
const UV* array_a; /* a's array */
const UV* array_b;
SV* u; /* the resulting union */
UV* array_u;
- UV len_u;
+ UV len_u = 0;
UV i_a = 0; /* current index into a's array */
UV i_b = 0;
/* running count, as explained in the algorithm source book; items are
* stopped accumulating and are output when the count changes to/from 0.
- * The count is incremented when we start a range that's in the set, and
- * decremented when we start a range that's not in the set. So its range
- * is 0 to 2. Only when the count is zero is something not in the set.
- */
+ * The count is incremented when we start a range that's in an input's set,
+ * and decremented when we start a range that's not in a set. So this
+ * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
+ * and hence nothing goes into the union; 1, just one of the inputs is in
+ * its set (and its current range gets added to the union); and 2 when both
+ * inputs are in their sets. */
UV count = 0;
PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- /* If either one is empty, the union is the other one */
- if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
- bool make_temp = FALSE; /* Should we mortalize the result? */
+ len_b = _invlist_len(b);
+ if (len_b == 0) {
- if (*output == a) {
- if (a != NULL) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
+ /* Here, 'b' is empty. If the output is the complement of 'b', the
+ * union is all possible code points, and we need not even look at 'a'.
+ * It's easiest to create a new inversion list that matches everything.
+ * */
+ if (complement_b) {
+ SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
+
+ /* If the output didn't exist, just point it at the new list */
+ if (*output == NULL) {
+ *output = everything;
+ return;
}
- }
- if (*output != b) {
- *output = invlist_clone(b);
- if (complement_b) {
- _invlist_invert(*output);
+
+ /* Otherwise, replace its contents with the new list */
+ invlist_replace_list_destroys_src(*output, everything);
+ SvREFCNT_dec_NN(everything);
+ return;
+ }
+
+ /* Here, we don't want the complement of 'b', and since it is empty,
+ * the union will come entirely from 'a'. If 'a' is NULL or empty, the
+ * output will be empty */
+
+ if (a == NULL) {
+ *output = _new_invlist(0);
+ return;
+ }
+
+ if (_invlist_len(a) == 0) {
+ invlist_clear(*output);
+ return;
+ }
+
+ /* Here, 'a' is not empty, and entirely determines the union. If the
+ * output is not to overwrite 'b', we can just return 'a'. */
+ if (*output != b) {
+
+ /* If the output is to overwrite 'a', we have a no-op, as it's
+ * already in 'a' */
+ if (*output == a) {
+ return;
}
- } /* else *output already = b; */
- if (make_temp) {
- sv_2mortal(*output);
+ /* But otherwise we have to copy 'a' to the output */
+ *output = invlist_clone(a);
+ return;
}
- return;
+
+ /* Here, 'b' is to be overwritten by the output, which will be 'a' */
+ u = invlist_clone(a);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
+
+ return;
}
- else if ((len_b = _invlist_len(b)) == 0) {
- bool make_temp = FALSE;
- if (*output == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
+
+ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
+
+ /* Here, 'a' is empty (and b is not). That means the union will come
+ * entirely from 'b'. If the output is not to overwrite 'a', we can
+ * just return what's in 'b'. */
+ if (*output != a) {
+
+ /* If the output is to overwrite 'b', it's already in 'b', but
+ * otherwise we have to copy 'b' to the output */
+ if (*output != b) {
+ *output = invlist_clone(b);
}
- }
- /* The complement of an empty list is a list that has everything in it,
- * so the union with <a> includes everything too */
- if (complement_b) {
- if (a == *output) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
+ /* And if the output is to be the inversion of 'b', do that */
+ if (complement_b) {
+ _invlist_invert(*output);
}
- *output = _new_invlist(1);
- _append_range_to_invlist(*output, 0, UV_MAX);
+
+ return;
}
- else if (*output != a) {
- *output = invlist_clone(a);
+
+ /* Here, 'a', which is empty or even NULL, is to be overwritten by the
+ * output, which will either be 'b' or the complement of 'b' */
+
+ if (a == NULL) {
+ *output = invlist_clone(b);
}
- /* else *output already = a; */
+ else {
+ u = invlist_clone(b);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
+ }
- if (make_temp) {
- sv_2mortal(*output);
+ if (complement_b) {
+ _invlist_invert(*output);
}
+
return;
}
u = _new_invlist(len_a + len_b);
/* Will contain U+0000 if either component does */
- array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
- || (len_b > 0 && array_b[0] == 0));
+ array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
+ || (len_b > 0 && array_b[0] == 0));
- /* Go through each list item by item, stopping when exhausted one of
+ /* Go through each input list item by item, stopping when exhausted one of
* them */
while (i_a < len_a && i_b < len_b) {
UV cp; /* The element to potentially add to the union's array */
/* We need to take one or the other of the two inputs for the union.
* Since we are merging two sorted lists, we take the smaller of the
- * next items. In case of a tie, we take the one that is in its set
- * first. If we took one not in the set first, it would decrement the
- * count, possibly to 0 which would cause it to be output as ending the
- * range, and the next time through we would take the same number, and
- * output it again as beginning the next range. By doing it the
- * opposite way, there is no possibility that the count will be
- * momentarily decremented to 0, and thus the two adjoining ranges will
- * be seamlessly merged. (In a tie and both are in the set or both not
- * in the set, it doesn't matter which we take first.) */
- if (array_a[i_a] < array_b[i_b]
- || (array_a[i_a] == array_b[i_b]
+ * next items. In case of a tie, we take first the one that is in its
+ * set. If we first took the one not in its set, it would decrement
+ * the count, possibly to 0 which would cause it to be output as ending
+ * the range, and the next time through we would take the same number,
+ * and output it again as beginning the next range. By doing it the
+ * opposite way, there is no possibility that the count will be
+ * momentarily decremented to 0, and thus the two adjoining ranges will
+ * be seamlessly merged. (In a tie and both are in the set or both not
+ * in the set, it doesn't matter which we take first.) */
+ if ( array_a[i_a] < array_b[i_b]
+ || ( array_a[i_a] == array_b[i_b]
&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
{
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
- cp= array_a[i_a++];
+ cp = array_a[i_a++];
}
else {
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
/* Here, have chosen which of the two inputs to look at. Only output
* if the running count changes to/from 0, which marks the
- * beginning/end of a range in that's in the set */
+ * beginning/end of a range that's in the set */
if (cp_in_set) {
if (count == 0) {
array_u[i_u++] = cp;
}
}
- /* Here, we are finished going through at least one of the lists, which
- * means there is something remaining in at most one. We check if the list
- * that hasn't been exhausted is positioned such that we are in the middle
- * of a range in its set or not. (i_a and i_b point to the element beyond
- * the one we care about.) If in the set, we decrement 'count'; if 0, there
- * is potentially more to output.
- * There are four cases:
- * 1) Both weren't in their sets, count is 0, and remains 0. What's left
- * in the union is entirely from the non-exhausted set.
- * 2) Both were in their sets, count is 2. Nothing further should
- * be output, as everything that remains will be in the exhausted
- * list's set, hence in the union; decrementing to 1 but not 0 insures
- * that
- * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
- * Nothing further should be output because the union includes
- * everything from the exhausted set. Not decrementing ensures that.
- * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
- * decrementing to 0 insures that we look at the remainder of the
- * non-exhausted set */
- if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
+
+ /* The loop above increments the index into exactly one of the input lists
+ * each iteration, and ends when either index gets to its list end. That
+ * means the other index is lower than its end, and so something is
+ * remaining in that one. We decrement 'count', as explained below, if
+ * that list is in its set. (i_a and i_b each currently index the element
+ * beyond the one we care about.) */
+ if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
{
count--;
}
- /* The final length is what we've output so far, plus what else is about to
- * be output. (If 'count' is non-zero, then the input list we exhausted
- * has everything remaining up to the machine's limit in its set, and hence
- * in the union, so there will be no further output. */
- len_u = i_u;
- if (count == 0) {
- /* At most one of the subexpressions will be non-zero */
- len_u += (len_a - i_a) + (len_b - i_b);
+ /* Above we decremented 'count' if the list that had unexamined elements in
+ * it was in its set. This has made it so that 'count' being non-zero
+ * means there isn't anything left to output; and 'count' equal to 0 means
+ * that what is left to output is precisely that which is left in the
+ * non-exhausted input list.
+ *
+ * To see why, note first that the exhausted input obviously has nothing
+ * left to add to the union. If it was in its set at its end, that means
+ * the set extends from here to the platform's infinity, and hence so does
+ * the union and the non-exhausted set is irrelevant. The exhausted set
+ * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
+ * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
+ * 'count' remains at 1. This is consistent with the decremented 'count'
+ * != 0 meaning there's nothing left to add to the union.
+ *
+ * But if the exhausted input wasn't in its set, it contributed 0 to
+ * 'count', and the rest of the union will be whatever the other input is.
+ * If 'count' was 0, neither list was in its set, and 'count' remains 0;
+ * otherwise it gets decremented to 0. This is consistent with 'count'
+ * == 0 meaning the remainder of the union is whatever is left in the
+ * non-exhausted list. */
+ if (count != 0) {
+ len_u = i_u;
+ }
+ else {
+ IV copy_count = len_a - i_a;
+ if (copy_count > 0) { /* The non-exhausted input is 'a' */
+ Copy(array_a + i_a, array_u + i_u, copy_count, UV);
+ }
+ else { /* The non-exhausted input is b */
+ copy_count = len_b - i_b;
+ Copy(array_b + i_b, array_u + i_u, copy_count, UV);
+ }
+ len_u = i_u + copy_count;
}
- /* Set result to final length, which can change the pointer to array_u, so
- * re-find it */
+ /* Set the result to the final length, which can change the pointer to
+ * array_u, so re-find it. (Note that it is unlikely that this will
+ * change, as we are shrinking the space, not enlarging it) */
if (len_u != _invlist_len(u)) {
invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
invlist_trim(u);
array_u = invlist_array(u);
}
- /* When 'count' is 0, the list that was exhausted (if one was shorter than
- * the other) ended with everything above it not in its set. That means
- * that the remaining part of the union is precisely the same as the
- * non-exhausted list, so can just copy it unchanged. (If both list were
- * exhausted at the same time, then the operations below will be both 0.)
- */
- if (count == 0) {
- IV copy_count; /* At most one will have a non-zero copy count */
- if ((copy_count = len_a - i_a) > 0) {
- Copy(array_a + i_a, array_u + i_u, copy_count, UV);
- }
- else if ((copy_count = len_b - i_b) > 0) {
- Copy(array_b + i_b, array_u + i_u, copy_count, UV);
- }
+ /* If the output is not to overwrite either of the inputs, just return the
+ * calculated union */
+ if (a != *output && b != *output) {
+ *output = u;
}
+ else {
+ /* Here, the output is to be the same as one of the input scalars,
+ * hence replacing it. The simple thing to do is to free the input
+ * scalar, making it instead be the output one. But experience has
+ * shown [perl #127392] that if the input is a mortal, we can get a
+ * huge build-up of these during regex compilation before they get
+ * freed. So for that case, replace just the input's interior with
+ * the union's, and then free the union */
- /* We may be removing a reference to one of the inputs. If so, the output
- * is made mortal if the input was. (Mortal SVs shouldn't have their ref
- * count decremented) */
- if (a == *output || b == *output) {
assert(! invlist_is_iterating(*output));
- if ((SvTEMP(*output))) {
- sv_2mortal(u);
+
+ if (! SvTEMP(*output)) {
+ SvREFCNT_dec_NN(*output);
+ *output = u;
}
else {
- SvREFCNT_dec_NN(*output);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
}
}
- *output = u;
-
return;
}
/* Take the intersection of two inversion lists and point <i> to it. *i
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise *i will be made correspondingly mortal.
- * The first list, <a>, may be NULL, in which case an empty list is
- * returned. If <complement_b> is TRUE, the result will be the
- * intersection of <a> and the complement (or inversion) of <b> instead of
- * <b> directly.
+ * temporary (mortal); otherwise just its contents will be modified to be
+ * the intersection. The first list, <a>, may be NULL, in which case an
+ * empty list is returned. If <complement_b> is TRUE, the result will be
+ * the intersection of <a> and the complement (or inversion) of <b> instead
+ * of <b> directly.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
SV* r; /* the resulting intersection */
UV* array_r;
- UV len_r;
+ UV len_r = 0;
UV i_a = 0; /* current index into a's array */
UV i_b = 0;
UV i_r = 0;
- /* running count, as explained in the algorithm source book; items are
- * stopped accumulating and are output when the count changes to/from 2.
- * The count is incremented when we start a range that's in the set, and
- * decremented when we start a range that's not in the set. So its range
- * is 0 to 2. Only when the count is 2 is something in the intersection.
- */
+ /* running count of how many of the two inputs are postitioned at ranges
+ * that are in their sets. As explained in the algorithm source book,
+ * items are stopped accumulating and are output when the count changes
+ * to/from 2. The count is incremented when we start a range that's in an
+ * input's set, and decremented when we start a range that's not in a set.
+ * Only when it is 2 are we in the intersection. */
UV count = 0;
PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
/* Special case if either one is empty */
len_a = (a == NULL) ? 0 : _invlist_len(a);
if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
- bool make_temp = FALSE;
-
if (len_a != 0 && complement_b) {
- /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
- * be empty. Here, also we are using 'b's complement, which hence
- * must be every possible code point. Thus the intersection is
- * simply 'a'. */
- if (*i != a) {
- if (*i == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
- }
- }
+ /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
+ * must be empty. Here, also we are using 'b's complement, which
+ * hence must be every possible code point. Thus the intersection
+ * is simply 'a'. */
- *i = invlist_clone(a);
+ if (*i == a) { /* No-op */
+ return;
}
- /* else *i is already 'a' */
- if (make_temp) {
- sv_2mortal(*i);
+ /* If not overwriting either input, just make a copy of 'a' */
+ if (*i != b) {
+ *i = invlist_clone(a);
+ return;
}
+
+ /* Here we are overwriting 'b' with 'a's contents */
+ r = invlist_clone(a);
+ invlist_replace_list_destroys_src(*i, r);
+ SvREFCNT_dec_NN(r);
return;
}
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
* intersection must be empty */
- if (*i == a) {
- if (a != NULL) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
- }
- }
- else if (*i == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
- }
- }
- *i = _new_invlist(0);
- if (make_temp) {
- sv_2mortal(*i);
+ if (*i == NULL) {
+ *i = _new_invlist(0);
+ return;
}
+ invlist_clear(*i);
return;
}
r= _new_invlist(len_a + len_b);
/* Will contain U+0000 iff both components do */
- array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
- && len_b > 0 && array_b[0] == 0);
+ array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
+ && len_b > 0 && array_b[0] == 0);
/* Go through each list item by item, stopping when exhausted one of
* them */
/* We need to take one or the other of the two inputs for the
* intersection. Since we are merging two sorted lists, we take the
- * smaller of the next items. In case of a tie, we take the one that
- * is not in its set first (a difference from the union algorithm). If
- * we took one in the set first, it would increment the count, possibly
- * to 2 which would cause it to be output as starting a range in the
- * intersection, and the next time through we would take that same
- * number, and output it again as ending the set. By doing it the
- * opposite of this, there is no possibility that the count will be
- * momentarily incremented to 2. (In a tie and both are in the set or
- * both not in the set, it doesn't matter which we take first.) */
- if (array_a[i_a] < array_b[i_b]
- || (array_a[i_a] == array_b[i_b]
+ * smaller of the next items. In case of a tie, we take first the one
+ * that is not in its set (a difference from the union algorithm). If
+ * we first took the one in its set, it would increment the count,
+ * possibly to 2 which would cause it to be output as starting a range
+ * in the intersection, and the next time through we would take that
+ * same number, and output it again as ending the set. By doing the
+ * opposite of this, there is no possibility that the count will be
+ * momentarily incremented to 2. (In a tie and both are in the set or
+ * both not in the set, it doesn't matter which we take first.) */
+ if ( array_a[i_a] < array_b[i_b]
+ || ( array_a[i_a] == array_b[i_b]
&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
{
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
- cp= array_a[i_a++];
+ cp = array_a[i_a++];
}
else {
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
}
count--;
}
+
}
- /* Here, we are finished going through at least one of the lists, which
- * means there is something remaining in at most one. We check if the list
- * that has been exhausted is positioned such that we are in the middle
- * of a range in its set or not. (i_a and i_b point to elements 1 beyond
- * the ones we care about.) There are four cases:
- * 1) Both weren't in their sets, count is 0, and remains 0. There's
- * nothing left in the intersection.
- * 2) Both were in their sets, count is 2 and perhaps is incremented to
- * above 2. What should be output is exactly that which is in the
- * non-exhausted set, as everything it has is also in the intersection
- * set, and everything it doesn't have can't be in the intersection
- * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
- * gets incremented to 2. Like the previous case, the intersection is
- * everything that remains in the non-exhausted set.
- * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
- * remains 1. And the intersection has nothing more. */
- if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
- || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
+ /* The loop above increments the index into exactly one of the input lists
+ * each iteration, and ends when either index gets to its list end. That
+ * means the other index is lower than its end, and so something is
+ * remaining in that one. We increment 'count', as explained below, if the
+ * exhausted list was in its set. (i_a and i_b each currently index the
+ * element beyond the one we care about.) */
+ if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
+ || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
{
count++;
}
- /* The final length is what we've output so far plus what else is in the
- * intersection. At most one of the subexpressions below will be non-zero
- * */
- len_r = i_r;
- if (count >= 2) {
- len_r += (len_a - i_a) + (len_b - i_b);
+ /* Above we incremented 'count' if the exhausted list was in its set. This
+ * has made it so that 'count' being below 2 means there is nothing left to
+ * output; otheriwse what's left to add to the intersection is precisely
+ * that which is left in the non-exhausted input list.
+ *
+ * To see why, note first that the exhausted input obviously has nothing
+ * left to affect the intersection. If it was in its set at its end, that
+ * means the set extends from here to the platform's infinity, and hence
+ * anything in the non-exhausted's list will be in the intersection, and
+ * anything not in it won't be. Hence, the rest of the intersection is
+ * precisely what's in the non-exhausted list The exhausted set also
+ * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
+ * it means 'count' is now at least 2. This is consistent with the
+ * incremented 'count' being >= 2 means to add the non-exhausted list to
+ * the intersection.
+ *
+ * But if the exhausted input wasn't in its set, it contributed 0 to
+ * 'count', and the intersection can't include anything further; the
+ * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
+ * incremented. This is consistent with 'count' being < 2 meaning nothing
+ * further to add to the intersection. */
+ if (count < 2) { /* Nothing left to put in the intersection. */
+ len_r = i_r;
+ }
+ else { /* copy the non-exhausted list, unchanged. */
+ IV copy_count = len_a - i_a;
+ if (copy_count > 0) { /* a is the one with stuff left */
+ Copy(array_a + i_a, array_r + i_r, copy_count, UV);
+ }
+ else { /* b is the one with stuff left */
+ copy_count = len_b - i_b;
+ Copy(array_b + i_b, array_r + i_r, copy_count, UV);
+ }
+ len_r = i_r + copy_count;
}
- /* Set result to final length, which can change the pointer to array_r, so
- * re-find it */
+ /* Set the result to the final length, which can change the pointer to
+ * array_r, so re-find it. (Note that it is unlikely that this will
+ * change, as we are shrinking the space, not enlarging it) */
if (len_r != _invlist_len(r)) {
invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
invlist_trim(r);
}
}
- /* We may be removing a reference to one of the inputs. If so, the output
- * is made mortal if the input was. (Mortal SVs shouldn't have their ref
- * count decremented) */
- if (a == *i || b == *i) {
+ /* If the output is not to overwrite either of the inputs, just return the
+ * calculated intersection */
+ if (a != *i && b != *i) {
+ *i = r;
+ }
+ else {
+ /* Here, the output is to be the same as one of the input scalars,
+ * hence replacing it. The simple thing to do is to free the input
+ * scalar, making it instead be the output one. But experience has
+ * shown [perl #127392] that if the input is a mortal, we can get a
+ * huge build-up of these during regex compilation before they get
+ * freed. So for that case, replace just the input's interior with
+ * the output's, and then free the output. A short-cut in this case
+ * is if the output is empty, we can just set the input to be empty */
+
assert(! invlist_is_iterating(*i));
- if (SvTEMP(*i)) {
- sv_2mortal(r);
+
+ if (! SvTEMP(*i)) {
+ SvREFCNT_dec_NN(*i);
+ *i = r;
}
else {
- SvREFCNT_dec_NN(*i);
+ if (len_r) {
+ invlist_replace_list_destroys_src(*i, r);
+ }
+ else {
+ invlist_clear(*i);
+ }
+ SvREFCNT_dec_NN(r);
}
}
- *i = r;
-
return;
}
SV*
-Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
+Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
{
/* Add the range from 'start' to 'end' inclusive to the inversion list's
* set. A pointer to the inversion list is returned. This may actually be
* a new list, in which case the passed in one has been destroyed. The
* passed-in inversion list can be NULL, in which case a new one is created
- * with just the one range in it */
-
- SV* range_invlist;
- UV len;
-
+ * with just the one range in it. The new list is not necessarily
+ * NUL-terminated. Space is not freed if the inversion list shrinks as a
+ * result of this function. The gain would not be large, and in many
+ * cases, this is called multiple times on a single inversion list, so
+ * anything freed may almost immediately be needed again.
+ *
+ * This used to mostly call the 'union' routine, but that is much more
+ * heavyweight than really needed for a single range addition */
+
+ UV* array; /* The array implementing the inversion list */
+ UV len; /* How many elements in 'array' */
+ SSize_t i_s; /* index into the invlist array where 'start'
+ should go */
+ SSize_t i_e = 0; /* And the index where 'end' should go */
+ UV cur_highest; /* The highest code point in the inversion list
+ upon entry to this function */
+
+ /* This range becomes the whole inversion list if none already existed */
if (invlist == NULL) {
invlist = _new_invlist(2);
- len = 0;
+ _append_range_to_invlist(invlist, start, end);
+ return invlist;
}
- else {
- len = _invlist_len(invlist);
+
+ /* Likewise, if the inversion list is currently empty */
+ len = _invlist_len(invlist);
+ if (len == 0) {
+ _append_range_to_invlist(invlist, start, end);
+ return invlist;
}
- /* If comes after the final entry actually in the list, can just append it
- * to the end, */
- if (len == 0
- || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
- && start >= invlist_array(invlist)[len - 1]))
- {
- _append_range_to_invlist(invlist, start, end);
- return invlist;
+ /* Starting here, we have to know the internals of the list */
+ array = invlist_array(invlist);
+
+ /* If the new range ends higher than the current highest ... */
+ cur_highest = invlist_highest(invlist);
+ if (end > cur_highest) {
+
+ /* If the whole range is higher, we can just append it */
+ if (start > cur_highest) {
+ _append_range_to_invlist(invlist, start, end);
+ return invlist;
+ }
+
+ /* Otherwise, add the portion that is higher ... */
+ _append_range_to_invlist(invlist, cur_highest + 1, end);
+
+ /* ... and continue on below to handle the rest. As a result of the
+ * above append, we know that the index of the end of the range is the
+ * final even numbered one of the array. Recall that the final element
+ * always starts a range that extends to infinity. If that range is in
+ * the set (meaning the set goes from here to infinity), it will be an
+ * even index, but if it isn't in the set, it's odd, and the final
+ * range in the set is one less, which is even. */
+ if (end == UV_MAX) {
+ i_e = len;
+ }
+ else {
+ i_e = len - 2;
+ }
}
- /* Here, can't just append things, create and return a new inversion list
- * which is the union of this range and the existing inversion list. (If
- * the new range is well-behaved wrt to the old one, we could just insert
- * it, doing a Move() down on the tail of the old one (potentially growing
- * it first). But to determine that means we would have the extra
- * (possibly throw-away) work of first finding where the new one goes and
- * whether it disrupts (splits) an existing range, so it doesn't appear to
- * me (khw) that it's worth it) */
- range_invlist = _new_invlist(2);
- _append_range_to_invlist(range_invlist, start, end);
+ /* We have dealt with appending, now see about prepending. If the new
+ * range starts lower than the current lowest ... */
+ if (start < array[0]) {
+
+ /* Adding something which has 0 in it is somewhat tricky, and uncommon.
+ * Let the union code handle it, rather than having to know the
+ * trickiness in two code places. */
+ if (UNLIKELY(start == 0)) {
+ SV* range_invlist;
+
+ range_invlist = _new_invlist(2);
+ _append_range_to_invlist(range_invlist, start, end);
+
+ _invlist_union(invlist, range_invlist, &invlist);
+
+ SvREFCNT_dec_NN(range_invlist);
+
+ return invlist;
+ }
+
+ /* If the whole new range comes before the first entry, and doesn't
+ * extend it, we have to insert it as an additional range */
+ if (end < array[0] - 1) {
+ i_s = i_e = -1;
+ goto splice_in_new_range;
+ }
+
+ /* Here the new range adjoins the existing first range, extending it
+ * downwards. */
+ array[0] = start;
+
+ /* And continue on below to handle the rest. We know that the index of
+ * the beginning of the range is the first one of the array */
+ i_s = 0;
+ }
+ else { /* Not prepending any part of the new range to the existing list.
+ * Find where in the list it should go. This finds i_s, such that:
+ * invlist[i_s] <= start < array[i_s+1]
+ */
+ i_s = _invlist_search(invlist, start);
+ }
+
+ /* At this point, any extending before the beginning of the inversion list
+ * and/or after the end has been done. This has made it so that, in the
+ * code below, each endpoint of the new range is either in a range that is
+ * in the set, or is in a gap between two ranges that are. This means we
+ * don't have to worry about exceeding the array bounds.
+ *
+ * Find where in the list the new range ends (but we can skip this if we
+ * have already determined what it is, or if it will be the same as i_s,
+ * which we already have computed) */
+ if (i_e == 0) {
+ i_e = (start == end)
+ ? i_s
+ : _invlist_search(invlist, end);
+ }
+
+ /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
+ * is a range that goes to infinity there is no element at invlist[i_e+1],
+ * so only the first relation holds. */
+
+ if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
+
+ /* Here, the ranges on either side of the beginning of the new range
+ * are in the set, and this range starts in the gap between them.
+ *
+ * The new range extends the range above it downwards if the new range
+ * ends at or above that range's start */
+ const bool extends_the_range_above = ( end == UV_MAX
+ || end + 1 >= array[i_s+1]);
+
+ /* The new range extends the range below it upwards if it begins just
+ * after where that range ends */
+ if (start == array[i_s]) {
+
+ /* If the new range fills the entire gap between the other ranges,
+ * they will get merged together. Other ranges may also get
+ * merged, depending on how many of them the new range spans. In
+ * the general case, we do the merge later, just once, after we
+ * figure out how many to merge. But in the case where the new
+ * range exactly spans just this one gap (possibly extending into
+ * the one above), we do the merge here, and an early exit. This
+ * is done here to avoid having to special case later. */
+ if (i_e - i_s <= 1) {
+
+ /* If i_e - i_s == 1, it means that the new range terminates
+ * within the range above, and hence 'extends_the_range_above'
+ * must be true. (If the range above it extends to infinity,
+ * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
+ * will be 0, so no harm done.) */
+ if (extends_the_range_above) {
+ Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
+ invlist_set_len(invlist,
+ len - 2,
+ *(get_invlist_offset_addr(invlist)));
+ return invlist;
+ }
+
+ /* Here, i_e must == i_s. We keep them in sync, as they apply
+ * to the same range, and below we are about to decrement i_s
+ * */
+ i_e--;
+ }
+
+ /* Here, the new range is adjacent to the one below. (It may also
+ * span beyond the range above, but that will get resolved later.)
+ * Extend the range below to include this one. */
+ array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
+ i_s--;
+ start = array[i_s];
+ }
+ else if (extends_the_range_above) {
+
+ /* Here the new range only extends the range above it, but not the
+ * one below. It merges with the one above. Again, we keep i_e
+ * and i_s in sync if they point to the same range */
+ if (i_e == i_s) {
+ i_e++;
+ }
+ i_s++;
+ array[i_s] = start;
+ }
+ }
+
+ /* Here, we've dealt with the new range start extending any adjoining
+ * existing ranges.
+ *
+ * If the new range extends to infinity, it is now the final one,
+ * regardless of what was there before */
+ if (UNLIKELY(end == UV_MAX)) {
+ invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
+ return invlist;
+ }
+
+ /* If i_e started as == i_s, it has also been dealt with,
+ * and been updated to the new i_s, which will fail the following if */
+ if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
+
+ /* Here, the ranges on either side of the end of the new range are in
+ * the set, and this range ends in the gap between them.
+ *
+ * If this range is adjacent to (hence extends) the range above it, it
+ * becomes part of that range; likewise if it extends the range below,
+ * it becomes part of that range */
+ if (end + 1 == array[i_e+1]) {
+ i_e++;
+ array[i_e] = start;
+ }
+ else if (start <= array[i_e]) {
+ array[i_e] = end + 1;
+ i_e--;
+ }
+ }
+
+ if (i_s == i_e) {
+
+ /* If the range fits entirely in an existing range (as possibly already
+ * extended above), it doesn't add anything new */
+ if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
+ return invlist;
+ }
+
+ /* Here, no part of the range is in the list. Must add it. It will
+ * occupy 2 more slots */
+ splice_in_new_range:
+
+ invlist_extend(invlist, len + 2);
+ array = invlist_array(invlist);
+ /* Move the rest of the array down two slots. Don't include any
+ * trailing NUL */
+ Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
- _invlist_union(invlist, range_invlist, &invlist);
+ /* Do the actual splice */
+ array[i_e+1] = start;
+ array[i_e+2] = end + 1;
+ invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
+ return invlist;
+ }
- /* The temporary can be freed */
- SvREFCNT_dec_NN(range_invlist);
+ /* Here the new range crossed the boundaries of a pre-existing range. The
+ * code above has adjusted things so that both ends are in ranges that are
+ * in the set. This means everything in between must also be in the set.
+ * Just squash things together */
+ Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
+ invlist_set_len(invlist,
+ len - i_e + i_s,
+ *(get_invlist_offset_addr(invlist)));
return invlist;
}
PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
- _append_range_to_invlist(invlist, element0, element0);
+ invlist = add_cp_to_invlist(invlist, element0);
offset = *get_invlist_offset_addr(invlist);
invlist_set_len(invlist, size, offset);
: array[len - 1] - 1;
}
-SV *
+STATIC SV *
S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
{
/* Get the contents of an inversion list into a string SV so that they can
{
AV* list = (AV*) *listp;
IV k;
- for (k = 0; k <= av_tindex(list); k++) {
+ for (k = 0; k <= av_tindex_nomg(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
assert(c_p);
if (RExC_flags & RXf_PMf_FOLD) {
RExC_contains_i = 1;
}
- if (PASS2) {
- STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+
+ if (UNLIKELY((x_mod_count) > 1)) {
+ vFAIL("Only one /x regex modifier is allowed");
}
return;
/*NOTREACHED*/
* indivisible */
bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
- assert(RExC_parse < RExC_end);
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Unmatched (");
+ }
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse + 1;
RExC_seen |= REG_LOOKBEHIND_SEEN;
RExC_in_lookbehind++;
RExC_parse++;
- assert(RExC_parse < RExC_end);
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Sequence (?... not terminated");
+ }
+
/* FALLTHROUGH */
case '=': /* (?=...) */
RExC_seen_zerolen++;
break;
case '0' : /* (?0) */
case 'R' : /* (?R) */
- if (*RExC_parse != ')')
+ if (RExC_parse == RExC_end || *RExC_parse != ')')
FAIL("Sequence (?R) not terminated");
- ret = reg_node(pRExC_state, GOSTART);
- RExC_seen |= REG_GOSTART_SEEN;
+ num = 0;
+ RExC_seen |= REG_RECURSE_SEEN;
*flagp |= POSTPONED;
- nextchar(pRExC_state);
- return ret;
+ goto gen_recurse_regop;
/*notreached*/
/* named and numeric backreferences */
case '&': /* (?&NAME) */
} else if ( paren == '+' ) {
num = RExC_npar + num - 1;
}
+ /* We keep track how many GOSUB items we have produced.
+ To start off the ARG2L() of the GOSUB holds its "id",
+ which is used later in conjunction with RExC_recurse
+ to calculate the offset we need to jump for the GOSUB,
+ which it will store in the final representation.
+ We have to defer the actual calculation until much later
+ as the regop may move.
+ */
ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
if (!SIZE_ONLY) {
vFAIL("Reference to nonexistent group");
}
RExC_recurse_count++;
- DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
22, "| |", (int)(depth * 2 + 1), "",
(UV)ARG(ret), (IV)ARG2L(ret)));
}
RExC_seen |= REG_RECURSE_SEEN;
+
Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
Set_Node_Offset(ret, parse_start); /* MJD */
*flagp |= POSTPONED;
+ assert(*RExC_parse == ')');
nextchar(pRExC_state);
return ret;
}
else if (RExC_parse[0] == 'R') {
RExC_parse++;
+ /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
+ * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
+ * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
+ */
parno = 0;
- if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ if (RExC_parse[0] == '0') {
+ parno = 1;
+ RExC_parse++;
+ }
+ else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
UV uv;
if (grok_atoUV(RExC_parse, &uv, &endptr)
&& uv <= I32_MAX
) {
- parno = (I32)uv;
+ parno = (I32)uv + 1;
RExC_parse = (char*)endptr;
}
/* else "Switch condition not recognized" below */
SIZE_ONLY
? REG_RSN_RETURN_NULL
: REG_RSN_RETURN_DATA);
- parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
+
+ /* we should only have a false sv_dat when
+ * SIZE_ONLY is true, and we always have false
+ * sv_dat when SIZE_ONLY is true.
+ * reg_scan_name() will VFAIL() if the name is
+ * unknown when SIZE_ONLY is false, and otherwise
+ * will return something, and when SIZE_ONLY is
+ * true, reg_scan_name() just parses the string,
+ * and doesnt return anything. (in theory) */
+ assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
+
+ if (sv_dat)
+ parno = 1 + *((I32 *)SvPVX(sv_dat));
}
ret = reganode(pRExC_state,INSUBP,parno);
goto insert_if_check_paren;
if (!SIZE_ONLY ){
if (!RExC_nestroot)
RExC_nestroot = parno;
- if (RExC_seen & REG_RECURSE_SEEN
- && !RExC_open_parens[parno-1])
+ if (RExC_open_parens && !RExC_open_parens[parno])
{
- DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting open paren #%"IVdf" to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
(IV)parno, REG_NODE_NUM(ret)));
- RExC_open_parens[parno-1]= ret;
+ RExC_open_parens[parno]= ret;
}
}
Set_Node_Length(ret, 1); /* MJD */
break;
case 1: case 2:
ender = reganode(pRExC_state, CLOSE, parno);
- if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
- DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+ if ( RExC_close_parens ) {
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting close paren #%"IVdf" to %d\n",
22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
- RExC_close_parens[parno-1]= ender;
+ RExC_close_parens[parno]= ender;
if (RExC_nestroot == parno)
RExC_nestroot = 0;
}
case 0:
ender = reg_node(pRExC_state, END);
if (!SIZE_ONLY) {
- assert(!RExC_opend); /* there can only be one! */
- RExC_opend = ender;
+ assert(!RExC_end_op); /* there can only be one! */
+ RExC_end_op = ender;
+ if (RExC_close_parens) {
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Setting close paren #0 (END) to %d\n",
+ 22, "| |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender)));
+
+ RExC_close_parens[0]= ender;
+ }
}
break;
}
DEBUG_PARSE_MSG("lsbr");
regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
- PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ Perl_re_printf( aTHX_ "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(lastbr),
SvPV_nolen_const(RExC_mysv2),
DEBUG_PARSE_MSG("NADA");
regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
- PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ Perl_re_printf( aTHX_ "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(ret),
SvPV_nolen_const(RExC_mysv2),
FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
}
else if (ret == NULL)
- ret = latest;
+ ret = latest;
*flagp |= flags&(HASWIDTH|POSTPONED);
if (chain == NULL) /* First piece. */
*flagp |= flags&SPSTART;
}
-/*
- * reg_recode
- *
- * It returns the code point in utf8 for the value in *encp.
- * value: a code value in the source encoding
- * encp: a pointer to an Encode object
- *
- * If the result from Encode is not a single character,
- * it returns U+FFFD (Replacement character) and sets *encp to NULL.
- */
-STATIC UV
-S_reg_recode(pTHX_ const U8 value, SV **encp)
-{
- STRLEN numlen = 1;
- SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
- const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
- const STRLEN newlen = SvCUR(sv);
- UV uv = UNICODE_REPLACEMENT;
-
- PERL_ARGS_ASSERT_REG_RECODE;
-
- if (newlen)
- uv = SvUTF8(sv)
- ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
- : *(U8*)s;
-
- if (!newlen || numlen != newlen) {
- uv = UNICODE_REPLACEMENT;
- *encp = NULL;
- }
- return uv;
-}
-
PERL_STATIC_INLINE U8
S_compute_EXACTish(RExC_state_t *pRExC_state)
{
/*
- regatom - the lowest level
- Try to identify anything special at the start of the pattern. If there
- is, then handle it as required. This may involve generating a single regop,
- such as for an assertion; or it may involve recursing, such as to
- handle a () structure.
+ Try to identify anything special at the start of the current parse position.
+ If there is, then handle it as required. This may involve generating a
+ single regop, such as for an assertion; or it may involve recursing, such as
+ to handle a () structure.
If the string doesn't start with something special then we gobble up
- as much literal text as we can.
+ as much literal text as we can. If we encounter a quantifier, we have to
+ back off the final literal character, as that quantifier applies to just it
+ and not to the whole string of literals.
Once we have been able to handle whatever type of thing started the
sequence, we return.
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
+ /* Here, we have a literal character. Find the maximal string of
+ * them in the input that we can fit into a single EXACTish node.
+ * We quit at the first non-literal or when the node gets full */
for (p = RExC_parse;
len < upper_parse && p < RExC_end;
len++)
vFAIL(error_msg);
}
ender = result;
- if (IN_ENCODING && ender < 0x100) {
- goto recode_encoding;
- }
if (ender > 0xff) {
REQUIRE_UTF8(flagp);
}
if (RExC_recode_x_to_native) {
ender = LATIN1_TO_NATIVE(ender);
}
- else
#endif
- if (IN_ENCODING) {
- goto recode_encoding;
- }
}
else {
REQUIRE_UTF8(flagp);
form_short_octal_warning(p, numlen));
}
}
- if (IN_ENCODING && ender < 0x100)
- goto recode_encoding;
- break;
- recode_encoding:
- if (! RExC_override_recoding) {
- SV* enc = _get_encoding();
- ender = reg_recode((U8)ender, &enc);
- if (!enc && PASS2)
- ckWARNreg(p, "Invalid escape in the specified encoding");
- REQUIRE_UTF8(flagp);
- }
break;
case '\0':
if (p >= RExC_end)
} /* End of switch on '\' */
break;
case '{':
- /* Currently we don't warn when the lbrace is at the start
+ /* Currently we don't care if the lbrace is at the start
* of a construct. This catches it in the middle of a
* literal string, or when it's the first thing after
* something like "\b" */
- if (! SIZE_ONLY
- && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
- {
- ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+ if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
+ RExC_parse = p + 1;
+ vFAIL("Unescaped left brace in regex is illegal here");
}
/*FALLTHROUGH*/
default: /* A literal character */
* the node, close the node with just them, and set up to do
* this character again next time through, when it will be the
* only thing in its new node */
+
if ((next_is_quantifier = ( LIKELY(p < RExC_end)
&& UNLIKELY(ISMULT2(p))))
&& LIKELY(len))
RExC_parse = p - 1;
Set_Node_Cur_Length(ret, parse_start);
RExC_parse = p;
- skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- FALSE /* Don't force to /x */ );
{
/* len is STRLEN which is unsigned, need to copy to signed */
IV iv = len;
break;
} /* End of giant switch on input character */
+ /* Position parse to next real character */
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
+ if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
+ ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here, passed through");
+ }
+
return(ret);
}
/* 'posix_warnings' and 'warn_text' are names of variables in the following
* routine. q.v. */
#define ADD_POSIX_WARNING(p, text) STMT_START { \
- if (posix_warnings && ( posix_warnings != (AV **) -1 \
- || (PASS2 && ckWARN(WARN_REGEXP)))) \
- { \
- if (! warn_text) warn_text = newAV(); \
- av_push(warn_text, Perl_newSVpvf(aTHX_ \
+ if (posix_warnings) { \
+ if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
+ av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
WARNING_PREFIX \
text \
REPORT_LOCATION, \
besides RExC_parse. */
char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
NULL */
- AV ** posix_warnings /* Where to place any generated warnings, or -1
- if to output them, or NULL */
+ AV ** posix_warnings, /* Where to place any generated warnings, or
+ NULL */
+ const bool check_only /* Don't die if error */
)
{
/* This parses what the caller thinks may be one of the three POSIX
* 'updated_parse_ptr' is not changed. No warnings nor errors are
* raised.
*
- * In b) there may be warnings and even errors generated. What to do about
- * these is determined by the 'posix_warnings' parameter. If it is NULL,
- * this call is treated as a check-only, scouting-out-the-territory call,
- * and no warnings nor errors are generated at all. Otherwise, any errors
- * are raised if found. If 'posix_warnings' is -1 (appropriately cast),
- * warnings are generated and displayed (in pass 2), just as they would be
- * for any other message of the same type from this file. If it isn't NULL
- * and not -1, warnings aren't displayed, but instead an AV is generated
- * with all the warning messages (that aren't to be ignored) stored into
- * it, so that the caller can output them if it wants. This is done in all
+ * In b) there may be errors or warnings generated. If 'check_only' is
+ * TRUE, then any errors are discarded. Warnings are returned to the
+ * caller via an AV* created into '*posix_warnings' if it is not NULL. If
+ * instead it is NULL, warnings are suppressed. This is done in all
* passes. The reason for this is that the rest of the parsing is heavily
* dependent on whether this routine found a valid posix class or not. If
- * it did, the closing ']' is absorbed as part of the class. If no class
+ * it did, the closing ']' is absorbed as part of the class. If no class,
* or an invalid one is found, any ']' will be considered the terminator of
* the outer bracketed character class, leading to very different results.
* In particular, a '(?[ ])' construct will likely have a syntax error if
bool has_opening_colon = FALSE;
int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
valid class */
- AV* warn_text = NULL; /* any warning messages */
const char * possible_end = NULL; /* used for a 2nd parse pass */
const char* name_start; /* ptr to class name first char */
PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
+ if (posix_warnings && RExC_warn_text)
+ av_clear(RExC_warn_text);
+
if (p >= e) {
return NOT_MEANT_TO_BE_A_POSIX_CLASS;
}
/* For [. .] and [= =]. These are quite different internally from [: :],
* so they are handled separately. */
- if (POSIXCC_NOTYET(*p)) {
+ if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
+ and 1 for at least one char in it
+ */
+ {
const char open_char = *p;
const char * temp_ptr = p + 1;
- unsigned int len = 0;
/* These two constructs are not handled by perl, and if we find a
- * syntactically valid one, we croak. It looks like just about any
- * byte can be in them, but they are likely very short, like [.ch.] to
- * denote a ligature 'ch' single character. If we find something that
- * started out to look like one of these constructs, but isn't, we
- * break so that it can be checked for being a class name with a typo
- * of '.' or '=' instead of a colon */
- while (temp_ptr < e) {
- len++;
-
- /* qr/[[.].]]/, for example, is valid. But otherwise we quit on an
- * unexpected ']'. It is possible, it appears, for such a ']' to
- * be not in the final position, but that's so unlikely that that
- * case is not handled. */
- if (*temp_ptr == ']' && temp_ptr[1] != open_char) {
- break;
- }
-
- /* XXX this could be cut down, but this value is certainly large
- * enough */
- if (len > 10) {
- break;
- }
+ * syntactically valid one, we croak. khw, who wrote this code, finds
+ * this explanation of them very unclear:
+ * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
+ * And searching the rest of the internet wasn't very helpful either.
+ * It looks like just about any byte can be in these constructs,
+ * depending on the locale. But unless the pattern is being compiled
+ * under /l, which is very rare, Perl runs under the C or POSIX locale.
+ * In that case, it looks like [= =] isn't allowed at all, and that
+ * [. .] could be any single code point, but for longer strings the
+ * constituent characters would have to be the ASCII alphabetics plus
+ * the minus-hyphen. Any sensible locale definition would limit itself
+ * to these. And any portable one definitely should. Trying to parse
+ * the general case is a nightmare (see [perl #127604]). So, this code
+ * looks only for interiors of these constructs that match:
+ * qr/.|[-\w]{2,}/
+ * Using \w relaxes the apparent rules a little, without adding much
+ * danger of mistaking something else for one of these constructs.
+ *
+ * [. .] in some implementations described on the internet is usable to
+ * escape a character that otherwise is special in bracketed character
+ * classes. For example [.].] means a literal right bracket instead of
+ * the ending of the class
+ *
+ * [= =] can legitimately contain a [. .] construct, but we don't
+ * handle this case, as that [. .] construct will later get parsed
+ * itself and croak then. And [= =] is checked for even when not under
+ * /l, as Perl has long done so.
+ *
+ * The code below relies on there being a trailing NUL, so it doesn't
+ * have to keep checking if the parse ptr < e.
+ */
+ if (temp_ptr[1] == open_char) {
+ temp_ptr++;
+ }
+ else while ( temp_ptr < e
+ && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
+ {
+ temp_ptr++;
+ }
- if (*temp_ptr == open_char) {
+ if (*temp_ptr == open_char) {
+ temp_ptr++;
+ if (*temp_ptr == ']') {
temp_ptr++;
- if (*temp_ptr == ']') {
- temp_ptr++;
- if (! found_problem && posix_warnings) {
- RExC_parse = (char *) temp_ptr;
- vFAIL3("POSIX syntax [%c %c] is reserved for future "
- "extensions", open_char, open_char);
- }
-
- /* Here, the syntax wasn't completely valid, or else the
- * call is to check-only */
- if (updated_parse_ptr) {
- *updated_parse_ptr = (char *) temp_ptr;
- }
-
- return OOB_NAMEDCLASS;
+ if (! found_problem && ! check_only) {
+ RExC_parse = (char *) temp_ptr;
+ vFAIL3("POSIX syntax [%c %c] is reserved for future "
+ "extensions", open_char, open_char);
}
- }
- else if (*temp_ptr == '\\') {
-
- /* A backslash is treate as like any other character, unless it
- * precedes a comment starter. XXX multiple backslashes in a
- * row are not handled specially here, nor would they ever
- * likely to be handled specially in one of these constructs */
- if (temp_ptr[1] == '#' && (RExC_flags & RXf_PMf_EXTENDED)) {
- temp_ptr++;
+
+ /* Here, the syntax wasn't completely valid, or else the call
+ * is to check-only */
+ if (updated_parse_ptr) {
+ *updated_parse_ptr = (char *) temp_ptr;
}
- temp_ptr++;
- }
- else if (*temp_ptr == '#' && (RExC_flags & RXf_PMf_EXTENDED)) {
- break; /* Under no circumstances can we look at the interior
- of a comment */
- }
- else if (*temp_ptr == '\n') { /* And we don't allow newlines
- either as it's extremely
- unlikely that one could be in an
- intended class */
- break;
- }
- else if (UTF && ! UTF8_IS_INVARIANT(*temp_ptr)) {
- /* XXX Since perl will never handle multi-byte locales, except
- * for UTF-8, we could break if we found a byte above latin1,
- * but perhaps the person intended to use one. */
- temp_ptr += UTF8SKIP(temp_ptr);
- }
- else {
- temp_ptr++;
+
+ return OOB_NAMEDCLASS;
}
}
+
+ /* If we find something that started out to look like one of these
+ * constructs, but isn't, we continue below so that it can be checked
+ * for being a class name with a typo of '.' or '=' instead of a colon.
+ * */
}
/* Here, we think there is a possibility that a [: :] class was meant, and
ADD_POSIX_WARNING(p, "there is no terminating ']'");
}
- if (warn_text) {
- if (posix_warnings != (AV **) -1) {
- *posix_warnings = warn_text;
- }
- else {
- SV * msg;
- while ((msg = av_shift(warn_text)) != &PL_sv_undef) {
- Perl_warner(aTHX_ packWARN(WARN_REGEXP),
- "%s", SvPVX(msg));
- SvREFCNT_dec_NN(msg);
- }
- SvREFCNT_dec_NN(warn_text);
- }
+ if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
+ *posix_warnings = RExC_warn_text;
}
}
else if (class_number != OOB_NAMEDCLASS) {
* one */
return class_number + complement;
}
- else if (posix_warnings) {
+ else if (! check_only) {
/* Here, it is an unrecognized class. This is an error (unless the
* call is to check only, which we've already handled above) */
'stack' of where the undealt-with left
parens would be if they were actually
put there */
- IV fence = 0; /* Position of where most recent undealt-
+ /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
+ * in Solaris Studio 12.3. See RT #127455 */
+ VOL IV fence = 0; /* Position of where most recent undealt-
with left paren in stack is; -1 if none.
*/
STRLEN len; /* Temporary */
{
/* See if this is a [:posix:] class. */
bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL));
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL,
+ TRUE /* checking only */));
/* If it is a posix class, leave the parse pointer at the
* '[' to fool regclass() into thinking it is part of a
* '[[:posix:]]'. */
no_close:
/* We output the messages even if warnings are off, because we'll fail
* the very next thing, and these give a likely diagnosis for that */
- if (posix_warnings) {
- SV * msg;
- while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
- SvREFCNT_dec_NN(msg);
- }
- SvREFCNT_dec_NN(posix_warnings);
+ if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
}
FAIL("Syntax error in (?[...])");
redo_curchar:
- top_index = av_tindex(stack);
+ top_index = av_tindex_nomg(stack);
switch (curchar) {
SV** stacked_ptr; /* Ptr to something already on 'stack' */
}
/* Stack the position of this undealt-with left paren */
- fence = top_index + 1;
av_push(fence_stack, newSViv(fence));
+ fence = top_index + 1;
break;
case '\\':
{
/* See if this is a [:posix:] class. */
bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL));
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL,
+ TRUE /* checking only */));
/* If it is a posix class, leave the parse pointer at the '['
* to fool regclass() into thinking it is part of a
* '[[:posix:]]'. */
goto done;
case ')':
- if (av_tindex(fence_stack) < 0) {
+ if (av_tindex_nomg(fence_stack) < 0) {
RExC_parse++;
vFAIL("Unexpected ')'");
}
- /* If at least two thing on the stack, treat this as an
+ /* If nothing after the fence, is missing an operand */
+ if (top_index - fence < 0) {
+ RExC_parse++;
+ goto bad_syntax;
+ }
+ /* If at least two things on the stack, treat this as an
* operator */
if (top_index - fence >= 1) {
goto join_operators;
handle_operand:
/* Here 'current' is the operand. If something is already on the
- * stack, we have to check if it is a !. */
- top_index = av_tindex(stack); /* Code above may have altered the
- * stack in the time since we
- * earlier set 'top_index'. */
+ * stack, we have to check if it is a !. But first, the code above
+ * may have altered the stack in the time since we earlier set
+ * 'top_index'. */
+
+ top_index = av_tindex_nomg(stack);
if (top_index - fence >= 0) {
/* If the top entry on the stack is an operator, it had better
* be a '!', otherwise the entry below the top operand should
only_to_avoid_leaks = av_pop(stack);
SvREFCNT_dec(only_to_avoid_leaks);
- top_index = av_tindex(stack);
/* And we redo with the inverted operand. This allows
* handling multiple ! in a row */
} /* End of loop parsing through the construct */
done:
- if (av_tindex(fence_stack) >= 0) {
+ if (av_tindex_nomg(fence_stack) >= 0) {
vFAIL("Unmatched (");
}
- if (av_tindex(stack) < 0 /* Was empty */
+ if (av_tindex_nomg(stack) < 0 /* Was empty */
|| ((final = av_pop(stack)) == NULL)
|| ! IS_OPERAND(final)
|| SvTYPE(final) != SVt_INVLIST
- || av_tindex(stack) >= 0) /* More left on stack */
+ || av_tindex_nomg(stack) >= 0) /* More left on stack */
{
bad_syntax:
SvREFCNT_dec(final);
}
}
+STATIC void
+S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
+{
+ /* If the final parameter is NULL, output the elements of the array given
+ * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
+ * pushed onto it, (creating if necessary) */
+
+ SV * msg;
+ const bool first_is_fatal = ! return_posix_warnings
+ && ckDEAD(packWARN(WARN_REGEXP));
+
+ PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
+
+ while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
+ if (return_posix_warnings) {
+ if (! *return_posix_warnings) { /* mortalize to not leak if
+ warnings are fatal */
+ *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
+ }
+ av_push(*return_posix_warnings, msg);
+ }
+ else {
+ if (first_is_fatal) { /* Avoid leaking this */
+ av_undef(posix_warnings); /* This isn't necessary if the
+ array is mortal, but is a
+ fail-safe */
+ (void) sv_2mortal(msg);
+ if (PASS2) {
+ SAVEFREESV(RExC_rx_sv);
+ }
+ }
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
+ SvREFCNT_dec_NN(msg);
+ }
+ }
+}
+
STATIC AV *
S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
{
bool optimizable, /* ? Allow a non-ANYOF return
node */
SV** ret_invlist, /* Return an inversion list, not a node */
- AV** posix_warnings
+ AV** return_posix_warnings
)
{
/* parse a bracketed class specification. Most of these will produce an
const SSize_t orig_size = RExC_size;
bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
- /* This variable is used to mark where in the input something that looks
- * like a POSIX construct ends. During the parse, when something looks
- * like it could be such a construct is encountered, it is checked for
- * being one, but not if we've already checked this area of the input.
- * Only after this position is reached do we check again */
- char *dont_check_for_posix_end = RExC_parse - 1;
+ /* This variable is used to mark where the end in the input is of something
+ * that looks like a POSIX construct but isn't. During the parse, when
+ * something looks like it could be such a construct is encountered, it is
+ * checked for being one, but not if we've already checked this area of the
+ * input. Only after this position is reached do we check again */
+ char *not_posix_region_end = RExC_parse - 1;
+
+ AV* posix_warnings = NULL;
+ const bool do_posix_warnings = return_posix_warnings
+ || (PASS2 && ckWARN(WARN_REGEXP));
GET_RE_DEBUG_FLAGS_DECL;
allow_multi_folds = FALSE;
#endif
- if (posix_warnings == NULL) {
- posix_warnings = (AV **) -1;
- }
-
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state,
(LOC)
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
- char *class_end;
- int maybe_class = handle_possible_posix(pRExC_state, RExC_parse,
- &class_end, NULL);
- if (maybe_class >= OOB_NAMEDCLASS) {
- dont_check_for_posix_end = class_end;
- if (PASS2 && posix_warnings == (AV **) -1) {
- SAVEFREESV(RExC_rx_sv);
- ckWARN4reg(class_end,
- "POSIX syntax [%c %c] belongs inside character classes%s",
- *RExC_parse, *RExC_parse,
- (maybe_class == OOB_NAMEDCLASS)
- ? ((POSIXCC_NOTYET(*RExC_parse))
- ? " (but this one isn't implemented)"
- : " (but this one isn't fully valid)")
- : ""
- );
- (void)ReREFCNT_inc(RExC_rx_sv);
- }
- }
+ int maybe_class = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ ¬_posix_region_end,
+ NULL,
+ TRUE /* checking only */);
+ if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
+ SAVEFREESV(RExC_rx_sv);
+ ckWARN4reg(not_posix_region_end,
+ "POSIX syntax [%c %c] belongs inside character classes%s",
+ *RExC_parse, *RExC_parse,
+ (maybe_class == OOB_NAMEDCLASS)
+ ? ((POSIXCC_NOTYET(*RExC_parse))
+ ? " (but this one isn't implemented)"
+ : " (but this one isn't fully valid)")
+ : ""
+ );
+ (void)ReREFCNT_inc(RExC_rx_sv);
+ }
}
/* If the caller wants us to just parse a single element, accomplish this
goto charclassloop;
while (1) {
+
+ if ( posix_warnings
+ && av_tindex_nomg(posix_warnings) >= 0
+ && RExC_parse > not_posix_region_end)
+ {
+ /* Warnings about posix class issues are considered tentative until
+ * we are far enough along in the parse that we can no longer
+ * change our mind, at which point we either output them or add
+ * them, if it has so specified, to what gets returned to the
+ * caller. This is done each time through the loop so that a later
+ * class won't zap them before they have been dealt with. */
+ output_or_return_posix_warnings(pRExC_state, posix_warnings,
+ return_posix_warnings);
+ }
+
if (RExC_parse >= stop_ptr) {
break;
}
value = UCHARAT(RExC_parse++);
if (value == '[') {
- namedclass = handle_possible_posix(pRExC_state, RExC_parse, &dont_check_for_posix_end, posix_warnings);
+ char * posix_class_end;
+ namedclass = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ &posix_class_end,
+ do_posix_warnings ? &posix_warnings : NULL,
+ FALSE /* die if error */);
if (namedclass > OOB_NAMEDCLASS) {
- RExC_parse = dont_check_for_posix_end;
+
+ /* If there was an earlier attempt to parse this particular
+ * posix class, and it failed, it was a false alarm, as this
+ * successful one proves */
+ if ( posix_warnings
+ && av_tindex_nomg(posix_warnings) >= 0
+ && not_posix_region_end >= RExC_parse
+ && not_posix_region_end <= posix_class_end)
+ {
+ av_undef(posix_warnings);
+ }
+
+ RExC_parse = posix_class_end;
+ }
+ else if (namedclass == OOB_NAMEDCLASS) {
+ not_posix_region_end = posix_class_end;
}
else {
namedclass = OOB_NAMEDCLASS;
}
}
- else if ( RExC_parse - 1 > dont_check_for_posix_end
+ else if ( RExC_parse - 1 > not_posix_region_end
&& MAYBE_POSIXCC(value))
{
- (void) handle_possible_posix(pRExC_state, RExC_parse - 1, /* -1 because parse has already been advanced */
- &dont_check_for_posix_end, posix_warnings);
+ (void) handle_possible_posix(
+ pRExC_state,
+ RExC_parse - 1, /* -1 because parse has already been
+ advanced */
+ ¬_posix_region_end,
+ do_posix_warnings ? &posix_warnings : NULL,
+ TRUE /* checking only */);
}
else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
+
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Unmatched [");
+ }
+
if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
SV* invlist;
char* name;
char* base_name; /* name after any packages are stripped */
+ char* lookup_name = NULL;
const char * const colon_colon = "::";
/* Try to get the definition of the property into
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- name = savepv(Perl_form(aTHX_
- "%s%.*s%s\n",
- (FOLD) ? "__" : "",
- (int)n,
- RExC_parse,
- (FOLD) ? "_i" : ""
- ));
+ name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
+ SAVEFREEPV(name);
+ if (FOLD) {
+ lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
+
+ /* The function call just below that uses this can fail
+ * to return, leaking memory if we don't do this */
+ SAVEFREEPV(lookup_name);
+ }
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
SvREFCNT_dec(swash); /* Free any left-overs */
- swash = _core_swash_init("utf8", name, &PL_sv_undef,
+ swash = _core_swash_init("utf8",
+ (lookup_name)
+ ? lookup_name
+ : name,
+ &PL_sv_undef,
1, /* binary */
0, /* not tr/// */
NULL, /* No inversion list */
pkgname,
name);
n = strlen(full_name);
- Safefree(name);
name = savepvn(full_name, n);
+ SAVEFREEPV(name);
}
}
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n",
(value == 'p' ? '+' : '!'),
- UTF8fARG(UTF, n, name));
+ (FOLD) ? "__" : "",
+ UTF8fARG(UTF, n, name),
+ (FOLD) ? "_i" : "");
has_user_defined_property = TRUE;
optimizable = FALSE; /* Will have to leave this an
ANYOF node */
_invlist_union(properties, invlist, &properties);
}
}
- Safefree(name);
}
RExC_parse = e + 1;
namedclass = ANYOF_UNIPROP; /* no official name, but it's
}
}
non_portable_endpoint++;
- if (IN_ENCODING && value < 0x100) {
- goto recode_encoding;
- }
break;
case 'x':
RExC_parse--; /* function expects to be pointed at the 'x' */
}
}
non_portable_endpoint++;
- if (IN_ENCODING && value < 0x100)
- goto recode_encoding;
break;
case 'c':
value = grok_bslash_c(*RExC_parse++, PASS2);
}
}
non_portable_endpoint++;
- if (IN_ENCODING && value < 0x100)
- goto recode_encoding;
- break;
- }
- recode_encoding:
- if (! RExC_override_recoding) {
- SV* enc = _get_encoding();
- value = reg_recode((U8)value, &enc);
- if (!enc) {
- if (strict) {
- vFAIL("Invalid escape in the specified encoding");
- }
- else if (PASS2) {
- ckWARNreg(RExC_parse,
- "Invalid escape in the specified encoding");
- }
- }
break;
}
default:
else if (! SIZE_ONLY) {
/* Here, not in pass1 (in that pass we skip calculating the
- * contents of this class), and is /l, or is a POSIX class for
- * which /l doesn't matter (or is a Unicode property, which is
- * skipped here). */
+ * contents of this class), and is not /l, or is a POSIX class
+ * for which /l doesn't matter (or is a Unicode property, which
+ * is skipped here). */
if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
&cp_list);
}
}
- else if (UNI_SEMANTICS
+ else if ( UNI_SEMANTICS
|| classnum == _CC_ASCII
- || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+ || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
|| classnum == _CC_XDIGIT)))
{
/* We usually have to worry about /d and /a affecting what
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
+
+ if ( posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings,
+ return_posix_warnings);
+ }
+
/* If anything in the class expands to more than one character, we have to
* deal with them by building up a substitute parse string, and recursively
* calling reg() on it, instead of proceeding */
#endif
/* Look at the longest folds first */
- for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
+ for (cp_count = av_tindex_nomg(multi_char_matches);
+ cp_count > 0;
+ cp_count--)
+ {
if (av_exists(multi_char_matches, cp_count)) {
AV** this_array_ptr;
{
AV* list = (AV*) *listp;
IV k;
- for (k = 0; k <= av_tindex(list); k++) {
+ for (k = 0; k <= av_tindex_nomg(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
assert(c_p);
SvREFCNT_dec_NN(cp_foldable_list);
}
- /* And combine the result (if any) with any inversion list from posix
+ /* And combine the result (if any) with any inversion lists from posix
* classes. The lists are kept separate up to now because we don't want to
* fold the classes (folding of those is automatically handled by the swash
* fetching code) */
- if (simple_posixes) {
- _invlist_union(cp_list, simple_posixes, &cp_list);
- SvREFCNT_dec_NN(simple_posixes);
+ if (simple_posixes) { /* These are the classes known to be unaffected by
+ /a, /aa, and /d */
+ if (cp_list) {
+ _invlist_union(cp_list, simple_posixes, &cp_list);
+ SvREFCNT_dec_NN(simple_posixes);
+ }
+ else {
+ cp_list = simple_posixes;
+ }
}
if (posixes || nposixes) {
- if (posixes && AT_LEAST_ASCII_RESTRICTED) {
+
+ /* We have to adjust /a and /aa */
+ if (AT_LEAST_ASCII_RESTRICTED) {
+
/* Under /a and /aa, nothing above ASCII matches these */
- _invlist_intersection(posixes,
- PL_XPosix_ptrs[_CC_ASCII],
- &posixes);
- }
- if (nposixes) {
- if (DEPENDS_SEMANTICS) {
- /* Under /d, everything in the upper half of the Latin1 range
- * matches these complements */
- ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ if (posixes) {
+ _invlist_intersection(posixes,
+ PL_XPosix_ptrs[_CC_ASCII],
+ &posixes);
}
- else if (AT_LEAST_ASCII_RESTRICTED) {
- /* Under /a and /aa, everything above ASCII matches these
- * complements */
+
+ /* Under /a and /aa, everything above ASCII matches these
+ * complements */
+ if (nposixes) {
_invlist_union_complement_2nd(nposixes,
PL_XPosix_ptrs[_CC_ASCII],
&nposixes);
}
- if (posixes) {
- _invlist_union(posixes, nposixes, &posixes);
- SvREFCNT_dec_NN(nposixes);
- }
- else {
- posixes = nposixes;
- }
}
+
if (! DEPENDS_SEMANTICS) {
- if (cp_list) {
- _invlist_union(cp_list, posixes, &cp_list);
- SvREFCNT_dec_NN(posixes);
+
+ /* For everything but /d, we can just add the current 'posixes' and
+ * 'nposixes' to the main list */
+ if (posixes) {
+ if (cp_list) {
+ _invlist_union(cp_list, posixes, &cp_list);
+ SvREFCNT_dec_NN(posixes);
+ }
+ else {
+ cp_list = posixes;
+ }
}
- else {
- cp_list = posixes;
+ if (nposixes) {
+ if (cp_list) {
+ _invlist_union(cp_list, nposixes, &cp_list);
+ SvREFCNT_dec_NN(nposixes);
+ }
+ else {
+ cp_list = nposixes;
+ }
}
}
else {
- /* Under /d, we put into a separate list the Latin1 things that
- * match only when the target string is utf8 */
- SV* nonascii_but_latin1_properties = NULL;
- _invlist_intersection(posixes, PL_UpperLatin1,
- &nonascii_but_latin1_properties);
- _invlist_subtract(posixes, nonascii_but_latin1_properties,
- &posixes);
- if (cp_list) {
- _invlist_union(cp_list, posixes, &cp_list);
- SvREFCNT_dec_NN(posixes);
+ /* Under /d, things like \w match upper Latin1 characters only if
+ * the target string is in UTF-8. But things like \W match all the
+ * upper Latin1 characters if the target string is not in UTF-8.
+ *
+ * Handle the case where there something like \W separately */
+ if (nposixes) {
+ SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+
+ /* A complemented posix class matches all upper Latin1
+ * characters if not in UTF-8. And it matches just certain
+ * ones when in UTF-8. That means those certain ones are
+ * matched regardless, so can just be added to the
+ * unconditional list */
+ if (cp_list) {
+ _invlist_union(cp_list, nposixes, &cp_list);
+ SvREFCNT_dec_NN(nposixes);
+ nposixes = NULL;
+ }
+ else {
+ cp_list = nposixes;
+ }
+
+ /* Likewise for 'posixes' */
+ _invlist_union(posixes, cp_list, &cp_list);
+
+ /* Likewise for anything else in the range that matched only
+ * under UTF-8 */
+ if (has_upper_latin1_only_utf8_matches) {
+ _invlist_union(cp_list,
+ has_upper_latin1_only_utf8_matches,
+ &cp_list);
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
+
+ /* If we don't match all the upper Latin1 characters regardless
+ * of UTF-8ness, we have to set a flag to match the rest when
+ * not in UTF-8 */
+ _invlist_subtract(only_non_utf8_list, cp_list,
+ &only_non_utf8_list);
+ if (_invlist_len(only_non_utf8_list) != 0) {
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ }
}
else {
- cp_list = posixes;
- }
-
- if (has_upper_latin1_only_utf8_matches) {
+ /* Here there were no complemented posix classes. That means
+ * the upper Latin1 characters in 'posixes' match only when the
+ * target string is in UTF-8. So we have to add them to the
+ * list of those types of code points, while adding the
+ * remainder to the unconditional list.
+ *
+ * First calculate what they are */
+ SV* nonascii_but_latin1_properties = NULL;
+ _invlist_intersection(posixes, PL_UpperLatin1,
+ &nonascii_but_latin1_properties);
+
+ /* And add them to the final list of such characters. */
_invlist_union(has_upper_latin1_only_utf8_matches,
nonascii_but_latin1_properties,
&has_upper_latin1_only_utf8_matches);
- SvREFCNT_dec_NN(nonascii_but_latin1_properties);
- }
- else {
- has_upper_latin1_only_utf8_matches
- = nonascii_but_latin1_properties;
+
+ /* Remove them from what now becomes the unconditional list */
+ _invlist_subtract(posixes, nonascii_but_latin1_properties,
+ &posixes);
+
+ /* And add those unconditional ones to the final list */
+ if (cp_list) {
+ _invlist_union(cp_list, posixes, &cp_list);
+ SvREFCNT_dec_NN(posixes);
+ posixes = NULL;
+ }
+ else {
+ cp_list = posixes;
+ }
+
+ SvREFCNT_dec(nonascii_but_latin1_properties);
+
+ /* Get rid of any characters that we now know are matched
+ * unconditionally from the conditional list, which may make
+ * that list empty */
+ _invlist_subtract(has_upper_latin1_only_utf8_matches,
+ cp_list,
+ &has_upper_latin1_only_utf8_matches);
+ if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
}
}
}
invlist_iterfinish(cp_list);
}
}
-
-#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \
- ( DEPENDS_SEMANTICS \
- && (ANYOF_FLAGS(ret) \
- & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
-
- /* See if we can simplify things under /d */
- if ( has_upper_latin1_only_utf8_matches
- || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+ else if ( DEPENDS_SEMANTICS
+ && ( has_upper_latin1_only_utf8_matches
+ || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
{
- /* But not if we are inverting, as that screws it up */
- if (! invert) {
- if (has_upper_latin1_only_utf8_matches) {
- if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
-
- /* Here, we have both the flag and inversion list. Any
- * character in 'has_upper_latin1_only_utf8_matches'
- * matches when UTF-8 is in effect, but it also matches
- * when UTF-8 is not in effect because of
- * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches
- * unconditionally, so can be added to the regular list,
- * and 'has_upper_latin1_only_utf8_matches' cleared */
- _invlist_union(cp_list,
- has_upper_latin1_only_utf8_matches,
- &cp_list);
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
- has_upper_latin1_only_utf8_matches = NULL;
- }
- else if (cp_list) {
-
- /* Here, 'cp_list' gives chars that always match, and
- * 'has_upper_latin1_only_utf8_matches' gives chars that
- * were specified to match only if the target string is in
- * UTF-8. It may be that these overlap, so we can subtract
- * the unconditionally matching from the conditional ones,
- * to make the conditional list as small as possible,
- * perhaps even clearing it, in which case more
- * optimizations are possible later */
- _invlist_subtract(has_upper_latin1_only_utf8_matches,
- cp_list,
- &has_upper_latin1_only_utf8_matches);
- if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
- has_upper_latin1_only_utf8_matches = NULL;
- }
- }
- }
-
- /* Similarly, if the unconditional matches include every upper
- * latin1 character, we can clear that flag to permit later
- * optimizations */
- if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
- SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
- _invlist_subtract(only_non_utf8_list, cp_list,
- &only_non_utf8_list);
- if (_invlist_len(only_non_utf8_list) == 0) {
- ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
- }
- SvREFCNT_dec_NN(only_non_utf8_list);
- only_non_utf8_list = NULL;;
- }
- }
-
- /* If we haven't gotten rid of all conditional matching, we change the
- * regnode type to indicate that */
- if ( has_upper_latin1_only_utf8_matches
- || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
- {
- OP(ret) = ANYOFD;
- optimizable = FALSE;
- }
+ OP(ret) = ANYOFD;
+ optimizable = FALSE;
}
-#undef MATCHES_ALL_NON_UTF8_NON_ASCII
+
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
* at compile time. Besides not inverting folded locale now, we can't
si = *ary; /* ary[0] = the string to initialize the swash with */
- if (av_tindex(av) >= 2) {
+ if (av_tindex_nomg(av) >= 2) {
if (only_utf8_locale_ptr
&& ary[2]
&& ary[2] != &PL_sv_undef)
* is any inversion list generated at compile time; [4]
* indicates if that inversion list has any user-defined
* properties in it. */
- if (av_tindex(av) >= 3) {
+ if (av_tindex_nomg(av) >= 3) {
invlist = ary[3];
if (SvUV(ary[4])) {
swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
sv_catpvs(matches_string, " ");
} /* end of loop through the text */
+ assert(matches_string);
if (SvCUR(matches_string)) { /* Get rid of trailing blank */
SvCUR_set(matches_string, SvCUR(matches_string) - 1);
}
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- FALSE /* Don't assume /x */ );
+ FALSE /* Don't force /x */ );
}
}
RExC_size += size;
return;
}
-
+ assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
+ studying. If this is wrong then we need to adjust RExC_recurse
+ below like we do with RExC_open_parens/RExC_close_parens. */
src = RExC_emit;
RExC_emit += size;
dst = RExC_emit;
if (RExC_open_parens) {
int paren;
/*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
+ /* remember that RExC_npar is rex->nparens + 1,
+ * iow it is 1 more than the number of parens seen in
+ * the pattern so far. */
for ( paren=0 ; paren < RExC_npar ; paren++ ) {
- if ( RExC_open_parens[paren] >= opnd ) {
+ /* note, RExC_open_parens[0] is the start of the
+ * regex, it can't move. RExC_close_parens[0] is the end
+ * of the regex, it *can* move. */
+ if ( paren && RExC_open_parens[paren] >= opnd ) {
/*DEBUG_PARSE_FMT("open"," - %d",size);*/
RExC_open_parens[paren] += size;
} else {
}
}
}
+ if (RExC_end_op)
+ RExC_end_op += size;
while (src > opnd) {
StructCopy(--src, --dst, regnode);
DEBUG_PARSE_r({
DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
- PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
+ Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
(temp == NULL ? "->" : ""),
(temp == NULL ? PL_reg_name[OP(val)] : "")
DEBUG_PARSE_r({
DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
- PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
+ Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
SvPV_nolen_const(RExC_mysv),
REG_NODE_NUM(scan),
PL_reg_name[exact]);
DEBUG_PARSE_r({
DEBUG_PARSE_MSG("");
regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"~ attach to %s (%"IVdf") offset to %"IVdf"\n",
SvPV_nolen_const(RExC_mysv),
(IV)REG_NODE_NUM(val),
for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
if (flags & (1<<bit)) {
if (!set++ && lead)
- PerlIO_printf(Perl_debug_log, "%s",lead);
- PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
+ Perl_re_printf( aTHX_ "%s",lead);
+ Perl_re_printf( aTHX_ "%s ",PL_reg_intflags_name[bit]);
}
}
if (lead) {
if (set)
- PerlIO_printf(Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "\n");
else
- PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
+ Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
}
}
continue;
}
if (!set++ && lead)
- PerlIO_printf(Perl_debug_log, "%s",lead);
- PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
+ Perl_re_printf( aTHX_ "%s",lead);
+ Perl_re_printf( aTHX_ "%s ",PL_reg_extflags_name[bit]);
}
}
if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
if (!set++ && lead) {
- PerlIO_printf(Perl_debug_log, "%s",lead);
+ Perl_re_printf( aTHX_ "%s",lead);
}
switch (cs) {
case REGEX_UNICODE_CHARSET:
- PerlIO_printf(Perl_debug_log, "UNICODE");
+ Perl_re_printf( aTHX_ "UNICODE");
break;
case REGEX_LOCALE_CHARSET:
- PerlIO_printf(Perl_debug_log, "LOCALE");
+ Perl_re_printf( aTHX_ "LOCALE");
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
- PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
+ Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
break;
case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
+ Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
break;
default:
- PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
+ Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
break;
}
}
if (lead) {
if (set)
- PerlIO_printf(Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "\n");
else
- PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
+ Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
}
}
#endif
if (r->anchored_substr) {
RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
RE_SV_DUMPLEN(r->anchored_substr), 30);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"anchored %s%s at %"IVdf" ",
s, RE_SV_TAIL(r->anchored_substr),
(IV)r->anchored_offset);
} else if (r->anchored_utf8) {
RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
RE_SV_DUMPLEN(r->anchored_utf8), 30);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"anchored utf8 %s%s at %"IVdf" ",
s, RE_SV_TAIL(r->anchored_utf8),
(IV)r->anchored_offset);
if (r->float_substr) {
RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
RE_SV_DUMPLEN(r->float_substr), 30);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"floating %s%s at %"IVdf"..%"UVuf" ",
s, RE_SV_TAIL(r->float_substr),
(IV)r->float_min_offset, (UV)r->float_max_offset);
} else if (r->float_utf8) {
RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
RE_SV_DUMPLEN(r->float_utf8), 30);
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
s, RE_SV_TAIL(r->float_utf8),
(IV)r->float_min_offset, (UV)r->float_max_offset);
}
if (r->check_substr || r->check_utf8)
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
(const char *)
(r->check_substr == r->float_substr
&& r->check_utf8 == r->float_utf8
? "(checking floating" : "(checking anchored"));
if (r->intflags & PREGf_NOSCAN)
- PerlIO_printf(Perl_debug_log, " noscan");
+ Perl_re_printf( aTHX_ " noscan");
if (r->extflags & RXf_CHECK_ALL)
- PerlIO_printf(Perl_debug_log, " isall");
+ Perl_re_printf( aTHX_ " isall");
if (r->check_substr || r->check_utf8)
- PerlIO_printf(Perl_debug_log, ") ");
+ Perl_re_printf( aTHX_ ") ");
if (ri->regstclass) {
regprop(r, sv, ri->regstclass, NULL, NULL);
- PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
+ Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
}
if (r->intflags & PREGf_ANCH) {
- PerlIO_printf(Perl_debug_log, "anchored");
+ Perl_re_printf( aTHX_ "anchored");
if (r->intflags & PREGf_ANCH_MBOL)
- PerlIO_printf(Perl_debug_log, "(MBOL)");
+ Perl_re_printf( aTHX_ "(MBOL)");
if (r->intflags & PREGf_ANCH_SBOL)
- PerlIO_printf(Perl_debug_log, "(SBOL)");
+ Perl_re_printf( aTHX_ "(SBOL)");
if (r->intflags & PREGf_ANCH_GPOS)
- PerlIO_printf(Perl_debug_log, "(GPOS)");
- (void)PerlIO_putc(Perl_debug_log, ' ');
+ Perl_re_printf( aTHX_ "(GPOS)");
+ Perl_re_printf( aTHX_ " ");
}
if (r->intflags & PREGf_GPOS_SEEN)
- PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
+ Perl_re_printf( aTHX_ "GPOS:%"UVuf" ", (UV)r->gofs);
if (r->intflags & PREGf_SKIP)
- PerlIO_printf(Perl_debug_log, "plus ");
+ Perl_re_printf( aTHX_ "plus ");
if (r->intflags & PREGf_IMPLICIT)
- PerlIO_printf(Perl_debug_log, "implicit ");
- PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
+ Perl_re_printf( aTHX_ "implicit ");
+ Perl_re_printf( aTHX_ "minlen %"IVdf" ", (IV)r->minlen);
if (r->extflags & RXf_EVAL_SEEN)
- PerlIO_printf(Perl_debug_log, "with eval ");
- PerlIO_printf(Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "with eval ");
+ Perl_re_printf( aTHX_ "\n");
DEBUG_FLAGS_r({
regdump_extflags("r->extflags: ",r->extflags);
regdump_intflags("r->intflags: ",r->intflags);
PERL_ARGS_ASSERT_REGPROP;
- sv_setpvn(sv, "", 0);
+ sv_setpvs(sv, "");
if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
/* It would be nice to FAIL() here, but this may be called from
: TRIE_BITMAP(trie)),
NULL,
NULL,
- NULL
+ NULL,
+ FALSE
);
sv_catpvs(sv, "]");
}
}
/* Paren and offset */
- Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
+ Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
+ (int)((o + (int)ARG2L(o)) - progi->program) );
if (name_list) {
SV **name= av_fetch(name_list, ARG(o), 0 );
if (name)
/* And things that aren't in the bitmap, but are small enough to be */
SV* bitmap_range_not_in_bitmap = NULL;
+ const bool inverted = flags & ANYOF_INVERT;
+
if (OP(o) == ANYOFL) {
if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
sv_catpvs(sv, "{utf8-locale-reqd}");
ANYOF_BITMAP(o),
bitmap_range_not_in_bitmap,
only_utf8_locale_invlist,
- o);
+ o,
+
+ /* Can't try inverting for a
+ * better display if there are
+ * things that haven't been
+ * resolved */
+ unresolved != NULL);
SvREFCNT_dec(bitmap_range_not_in_bitmap);
/* If there are user-defined properties which haven't been defined yet,
- * output them, in a separate [] from the bitmap range stuff */
+ * output them. If the result is not to be inverted, it is clearest to
+ * output them in a separate [] from the bitmap range stuff. If the
+ * result is to be complemented, we have to show everything in one [],
+ * as the inversion applies to the whole thing. Use {braces} to
+ * separate them from anything in the bitmap and anything above the
+ * bitmap. */
if (unresolved) {
- if (do_sep) {
- Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ if (inverted) {
+ if (! do_sep) { /* If didn't output anything in the bitmap */
+ sv_catpvs(sv, "^");
+ }
+ sv_catpvs(sv, "{");
}
- if (flags & ANYOF_INVERT) {
- sv_catpvs(sv, "^");
+ else if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
}
sv_catsv(sv, unresolved);
- do_sep = TRUE;
- SvREFCNT_dec_NN(unresolved);
+ if (inverted) {
+ sv_catpvs(sv, "}");
+ }
+ do_sep = ! inverted;
}
/* And, finally, add the above-the-bitmap stuff */
- if (nonbitmap_invlist) {
+ if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
SV* contents;
/* See if truncation size is overridden */
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
}
- /* And, for easy of understanding, it is always output not-shown as
- * complemented */
- if (flags & ANYOF_INVERT) {
+ /* And, for easy of understanding, it is shown in the
+ * uncomplemented form if possible. The one exception being if
+ * there are unresolved items, where the inversion has to be
+ * delayed until runtime */
+ if (inverted && ! unresolved) {
_invlist_invert(nonbitmap_invlist);
_invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
}
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(unresolved);
}
else if (k == POSIXD || k == NPOSIXD) {
U8 index = FLAGS(o) * 2;
? prog->check_utf8 : prog->check_substr);
if (!PL_colorset) reginitcolors();
- PerlIO_printf(Perl_debug_log,
+ Perl_re_printf( aTHX_
"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
PL_colors[4],
RX_UTF8(r) ? "utf8 " : "",
#endif
Safefree(r->offs);
SvREFCNT_dec(r->qr_anoncv);
+ if (r->recurse_locinput)
+ Safefree(r->recurse_locinput);
rx->sv_u.svu_rx = 0;
}
#endif
ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
SvREFCNT_inc_void(ret->qr_anoncv);
+ if (r->recurse_locinput)
+ Newxz(ret->recurse_locinput,r->nparens + 1,char *);
return ret_x;
}
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
- PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
+ Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
PL_colors[4],PL_colors[5],s);
}
});
#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
/*
- re_dup - duplicate a regexp.
+ re_dup_guts - duplicate a regexp.
This routine is expected to clone a given regexp structure. It is only
compiled under USE_ITHREADS.
RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
+ if (r->recurse_locinput)
+ Newxz(ret->recurse_locinput,r->nparens + 1,char *);
if (ret->pprivate)
RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
char, regexp_internal);
Copy(ri->program, reti->program, len+1, regnode);
+
reti->num_code_blocks = ri->num_code_blocks;
if (ri->code_blocks) {
int n;
d->data[i] = ri->data->data[i];
break;
default:
- Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
+ Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
ri->data->what[i]);
}
}
* mnemonic names. Split off any of those at the beginning and end of
* the range to print mnemonically. It isn't possible for many of
* these to be in a row, so this won't overwhelm with output */
- while (isMNEMONIC_CNTRL(start) && start <= end) {
- put_code_point(sv, start);
- start++;
- }
- if (start < end && isMNEMONIC_CNTRL(end)) {
-
- /* Here, the final character in the range has a mnemonic name.
- * Work backwards from the end to find the final non-mnemonic */
- UV temp_end = end - 1;
- while (isMNEMONIC_CNTRL(temp_end)) {
- temp_end--;
+ if ( start <= end
+ && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
+ {
+ while (isMNEMONIC_CNTRL(start) && start <= end) {
+ put_code_point(sv, start);
+ start++;
}
- /* And separately output the interior range that doesn't start or
- * end with mnemonics */
- put_range(sv, start, temp_end, FALSE);
+ /* If this didn't take care of the whole range ... */
+ if (start <= end) {
- /* Then output the mnemonic trailing controls */
- start = temp_end + 1;
- while (start <= end) {
- put_code_point(sv, start);
- start++;
+ /* Look backwards from the end to find the final non-mnemonic
+ * */
+ UV temp_end = end;
+ while (isMNEMONIC_CNTRL(temp_end)) {
+ temp_end--;
+ }
+
+ /* And separately output the interior range that doesn't start
+ * or end with mnemonics */
+ put_range(sv, start, temp_end, FALSE);
+
+ /* Then output the mnemonic trailing controls */
+ start = temp_end + 1;
+ while (start <= end) {
+ put_code_point(sv, start);
+ start++;
+ }
+ break;
}
- break;
}
/* As a final resort, output the range or subrange as hex. */
)
{
/* Create and return an SV containing a displayable version of the bitmap
- * and associated information determined by the input parameters. */
+ * and associated information determined by the input parameters. If the
+ * output would have been only the inversion indicator '^', NULL is instead
+ * returned. */
SV * output;
}
}
- /* If the only thing we output is the '^', clear it */
if (invert && SvCUR(output) == 1) {
- SvCUR_set(output, 0);
+ return NULL;
}
return output;
char *bitmap,
SV *nonbitmap_invlist,
SV *only_utf8_locale_invlist,
- const regnode * const node)
+ const regnode * const node,
+ const bool force_as_is_display)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class defined by the other arguments:
* 'node' is the regex pattern node. It is needed only when the above two
* parameters are not null, and is passed so that this routine can
* tease apart the various reasons for them.
+ * 'force_as_is_display' is TRUE if this routine should definitely NOT try
+ * to invert things to see if that leads to a cleaner display. If
+ * FALSE, this routine is free to use its judgment about doing this.
*
* It returns TRUE if there was actually something output. (It may be that
* the bitmap, etc is empty.)
*
* When called for outputting the bitmap of a non-ANYOF node, just pass the
- * bitmap, with the succeeding parameters set to NULL.
- *
+ * bitmap, with the succeeding parameters set to NULL, and the final one to
+ * FALSE.
*/
/* In general, it tries to display the 'cleanest' representation of the
* whether the class itself is to be inverted. However, there are some
* cases where it can't try inverting, as what actually matches isn't known
* until runtime, and hence the inversion isn't either. */
- bool inverting_allowed = TRUE;
+ bool inverting_allowed = ! force_as_is_display;
int i;
STRLEN orig_sv_cur = SvCUR(sv);
is UTF-8 */
SV* as_is_display; /* The output string when we take the inputs
- literally */
+ literally */
SV* inverted_display; /* The output string when we invert the inputs */
U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
/* And this flag for matching all non-ASCII 0xFF and below */
if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
{
- if (invert) {
- not_utf8 = _new_invlist(0);
- }
- else {
- not_utf8 = invlist_clone(PL_UpperLatin1);
- }
- inverting_allowed = FALSE; /* XXX needs more work to be able
- to allow this */
+ not_utf8 = invlist_clone(PL_UpperLatin1);
}
}
else if (OP(node) == ANYOFL) {
/* Accumulate the bit map into the unconditional match list */
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (BITMAP_TEST(bitmap, i)) {
- invlist = add_cp_to_invlist(invlist, i);
+ int start = i++;
+ for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
}
}
/* If have to take the output as-is, just do that */
if (! inverting_allowed) {
- sv_catsv(sv, as_is_display);
+ if (as_is_display) {
+ sv_catsv(sv, as_is_display);
+ SvREFCNT_dec_NN(as_is_display);
+ }
}
else { /* But otherwise, create the output again on the inverted input, and
use whichever version is shorter */
* conditional code points, so that when inverted, they will be gone
* from it */
_invlist_union(only_utf8, invlist, &invlist);
+ _invlist_union(not_utf8, invlist, &invlist);
_invlist_union(only_utf8_locale, invlist, &invlist);
_invlist_invert(invlist);
_invlist_intersection(invlist, PL_InBitmap, &invlist);
_invlist_invert(only_utf8);
_invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
}
+ else if (not_utf8) {
- if (not_utf8) {
- _invlist_invert(not_utf8);
- _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8);
+ /* If a code point matches iff the target string is not in UTF-8,
+ * then complementing the result has it not match iff not in UTF-8,
+ * which is the same thing as matching iff it is UTF-8. */
+ only_utf8 = not_utf8;
+ not_utf8 = NULL;
}
if (only_utf8_locale) {
/* Use the shortest representation, taking into account our bias
* against showing it inverted */
- if (SvCUR(inverted_display) + inverted_bias
- < SvCUR(as_is_display) + as_is_bias)
+ if ( inverted_display
+ && ( ! as_is_display
+ || ( SvCUR(inverted_display) + inverted_bias
+ < SvCUR(as_is_display) + as_is_bias)))
{
sv_catsv(sv, inverted_display);
}
- else {
+ else if (as_is_display) {
sv_catsv(sv, as_is_display);
}
- SvREFCNT_dec_NN(as_is_display);
- SvREFCNT_dec_NN(inverted_display);
+ SvREFCNT_dec(as_is_display);
+ SvREFCNT_dec(inverted_display);
}
SvREFCNT_dec_NN(invlist);
return SvCUR(sv) > orig_sv_cur;
}
-#define CLEAR_OPTSTART \
+#define CLEAR_OPTSTART \
if (optstart) STMT_START { \
- DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
+ DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
" (%"IVdf" nodes)\n", (IV)(node - optstart))); \
- optstart=NULL; \
+ optstart=NULL; \
} STMT_END
#define DUMPUNTIL(b,e) \
PERL_ARGS_ASSERT_DUMPUNTIL;
#ifdef DEBUG_DUMPUNTIL
- PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
+ Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n",indent,node-start,
last ? last-start : 0,plast ? plast-start : 0);
#endif
CLEAR_OPTSTART;
regprop(r, sv, node, NULL, NULL);
- PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
+ Perl_re_printf( aTHX_ "%4"IVdf":%*s%s", (IV)(node - start),
(int)(2*indent + 1), "", SvPVX_const(sv));
if (OP(node) != OPTIMIZED) {
if (next == NULL) /* Next ptr. */
- PerlIO_printf(Perl_debug_log, " (0)");
+ Perl_re_printf( aTHX_ " (0)");
else if (PL_regkind[(U8)op] == BRANCH
&& PL_regkind[OP(next)] != BRANCH )
- PerlIO_printf(Perl_debug_log, " (FAIL)");
+ Perl_re_printf( aTHX_ " (FAIL)");
else
- PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
- (void)PerlIO_putc(Perl_debug_log, '\n');
+ Perl_re_printf( aTHX_ " (%"IVdf")", (IV)(next - start));
+ Perl_re_printf( aTHX_ "\n");
}
after_print:
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
- PerlIO_printf(Perl_debug_log, "%*s%s ",
- (int)(2*(indent+3)), "",
+ Perl_re_indentf( aTHX_ "%s ",
+ indent+3,
elem_ptr
? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
SvCUR(*elem_ptr), 60,
);
if (trie->jump) {
U16 dist= trie->jump[word_idx+1];
- PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
+ Perl_re_printf( aTHX_ "(%"UVuf")\n",
(UV)((dist ? this_trie + dist : next) - start));
if (dist) {
if (!nextbranch)
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode *)nextbranch);
} else {
- PerlIO_printf(Perl_debug_log, "\n");
+ Perl_re_printf( aTHX_ "\n");
}
}
if (last && next > last)
}
CLEAR_OPTSTART;
#ifdef DEBUG_DUMPUNTIL
- PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
+ Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
#endif
return node;
}