/* Certain characters are output as a sequence with the first being a
* backslash. */
-#define isBACKSLASHED_PUNCT(c) \
- ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
struct RExC_state_t {
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
I32 in_lookbehind;
I32 contains_locale;
- I32 contains_i;
I32 override_recoding;
#ifdef EBCDIC
I32 recode_x_to_native;
#endif
I32 in_multi_char_class;
- struct reg_code_block *code_blocks; /* positions of literal (?{})
+ struct reg_code_blocks *code_blocks;/* positions of literal (?{})
within pattern */
- int num_code_blocks; /* size of code_blocks[] */
int code_index; /* next code_blocks[] slot */
SSize_t maxlen; /* mininum possible number of chars in string to match */
scan_frame *frame_head;
(pRExC_state->study_chunk_recursed_bytes)
#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
#define RExC_contains_locale (pRExC_state->contains_locale)
-#define RExC_contains_i (pRExC_state->contains_i)
-#define RExC_override_recoding (pRExC_state->override_recoding)
#ifdef EBCDIC
# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
#endif
For each string some basic information is maintained:
- - offset or min_offset
+ - min_offset
This is the position the string must appear at, or not before.
It also implicitly (when combined with minlenp) tells us how many
characters must match before the string we are searching for.
Only used for floating strings. This is the rightmost point that
the string can appear at. If set to SSize_t_MAX it indicates that the
string can occur infinitely far to the right.
+ For fixed strings, it is equal to min_offset.
- minlenp
A pointer to the minimum number of characters of the pattern that the
*/
+struct scan_data_substrs {
+ SV *str; /* longest substring found in pattern */
+ SSize_t min_offset; /* earliest point in string it can appear */
+ SSize_t max_offset; /* latest point in string it can appear */
+ SSize_t *minlenp; /* pointer to the minlen relevant to the string */
+ SSize_t lookbehind; /* is the pos of the string modified by LB */
+ I32 flags; /* per substring SF_* and SCF_* flags */
+};
+
typedef struct scan_data_t {
/*I32 len_min; unused */
/*I32 len_delta; unused */
SSize_t last_end; /* min value, <0 unless valid. */
SSize_t last_start_min;
SSize_t last_start_max;
- SV **longest; /* Either &l_fixed, or &l_float. */
- SV *longest_fixed; /* longest fixed string found in pattern */
- SSize_t offset_fixed; /* offset where it starts */
- SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
- I32 lookbehind_fixed; /* is the position of the string modfied by LB */
- SV *longest_float; /* longest floating string found in pattern */
- SSize_t offset_float_min; /* earliest point in string it can appear */
- SSize_t offset_float_max; /* latest point in string it can appear */
- SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
- SSize_t lookbehind_float; /* is the pos of the string modified by LB */
- I32 flags;
+ U8 cur_is_floating; /* whether the last_* values should be set as
+ * the next fixed (0) or floating (1)
+ * substring */
+
+ /* [0] is longest fixed substring so far, [1] is longest float so far */
+ struct scan_data_substrs substrs[2];
+
+ I32 flags; /* common SF_* and SCF_* flags */
I32 whilem_c;
SSize_t *last_closep;
regnode_ssc *start_class;
* Forward declarations for pregcomp()'s friends.
*/
-static const scan_data_t zero_scan_data =
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
+static const scan_data_t zero_scan_data = {
+ 0, 0, NULL, 0, 0, 0, 0,
+ {
+ { NULL, 0, 0, 0, 0, 0 },
+ { NULL, 0, 0, 0, 0, 0 },
+ },
+ 0, 0, NULL, NULL
+};
+
+/* study flags */
-#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
#define SF_BEFORE_SEOL 0x0001
#define SF_BEFORE_MEOL 0x0002
-#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
-#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
-
-#define SF_FIX_SHIFT_EOL (+2)
-#define SF_FL_SHIFT_EOL (+4)
-
-#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
-#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
+#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
-#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
-#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
#define SF_IS_INF 0x0040
#define SF_HAS_PAR 0x0080
#define SF_IN_PAR 0x0100
#define OOB_UNICODE 0xDEADBEEF
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
-#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
/* length of regex to show in messages that don't mark a position within */
#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
#define REPORT_LOCATION " in regex; marked by " MARKER1 \
- " in m/%"UTF8f MARKER2 "%"UTF8f"/"
+ " in m/%" UTF8f MARKER2 "%" UTF8f "/"
/* The code in this file in places uses one level of recursion with parsing
* rebased to an alternate string constructed by us in memory. This can take
} STMT_END
#define FAIL(msg) _FAIL( \
- Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
+ Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
#define FAIL2(msg,arg) _FAIL( \
- Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
+ Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
/*
#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
-#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
- if ( ( flags ) ) { \
- Perl_re_printf( aTHX_ "%s", open_str); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
- Perl_re_printf( aTHX_ "%s", close_str); \
- }
-
-
-#define DEBUG_STUDYDATA(str,data,depth) \
-DEBUG_OPTIMISE_MORE_r(if(data){ \
- Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \
- " Flags: 0x%"UVXf, \
- depth, \
- (IV)((data)->pos_min), \
- (IV)((data)->pos_delta), \
- (UV)((data)->flags) \
- ); \
- DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
- Perl_re_printf( aTHX_ \
- " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
- (IV)((data)->whilem_c), \
- (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
- is_inf ? "INF " : "" \
- ); \
- if ((data)->last_found) \
- Perl_re_printf( aTHX_ \
- "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
- " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
- SvPVX_const((data)->last_found), \
- (IV)((data)->last_end), \
- (IV)((data)->last_start_min), \
- (IV)((data)->last_start_max), \
- ((data)->longest && \
- (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
- SvPVX_const((data)->longest_fixed), \
- (IV)((data)->offset_fixed), \
- ((data)->longest && \
- (data)->longest==&((data)->longest_float)) ? "*" : "", \
- SvPVX_const((data)->longest_float), \
- (IV)((data)->offset_float_min), \
- (IV)((data)->offset_float_max) \
- ); \
- Perl_re_printf( aTHX_ "\n"); \
-});
+
+#ifdef DEBUGGING
+static void
+S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
+ const char *close_str)
+{
+ if (!flags)
+ return;
+
+ Perl_re_printf( aTHX_ "%s", open_str);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
+ Perl_re_printf( aTHX_ "%s", close_str);
+}
+
+
+static void
+S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
+ U32 depth, int is_inf)
+{
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_OPTIMISE_MORE_r({
+ if (!data)
+ return;
+ Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
+ depth,
+ where,
+ (IV)data->pos_min,
+ (IV)data->pos_delta,
+ (UV)data->flags
+ );
+
+ S_debug_show_study_flags(aTHX_ data->flags," [","]");
+
+ Perl_re_printf( aTHX_
+ " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
+ (IV)data->whilem_c,
+ (IV)(data->last_closep ? *((data)->last_closep) : -1),
+ is_inf ? "INF " : ""
+ );
+
+ if (data->last_found) {
+ int i;
+ Perl_re_printf(aTHX_
+ "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
+ SvPVX_const(data->last_found),
+ (IV)data->last_end,
+ (IV)data->last_start_min,
+ (IV)data->last_start_max
+ );
+
+ for (i = 0; i < 2; i++) {
+ Perl_re_printf(aTHX_
+ " %s%s: '%s' @ %" IVdf "/%" IVdf,
+ data->cur_is_floating == i ? "*" : "",
+ i ? "Float" : "Fixed",
+ SvPVX_const(data->substrs[i].str),
+ (IV)data->substrs[i].min_offset,
+ (IV)data->substrs[i].max_offset
+ );
+ S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
+ }
+ }
+
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+static void
+S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
+ regnode *scan, U32 depth, U32 flags)
+{
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_OPTIMISE_r({
+ regnode *Next;
+
+ if (!scan)
+ return;
+ Next = regnext(scan);
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
+ Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
+ depth,
+ str,
+ REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
+ Next ? (REG_NODE_NUM(Next)) : 0 );
+ S_debug_show_study_flags(aTHX_ flags," [ ","]");
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
+ S_debug_studydata(aTHX_ where, data, depth, is_inf)
+
+# define DEBUG_PEEP(str, scan, depth, flags) \
+ S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
+
+#else
+# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
+# define DEBUG_PEEP(str, scan, depth, flags) NOOP
+#endif
/* =========================================================
}
/* Mark that we cannot extend a found fixed substring at this point.
- Update the longest found anchored substring and the longest found
+ Update the longest found anchored substring or the longest found
floating substrings if needed. */
STATIC void
SSize_t *minlenp, int is_inf)
{
const STRLEN l = CHR_SVLEN(data->last_found);
- const STRLEN old_l = CHR_SVLEN(*data->longest);
+ SV * const longest_sv = data->substrs[data->cur_is_floating].str;
+ const STRLEN old_l = CHR_SVLEN(longest_sv);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_SCAN_COMMIT;
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
- SvSetMagicSV(*data->longest, data->last_found);
- if (*data->longest == data->longest_fixed) {
- data->offset_fixed = l ? data->last_start_min : data->pos_min;
- if (data->flags & SF_BEFORE_EOL)
- data->flags
- |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
- else
- data->flags &= ~SF_FIX_BEFORE_EOL;
- data->minlen_fixed=minlenp;
- data->lookbehind_fixed=0;
- }
- else { /* *data->longest == data->longest_float */
- data->offset_float_min = l ? data->last_start_min : data->pos_min;
- data->offset_float_max = (l
+ const U8 i = data->cur_is_floating;
+ SvSetMagicSV(longest_sv, data->last_found);
+ data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
+
+ if (!i) /* fixed */
+ data->substrs[0].max_offset = data->substrs[0].min_offset;
+ else { /* float */
+ data->substrs[1].max_offset = (l
? data->last_start_max
: (data->pos_delta > SSize_t_MAX - data->pos_min
? SSize_t_MAX
: data->pos_min + data->pos_delta));
if (is_inf
- || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
- data->offset_float_max = SSize_t_MAX;
- if (data->flags & SF_BEFORE_EOL)
- data->flags
- |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
- else
- data->flags &= ~SF_FL_BEFORE_EOL;
- data->minlen_float=minlenp;
- data->lookbehind_float=0;
- }
+ || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
+ data->substrs[1].max_offset = SSize_t_MAX;
+ }
+
+ if (data->flags & SF_BEFORE_EOL)
+ data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
+ else
+ data->substrs[i].flags &= ~SF_BEFORE_EOL;
+ data->substrs[i].minlenp = minlenp;
+ data->substrs[i].lookbehind = 0;
}
+
SvCUR_set(data->last_found, 0);
{
SV * const sv = data->last_found;
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
- DEBUG_STUDYDATA("commit: ",data,0);
+ DEBUG_STUDYDATA("commit", data, 0, is_inf);
}
/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
for( state = 1 ; state < trie->statecount ; state++ ) {
const U32 base = trie->states[ state ].trans.base;
- Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state);
+ Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
if ( trie->states[ state ].wordnum ) {
Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
Perl_re_printf( aTHX_ "%6s", "" );
}
- Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base );
+ Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
if ( base ) {
U32 ofs = 0;
!= state))
ofs++;
- Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs);
+ Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
if ( ( base + ofs >= trie->uniquecharcount )
&& trie->trans[ base + ofs
- trie->uniquecharcount ].check == state )
{
- Perl_re_printf( aTHX_ "%*"UVXf, colwidth,
+ Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
);
} else {
for( state=1 ; state < next_alloc ; state ++ ) {
U16 charid;
- Perl_re_indentf( aTHX_ " %4"UVXf" :",
+ Perl_re_indentf( aTHX_ " %4" UVXf " :",
depth+1, (UV)state );
if ( ! trie->states[ state ].wordnum ) {
Perl_re_printf( aTHX_ "%5s| ","");
SV ** const tmp = av_fetch( revcharmap,
TRIE_LIST_ITEM(state,charid).forid, 0);
if ( tmp ) {
- Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ",
+ Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
colwidth,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
colwidth,
for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
- Perl_re_indentf( aTHX_ "%4"UVXf" : ",
+ Perl_re_indentf( aTHX_ "%4" UVXf " : ",
depth+1,
(UV)TRIE_NODENUM( state ) );
for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
if (v)
- Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v );
+ Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
else
Perl_re_printf( aTHX_ "%*s", colwidth, "." );
}
if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
- Perl_re_printf( aTHX_ " (%4"UVXf")\n",
+ Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
(UV)trie->trans[ state ].check );
} else {
- Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n",
+ Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
(UV)trie->trans[ state ].check,
trie->states[ TRIE_NODENUM( state ) ].wordnum );
}
#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
- U32 ging = TRIE_LIST_LEN( state ) *= 2; \
+ U32 ging = TRIE_LIST_LEN( state ) * 2; \
Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
+ TRIE_LIST_LEN( state ) = ging; \
} \
TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
: ( state==1 ? special : 0 ) \
)
+#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
+STMT_START { \
+ TRIE_BITMAP_SET(trie, uvc); \
+ /* store the folded codepoint */ \
+ if ( folder ) \
+ TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
+ \
+ if ( !UTF ) { \
+ /* store first byte of utf8 representation of */ \
+ /* variant codepoints */ \
+ if (! UVCHR_IS_INVARIANT(uvc)) { \
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
+ } \
+ } \
+} STMT_END
#define MADE_TRIE 1
#define MADE_JUMP_TRIE 2
#define MADE_EXACT_TRIE 4
/* we just use folder as a flag in utf8 */
const U8 * folder = NULL;
+ /* in the below add_data call we are storing either 'tu' or 'tuaa'
+ * which stands for one trie structure, one hash, optionally followed
+ * by two arrays */
#ifdef DEBUGGING
- const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
+ const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
AV *trie_words = NULL;
/* along with revcharmap, this only used during construction but both are
* useful during debugging so we store them in the struct when debugging.
bitmap?*/
if (OP(noper) == NOTHING) {
+ /* skip past a NOTHING at the start of an alternation
+ * eg, /(?:)a|(?:b)/ should be the same as /a|b/
+ */
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail &&
+ (
+ OP(noper) == flags ||
+ (
+ flags == EXACTFU &&
+ OP(noper) == EXACTFU_SS
+ )
+ )
+ ) {
uc= (U8*)STRING(noper);
e= uc + STR_LEN(noper);
} else {
TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
+
for ( ; uc < e ; uc += len ) { /* Look at each char in the current
branch */
TRIE_CHARCOUNT(trie)++;
if ( set_bit ) {
/* store the codepoint in the bitmap, and its folded
* equivalent. */
- TRIE_BITMAP_SET(trie, uvc);
-
- /* store the folded codepoint */
- if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
-
- if ( !UTF ) {
- /* store first byte of utf8 representation of
- variant codepoints */
- if (! UVCHR_IS_INVARIANT(uvc)) {
- TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
- }
- }
+ TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
set_bit = 0; /* We've done our bit :-) */
}
} else {
svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
if ( !svpp )
- Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
+ Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
if ( !SvTRUE( *svpp ) ) {
sv_setiv( *svpp, ++trie->uniquecharcount );
}
state = newstate;
} else {
- Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
+ Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
}
}
}
}
state = trie->trans[ state + charid ].next;
} else {
- Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
+ Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
}
/* charid is now 0 if we dont know the char read, or
* nonzero if we do */
PerlMemShared_realloc( trie->states, laststate
* sizeof(reg_trie_state) );
DEBUG_TRIE_COMPILE_MORE_r(
- Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
+ Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
depth+1,
(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
+ 1 ),
} /* end table compress */
}
DEBUG_TRIE_COMPILE_MORE_r(
- Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n",
+ Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
depth+1,
(UV)trie->statecount,
(UV)trie->lasttrans)
});
}
DEBUG_OPTIMISE_r(
- Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n",
+ Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
depth+1,
(UV)mjd_offset, (UV)mjd_nodelen)
);
split out as an EXACT and put in front of the TRIE node. */
trie->startstate= 1;
if ( trie->bitmap && !widecharmap && !trie->jump ) {
+ /* we want to find the first state that has more than
+ * one transition, if that state is not the first state
+ * then we have a common prefix which we can remove.
+ */
U32 state;
for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
U32 ofs = 0;
- I32 idx = -1;
+ I32 first_ofs = -1; /* keeps track of the ofs of the first
+ transition, -1 means none */
U32 count = 0;
const U32 base = trie->states[ state ].trans.base;
+ /* does this state terminate an alternation? */
if ( trie->states[state].wordnum )
count = 1;
trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
{
if ( ++count > 1 ) {
- SV **tmp = av_fetch( revcharmap, ofs, 0);
- const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
+ /* we have more than one transition */
+ SV **tmp;
+ U8 *ch;
+ /* if this is the first state there is no common prefix
+ * to extract, so we can exit */
if ( state == 1 ) break;
+ tmp = av_fetch( revcharmap, ofs, 0);
+ ch = (U8*)SvPV_nolen_const( *tmp );
+
+ /* if we are on count 2 then we need to initialize the
+ * bitmap, and store the previous char if there was one
+ * in it*/
if ( count == 2 ) {
+ /* clear the bitmap */
Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
DEBUG_OPTIMISE_r(
- Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [",
+ Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
depth+1,
(UV)state));
- if (idx >= 0) {
- SV ** const tmp = av_fetch( revcharmap, idx, 0);
+ if (first_ofs >= 0) {
+ SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
- TRIE_BITMAP_SET(trie,*ch);
- if ( folder )
- TRIE_BITMAP_SET(trie, folder[ *ch ]);
- if ( !UTF ) {
- /* store first byte of utf8 representation of
- variant codepoints */
- if (! UVCHR_IS_INVARIANT(*ch)) {
- TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
- }
- }
+ TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "%s", (char*)ch)
);
}
}
- TRIE_BITMAP_SET(trie,*ch);
- if ( folder )
- TRIE_BITMAP_SET(trie,folder[ *ch ]);
- if ( !UTF ) {
- /* store first byte of utf8 representation of
- variant codepoints */
- if (! UVCHR_IS_INVARIANT(*ch)) {
- TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
- }
- }
+ /* store the current firstchar in the bitmap */
+ TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
}
- idx = ofs;
+ first_ofs = ofs;
}
}
if ( count == 1 ) {
- SV **tmp = av_fetch( revcharmap, idx, 0);
+ /* This state has only one transition, its transition is part
+ * of a common prefix - we need to concatenate the char it
+ * represents to what we have so far. */
+ SV **tmp = av_fetch( revcharmap, first_ofs, 0);
STRLEN len;
char *ch = SvPV( *tmp, len );
DEBUG_OPTIMISE_r({
SV *sv=sv_newmortal();
- Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
+ Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
depth+1,
- (UV)state, (UV)idx,
+ (UV)state, (UV)first_ofs,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
PL_colors[0], PL_colors[1],
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
*/
fail[ 0 ] = fail[ 1 ] = 0;
DEBUG_TRIE_COMPILE_r({
- Perl_re_indentf( aTHX_ "Stclass Failtable (%"UVuf" states): 0",
+ Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
depth, (UV)numstates
);
for( q_read=1; q_read<numstates; q_read++ ) {
- Perl_re_printf( aTHX_ ", %"UVuf, (UV)fail[q_read]);
+ Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
}
Perl_re_printf( aTHX_ "\n");
});
}
-#define DEBUG_PEEP(str,scan,depth) \
- DEBUG_OPTIMISE_r({if (scan){ \
- regnode *Next = regnext(scan); \
- regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
- Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \
- depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
- Next ? (REG_NODE_NUM(Next)) : 0 );\
- DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
- Perl_re_printf( aTHX_ "\n"); \
- }});
-
/* The below joins as many adjacent EXACTish nodes as possible into a single
* one. The regop may be changed if the node(s) contain certain sequences that
* require special handling. The joining is only done if:
PERL_UNUSED_ARG(flags);
PERL_UNUSED_ARG(val);
#endif
- DEBUG_PEEP("join",scan,depth);
+ DEBUG_PEEP("join", scan, depth, 0);
/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
* EXACT ones that are mergeable to the current one. */
if (OP(n) == TAIL || n > next)
stringok = 0;
if (PL_regkind[OP(n)] == NOTHING) {
- DEBUG_PEEP("skip:",n,depth);
+ DEBUG_PEEP("skip:", n, depth, 0);
NEXT_OFF(scan) += NEXT_OFF(n);
next = n + NODE_STEP_REGNODE;
#ifdef DEBUGGING
if (oldl + STR_LEN(n) > U8_MAX)
break;
- DEBUG_PEEP("merg",n,depth);
+ DEBUG_PEEP("merg", n, depth, 0);
merged++;
NEXT_OFF(scan) += NEXT_OFF(n);
#ifdef EXPERIMENTAL_INPLACESCAN
if (flags && !NEXT_OFF(n)) {
- DEBUG_PEEP("atch", val, depth);
+ DEBUG_PEEP("atch", val, depth, 0);
if (reg_off_by_arg[OP(n)]) {
ARG_SET(n, val - n);
}
}
else {
STRLEN len;
- _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
+ _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
d += len;
}
s += s_len;
n++;
}
#endif
- DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
+ DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
return stopnow;
}
the folded version may be shorter) */
bool unfolded_multi_char = FALSE;
/* Peephole optimizer: */
- DEBUG_STUDYDATA("Peep:", data, depth);
- DEBUG_PEEP("Peep", scan, depth);
+ DEBUG_STUDYDATA("Peep", data, depth, is_inf);
+ DEBUG_PEEP("Peep", scan, depth, flags);
/* The reason we do this here is that we need to deal with things like
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
scan = regnext(scan);
assert( OP(scan) == IFTHEN );
- DEBUG_PEEP("expect IFTHEN", scan, depth);
+ DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
data_fake.last_closep= &fake_last_close;
minlen = *minlenp;
next = regnext(scan);
scan = NEXTOPER(NEXTOPER(scan));
- DEBUG_PEEP("scan", scan, depth);
- DEBUG_PEEP("next", next, depth);
+ DEBUG_PEEP("scan", scan, depth, flags);
+ DEBUG_PEEP("next", next, depth, flags);
/* we suppose the run is continuous, last=next...
* NOTE we dont use the return here! */
I32 f = 0;
regnode_ssc this_class;
- DEBUG_PEEP("Branch", scan, depth);
+ DEBUG_PEEP("Branch", scan, depth, flags);
num++;
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
else
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1;
}
min += min1;
if (delta == SSize_t_MAX
DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
- Perl_re_indentf( aTHX_ "%s %"UVuf":%s\n",
+ Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
depth+1,
"Looking for TRIE'able sequences. Tail node is ",
(UV)(tail - RExC_emit_start),
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
- DEBUG_STUDYDATA("gosub-set:", data,depth);
+ DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
my_recursed_depth= recursed_depth + 1;
} else {
- DEBUG_STUDYDATA("gosub-inf:", data,depth);
+ DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
/* some form of infinite recursion, assume infinite length
* */
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1;
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
newframe->prev_recursed_depth = recursed_depth;
newframe->this_prev_frame= frame;
- DEBUG_STUDYDATA("frame-new:",data,depth);
- DEBUG_PEEP("fnew", scan, depth);
+ DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
+ DEBUG_PEEP("fnew", scan, depth, flags);
frame = newframe;
scan = start;
}
data->pos_delta += min_subtract;
if (min_subtract) {
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
}
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
/* Cannot extend fixed substrings */
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
scan = regnext(scan);
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if (!(flags & SCF_TRIE_DOING_RESTUDY)
+ if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
/* ? quantifier ok, except for (?{ ... }) */
&& (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
SAVEFREESV(RExC_rx_sv);
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
"Quantifier unexpected on zero-length expression "
- "in regex m/%"UTF8f"/",
+ "in regex m/%" UTF8f "/",
UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
RExC_precomp));
(void)ReREFCNT_inc(RExC_rx_sv);
However, this time it's not a subexpression
we care about, but the expression itself. */
&& (maxcount == REG_INFTY)
- && data && ++data->whilem_c < 16) {
+ && data) {
/* This stays as CURLYX, we can put the count/of pair. */
/* Find WHILEM (as in regexec.c) */
regnode *nxt = oscan + NEXT_OFF(oscan);
if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
nxt += ARG(nxt);
- PREVOPER(nxt)->flags = (U8)(data->whilem_c
- | (RExC_whilem_seen << 4)); /* On WHILEM */
+ nxt = PREVOPER(nxt);
+ if (nxt->flags & 0xf) {
+ /* we've already set whilem count on this node */
+ } else if (++data->whilem_c < 16) {
+ assert(data->whilem_c <= RExC_whilem_seen);
+ nxt->flags = (U8)(data->whilem_c
+ | (RExC_whilem_seen << 4)); /* On WHILEM */
+ }
}
if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
pars++;
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-Perl_re_printf( aTHX_ "counted=%"UVuf" deltanext=%"UVuf
- " SSize_t_MAX=%"UVuf" minnext=%"UVuf
- " maxcount=%"UVuf" mincount=%"UVuf"\n",
+Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
+ " SSize_t_MAX=%" UVuf " minnext=%" UVuf
+ " maxcount=%" UVuf " mincount=%" UVuf "\n",
(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
(UV)mincount);
if (deltanext != SSize_t_MAX)
-Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n",
+Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
? SSize_t_MAX
: data->pos_min + data->pos_delta - last_chrs;
}
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
SvREFCNT_dec(last_str);
}
if (flags & SCF_DO_SUBSTR) {
/* Cannot expect anything... */
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
data->pos_min += 1;
data->pos_delta += 1;
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
}
else if (REGNODE_SIMPLE(OP(scan))) {
FAIL("Variable length lookbehind not implemented");
}
else if (minnext > (I32)U8_MAX) {
- FAIL2("Lookbehind longer than %"UVuf" not implemented",
+ FAIL2("Lookbehind longer than %" UVuf " not implemented",
(UV)U8_MAX);
}
scan->flags = (U8)minnext;
else
data_fake.last_closep = &fake;
data_fake.flags = 0;
+ data_fake.substrs[0].flags = 0;
+ data_fake.substrs[1].flags = 0;
data_fake.pos_delta = delta;
if (is_inf)
data_fake.flags |= SF_IS_INF;
FAIL("Variable length lookbehind not implemented");
}
else if (*minnextp > (I32)U8_MAX) {
- FAIL2("Lookbehind longer than %"UVuf" not implemented",
+ FAIL2("Lookbehind longer than %" UVuf " not implemented",
(UV)U8_MAX);
}
scan->flags = (U8)*minnextp;
data->flags |= SF_HAS_EVAL;
data->whilem_c = data_fake.whilem_c;
if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
+ int i;
if (RExC_rx->minlen<*minnextp)
RExC_rx->minlen=*minnextp;
scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
SvREFCNT_dec_NN(data_fake.last_found);
- if ( data_fake.minlen_fixed != minlenp )
- {
- data->offset_fixed= data_fake.offset_fixed;
- data->minlen_fixed= data_fake.minlen_fixed;
- data->lookbehind_fixed+= scan->flags;
- }
- if ( data_fake.minlen_float != minlenp )
- {
- data->minlen_float= data_fake.minlen_float;
- data->offset_float_min=data_fake.offset_float_min;
- data->offset_float_max=data_fake.offset_float_max;
- data->lookbehind_float+= scan->flags;
+ for (i = 0; i < 2; i++) {
+ if (data_fake.substrs[i].minlenp != minlenp) {
+ data->substrs[i].min_offset =
+ data_fake.substrs[i].min_offset;
+ data->substrs[i].max_offset =
+ data_fake.substrs[i].max_offset;
+ data->substrs[i].minlenp =
+ data_fake.substrs[i].minlenp;
+ data->substrs[i].lookbehind += scan->flags;
+ }
}
}
}
}
#endif
}
+
else if (OP(scan) == OPEN) {
if (stopparen != (I32)ARG(scan))
pars++;
{
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
data->pos_min += min1;
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
min += min1;
if (delta != SSize_t_MAX)
data->pos_min += trie->minlen;
data->pos_delta += (trie->maxlen - trie->minlen);
if (trie->maxlen != trie->minlen)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
if (trie->jump) /* no more substrings -- for now /grr*/
flags &= ~SCF_DO_SUBSTR;
/* we need to unwind recursion. */
depth = depth - 1;
- DEBUG_STUDYDATA("frame-end:",data,depth);
- DEBUG_PEEP("fend", scan, depth);
+ DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
+ DEBUG_PEEP("fend", scan, depth, flags);
/* restore previous context */
last = frame->last_regnode;
}
assert(!frame);
- DEBUG_STUDYDATA("pre-fin:",data,depth);
+ DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
*scanp = scan;
*deltap = is_inf_internal ? SSize_t_MAX : delta;
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
- DEBUG_STUDYDATA("post-fin:",data,depth);
+ DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
{
SSize_t final_minlen= min < stopmin ? min : stopmin;
/* Dispatch a request to compile a regexp to correct regexp engine. */
DEBUG_COMPILE_r({
- Perl_re_printf( aTHX_ "Using engine %"UVxf"\n",
+ Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
PTR2UV(eng));
});
return CALLREGCOMP_ENG(eng, pattern, flags);
}
+static void
+S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
+{
+ int n;
+
+ if (--cbs->refcnt > 0)
+ return;
+ for (n = 0; n < cbs->count; n++) {
+ REGEXP *rx = cbs->cb[n].src_regex;
+ cbs->cb[n].src_regex = NULL;
+ SvREFCNT_dec(rx);
+ }
+ Safefree(cbs->cb);
+ Safefree(cbs);
+}
+
+
+static struct reg_code_blocks *
+S_alloc_code_blocks(pTHX_ int ncode)
+{
+ struct reg_code_blocks *cbs;
+ Newx(cbs, 1, struct reg_code_blocks);
+ cbs->count = ncode;
+ cbs->refcnt = 1;
+ SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
+ if (ncode)
+ Newx(cbs->cb, ncode, struct reg_code_block);
+ else
+ cbs->cb = NULL;
+ return cbs;
+}
+
+
/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
* blocks, recalculate the indices. Update pat_p and plen_p in-place to
* point to the realloced string and length.
while (s < *plen_p) {
append_utf8_from_native_byte(src[s], &d);
+
if (n < num_code_blocks) {
- if (!do_end && pRExC_state->code_blocks[n].start == s) {
- pRExC_state->code_blocks[n].start = d - dst - 1;
+ assert(pRExC_state->code_blocks);
+ if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
+ pRExC_state->code_blocks->cb[n].start = d - dst - 1;
assert(*(d - 1) == '(');
do_end = 1;
}
- else if (do_end && pRExC_state->code_blocks[n].end == s) {
- pRExC_state->code_blocks[n].end = d - dst - 1;
+ else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
+ pRExC_state->code_blocks->cb[n].end = d - dst - 1;
assert(*(d - 1) == ')');
do_end = 0;
n++;
if (oplist->op_type == OP_NULL
&& (oplist->op_flags & OPf_SPECIAL))
{
- assert(n < pRExC_state->num_code_blocks);
- pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
- pRExC_state->code_blocks[n].block = oplist;
- pRExC_state->code_blocks[n].src_regex = NULL;
+ assert(n < pRExC_state->code_blocks->count);
+ pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
+ pRExC_state->code_blocks->cb[n].block = oplist;
+ pRExC_state->code_blocks->cb[n].src_regex = NULL;
n++;
code = 1;
oplist = OpSIBLING(oplist); /* skip CONST */
sv_setsv(pat, sv);
/* overloading involved: all bets are off over literal
* code. Pretend we haven't seen it */
- pRExC_state->num_code_blocks -= n;
+ if (n)
+ pRExC_state->code_blocks->count -= n;
n = 0;
}
else {
}
if (code)
- pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
+ pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
}
/* extract any code blocks within any embedded qr//'s */
{
RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
- if (ri->num_code_blocks) {
+ if (ri->code_blocks && ri->code_blocks->count) {
int i;
/* the presence of an embedded qr// with code means
* we should always recompile: the text of the
* qr// may not have changed, but it may be a
* different closure than last time */
*recompile_p = 1;
- Renew(pRExC_state->code_blocks,
- pRExC_state->num_code_blocks + ri->num_code_blocks,
- struct reg_code_block);
- pRExC_state->num_code_blocks += ri->num_code_blocks;
+ if (pRExC_state->code_blocks) {
+ int new_count = pRExC_state->code_blocks->count
+ + ri->code_blocks->count;
+ Renew(pRExC_state->code_blocks->cb,
+ new_count, struct reg_code_block);
+ pRExC_state->code_blocks->count = new_count;
+ }
+ else
+ pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
+ ri->code_blocks->count);
- for (i=0; i < ri->num_code_blocks; i++) {
+ for (i=0; i < ri->code_blocks->count; i++) {
struct reg_code_block *src, *dst;
STRLEN offset = orig_patlen
+ ReANY((REGEXP *)rx)->pre_prefix;
- assert(n < pRExC_state->num_code_blocks);
- src = &ri->code_blocks[i];
- dst = &pRExC_state->code_blocks[n];
+ assert(n < pRExC_state->code_blocks->count);
+ src = &ri->code_blocks->cb[i];
+ dst = &pRExC_state->code_blocks->cb[n];
dst->start = src->start + offset;
dst->end = src->end + offset;
dst->block = src->block;
PERL_UNUSED_CONTEXT;
for (s = 0; s < plen; s++) {
- if (n < pRExC_state->num_code_blocks
- && s == pRExC_state->code_blocks[n].start)
+ if ( pRExC_state->code_blocks
+ && n < pRExC_state->code_blocks->count
+ && s == pRExC_state->code_blocks->cb[n].start)
{
- s = pRExC_state->code_blocks[n].end;
+ s = pRExC_state->code_blocks->cb[n].end;
n++;
continue;
}
int n = 0;
STRLEN s;
char *p, *newpat;
- int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
+ int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
SV *sv, *qr_ref;
dSP;
*p++ = 'q'; *p++ = 'r'; *p++ = '\'';
for (s = 0; s < plen; s++) {
- if (n < pRExC_state->num_code_blocks
- && s == pRExC_state->code_blocks[n].start)
+ if ( pRExC_state->code_blocks
+ && n < pRExC_state->code_blocks->count
+ && s == pRExC_state->code_blocks->cb[n].start)
{
/* blank out literal code block */
assert(pat[s] == '(');
- while (s <= pRExC_state->code_blocks[n].end) {
+ while (s <= pRExC_state->code_blocks->cb[n].end) {
*p++ = '_';
s++;
}
*p++ = pat[s];
}
*p++ = '\'';
- if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
+ if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
*p++ = 'x';
+ if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
+ *p++ = 'x';
+ }
+ }
*p++ = '\0';
DEBUG_COMPILE_r({
Perl_re_printf( aTHX_
{
SV * const errsv = ERRSV;
if (SvTRUE_NN(errsv))
- {
- Safefree(pRExC_state->code_blocks);
/* use croak_sv ? */
- Perl_croak_nocontext("%"SVf, SVfARG(errsv));
- }
+ Perl_croak_nocontext("%" SVf, SVfARG(errsv));
}
assert(SvROK(qr_ref));
qr = SvRV(qr_ref);
struct reg_code_block *new_block, *dst;
RExC_state_t * const r1 = pRExC_state; /* convenient alias */
int i1 = 0, i2 = 0;
+ int r1c, r2c;
- if (!r2->num_code_blocks) /* we guessed wrong */
+ if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
{
SvREFCNT_dec_NN(qr);
return 1;
}
- Newx(new_block,
- r1->num_code_blocks + r2->num_code_blocks,
- struct reg_code_block);
+ if (!r1->code_blocks)
+ r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
+
+ r1c = r1->code_blocks->count;
+ r2c = r2->code_blocks->count;
+
+ Newx(new_block, r1c + r2c, struct reg_code_block);
+
dst = new_block;
- while ( i1 < r1->num_code_blocks
- || i2 < r2->num_code_blocks)
- {
+ while (i1 < r1c || i2 < r2c) {
struct reg_code_block *src;
bool is_qr = 0;
- if (i1 == r1->num_code_blocks) {
- src = &r2->code_blocks[i2++];
+ if (i1 == r1c) {
+ src = &r2->code_blocks->cb[i2++];
is_qr = 1;
}
- else if (i2 == r2->num_code_blocks)
- src = &r1->code_blocks[i1++];
- else if ( r1->code_blocks[i1].start
- < r2->code_blocks[i2].start)
+ else if (i2 == r2c)
+ src = &r1->code_blocks->cb[i1++];
+ else if ( r1->code_blocks->cb[i1].start
+ < r2->code_blocks->cb[i2].start)
{
- src = &r1->code_blocks[i1++];
- assert(src->end < r2->code_blocks[i2].start);
+ src = &r1->code_blocks->cb[i1++];
+ assert(src->end < r2->code_blocks->cb[i2].start);
}
else {
- assert( r1->code_blocks[i1].start
- > r2->code_blocks[i2].start);
- src = &r2->code_blocks[i2++];
+ assert( r1->code_blocks->cb[i1].start
+ > r2->code_blocks->cb[i2].start);
+ src = &r2->code_blocks->cb[i2++];
is_qr = 1;
- assert(src->end < r1->code_blocks[i1].start);
+ assert(src->end < r1->code_blocks->cb[i1].start);
}
assert(pat[src->start] == '(');
: src->src_regex;
dst++;
}
- r1->num_code_blocks += r2->num_code_blocks;
- Safefree(r1->code_blocks);
- r1->code_blocks = new_block;
+ r1->code_blocks->count += r2c;
+ Safefree(r1->code_blocks->cb);
+ r1->code_blocks->cb = new_block;
}
SvREFCNT_dec_NN(qr);
STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
- SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
- SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
- STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
+ struct reg_substr_datum *rsd,
+ struct scan_data_substrs *sub,
+ STRLEN longest_length)
{
/* This is the common code for setting up the floating and fixed length
* string data extracted from Perl_re_op_compile() below. Returns a boolean
I32 t;
SSize_t ml;
+ bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
+ bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
if (! (longest_length
|| (eol /* Can't have SEOL and MULTI */
/* copy the information about the longest from the reg_scan_data
over to the program. */
- if (SvUTF8(sv_longest)) {
- *rx_utf8 = sv_longest;
- *rx_substr = NULL;
+ if (SvUTF8(sub->str)) {
+ rsd->substr = NULL;
+ rsd->utf8_substr = sub->str;
} else {
- *rx_substr = sv_longest;
- *rx_utf8 = NULL;
+ rsd->substr = sub->str;
+ rsd->utf8_substr = NULL;
}
/* end_shift is how many chars that must be matched that
follow this item. We calculate it ahead of time as once the
lookbehind offset is added in we lose the ability to correctly
calculate it.*/
- ml = minlen ? *(minlen) : (SSize_t)longest_length;
- *rx_end_shift = ml - offset
- - longest_length + (SvTAIL(sv_longest) != 0)
- + lookbehind;
+ ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
+ rsd->end_shift = ml - sub->min_offset
+ - longest_length
+ /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
+ * intead? - DAPM
+ + (SvTAIL(sub->str) != 0)
+ */
+ + sub->lookbehind;
t = (eol/* Can't have SEOL and MULTI */
&& (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
+ fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
return TRUE;
}
SSize_t minlen = 0;
U32 rx_flags;
SV *pat;
- SV *code_blocksv = NULL;
SV** new_patternp = patternp;
/* these are all flags - maybe they should be turned
pRExC_state->warn_text = NULL;
pRExC_state->code_blocks = NULL;
- pRExC_state->num_code_blocks = 0;
if (is_bare_re)
*is_bare_re = FALSE;
for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
ncode++; /* count of DO blocks */
- if (ncode) {
- pRExC_state->num_code_blocks = ncode;
- Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
- }
+
+ if (ncode)
+ pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
}
if (!pat_count) {
/* set expr to the first arg op */
- if (pRExC_state->num_code_blocks
+ if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
&& expr->op_type != OP_CONST)
{
expr = cLISTOPx(expr)->op_first;
if (is_bare_re)
*is_bare_re = TRUE;
SvREFCNT_inc(re);
- Safefree(pRExC_state->code_blocks);
DEBUG_PARSE_r(Perl_re_printf( aTHX_
"Precompiled pattern%s\n",
orig_rx_flags & RXf_SPLIT ? " for split" : ""));
pat = newSVpvn_flags(exp, plen, SVs_TEMP |
(IN_BYTES ? 0 : SvUTF8(pat)));
}
- Safefree(pRExC_state->code_blocks);
return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
}
RExC_uni_semantics = 0;
RExC_seen_unfolded_sharp_s = 0;
RExC_contains_locale = 0;
- RExC_contains_i = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
RExC_study_started = 0;
pRExC_state->runtime_code_qr = NULL;
&& memEQ(RX_PRECOMP(old_re), exp, plen)
&& !runtime_code /* with runtime code, always recompile */ )
{
- Safefree(pRExC_state->code_blocks);
return old_re;
}
rx_flags = orig_rx_flags;
- if (rx_flags & PMf_FOLD) {
- RExC_contains_i = 1;
- }
if ( initial_charset == REGEX_DEPENDS_CHARSET
&& (RExC_utf8 ||RExC_uni_semantics))
{
/* whoops, we have a non-utf8 pattern, whilst run-time code
* got compiled as utf8. Try again with a utf8 pattern */
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
- pRExC_state->num_code_blocks);
+ pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
goto redo_first_pass;
}
}
RExC_in_lookbehind = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_extralen = 0;
- RExC_override_recoding = 0;
#ifdef EBCDIC
RExC_recode_x_to_native = 0;
#endif
RExC_lastnum=0;
RExC_lastparse=NULL;
);
- /* reg may croak on us, not giving us a chance to free
- pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
- need it to survive as long as the regexp (qr/(?{})/).
- We must check that code_blocksv is not already set, because we may
- have jumped back to restart the sizing pass. */
- if (pRExC_state->code_blocks && !code_blocksv) {
- code_blocksv = newSV_type(SVt_PV);
- SAVEFREESV(code_blocksv);
- SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
- SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
- }
+
if (reg(pRExC_state, 0, &flags,1) == NULL) {
/* It's possible to write a regexp in ascii that represents Unicode
codepoints outside of the byte range, such as via \x{100}. If we
if (flags & RESTART_PASS1) {
if (flags & NEED_UTF8) {
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
- pRExC_state->num_code_blocks);
+ pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
}
else {
DEBUG_PARSE_r(Perl_re_printf( aTHX_
goto redo_first_pass;
}
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
}
- if (code_blocksv)
- SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
DEBUG_PARSE_r({
Perl_re_printf( aTHX_
- "Required size %"IVdf" nodes\n"
+ "Required size %" IVdf " nodes\n"
"Starting second pass (creation)\n",
(IV)RExC_size);
RExC_lastnum=0;
if (pm_flags & PMf_IS_QR) {
ri->code_blocks = pRExC_state->code_blocks;
- ri->num_code_blocks = pRExC_state->num_code_blocks;
- }
- else
- {
- int n;
- for (n = 0; n < pRExC_state->num_code_blocks; n++)
- if (pRExC_state->code_blocks[n].src_regex)
- SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
- if(pRExC_state->code_blocks)
- SAVEFREEPV(pRExC_state->code_blocks); /* often null */
+ if (ri->code_blocks)
+ ri->code_blocks->refcnt++;
}
{
== REG_RUN_ON_COMMENT_SEEN);
U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
>> RXf_PMf_STD_PMMOD_SHIFT);
- const char *fptr = STD_PAT_MODS; /*"msixn"*/
+ const char *fptr = STD_PAT_MODS; /*"msixxn"*/
char *p;
/* We output all the necessary flags; we never output a minus, as all
/* make sure PL_bitcount bounds not exceeded */
assert(sizeof(STD_PAT_MODS) <= 8);
- Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
- r->xpv_len_u.xpvlenu_pv = p;
+ p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
+ SvPOK_on(rx);
if (RExC_utf8)
SvFLAGS(rx) |= SVf_UTF8;
*p++='('; *p++='?';
#ifdef RE_TRACK_PATTERN_OFFSETS
Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
- "%s %"UVuf" bytes for offset annotations.\n",
+ "%s %" UVuf " bytes for offset annotations.\n",
ri->u.offsets ? "Got" : "Couldn't get",
(UV)((2*RExC_size+1) * sizeof(U32))));
#endif
RExC_npar = 1;
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
}
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "Starting post parse optimization\n");
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
*/
SSize_t fake;
- STRLEN longest_float_length, longest_fixed_length;
+ STRLEN longest_length[2];
regnode_ssc ch_class; /* pointed to by data */
int stclass_flag;
SSize_t last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
+ int i;
+
/*
* Skip introductions and multiplicators >= 1
* so that we can extract the 'meat' of the pattern that must
/* Starting-point info. */
again:
- DEBUG_PEEP("first:",first,0);
+ DEBUG_PEEP("first:", first, 0, 0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
if (OP(first) == EXACT || OP(first) == EXACTL)
!sawlookahead &&
(OP(first) == STAR &&
PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
- !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
+ !(r->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
{
/* turn .* into ^.* with an implied $*=1 */
const int type =
}
if (sawplus && !sawminmod && !sawlookahead
&& (!sawopen || !RExC_sawback)
- && !pRExC_state->num_code_blocks) /* May examine pos and $& */
+ && !pRExC_state->code_blocks) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->intflags |= PREGf_SKIP;
#ifdef TRIE_STUDY_OPT
DEBUG_PARSE_r(
if (!restudied)
- Perl_re_printf( aTHX_ "first at %"IVdf"\n",
+ Perl_re_printf( aTHX_ "first at %" IVdf "\n",
(IV)(first - scan + 1))
);
#else
DEBUG_PARSE_r(
- Perl_re_printf( aTHX_ "first at %"IVdf"\n",
+ Perl_re_printf( aTHX_ "first at %" IVdf "\n",
(IV)(first - scan + 1))
);
#endif
* earlier string may buy us something the later one won't.]
*/
- data.longest_fixed = newSVpvs("");
- data.longest_float = newSVpvs("");
+ data.substrs[0].str = newSVpvs("");
+ data.substrs[1].str = newSVpvs("");
data.last_found = newSVpvs("");
- data.longest = &(data.longest_fixed);
+ data.cur_is_floating = 0; /* initially any found substring is fixed */
ENTER_with_name("study_chunk");
- SAVEFREESV(data.longest_fixed);
- SAVEFREESV(data.longest_float);
+ SAVEFREESV(data.substrs[0].str);
+ SAVEFREESV(data.substrs[1].str);
SAVEFREESV(data.last_found);
first = scan;
if (!ri->regstclass) {
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
- if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
+ if ( RExC_npar == 1 && !data.cur_is_floating
&& data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& !(RExC_seen & REG_VERBARG_SEEN)
}
scan_commit(pRExC_state, &data,&minlen,0);
- longest_float_length = CHR_SVLEN(data.longest_float);
-
- if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
- && S_setup_longest (aTHX_ pRExC_state,
- data.longest_float,
- &(r->float_utf8),
- &(r->float_substr),
- &(r->float_end_shift),
- data.lookbehind_float,
- data.offset_float_min,
- data.minlen_float,
- longest_float_length,
- cBOOL(data.flags & SF_FL_BEFORE_EOL),
- cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
- {
- r->float_min_offset = data.offset_float_min - data.lookbehind_float;
- r->float_max_offset = data.offset_float_max;
- if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
- r->float_max_offset -= data.lookbehind_float;
- SvREFCNT_inc_simple_void_NN(data.longest_float);
- }
- else {
- r->float_substr = r->float_utf8 = NULL;
- longest_float_length = 0;
- }
- longest_fixed_length = CHR_SVLEN(data.longest_fixed);
-
- if (S_setup_longest (aTHX_ pRExC_state,
- data.longest_fixed,
- &(r->anchored_utf8),
- &(r->anchored_substr),
- &(r->anchored_end_shift),
- data.lookbehind_fixed,
- data.offset_fixed,
- data.minlen_fixed,
- longest_fixed_length,
- cBOOL(data.flags & SF_FIX_BEFORE_EOL),
- cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
- {
- r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
- SvREFCNT_inc_simple_void_NN(data.longest_fixed);
- }
- else {
- r->anchored_substr = r->anchored_utf8 = NULL;
- longest_fixed_length = 0;
- }
+ /* XXX this is done in reverse order because that's the way the
+ * code was before it was parameterised. Don't know whether it
+ * actually needs doing in reverse order. DAPM */
+ for (i = 1; i >= 0; i--) {
+ longest_length[i] = CHR_SVLEN(data.substrs[i].str);
+
+ if ( !( i
+ && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
+ && data.substrs[0].min_offset
+ == data.substrs[1].min_offset
+ && SvCUR(data.substrs[0].str)
+ == SvCUR(data.substrs[1].str)
+ )
+ && S_setup_longest (aTHX_ pRExC_state,
+ &(r->substrs->data[i]),
+ &(data.substrs[i]),
+ longest_length[i]))
+ {
+ r->substrs->data[i].min_offset =
+ data.substrs[i].min_offset - data.substrs[i].lookbehind;
+
+ r->substrs->data[i].max_offset = data.substrs[i].max_offset;
+ /* Don't offset infinity */
+ if (data.substrs[i].max_offset < SSize_t_MAX)
+ r->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
+ SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
+ }
+ else {
+ r->substrs->data[i].substr = NULL;
+ r->substrs->data[i].utf8_substr = NULL;
+ longest_length[i] = 0;
+ }
+ }
+
LEAVE_with_name("study_chunk");
if (ri->regstclass
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
- if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+ if ((!(r->substrs->data[0].substr || r->substrs->data[0].utf8_substr)
+ || r->substrs->data[0].min_offset)
&& stclass_flag
&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
data.start_class = NULL;
}
- /* A temporary algorithm prefers floated substr to fixed one to dig
- * more info. */
- if (longest_fixed_length > longest_float_length) {
- r->substrs->check_ix = 0;
- r->check_end_shift = r->anchored_end_shift;
- r->check_substr = r->anchored_substr;
- r->check_utf8 = r->anchored_utf8;
- r->check_offset_min = r->check_offset_max = r->anchored_offset;
- if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
- r->intflags |= PREGf_NOSCAN;
- }
- else {
- r->substrs->check_ix = 1;
- r->check_end_shift = r->float_end_shift;
- r->check_substr = r->float_substr;
- r->check_utf8 = r->float_utf8;
- r->check_offset_min = r->float_min_offset;
- r->check_offset_max = r->float_max_offset;
- }
+ /* A temporary algorithm prefers floated substr to fixed one of
+ * same length to dig more info. */
+ i = (longest_length[0] <= longest_length[1]);
+ r->substrs->check_ix = i;
+ r->check_end_shift = r->substrs->data[i].end_shift;
+ r->check_substr = r->substrs->data[i].substr;
+ r->check_utf8 = r->substrs->data[i].utf8_substr;
+ r->check_offset_min = r->substrs->data[i].min_offset;
+ r->check_offset_max = r->substrs->data[i].max_offset;
+ if (!i && (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
+ r->intflags |= PREGf_NOSCAN;
+
if ((r->check_substr || r->check_utf8) ) {
r->extflags |= RXf_USE_INTUIT;
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->extflags |= RXf_INTUIT_TAIL;
}
- r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
- if ( (STRLEN)minlen < longest_float_length )
- minlen= longest_float_length;
- if ( (STRLEN)minlen < longest_fixed_length )
- minlen= longest_fixed_length;
+ if ( (STRLEN)minlen < longest_length[1] )
+ minlen= longest_length[1];
+ if ( (STRLEN)minlen < longest_length[0] )
+ minlen= longest_length[0];
*/
}
else {
CHECK_RESTUDY_GOTO_butfirst(NOOP);
- r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
- = r->float_substr = r->float_utf8 = NULL;
+ r->check_substr = NULL;
+ r->check_utf8 = NULL;
+ r->substrs->data[0].substr = NULL;
+ r->substrs->data[0].utf8_substr = NULL;
+ r->substrs->data[1].substr = NULL;
+ r->substrs->data[1].utf8_substr = NULL;
if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
/* Guard against an embedded (?=) or (?<=) with a longer minlen than
the "real" pattern. */
DEBUG_OPTIMISE_r({
- Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
+ Perl_re_printf( aTHX_ "minlen: %" IVdf " r->minlen:%" IVdf " maxlen:%" IVdf "\n",
(IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
});
r->minlenret = minlen;
if (RExC_seen & REG_LOOKBEHIND_SEEN)
r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
lookbehind */
- if (pRExC_state->num_code_blocks)
+ if (pRExC_state->code_blocks)
r->extflags |= RXf_EVAL_SEEN;
if (RExC_seen & REG_VERBARG_SEEN)
{
while ( RExC_recurse_count > 0 ) {
const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
+ /*
+ * This data structure is set up in study_chunk() and is used
+ * to calculate the distance between a GOSUB regopcode and
+ * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
+ * it refers to.
+ *
+ * If for some reason someone writes code that optimises
+ * away a GOSUB opcode then the assert should be changed to
+ * an if(scan) to guard the ARG2L_SET() - Yves
+ *
+ */
+ assert(scan && OP(scan) == GOSUB);
ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
}
STRLEN i;
GET_RE_DEBUG_FLAGS_DECL;
Perl_re_printf( aTHX_
- "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
+ "Offsets: [%" UVuf "]\n\t", (UV)ri->u.offsets[0]);
for (i = 1; i <= len; i++) {
if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
- Perl_re_printf( aTHX_ "%"UVuf":%"UVuf"[%"UVuf"] ",
+ Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
(UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
}
Perl_re_printf( aTHX_ "\n");
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
const U32 flags)
{
- AV *retarray = NULL;
SV *ret;
struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
- if (flags & RXapif_ALL)
- retarray=newAV();
-
if (rx && RXp_PAREN_NAMES(rx)) {
HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
if (he_str) {
IV i;
SV* sv_dat=HeVAL(he_str);
I32 *nums=(I32*)SvPVX(sv_dat);
+ AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
for ( i=0; i<SvIVX(sv_dat); i++ ) {
if ((I32)(rx->nparens) >= nums[i]
&& rx->offs[nums[i]].start != -1
}
} else {
ret_undef:
- sv_setsv(sv,&PL_sv_undef);
+ sv_set_undef(sv);
return;
}
}
assert (RExC_parse <= RExC_end);
if (RExC_parse == RExC_end) NOOP;
- else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
+ else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
/* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
* using do...while */
if (UTF)
do {
RExC_parse += UTF8SKIP(RExC_parse);
- } while (isWORDCHAR_utf8((U8*)RExC_parse));
+ } while ( RExC_parse < RExC_end
+ && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
else
do {
RExC_parse++;
- } while (isWORDCHAR(*RExC_parse));
+ } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
} else {
RExC_parse++; /* so the <- from the vFAIL is after the offending
character */
if ( array[final_element] > start
|| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
{
- Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
+ Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
array[final_element], start,
ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
}
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
+ Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
start, intra_range_delimiter,
inter_range_delimiter);
}
else if (end != start) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
+ Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
start,
intra_range_delimiter,
end, inter_range_delimiter);
}
else {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
+ Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
start, inter_range_delimiter);
}
}
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
Perl_dump_indent(aTHX_ level, file,
- "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
+ "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
indent, (UV)count, start);
}
else if (end != start) {
Perl_dump_indent(aTHX_ level, file,
- "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
+ "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
indent, (UV)count, start, end);
}
else {
- Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
+ Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
indent, (UV)count, start);
}
count += 2;
* to force that */
if (! PL_utf8_tofold) {
U8 dummy[UTF8_MAXBYTES_CASE+1];
+ const U8 hyphen[] = HYPHEN_UTF8;
/* This string is just a short named one above \xff */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+ toFOLD_utf8_safe(hyphen, hyphen + sizeof(hyphen) - 1, dummy, NULL);
assert(PL_utf8_tofold); /* Verify that worked */
}
PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
UV len_a = _invlist_len(a);
UV len_b = _invlist_len(b);
- UV i = 0; /* current index into the arrays */
- bool retval = TRUE; /* Assume are identical until proven otherwise */
-
PERL_ARGS_ASSERT__INVLISTEQ;
/* If are to compare 'a' with the complement of b, set it
}
}
- /* Make sure that the lengths are the same, as well as the final element
- * before looping through the remainder. (Thus we test the length, final,
- * and first elements right off the bat) */
- if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
- retval = FALSE;
- }
- else for (i = 0; i < len_a - 1; i++) {
- if (array_a[i] != array_b[i]) {
- retval = FALSE;
- break;
- }
- }
+ return len_a == len_b
+ && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
- return retval;
}
#endif
}
else {
STRLEN len;
- to_utf8_fold(s, d, &len);
+ toFOLD_utf8_safe(s, e, d, &len);
d += len;
s += UTF8SKIP(s);
}
{
AV* list = (AV*) *listp;
IV k;
- for (k = 0; k <= av_tindex_nomg(list); k++) {
+ for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
assert(c_p);
}
flagsp = &negflags;
wastedflags = 0; /* reset so (?g-c) warns twice */
+ x_mod_count = 0;
break;
case ':':
case ')':
+
+ if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
+ negflags |= RXf_PMf_EXTENDED_MORE;
+ }
RExC_flags |= posflags;
+
+ if (negflags & RXf_PMf_EXTENDED) {
+ negflags |= RXf_PMf_EXTENDED_MORE;
+ }
RExC_flags &= ~negflags;
set_regex_charset(&RExC_flags, cs);
- if (RExC_flags & RXf_PMf_FOLD) {
- RExC_contains_i = 1;
- }
- if (UNLIKELY((x_mod_count) > 1)) {
- vFAIL("Only one /x regex modifier is allowed");
- }
return;
- /*NOTREACHED*/
default:
fail_modifiers:
RExC_parse += SKIP_IF_CHAR(RExC_parse);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
- vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
+ vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
NOT_REACHED; /*NOTREACHED*/
}
if ( ! op ) {
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
vFAIL2utf8f(
- "Unknown verb pattern '%"UTF8f"'",
+ "Unknown verb pattern '%" UTF8f "'",
UTF8fARG(UTF, verb_len, start_verb));
}
if ( arg_required && !start_arg ) {
}
RExC_recurse_count++;
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
+ "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
22, "| |", (int)(depth * 2 + 1), "",
(UV)ARG(ret), (IV)ARG2L(ret)));
}
RExC_parse += SKIP_IF_CHAR(RExC_parse);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f(
- "Sequence (%"UTF8f"...) not recognized",
+ "Sequence (%" UTF8f "...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
NOT_REACHED; /*NOTREACHED*/
}
RExC_seen_zerolen++;
- if ( !pRExC_state->num_code_blocks
- || pRExC_state->code_index >= pRExC_state->num_code_blocks
- || pRExC_state->code_blocks[pRExC_state->code_index].start
+ if ( !pRExC_state->code_blocks
+ || pRExC_state->code_index
+ >= pRExC_state->code_blocks->count
+ || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
- RExC_start)
) {
FAIL("Eval-group not allowed at runtime, use re 'eval'");
}
/* this is a pre-compiled code block (?{...}) */
- cb = &pRExC_state->code_blocks[pRExC_state->code_index];
+ cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
RExC_parse = RExC_start + cb->end;
if (!SIZE_ONLY) {
OP *o = cb->block;
*flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
(UV) flags);
} else
REGTAIL(pRExC_state, br, reganode(pRExC_state,
*flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
+ FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
(UV) flags);
}
REGTAIL(pRExC_state, ret, lastbr);
if (RExC_open_parens && !RExC_open_parens[parno])
{
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting open paren #%"IVdf" to %d\n",
+ "%*s%*s Setting open paren #%" IVdf " to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
(IV)parno, REG_NODE_NUM(ret)));
RExC_open_parens[parno]= ret;
*flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
}
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
*flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
- FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
+ FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
}
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
ender = reganode(pRExC_state, CLOSE, parno);
if ( RExC_close_parens ) {
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting close paren #%"IVdf" to %d\n",
+ "%*s%*s Setting close paren #%" IVdf " to %d\n",
22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
RExC_close_parens[parno]= ender;
if (RExC_nestroot == parno)
DEBUG_PARSE_MSG("lsbr");
regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
- Perl_re_printf( aTHX_ "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(lastbr),
SvPV_nolen_const(RExC_mysv2),
DEBUG_PARSE_MSG("NADA");
regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
- Perl_re_printf( aTHX_ "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(ret),
SvPV_nolen_const(RExC_mysv2),
*flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
- FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
+ FAIL2("panic: regpiece returned NULL, flags=%#" UVxf, (UV) flags);
}
else if (ret == NULL)
ret = latest;
}
/*
- - regpiece - something followed by possible [*+?]
+ - regpiece - something followed by possible quantifier * + ? {n,m}
*
* Note that the branching code sequences used for ? and the general cases
* of * and + are somewhat optimized: they use the same NOTHING node as
if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
*flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
else
- FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
+ FAIL2("panic: regatom returned NULL, flags=%#" UVxf, (UV) flags);
return(NULL);
}
nextchar(pRExC_state);
if (max < min) { /* If can't match, warn and optimize to fail
unconditionally */
- if (SIZE_ONLY) {
-
- /* We can't back off the size because we have to reserve
- * enough space for all the things we are about to throw
- * away, but we can shrink it by the amount we are about
- * to re-use here */
- RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
- }
- else {
+ reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
+ if (PASS2) {
ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
- RExC_emit = orig_emit;
+ NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE;
}
- ret = reganode(pRExC_state, OPFAIL, 0);
return ret;
}
else if (min == max && *RExC_parse == '?')
if ((flags&SIMPLE)) {
if (min == 0 && max == REG_INFTY) {
reginsert(pRExC_state, STAR, ret, depth+1);
- ret->flags = 0;
MARK_NAUGHTY(4);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto nest_check;
}
if (min == 1 && max == REG_INFTY) {
reginsert(pRExC_state, PLUS, ret, depth+1);
- ret->flags = 0;
MARK_NAUGHTY(3);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto nest_check;
if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
ckWARN2reg(RExC_parse,
- "%"UTF8f" matches null string many times",
+ "%" UTF8f " matches null string many times",
UTF8fARG(UTF, (RExC_parse >= origparse
? RExC_parse - origparse
: 0),
ender = reg_node(pRExC_state, SUCCEED);
REGTAIL(pRExC_state, ret, ender);
reginsert(pRExC_state, SUSPEND, ret, depth+1);
- ret->flags = 0;
ender = reg_node(pRExC_state, TAIL);
REGTAIL(pRExC_state, ret, ender);
}
RExC_parse++; /* Skip past the '{' */
- if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
- || ! (endbrace == RExC_parse /* nothing between the {} */
+ endbrace = strchr(RExC_parse, '}');
+ if (! endbrace) { /* no trailing brace */
+ vFAIL2("Missing right brace on \\%c{}", 'N');
+ }
+ else if(!(endbrace == RExC_parse /* nothing between the {} */
|| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
&& strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
error msg) */
{
- if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
+ RExC_parse = endbrace; /* position msg's '<--HERE' */
vFAIL("\\N{NAME} must be resolved by the lexer");
}
}
sv_catpv(substitute_parse, ")");
- RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
- len);
+ len = SvCUR(substitute_parse);
/* Don't allow empty number */
if (len < (STRLEN) 8) {
RExC_parse = endbrace;
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
+
+ RExC_parse = RExC_start = RExC_adjusted_start
+ = SvPV_nolen(substitute_parse);
RExC_end = RExC_parse + len;
/* The values are Unicode, and therefore not subject to recoding, but
* have to be converted to native on a non-Unicode (meaning non-ASCII)
* platform. */
- RExC_override_recoding = 1;
#ifdef EBCDIC
RExC_recode_x_to_native = 1;
#endif
*flagp = flags & (RESTART_PASS1|NEED_UTF8);
return FALSE;
}
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
(UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
RExC_start = RExC_adjusted_start = save_start;
RExC_parse = endbrace;
RExC_end = orig_end;
- RExC_override_recoding = 0;
#ifdef EBCDIC
RExC_recode_x_to_native = 0;
#endif
}
}
+STATIC bool
+S_new_regcurly(const char *s, const char *e)
+{
+ /* This is a temporary function designed to match the most lenient form of
+ * a {m,n} quantifier we ever envision, with either number omitted, and
+ * spaces anywhere between/before/after them.
+ *
+ * If this function fails, then the string it matches is very unlikely to
+ * ever be considered a valid quantifier, so we can allow the '{' that
+ * begins it to be considered as a literal */
+
+ bool has_min = FALSE;
+ bool has_max = FALSE;
+
+ PERL_ARGS_ASSERT_NEW_REGCURLY;
+
+ if (s >= e || *s++ != '{')
+ return FALSE;
+
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+ while (s < e && isDIGIT(*s)) {
+ has_min = TRUE;
+ s++;
+ }
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+
+ if (*s == ',') {
+ s++;
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+ while (s < e && isDIGIT(*s)) {
+ has_max = TRUE;
+ s++;
+ }
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+ }
+
+ return s < e && *s == '}' && (has_min || has_max);
+}
/* Parse backref decimal value, unless it's too big to sensibly be a backref,
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
if (ret == NULL) {
if (*flagp & (RESTART_PASS1|NEED_UTF8))
return NULL;
- FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
(UV) *flagp);
}
if (*RExC_parse != ']') {
*flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
- FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
+ FAIL2("panic: reg returned NULL to regatom, flags=%#" UVxf,
(UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
bad_bound_type:
RExC_parse = endbrace;
vFAIL2utf8f(
- "'%"UTF8f"' is an unknown bound type",
+ "'%" UTF8f "' is an unknown bound type",
UTF8fARG(UTF, length, endbrace - length));
NOT_REACHED; /*NOTREACHED*/
}
/* FALLTHROUGH */
finish_meta_pat:
+ if ( UCHARAT(RExC_parse + 1) == '{'
+ && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+ {
+ RExC_parse += 2;
+ vFAIL("Unescaped left brace in regex is illegal here");
+ }
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
break;
/* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
* multi-char folds are allowed. */
if (!ret)
- FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
(UV) *flagp);
RExC_parse--;
goto loopdone;
}
p = RExC_parse;
+ RExC_parse = parse_start;
if (ender > 0xff) {
REQUIRE_UTF8(flagp);
}
} /* End of switch on '\' */
break;
case '{':
- /* Currently we don't care if the lbrace is at the start
- * of a construct. This catches it in the middle of a
- * literal string, or when it's the first thing after
- * something like "\b" */
- if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
- RExC_parse = p + 1;
- vFAIL("Unescaped left brace in regex is illegal here");
+ /* Currently we allow an lbrace at the start of a construct
+ * without raising a warning. This is because we think we
+ * will never want such a brace to be meant to be other
+ * than taken literally. */
+ if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
+
+ /* But, we raise a fatal warning otherwise, as the
+ * deprecation cycle has come and gone. Except that it
+ * turns out that some heavily-relied on upstream
+ * software, notably GNU Autoconf, have failed to fix
+ * their uses. For these, don't make it fatal unless
+ * we anticipate using the '{' for something else.
+ * This happens after any alpha, and for a looser {m,n}
+ * quantifier specification */
+ if ( RExC_strict
+ || ( p > parse_start + 1
+ && isALPHA_A(*(p - 1))
+ && *(p - 2) == '\\')
+ || new_regcurly(p, RExC_end))
+ {
+ RExC_parse = p + 1;
+ vFAIL("Unescaped left brace in regex is "
+ "illegal here");
+ }
+ if (PASS2) {
+ ckWARNregdep(p + 1,
+ "Unescaped left brace in regex is "
+ "deprecated here (and will be fatal "
+ "in Perl 5.30), passed through");
+ }
}
+ goto normal_default;
+ case '}':
+ case ']':
+ if (PASS2 && p > RExC_parse && RExC_strict) {
+ ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
+ }
/*FALLTHROUGH*/
default: /* A literal character */
normal_default:
* this character again next time through, when it will be the
* only thing in its new node */
- if ((next_is_quantifier = ( LIKELY(p < RExC_end)
- && UNLIKELY(ISMULT2(p))))
- && LIKELY(len))
- {
+ next_is_quantifier = LIKELY(p < RExC_end)
+ && UNLIKELY(ISMULT2(p));
+
+ if (next_is_quantifier && LIKELY(len)) {
p = oldp;
goto loopdone;
}
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force to /x */ );
if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
- ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here, passed through");
+ ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through");
}
return(ret);
REPORT_LOCATION_ARGS(p))); \
} \
} STMT_END
+#define CLEAR_POSIX_WARNINGS() \
+ STMT_START { \
+ if (posix_warnings && RExC_warn_text) \
+ av_clear(RExC_warn_text); \
+ } STMT_END
+
+#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
+ STMT_START { \
+ CLEAR_POSIX_WARNINGS(); \
+ return ret; \
+ } STMT_END
STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
*
* The syntax for a legal posix class is:
*
- * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
+ * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
*
* What this routine considers syntactically to be an intended posix class
* is this (the comments indicate some restrictions that the pattern
* # for it to be considered to be
* # an intended posix class.
* \h*
- * [:punct:]? # The closing class character,
+ * [[:punct:]]? # The closing class character,
* # possibly omitted. If not a colon
* # nor semi colon, the class name
* # must be even closer to a valid
* decide that no posix class was intended. Should be at least
* sizeof("alphanumeric") */
UV input_text[15];
+ STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
- if (posix_warnings && RExC_warn_text)
- av_clear(RExC_warn_text);
+ CLEAR_POSIX_WARNINGS();
if (p >= e) {
return NOT_MEANT_TO_BE_A_POSIX_CLASS;
*updated_parse_ptr = (char *) temp_ptr;
}
- return OOB_NAMEDCLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
}
}
/* We consider something like [^:^alnum:]] to not have been intended to
* be a posix class, but XXX maybe we should */
if (complement) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
complement = 1;
* this leaves this construct looking like [:] or [:^], which almost
* certainly weren't intended to be posix classes */
if (has_opening_bracket) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* But this function can be called when we parse the colon for
/* XXX We are currently very restrictive here, so this code doesn't
* consider the possibility that, say, /[alpha.]]/ was intended to
* be a posix class. */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* Here we have something like 'foo:]'. There was no initial colon,
}
/* Otherwise, it can't have meant to have been a class */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* If we ran off the end, and the final character was a punctuation
* class name. (We can do this on the first pass, as any second pass
* will yield an even shorter name) */
if (name_len < 3) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* Find which class it is. Initially switch on the length of the name.
}
/* Here neither pass found a close-enough class name */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
probably_meant_to_be:
/* If it is a known class, return the class. The class number
* #defines are structured so each complement is +1 to the normal
* one */
- return class_number + complement;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
}
else if (! check_only) {
? "^"
: "";
RExC_parse = (char *) p;
- vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown",
+ vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
complement_string,
UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
}
&posix_warnings
))
FAIL2("panic: regclass returned NULL to handle_sets, "
- "flags=%#"UVxf"", (UV) *flagp);
+ "flags=%#" UVxf, (UV) *flagp);
/* function call leaves parse pointing to the ']', except
* if we faked it */
no_close:
/* We output the messages even if warnings are off, because we'll fail
* the very next thing, and these give a likely diagnosis for that */
- if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
+ if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
}
stack, fence, fence_stack));
#endif
- top_index = av_tindex_nomg(stack);
+ top_index = av_tindex_skip_len_mg(stack);
switch (curchar) {
SV** stacked_ptr; /* Ptr to something already on 'stack' */
NULL))
{
FAIL2("panic: regclass returned NULL to handle_sets, "
- "flags=%#"UVxf"", (UV) *flagp);
+ "flags=%#" UVxf, (UV) *flagp);
}
/* regclass() will return with parsing just the \ sequence,
))
{
FAIL2("panic: regclass returned NULL to handle_sets, "
- "flags=%#"UVxf"", (UV) *flagp);
+ "flags=%#" UVxf, (UV) *flagp);
}
/* function call leaves parse pointing to the ']', except if we
goto done;
case ')':
- if (av_tindex_nomg(fence_stack) < 0) {
+ if (av_tindex_skip_len_mg(fence_stack) < 0) {
RExC_parse++;
vFAIL("Unexpected ')'");
}
* may have altered the stack in the time since we earlier set
* 'top_index'. */
- top_index = av_tindex_nomg(stack);
+ top_index = av_tindex_skip_len_mg(stack);
if (top_index - fence >= 0) {
/* If the top entry on the stack is an operator, it had better
* be a '!', otherwise the entry below the top operand should
} /* End of loop parsing through the construct */
done:
- if (av_tindex_nomg(fence_stack) >= 0) {
+ if (av_tindex_skip_len_mg(fence_stack) >= 0) {
vFAIL("Unmatched (");
}
- if (av_tindex_nomg(stack) < 0 /* Was empty */
+ if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
|| ((final = av_pop(stack)) == NULL)
|| ! IS_OPERAND(final)
|| SvTYPE(final) != SVt_INVLIST
- || av_tindex_nomg(stack) >= 0) /* More left on stack */
+ || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
{
bad_syntax:
SvREFCNT_dec(final);
result_string = newSVpvs("");
while (invlist_iternext(final, &start, &end)) {
if (start == end) {
- Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
}
else {
- Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
start, end);
}
}
NULL
);
if (!node)
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#" UVxf,
PTR2UV(flagp));
/* Fix up the node type if we are in locale. (We have pretended we are
AV * stack, const IV fence, AV * fence_stack)
{ /* Dumps the stacks in handle_regex_sets() */
- const SSize_t stack_top = av_tindex_nomg(stack);
- const SSize_t fence_stack_top = av_tindex_nomg(fence_stack);
+ const SSize_t stack_top = av_tindex_skip_len_mg(stack);
+ const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
SSize_t i;
PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
character; used under /i */
UV n;
char * stop_ptr = RExC_end; /* where to stop parsing */
- const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
- space? */
+
+ /* ignore unescaped whitespace? */
+ const bool skip_white = cBOOL( ret_invlist
+ || (RExC_flags & RXf_PMf_EXTENDED_MORE));
/* Unicode properties are stored in a swash; this holds the current one
* being parsed. If this swash is the only above-latin1 component of the
while (1) {
if ( posix_warnings
- && av_tindex_nomg(posix_warnings) >= 0
+ && av_tindex_skip_len_mg(posix_warnings) >= 0
&& RExC_parse > not_posix_region_end)
{
/* Warnings about posix class issues are considered tentative until
* posix class, and it failed, it was a false alarm, as this
* successful one proves */
if ( posix_warnings
- && av_tindex_nomg(posix_warnings) >= 0
+ && av_tindex_skip_len_mg(posix_warnings) >= 0
&& not_posix_region_end >= RExC_parse
&& not_posix_region_end <= posix_class_end)
{
RExC_parse = e + 1;
/* diag_listed_as: Can't find Unicode property definition "%s" */
- vFAIL3utf8f("%s \"%"UTF8f"\"",
+ vFAIL3utf8f("%s \"%" UTF8f "\"",
msg, UTF8fARG(UTF, n, name));
}
SAVEFREEPV(name);
}
}
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n",
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
(value == 'p' ? '+' : '!'),
(FOLD) ? "__" : "",
UTF8fARG(UTF, n, name),
: 0;
if (strict) {
vFAIL2utf8f(
- "False [] range \"%"UTF8f"\"",
+ "False [] range \"%" UTF8f "\"",
UTF8fARG(UTF, w, rangebegin));
}
else {
SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
ckWARN2reg(RExC_parse,
- "False [] range \"%"UTF8f"\"",
+ "False [] range \"%" UTF8f "\"",
UTF8fARG(UTF, w, rangebegin));
(void)ReREFCNT_inc(RExC_rx_sv);
cp_list = add_cp_to_invlist(cp_list, '-');
#endif
w = RExC_parse - rangebegin;
vFAIL2utf8f(
- "Invalid [] range \"%"UTF8f"\"",
+ "Invalid [] range \"%" UTF8f "\"",
UTF8fARG(UTF, w, rangebegin));
NOT_REACHED; /* NOTREACHED */
}
foldbuf + foldlen);
SV* multi_fold = sv_2mortal(newSVpvs(""));
- Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
+ Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
multi_char_matches
= add_multi_match(multi_char_matches,
* must be be all digits or all letters of the same case.
* Otherwise, the range is non-portable and unclear as to
* what it contains */
- if ((isPRINT_A(prevvalue) || isPRINT_A(value))
- && (non_portable_endpoint
- || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
- || (isLOWER_A(prevvalue) && isLOWER_A(value))
- || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
- {
- vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
+ if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
+ && ( non_portable_endpoint
+ || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
+ || (isLOWER_A(prevvalue) && isLOWER_A(value))
+ || (isUPPER_A(prevvalue) && isUPPER_A(value))
+ ))) {
+ vWARN(RExC_parse, "Ranges of ASCII printables should"
+ " be some subset of \"0-9\","
+ " \"A-Z\", or \"a-z\"");
}
else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
+ SSize_t index_start;
+ SSize_t index_final;
/* But the nature of Unicode and languages mean we
* can't do the same checks for above-ASCII ranges,
* contain only digits from the same group of 10. The
* ASCII case is handled just above. 0x660 is the
* first digit character beyond ASCII. Hence here, the
- * range could be a range of digits. Find out. */
- IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
- prevvalue);
- IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
- value);
-
- /* If the range start and final points are in the same
- * inversion list element, it means that either both
- * are not digits, or both are digits in a consecutive
- * sequence of digits. (So far, Unicode has kept all
- * such sequences as distinct groups of 10, but assert
- * to make sure). If the end points are not in the
- * same element, neither should be a digit. */
- if (index_start == index_final) {
- assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
- || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
- - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
- == 10)
- /* But actually Unicode did have one group of 11
- * 'digits' in 5.2, so in case we are operating
- * on that version, let that pass */
- || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
- - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
- == 11
- && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
- == 0x19D0)
- );
+ * range could be a range of digits. First some
+ * unlikely special cases. Grandfather in that a range
+ * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
+ * if its starting value is one of the 10 digits prior
+ * to it. This is because it is an alternate way of
+ * writing 19D1, and some people may expect it to be in
+ * that group. But it is bad, because it won't give
+ * the expected results. In Unicode 5.2 it was
+ * considered to be in that group (of 11, hence), but
+ * this was fixed in the next version */
+
+ if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
+ goto warn_bad_digit_range;
}
- else if ((index_start >= 0
- && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
- || (index_final >= 0
- && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
+ else if (UNLIKELY( prevvalue >= 0x1D7CE
+ && value <= 0x1D7FF))
{
- vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
+ /* This is the only other case currently in Unicode
+ * where the algorithm below fails. The code
+ * points just above are the end points of a single
+ * range containing only decimal digits. It is 5
+ * different series of 0-9. All other ranges of
+ * digits currently in Unicode are just a single
+ * series. (And mktables will notify us if a later
+ * Unicode version breaks this.)
+ *
+ * If the range being checked is at most 9 long,
+ * and the digit values represented are in
+ * numerical order, they are from the same series.
+ * */
+ if ( value - prevvalue > 9
+ || ((( value - 0x1D7CE) % 10)
+ <= (prevvalue - 0x1D7CE) % 10))
+ {
+ goto warn_bad_digit_range;
+ }
+ }
+ else {
+
+ /* For all other ranges of digits in Unicode, the
+ * algorithm is just to check if both end points
+ * are in the same series, which is the same range.
+ * */
+ index_start = _invlist_search(
+ PL_XPosix_ptrs[_CC_DIGIT],
+ prevvalue);
+
+ /* Warn if the range starts and ends with a digit,
+ * and they are not in the same group of 10. */
+ if ( index_start >= 0
+ && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
+ && (index_final =
+ _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+ value)) != index_start
+ && index_final >= 0
+ && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
+ {
+ warn_bad_digit_range:
+ vWARN(RExC_parse, "Ranges of digits should be"
+ " from the same group of"
+ " 10");
+ }
}
}
}
} /* End of loop through all the text within the brackets */
- if ( posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
+ if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
output_or_return_posix_warnings(pRExC_state, posix_warnings,
return_posix_warnings);
}
#endif
/* Look at the longest folds first */
- for (cp_count = av_tindex_nomg(multi_char_matches);
+ for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
cp_count > 0;
cp_count--)
{
RExC_adjusted_start = RExC_start + prefix_end;
RExC_end = RExC_parse + len;
RExC_in_multi_char_class = 1;
- RExC_override_recoding = 1;
RExC_emit = (regnode *)orig_emit;
ret = reg(pRExC_state, 1, ®_flags, depth+1);
RExC_precomp_adj = 0;
RExC_end = save_end;
RExC_in_multi_char_class = 0;
- RExC_override_recoding = 0;
SvREFCNT_dec_NN(multi_char_matches);
return ret;
}
{
AV* list = (AV*) *listp;
IV k;
- for (k = 0; k <= av_tindex_nomg(list); k++) {
+ for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
assert(c_p);
si = *ary; /* ary[0] = the string to initialize the swash with */
- if (av_tindex_nomg(av) >= 2) {
+ if (av_tindex_skip_len_mg(av) >= 2) {
if (only_utf8_locale_ptr
&& ary[2]
&& ary[2] != &PL_sv_undef)
* is any inversion list generated at compile time; [4]
* indicates if that inversion list has any user-defined
* properties in it. */
- if (av_tindex_nomg(av) >= 3) {
+ if (av_tindex_skip_len_mg(av) >= 3) {
invlist = ary[3];
if (SvUV(ary[4])) {
swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
#else
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(
- ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
+ ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
name, __LINE__,
PL_reg_name[op],
(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
+*
+* IMPORTANT NOTE - it is the *callers* responsibility to correctly
+* set up NEXT_OFF() of the inserted node if needed. Something like this:
+*
+* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
+* if (PASS2)
+* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
+*
+* ALSO NOTE - operand->flags will be set to 0 as well.
*/
STATIC void
-S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
+S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth)
{
regnode *src;
regnode *dst;
dst = RExC_emit;
if (RExC_open_parens) {
int paren;
- /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
+ /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
/* remember that RExC_npar is rex->nparens + 1,
* iow it is 1 more than the number of parens seen in
* the pattern so far. */
/* note, RExC_open_parens[0] is the start of the
* regex, it can't move. RExC_close_parens[0] is the end
* of the regex, it *can* move. */
- if ( paren && RExC_open_parens[paren] >= opnd ) {
+ if ( paren && RExC_open_parens[paren] >= operand ) {
/*DEBUG_PARSE_FMT("open"," - %d",size);*/
RExC_open_parens[paren] += size;
} else {
/*DEBUG_PARSE_FMT("open"," - %s","ok");*/
}
- if ( RExC_close_parens[paren] >= opnd ) {
+ if ( RExC_close_parens[paren] >= operand ) {
/*DEBUG_PARSE_FMT("close"," - %d",size);*/
RExC_close_parens[paren] += size;
} else {
if (RExC_end_op)
RExC_end_op += size;
- while (src > opnd) {
+ while (src > operand) {
StructCopy(--src, --dst, regnode);
#ifdef RE_TRACK_PATTERN_OFFSETS
if (RExC_offsets) { /* MJD 20010112 */
MJD_OFFSET_DEBUG(
- ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
+ ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
"reg_insert",
__LINE__,
PL_reg_name[op],
#endif
}
-
- place = opnd; /* Op node, where operand used to be. */
+ place = operand; /* Op node, where operand used to be. */
#ifdef RE_TRACK_PATTERN_OFFSETS
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(
- ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
+ ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
"reginsert",
__LINE__,
PL_reg_name[op],
}
#endif
src = NEXTOPER(place);
+ place->flags = 0;
FILL_ADVANCE_NODE(place, op);
Zero(src, offset, regnode);
}
DEBUG_PARSE_MSG("");
regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
Perl_re_printf( aTHX_
- "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
+ "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
SvPV_nolen_const(RExC_mysv),
(IV)REG_NODE_NUM(val),
(IV)(val - scan)
Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
+ int i;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
/* Header fields of interest. */
- if (r->anchored_substr) {
- RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
- RE_SV_DUMPLEN(r->anchored_substr), 30);
- Perl_re_printf( aTHX_
- "anchored %s%s at %"IVdf" ",
- s, RE_SV_TAIL(r->anchored_substr),
- (IV)r->anchored_offset);
- } else if (r->anchored_utf8) {
- RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
- RE_SV_DUMPLEN(r->anchored_utf8), 30);
- Perl_re_printf( aTHX_
- "anchored utf8 %s%s at %"IVdf" ",
- s, RE_SV_TAIL(r->anchored_utf8),
- (IV)r->anchored_offset);
- }
- if (r->float_substr) {
- RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
- RE_SV_DUMPLEN(r->float_substr), 30);
- Perl_re_printf( aTHX_
- "floating %s%s at %"IVdf"..%"UVuf" ",
- s, RE_SV_TAIL(r->float_substr),
- (IV)r->float_min_offset, (UV)r->float_max_offset);
- } else if (r->float_utf8) {
- RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
- RE_SV_DUMPLEN(r->float_utf8), 30);
- Perl_re_printf( aTHX_
- "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
- s, RE_SV_TAIL(r->float_utf8),
- (IV)r->float_min_offset, (UV)r->float_max_offset);
+ for (i = 0; i < 2; i++) {
+ if (r->substrs->data[i].substr) {
+ RE_PV_QUOTED_DECL(s, 0, dsv,
+ SvPVX_const(r->substrs->data[i].substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].substr),
+ 30);
+ Perl_re_printf( aTHX_
+ "%s %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
+ else if (r->substrs->data[i].utf8_substr) {
+ RE_PV_QUOTED_DECL(s, 1, dsv,
+ SvPVX_const(r->substrs->data[i].utf8_substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
+ 30);
+ Perl_re_printf( aTHX_
+ "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].utf8_substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
}
+
if (r->check_substr || r->check_utf8)
Perl_re_printf( aTHX_
(const char *)
- (r->check_substr == r->float_substr
- && r->check_utf8 == r->float_utf8
+ ( r->check_substr == r->substrs->data[1].substr
+ && r->check_utf8 == r->substrs->data[1].utf8_substr
? "(checking floating" : "(checking anchored"));
if (r->intflags & PREGf_NOSCAN)
Perl_re_printf( aTHX_ " noscan");
Perl_re_printf( aTHX_ " ");
}
if (r->intflags & PREGf_GPOS_SEEN)
- Perl_re_printf( aTHX_ "GPOS:%"UVuf" ", (UV)r->gofs);
+ Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
if (r->intflags & PREGf_SKIP)
Perl_re_printf( aTHX_ "plus ");
if (r->intflags & PREGf_IMPLICIT)
Perl_re_printf( aTHX_ "implicit ");
- Perl_re_printf( aTHX_ "minlen %"IVdf" ", (IV)r->minlen);
+ Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
if (r->extflags & RXf_EVAL_SEEN)
Perl_re_printf( aTHX_ "with eval ");
Perl_re_printf( aTHX_ "\n");
if (trie->jump)
sv_catpvs(sv, "(JUMP)");
Perl_sv_catpvf(aTHX_ sv,
- "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
+ "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
(UV)trie->startstate,
(IV)trie->statecount-1, /* -1 because of the unused 0 element */
(UV)trie->wordcount,
{
AV *name_list= NULL;
U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
- Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */
+ Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
if ( RXp_PAREN_NAMES(prog) ) {
name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
} else if ( pRExC_state ) {
if ( k != REF || (OP(o) < NREF)) {
SV **name= av_fetch(name_list, parno, 0 );
if (name)
- Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
}
else {
SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
I32 n;
if (name) {
for ( n=0; n<SvIVX(sv_dat); n++ ) {
- Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
+ Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
(n ? "," : ""), (IV)nums[n]);
}
- Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
}
}
}
if (name_list) {
SV **name= av_fetch(name_list, ARG(o), 0 );
if (name)
- Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
}
}
else if (k == LOGICAL)
/* add on the verb argument if there is one */
if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
- Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
+ if ( ARG(o) )
+ Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+ else
+ sv_catpvs(sv, ":NULL");
}
#else
PERL_UNUSED_CONTEXT;
} else {
CALLREGFREE_PVT(rx); /* free the private data */
SvREFCNT_dec(RXp_PAREN_NAMES(r));
- Safefree(r->xpv_len_u.xpvlenu_pv);
}
if (r->substrs) {
- SvREFCNT_dec(r->anchored_substr);
- SvREFCNT_dec(r->anchored_utf8);
- SvREFCNT_dec(r->float_substr);
- SvREFCNT_dec(r->float_utf8);
+ int i;
+ for (i = 0; i < 2; i++) {
+ SvREFCNT_dec(r->substrs->data[i].substr);
+ SvREFCNT_dec(r->substrs->data[i].utf8_substr);
+ }
Safefree(r->substrs);
}
RX_MATCH_COPY_FREE(rx);
SvREFCNT_dec(r->qr_anoncv);
if (r->recurse_locinput)
Safefree(r->recurse_locinput);
- rx->sv_u.svu_rx = 0;
}
+
/* reg_temp_copy()
- This is a hacky workaround to the structural issue of match results
+ Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
+ except that dsv will be created if NULL.
+
+ This function is used in two main ways. First to implement
+ $r = qr/....; $s = $$r;
+
+ Secondly, it is used as a hacky workaround to the structural issue of
+ match results
being stored in the regexp structure which is in turn stored in
PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
could be PL_curpm in multiple contexts, and could require multiple
REGEXP *
-Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
+Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
{
- struct regexp *ret;
- struct regexp *const r = ReANY(rx);
- const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
+ struct regexp *drx;
+ struct regexp *const srx = ReANY(ssv);
+ const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
- if (!ret_x)
- ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+ if (!dsv)
+ dsv = (REGEXP*) newSV_type(SVt_REGEXP);
else {
- SvOK_off((SV *)ret_x);
+ SvOK_off((SV *)dsv);
if (islv) {
- /* For PVLVs, SvANY points to the xpvlv body while sv_u points
- to the regexp. (For SVt_REGEXPs, sv_upgrade has already
- made both spots point to the same regexp body.) */
+ /* For PVLVs, the head (sv_any) points to an XPVLV, while
+ * the LV's xpvlenu_rx will point to a regexp body, which
+ * we allocate here */
REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
- assert(!SvPVX(ret_x));
- ret_x->sv_u.svu_rx = temp->sv_any;
+ assert(!SvPVX(dsv));
+ ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
temp->sv_any = NULL;
SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
SvREFCNT_dec_NN(temp);
/* SvCUR still resides in the xpvlv struct, so the regexp copy-
ing below will not set it. */
- SvCUR_set(ret_x, SvCUR(rx));
+ SvCUR_set(dsv, SvCUR(ssv));
}
}
/* This ensures that SvTHINKFIRST(sv) is true, and hence that
sv_force_normal(sv) is called. */
- SvFAKE_on(ret_x);
- ret = ReANY(ret_x);
+ SvFAKE_on(dsv);
+ drx = ReANY(dsv);
- SvFLAGS(ret_x) |= SvUTF8(rx);
+ SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
+ SvPV_set(dsv, RX_WRAPPED(ssv));
/* We share the same string buffer as the original regexp, on which we
hold a reference count, incremented when mother_re is set below.
The string pointer is copied here, being part of the regexp struct.
*/
- memcpy(&(ret->xpv_cur), &(r->xpv_cur),
+ memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
- if (r->offs) {
- const I32 npar = r->nparens+1;
- Newx(ret->offs, npar, regexp_paren_pair);
- Copy(r->offs, ret->offs, npar, regexp_paren_pair);
- }
- if (r->substrs) {
- Newx(ret->substrs, 1, struct reg_substr_data);
- StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
+ if (!islv)
+ SvLEN_set(dsv, 0);
+ if (srx->offs) {
+ const I32 npar = srx->nparens+1;
+ Newx(drx->offs, npar, regexp_paren_pair);
+ Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
+ }
+ if (srx->substrs) {
+ int i;
+ Newx(drx->substrs, 1, struct reg_substr_data);
+ StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
- SvREFCNT_inc_void(ret->anchored_substr);
- SvREFCNT_inc_void(ret->anchored_utf8);
- SvREFCNT_inc_void(ret->float_substr);
- SvREFCNT_inc_void(ret->float_utf8);
+ for (i = 0; i < 2; i++) {
+ SvREFCNT_inc_void(drx->substrs->data[i].substr);
+ SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
+ }
/* check_substr and check_utf8, if non-NULL, point to either their
anchored or float namesakes, and don't hold a second reference. */
}
- RX_MATCH_COPIED_off(ret_x);
+ RX_MATCH_COPIED_off(dsv);
#ifdef PERL_ANY_COW
- ret->saved_copy = NULL;
+ drx->saved_copy = NULL;
#endif
- ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
- SvREFCNT_inc_void(ret->qr_anoncv);
- if (r->recurse_locinput)
- Newxz(ret->recurse_locinput,r->nparens + 1,char *);
+ drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
+ SvREFCNT_inc_void(drx->qr_anoncv);
+ if (srx->recurse_locinput)
+ Newxz(drx->recurse_locinput,srx->nparens + 1,char *);
- return ret_x;
+ return dsv;
}
#endif
+
/* regfree_internal()
Free the private data in a regexp. This is overloadable by
if (ri->u.offsets)
Safefree(ri->u.offsets); /* 20010421 MJD */
#endif
- if (ri->code_blocks) {
- int n;
- for (n = 0; n < ri->num_code_blocks; n++)
- SvREFCNT_dec(ri->code_blocks[n].src_regex);
- Safefree(ri->code_blocks);
- }
+ if (ri->code_blocks)
+ S_free_codeblocks(aTHX_ ri->code_blocks);
if (ri->data) {
int n = ri->data->count;
/* Do it this way to avoid reading from *r after the StructCopy().
That way, if any of the sv_dup_inc()s dislodge *r from the L1
cache, it doesn't matter. */
+ int i;
const bool anchored = r->check_substr
- ? r->check_substr == r->anchored_substr
- : r->check_utf8 == r->anchored_utf8;
+ ? r->check_substr == r->substrs->data[0].substr
+ : r->check_utf8 == r->substrs->data[0].utf8_substr;
Newx(ret->substrs, 1, struct reg_substr_data);
StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
- ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
- ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
- ret->float_substr = sv_dup_inc(ret->float_substr, param);
- ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
+ for (i = 0; i < 2; i++) {
+ ret->substrs->data[i].substr =
+ sv_dup_inc(ret->substrs->data[i].substr, param);
+ ret->substrs->data[i].utf8_substr =
+ sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
+ }
/* check_substr and check_utf8, if non-NULL, point to either their
anchored or float namesakes, and don't hold a second reference. */
if (ret->check_substr) {
if (anchored) {
- assert(r->check_utf8 == r->anchored_utf8);
- ret->check_substr = ret->anchored_substr;
- ret->check_utf8 = ret->anchored_utf8;
+ assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
+
+ ret->check_substr = ret->substrs->data[0].substr;
+ ret->check_utf8 = ret->substrs->data[0].utf8_substr;
} else {
- assert(r->check_substr == r->float_substr);
- assert(r->check_utf8 == r->float_utf8);
- ret->check_substr = ret->float_substr;
- ret->check_utf8 = ret->float_utf8;
+ assert(r->check_substr == r->substrs->data[1].substr);
+ assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
+
+ ret->check_substr = ret->substrs->data[1].substr;
+ ret->check_utf8 = ret->substrs->data[1].utf8_substr;
}
} else if (ret->check_utf8) {
if (anchored) {
- ret->check_utf8 = ret->anchored_utf8;
+ ret->check_utf8 = ret->substrs->data[0].utf8_substr;
} else {
- ret->check_utf8 = ret->float_utf8;
+ ret->check_utf8 = ret->substrs->data[1].utf8_substr;
}
}
}
1: a buffer in a different thread
2: something we no longer hold a reference on
so we need to copy it locally. */
- RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
+ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
ret->mother_re = NULL;
}
#endif /* PERL_IN_XSUB_RE */
Copy(ri->program, reti->program, len+1, regnode);
- reti->num_code_blocks = ri->num_code_blocks;
if (ri->code_blocks) {
int n;
- Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
- struct reg_code_block);
- Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
- struct reg_code_block);
- for (n = 0; n < ri->num_code_blocks; n++)
- reti->code_blocks[n].src_regex = (REGEXP*)
- sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
+ Newx(reti->code_blocks, 1, struct reg_code_blocks);
+ Newx(reti->code_blocks->cb, ri->code_blocks->count,
+ struct reg_code_block);
+ Copy(ri->code_blocks->cb, reti->code_blocks->cb,
+ ri->code_blocks->count, struct reg_code_block);
+ for (n = 0; n < ri->code_blocks->count; n++)
+ reti->code_blocks->cb[n].src_regex = (REGEXP*)
+ sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
+ reti->code_blocks->count = ri->code_blocks->count;
+ reti->code_blocks->refcnt = 1;
}
else
reti->code_blocks = NULL;
d->what[i] = ri->data->what[i];
switch (d->what[i]) {
/* see also regcomp.h and regfree_internal() */
- case 'a': /* actually an AV, but the dup function is identical. */
- case 'r':
- case 's':
- case 'S':
- case 'u': /* actually an HV, but the dup function is identical. */
+ case 'a': /* actually an AV, but the dup function is identical.
+ values seem to be "plain sv's" generally. */
+ case 'r': /* a compiled regex (but still just another SV) */
+ case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
+ this use case should go away, the code could have used
+ 'a' instead - see S_set_ANYOF_arg() for array contents. */
+ case 'S': /* actually an SV, but the dup function is identical. */
+ case 'u': /* actually an HV, but the dup function is identical.
+ values are "plain sv's" */
d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
break;
case 'f':
+ /* Synthetic Start Class - "Fake" charclass we generate to optimize
+ * patterns which could start with several different things. Pre-TRIE
+ * this was more important than it is now, however this still helps
+ * in some places, for instance /x?a+/ might produce a SSC equivalent
+ * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
+ * in regexec.c
+ */
/* This is cheating. */
Newx(d->data[i], 1, regnode_ssc);
StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
reti->regstclass = (regnode*)d->data[i];
break;
case 'T':
- /* Trie stclasses are readonly and can thus be shared
+ /* AHO-CORASICK fail table */
+ /* Trie stclasses are readonly and can thus be shared
* without duplication. We free the stclass in pregfree
* when the corresponding reg_ac_data struct is freed.
*/
reti->regstclass= ri->regstclass;
/* FALLTHROUGH */
case 't':
+ /* TRIE transition table */
OP_REFCNT_LOCK;
((reg_trie_data*)ri->data->data[i])->refcount++;
OP_REFCNT_UNLOCK;
/* FALLTHROUGH */
- case 'l':
- case 'L':
+ case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
+ case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
+ is not from another regexp */
d->data[i] = ri->data->data[i];
break;
default:
l1 = 512;
Copy(message, buf, l1 , char);
/* l1-1 to avoid \n */
- Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
+ Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
}
/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
PERL_ARGS_ASSERT_PUT_CODE_POINT;
if (c > 255) {
- Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
}
else if (isPRINT(c)) {
const char string = (char) c;
: NUM_ANYOF_CODE_POINTS - 1;
#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
- ? "\\x%02"UVXf"-\\x%02"UVXf""
- : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
+ ? "\\x%02" UVXf "-\\x%02" UVXf
+ : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
#else
- format = "\\x%02"UVXf"-\\x%02"UVXf"";
+ format = "\\x%02" UVXf "-\\x%02" UVXf;
#endif
GCC_DIAG_IGNORE(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
#define CLEAR_OPTSTART \
if (optstart) STMT_START { \
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
- " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
+ " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
optstart=NULL; \
} STMT_END
CLEAR_OPTSTART;
regprop(r, sv, node, NULL, NULL);
- Perl_re_printf( aTHX_ "%4"IVdf":%*s%s", (IV)(node - start),
+ Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
(int)(2*indent + 1), "", SvPVX_const(sv));
if (OP(node) != OPTIMIZED) {
&& PL_regkind[OP(next)] != BRANCH )
Perl_re_printf( aTHX_ " (FAIL)");
else
- Perl_re_printf( aTHX_ " (%"IVdf")", (IV)(next - start));
+ Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
Perl_re_printf( aTHX_ "\n");
}
);
if (trie->jump) {
U16 dist= trie->jump[word_idx+1];
- Perl_re_printf( aTHX_ "(%"UVuf")\n",
+ Perl_re_printf( aTHX_ "(%" UVuf ")\n",
(UV)((dist ? this_trie + dist : next) - start));
if (dist) {
if (!nextbranch)