U32 state;
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
+ U16 word;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_DUMP_TRIE;
}
PerlIO_printf( Perl_debug_log, "\n" );
}
+ PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
+ for (word=1; word <= trie->wordcount; word++) {
+ PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
+ (int)word, (int)(trie->wordinfo[word].prev),
+ (int)(trie->wordinfo[word].len));
+ }
+ PerlIO_printf(Perl_debug_log, "\n" );
}
/*
Dumps a fully constructed but uncompressed trie in list form.
#endif
+
/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
startbranch: the first branch in the whole branch sequence
first : start branch of sequence of branch-exact nodes.
U16 dupe= trie->states[ state ].wordnum; \
regnode * const noper_next = regnext( noper ); \
\
- if (trie->wordlen) \
- trie->wordlen[ curword ] = wordlen; \
DEBUG_r({ \
/* store the word for dumping */ \
SV* tmp; \
}); \
\
curword++; \
+ trie->wordinfo[curword].prev = 0; \
+ trie->wordinfo[curword].len = wordlen; \
+ trie->wordinfo[curword].accept = state; \
\
if ( noper_next < tail ) { \
if (!trie->jump) \
} \
\
if ( dupe ) { \
- /* So it's a dupe. This means we need to maintain a */\
- /* linked-list from the first to the next. */\
- /* we only allocate the nextword buffer when there */\
- /* a dupe, so first time we have to do the allocation */\
- if (!trie->nextword) \
- trie->nextword = (U16 *) \
- PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
- while ( trie->nextword[dupe] ) \
- dupe= trie->nextword[dupe]; \
- trie->nextword[dupe]= curword; \
+ /* It's a dupe. Pre-insert into the wordinfo[].prev */\
+ /* chain, so that when the bits of chain are later */\
+ /* linked together, the dups appear in the chain */\
+ trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
+ trie->wordinfo[dupe].prev = curword; \
} else { \
/* we haven't inserted this word yet. */ \
trie->states[ state ].wordnum = curword; \
regnode *jumper = NULL;
regnode *nextbranch = NULL;
regnode *convert = NULL;
+ U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
const U8 * const folder = ( flags == EXACTF
? PL_fold
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
if (!(UTF && folder))
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
+ trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
+ trie->wordcount+1, sizeof(reg_trie_wordinfo));
+
DEBUG_r({
trie_words = newAV();
});
(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
(int)trie->minlen, (int)trie->maxlen )
);
- trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
/*
We now know what we are dealing with in terms of unique chars and
*/
+ Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
+ prev_states[1] = 0;
+
if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
/*
Second Pass -- Array Of Lists Representation
}
if ( ! newstate ) {
newstate = next_alloc++;
+ prev_states[newstate] = state;
TRIE_LIST_PUSH( state, charid, newstate );
transcount++;
}
if ( !trie->trans[ state + charid ].next ) {
trie->trans[ state + charid ].next = next_alloc;
trie->trans[ state ].check++;
+ prev_states[TRIE_NODENUM(next_alloc)]
+ = TRIE_NODENUM(state);
next_alloc += trie->uniquecharcount;
}
state = trie->trans[ state + charid ].next;
PerlMemShared_realloc( trie->trans, trie->lasttrans
* sizeof(reg_trie_trans) );
- /* and now dump out the compressed format */
- DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
-
{ /* Modify the program and insert the new TRIE node*/
U8 nodetype =(U8)(flags & 0xFF);
char *str=NULL;
break;
}
}
+ trie->prefixlen = (state-1);
if (str) {
regnode *n = convert+NODE_SZ_STR(convert);
NEXT_OFF(convert) = NODE_SZ_STR(convert);
Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
});
} /* end node insert */
+
+ /* Finish populating the prev field of the wordinfo array. Walk back
+ * from each accept state until we find another accept state, and if
+ * so, point the first word's .prev field at the second word. If the
+ * second already has a .prev field set, stop now. This will be the
+ * case either if we've already processed that word's accept state,
+ * or that that state had multiple words, and the overspill words
+ * were already linked up earlier.
+ */
+ {
+ U16 word;
+ U32 state;
+ U16 prev;
+
+ for (word=1; word <= trie->wordcount; word++) {
+ prev = 0;
+ if (trie->wordinfo[word].prev)
+ continue;
+ state = trie->wordinfo[word].accept;
+ while (state) {
+ state = prev_states[state];
+ if (!state)
+ break;
+ prev = trie->states[state].wordnum;
+ if (prev)
+ break;
+ }
+ trie->wordinfo[word].prev = prev;
+ }
+ Safefree(prev_states);
+ }
+
+
+ /* and now dump out the compressed format */
+ DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
+
RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
#ifdef DEBUGGING
RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
}
flags &= ~SCF_DO_STCLASS;
}
- else if (strchr((const char*)PL_varies,OP(scan))) {
+ else if (REGNODE_VARIES(OP(scan))) {
I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
regnode * const oscan = scan;
/* Skip open. */
nxt = regnext(nxt);
- if (!strchr((const char*)PL_simple,OP(nxt))
+ if (!REGNODE_SIMPLE(OP(nxt))
&& !(PL_regkind[OP(nxt)] == EXACT
&& STR_LEN(nxt) == 1))
goto nogo;
data->longest = &(data->longest_float);
}
}
- else if (strchr((const char*)PL_simple,OP(scan))) {
+ else if (REGNODE_SIMPLE(OP(scan))) {
int value = 0;
if (flags & SCF_DO_SUBSTR) {
ri->regstclass = trie_op;
}
#endif
- else if (strchr((const char*)PL_simple,OP(first)))
+ else if (REGNODE_SIMPLE(OP(first)))
ri->regstclass = first;
else if (PL_regkind[OP(first)] == BOUND ||
PL_regkind[OP(first)] == NBOUND)
#endif
#ifdef DEBUGGING
if (RExC_paren_names) {
- ri->name_list_idx = add_data( pRExC_state, 1, "p" );
+ ri->name_list_idx = add_data( pRExC_state, 1, "a" );
ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
} else
#endif
STATIC regnode *
S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
{
- char * endbrace; /* endbrace following the name */
+ char * endbrace; /* '}' following the name */
regnode *ret = NULL;
#ifdef DEBUGGING
char* parse_start = RExC_parse - 2; /* points to the '\N' */
#endif
+ char* p;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REG_NAMEDSEQ;
GET_RE_DEBUG_FLAGS;
+
+ /* The [^\n] meaning of \N ignores spaces and comments under the /x
+ * modifier. The other meaning does not */
+ p = (RExC_flags & RXf_PMf_EXTENDED)
+ ? regwhite( pRExC_state, RExC_parse )
+ : RExC_parse;
/* Disambiguate between \N meaning a named character versus \N meaning
- * don't match a newline. */
- if (*RExC_parse != '{'
- || (! (endbrace = strchr(RExC_parse, '}'))) /* no trailing brace */
- || ! (endbrace == RExC_parse + 1 /* nothing between the {} */
- || (endbrace - RExC_parse > 3 /* U+ and at least one hex */
- && strnEQ(RExC_parse + 1, "U+", 2))))
- {
+ * [^\n]. The former is assumed when it can't be the latter. */
+ if (*p != '{' || regcurly(p)) {
+ RExC_parse = p;
if (valuep) {
/* no bare \N in a charclass */
vFAIL("\\N in a character class must be a named character: \\N{...}");
return ret;
}
- /* Here, we have decided it is a named sequence */
+ /* Here, we have decided it should be a named sequence */
+
+ /* The test above made sure that the next real character is a '{', but
+ * under the /x modifier, it could be separated by space (or a comment and
+ * \n) and this is not allowed (for consistency with \x{...} and the
+ * tokenizer handling of \N{NAME}). */
+ if (*RExC_parse != '{') {
+ vFAIL("Missing braces on \\N{}");
+ }
+
RExC_parse++; /* Skip past the '{' */
+
+ if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
+ || ! (endbrace == RExC_parse /* nothing between the {} */
+ || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
+ && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
+ {
+ if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
+ vFAIL("\\N{NAME} must be resolved by the lexer");
+ }
+
if (endbrace == RExC_parse) { /* empty: \N{} */
if (! valuep) {
RExC_parse = endbrace + 1;
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
- char * endchar = strchr(RExC_parse, '.');
- if (endchar) {
+ char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ if (endchar < endbrace) {
ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
}
- else endchar = endbrace;
length_of_hex = (STRLEN)(endchar - RExC_parse);
*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check */
- if ( length_of_hex != (STRLEN)(endchar - RExC_parse) ) {
- *valuep = UNICODE_REPLACEMENT;
+ if (length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) RExC_parse = endchar;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
RExC_parse = endbrace + 1;
* is primarily a named character, and not intended to be a huge long
* string, so 255 bytes should be good enough */
while (1) {
- STRLEN this_char_length;
+ STRLEN length_of_hex;
I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
/* Code points are separated by dots. If none, there is only one
* code point, and is terminated by the brace */
- endchar = strchr(RExC_parse, '.');
- if (! endchar) endchar = endbrace;
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
/* The values are Unicode even on EBCDIC machines */
- this_char_length = (STRLEN)(endchar - RExC_parse);
- cp = grok_hex(RExC_parse, &this_char_length, &grok_flags, NULL);
- if ( this_char_length == 0
- || this_char_length != (STRLEN)(endchar - RExC_parse) )
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
+ if ( length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
{
- cp = UNICODE_REPLACEMENT; /* Substitute a valid character */
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) RExC_parse = endchar;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
if (! FOLD) { /* Not folding, just append to the string */
break;
case 'c':
p++;
- ender = UCHARAT(p++);
- ender = toCTRL(ender);
+ ender = grok_bslash_c(*p++, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8':case '9':
goto recode_encoding;
break;
case 'c':
- value = UCHARAT(RExC_parse++);
- value = toCTRL(value);
+ value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
/*
- regcurly - a little FSA that accepts {\d+,?\d*}
*/
+#ifndef PERL_IN_XSUB_RE
I32
Perl_regcurly(register const char *s)
{
return FALSE;
return TRUE;
}
-
+#endif
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
while (--n >= 0) {
/* If you add a ->what type here, update the comment in regcomp.h */
switch (ri->data->what[n]) {
+ case 'a':
case 's':
case 'S':
case 'u':
PerlMemShared_free(trie->trans);
if (trie->bitmap)
PerlMemShared_free(trie->bitmap);
- if (trie->wordlen)
- PerlMemShared_free(trie->wordlen);
if (trie->jump)
PerlMemShared_free(trie->jump);
- if (trie->nextword)
- PerlMemShared_free(trie->nextword);
+ PerlMemShared_free(trie->wordinfo);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
}
Safefree(ri);
}
-#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
-#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
-#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
+#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
+#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
/*
for (i = 0; i < count; i++) {
d->what[i] = ri->data->what[i];
switch (d->what[i]) {
- /* legal options are one of: sSfpontTu
+ /* legal options are one of: sSfpontTua
see also regcomp.h and pregfree() */
+ case 'a': /* actually an AV, but the dup function is identical. */
case 's':
case 'S':
case 'p': /* actually an AV, but the dup function is identical. */
if (!p)
return(NULL);
+ if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
+ Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
+ }
+
offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
if (offset == 0)
return(NULL);
state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
- SSPUSHINT(SAVEt_RE_STATE);
+ SSPUSHUV(SAVEt_RE_STATE);
Copy(&PL_reg_state, state, 1, struct re_save_state);