*/
/*
- * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
+ * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
+ *
+ * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
*/
/* This file contains functions for compiling a regular expression. See
**** Alterations to Henry's code are...
****
**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
- **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
+ **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ **** by Larry Wall and others
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
# include "regcomp.h"
#endif
+#include "dquote_static.c"
+
#ifdef op
#undef op
#endif /* op */
I32 orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
- HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
regnode **recurse; /* Recurse regops */
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_orig_utf8 (pRExC_state->orig_utf8)
-#define RExC_charnames (pRExC_state->charnames)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
#define RExC_opend (pRExC_state->opend)
*/
#define WORST 0 /* Worst case. */
#define HASWIDTH 0x01 /* Known to match non-null strings. */
-#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
+
+/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
+ * character, and if utf8, must be invariant. */
+#define SIMPLE 0x02
#define SPSTART 0x04 /* Starts with * or +. */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
+/* If not already in utf8, do a longjmp back to the beginning */
+#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
+#define REQUIRE_UTF8 STMT_START { \
+ if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
+ } STMT_END
/* About scan_data_t.
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
#define SCF_SEEN_ACCEPT 0x8000
-#define UTF (RExC_utf8 != 0)
-#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
-#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
+#define UTF cBOOL(RExC_utf8)
+#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
+#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-#define vWARN(loc,m) STMT_START { \
+#define ckWARNreg(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
- m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-#define vWARNdep(loc,m) STMT_START { \
+#define ckWARNregdep(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
- "%s" REPORT_LOCATION, \
- m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-
-#define vWARN2(loc, m, a1) STMT_START { \
+#define ckWARN2reg(loc, m, a1) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
const STRLEN old_l = CHR_SVLEN(*data->longest);
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_SCAN_COMMIT;
+
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
SvSetMagicSV(*data->longest, data->last_found);
if (*data->longest == data->longest_fixed) {
STATIC void
S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
+ PERL_ARGS_ASSERT_CL_ANYTHING;
+
ANYOF_CLASS_ZERO(cl);
ANYOF_BITMAP_SETALL(cl);
cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
{
int value;
+ PERL_ARGS_ASSERT_CL_IS_ANYTHING;
+
for (value = 0; value <= ANYOF_MAX; value += 2)
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
return 1;
STATIC void
S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
+ PERL_ARGS_ASSERT_CL_INIT;
+
Zero(cl, 1, struct regnode_charclass_class);
cl->type = ANYOF;
cl_anything(pRExC_state, cl);
STATIC void
S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
+ PERL_ARGS_ASSERT_CL_INIT_ZERO;
+
Zero(cl, 1, struct regnode_charclass_class);
cl->type = ANYOF;
cl_anything(pRExC_state, cl);
S_cl_and(struct regnode_charclass_class *cl,
const struct regnode_charclass_class *and_with)
{
+ PERL_ARGS_ASSERT_CL_AND;
assert(and_with->type == ANYOF);
if (!(and_with->flags & ANYOF_CLASS)
STATIC void
S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
{
+ PERL_ARGS_ASSERT_CL_OR;
+
if (or_with->flags & ANYOF_INVERT) {
/* We do not use
* (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
Dumps the final compressed table form of the trie to Perl_debug_log.
Used for debugging make_trie().
*/
-
+
STATIC void
S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
AV *revcharmap, U32 depth)
U32 state;
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
+ U16 word;
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_DUMP_TRIE;
PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
(int)depth * 2 + 2,"",
}
PerlIO_printf( Perl_debug_log, "\n" );
}
+ PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
+ for (word=1; word <= trie->wordcount; word++) {
+ PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
+ (int)word, (int)(trie->wordinfo[word].prev),
+ (int)(trie->wordinfo[word].len));
+ }
+ PerlIO_printf(Perl_debug_log, "\n" );
}
/*
Dumps a fully constructed but uncompressed trie in list form.
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
+
/* print out the table precompression. */
PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
/*
print out the table precompression so that we can do a visual check
#endif
+
/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
startbranch: the first branch in the whole branch sequence
first : start branch of sequence of branch-exact nodes.
U16 dupe= trie->states[ state ].wordnum; \
regnode * const noper_next = regnext( noper ); \
\
- if (trie->wordlen) \
- trie->wordlen[ curword ] = wordlen; \
DEBUG_r({ \
/* store the word for dumping */ \
SV* tmp; \
}); \
\
curword++; \
+ trie->wordinfo[curword].prev = 0; \
+ trie->wordinfo[curword].len = wordlen; \
+ trie->wordinfo[curword].accept = state; \
\
if ( noper_next < tail ) { \
if (!trie->jump) \
} \
\
if ( dupe ) { \
- /* So it's a dupe. This means we need to maintain a */\
- /* linked-list from the first to the next. */\
- /* we only allocate the nextword buffer when there */\
- /* a dupe, so first time we have to do the allocation */\
- if (!trie->nextword) \
- trie->nextword = (U16 *) \
- PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
- while ( trie->nextword[dupe] ) \
- dupe= trie->nextword[dupe]; \
- trie->nextword[dupe]= curword; \
+ /* It's a dupe. Pre-insert into the wordinfo[].prev */\
+ /* chain, so that when the bits of chain are later */\
+ /* linked together, the dups appear in the chain */\
+ trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
+ trie->wordinfo[dupe].prev = curword; \
} else { \
/* we haven't inserted this word yet. */ \
trie->states[ state ].wordnum = curword; \
regnode *jumper = NULL;
regnode *nextbranch = NULL;
regnode *convert = NULL;
+ U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
const U8 * const folder = ( flags == EXACTF
? PL_fold
#endif
SV *re_trie_maxbuff;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_MAKE_TRIE;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
if (!(UTF && folder))
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
+ trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
+ trie->wordcount+1, sizeof(reg_trie_wordinfo));
+
DEBUG_r({
trie_words = newAV();
});
(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
(int)trie->minlen, (int)trie->maxlen )
);
- trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
/*
We now know what we are dealing with in terms of unique chars and
*/
+ Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
+ prev_states[1] = 0;
+
if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
/*
Second Pass -- Array Of Lists Representation
}
if ( ! newstate ) {
newstate = next_alloc++;
+ prev_states[newstate] = state;
TRIE_LIST_PUSH( state, charid, newstate );
transcount++;
}
if ( !trie->trans[ state + charid ].next ) {
trie->trans[ state + charid ].next = next_alloc;
trie->trans[ state ].check++;
+ prev_states[TRIE_NODENUM(next_alloc)]
+ = TRIE_NODENUM(state);
next_alloc += trie->uniquecharcount;
}
state = trie->trans[ state + charid ].next;
PerlMemShared_realloc( trie->trans, trie->lasttrans
* sizeof(reg_trie_trans) );
- /* and now dump out the compressed format */
- DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
-
{ /* Modify the program and insert the new TRIE node*/
U8 nodetype =(U8)(flags & 0xFF);
char *str=NULL;
if ( folder )
TRIE_BITMAP_SET(trie, folder[ *ch ]);
DEBUG_OPTIMISE_r(
- PerlIO_printf(Perl_debug_log, (char*)ch)
+ PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
);
}
}
break;
}
}
+ trie->prefixlen = (state-1);
if (str) {
regnode *n = convert+NODE_SZ_STR(convert);
NEXT_OFF(convert) = NODE_SZ_STR(convert);
Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
});
} /* end node insert */
+
+ /* Finish populating the prev field of the wordinfo array. Walk back
+ * from each accept state until we find another accept state, and if
+ * so, point the first word's .prev field at the second word. If the
+ * second already has a .prev field set, stop now. This will be the
+ * case either if we've already processed that word's accept state,
+ * or that that state had multiple words, and the overspill words
+ * were already linked up earlier.
+ */
+ {
+ U16 word;
+ U32 state;
+ U16 prev;
+
+ for (word=1; word <= trie->wordcount; word++) {
+ prev = 0;
+ if (trie->wordinfo[word].prev)
+ continue;
+ state = trie->wordinfo[word].accept;
+ while (state) {
+ state = prev_states[state];
+ if (!state)
+ break;
+ prev = trie->states[state].wordnum;
+ if (prev)
+ break;
+ }
+ trie->wordinfo[word].prev = prev;
+ }
+ Safefree(prev_states);
+ }
+
+
+ /* and now dump out the compressed format */
+ DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
+
RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
#ifdef DEBUGGING
RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
reg_ac_data *aho;
const U32 data_slot = add_data( pRExC_state, 1, "T" );
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
#else
PERL_UNUSED_ARG(depth);
#endif
+
+ PERL_ARGS_ASSERT_JOIN_EXACT;
#ifndef EXPERIMENTAL_INPLACESCAN
PERL_UNUSED_ARG(flags);
PERL_UNUSED_ARG(val);
regnode *first_non_open = scan;
I32 stopmin = I32_MAX;
scan_frame *frame = NULL;
-
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_STUDY_CHUNK;
+
#ifdef DEBUGGING
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
#endif
}
} else {
/*
- Currently we assume that the trie can handle unicode and ascii
- matches fold cased matches. If this proves true then the following
- define will prevent tries in this situation.
-
- #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
-*/
+ Currently we do not believe that the trie logic can
+ handle case insensitive matching properly when the
+ pattern is not unicode (thus forcing unicode semantics).
+
+ If/when this is fixed the following define can be swapped
+ in below to fully enable trie logic.
+
#define TRIE_TYPE_IS_SAFE 1
+
+*/
+#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+
if ( last && TRIE_TYPE_IS_SAFE ) {
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
}
flags &= ~SCF_DO_STCLASS;
}
- else if (strchr((const char*)PL_varies,OP(scan))) {
+ else if (REGNODE_VARIES(OP(scan))) {
I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
regnode * const oscan = scan;
/* These are the cases when once a subexpression
fails at a particular position, it cannot succeed
even after backtracking at the enclosing scope.
-
+
XXXX what if minimal match and we are at the
initial run of {n,m}? */
if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
(next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
- && maxcount <= REG_INFTY/3 /* Complement check for big count */
- && ckWARN(WARN_REGEXP))
+ && maxcount <= REG_INFTY/3) /* Complement check for big count */
{
- vWARN(RExC_parse,
- "Quantifier unexpected on zero-length expression");
+ ckWARNreg(RExC_parse,
+ "Quantifier unexpected on zero-length expression");
}
min += minnext * mincount;
/* Skip open. */
nxt = regnext(nxt);
- if (!strchr((const char*)PL_simple,OP(nxt))
+ if (!REGNODE_SIMPLE(OP(nxt))
&& !(PL_regkind[OP(nxt)] == EXACT
&& STR_LEN(nxt) == 1))
goto nogo;
#ifdef DEBUGGING
OP(nxt1 + 1) = OPTIMIZED; /* was count. */
- NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
- NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
+ NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
+ NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
OP(nxt) = OPTIMIZED; /* was CLOSE. */
OP(nxt + 1) = OPTIMIZED; /* was count. */
- NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
+ NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
#endif
}
nogo:
nxt = nxt2;
OP(nxt2) = SUCCEED; /* Whas WHILEM */
/* Need to optimize away parenths. */
- if (data->flags & SF_IN_PAR) {
+ if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
/* Set the parenth number. */
regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
- if (OP(nxt) != CLOSE)
- FAIL("Panic opt close");
oscan->flags = (U8)ARG(nxt);
if (RExC_open_parens) {
RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
#if 0
while ( nxt1 && (OP(nxt1) != WHILEM)) {
regnode *nnxt = regnext(nxt1);
-
if (nnxt == nxt) {
if (reg_off_by_arg[OP(nxt1)])
ARG_SET(nxt1, nxt2 - nxt1);
if (UTF)
old = utf8_hop((U8*)s, old) - (U8*)s;
-
l -= old;
/* Get the added string: */
last_str = newSVpvn_utf8(s + old, l, UTF);
if (flags & SCF_DO_STCLASS_AND) {
for (value = 0; value < 256; value++)
if (!is_VERTWS_cp(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- else {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ else {
for (value = 0; value < 256; value++)
if (is_VERTWS_cp(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
if (flags & SCF_DO_STCLASS_OR)
cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
data->pos_delta += 1;
data->longest = &(data->longest_float);
}
-
}
else if (OP(scan) == FOLDCHAR) {
int d = ARG(scan)==0xDF ? 1 : 2;
data->longest = &(data->longest_float);
}
}
- else if (strchr((const char*)PL_simple,OP(scan))) {
+ else if (REGNODE_SIMPLE(OP(scan))) {
int value = 0;
if (flags & SCF_DO_SUBSTR) {
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
- for (value = 0; value < 256; value++)
- if (!isALNUM(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (!isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (!isALNUM(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
- else {
- for (value = 0; value < 256; value++)
- if (isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ else if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isALNUM(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
}
break;
case ALNUML:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
- for (value = 0; value < 256; value++)
- if (isALNUM(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isALNUM(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
else {
for (value = 0; value < 256; value++)
if (!isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
- for (value = 0; value < 256; value++)
- if (!isSPACE(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
- if (data->start_class->flags & ANYOF_LOCALE)
+ if (data->start_class->flags & ANYOF_LOCALE) {
ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ else if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
}
}
break;
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
- else {
- for (value = 0; value < 256; value++)
- if (!isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ else if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
+ else {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
}
break;
case NSPACEL:
else {
for (value = 0; value < 256; value++)
if (isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
else {
for (value = 0; value < 256; value++)
if (!isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
data->whilem_c = data_fake.whilem_c;
}
if (f & SCF_DO_STCLASS_AND) {
- const int was = (data->start_class->flags & ANYOF_EOS);
-
- cl_and(data->start_class, &intrnl);
- if (was)
- data->start_class->flags |= ANYOF_EOS;
+ if (flags & SCF_DO_STCLASS_OR) {
+ /* OR before, AND after: ideally we would recurse with
+ * data_fake to get the AND applied by study of the
+ * remainder of the pattern, and then derecurse;
+ * *** HACK *** for now just treat as "no information".
+ * See [perl #56690].
+ */
+ cl_init(pRExC_state, data->start_class);
+ } else {
+ /* AND before and after: combine and continue */
+ const int was = (data->start_class->flags & ANYOF_EOS);
+
+ cl_and(data->start_class, &intrnl);
+ if (was)
+ data->start_class->flags |= ANYOF_EOS;
+ }
}
}
#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
{
U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
+ PERL_ARGS_ASSERT_ADD_DATA;
+
Renewc(RExC_rxi->data,
sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
char, struct reg_data);
#ifndef PERL_IN_XSUB_RE
REGEXP *
-Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
+Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
{
dVAR;
HV * const table = GvHV(PL_hintgv);
+
+ PERL_ARGS_ASSERT_PREGCOMP;
+
/* Dispatch a request to compile a regexp to correct
regexp engine. */
if (table) {
#endif
REGEXP *
-Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
+Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
{
dVAR;
REGEXP *rx;
struct regexp *r;
register regexp_internal *ri;
STRLEN plen;
- char* exp = SvPV((SV*)pattern, plen);
- char* xend = exp + plen;
+ char *exp;
+ char* xend;
regnode *scan;
I32 flags;
I32 minlen = 0;
I32 sawplus = 0;
I32 sawopen = 0;
+ U8 jump_ret = 0;
+ dJMPENV;
scan_data_t data;
RExC_state_t RExC_state;
RExC_state_t * const pRExC_state = &RExC_state;
#ifdef TRIE_STUDY_OPT
- int restudied= 0;
+ int restudied;
RExC_state_t copyRExC_state;
#endif
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_RE_COMPILE;
+
DEBUG_r(if (!PL_colorset) reginitcolors());
- RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
+ RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
- DEBUG_COMPILE_r({
- SV *dsv= sv_newmortal();
- RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, exp, plen, 60);
- PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
- PL_colors[4],PL_colors[5],s);
- });
-redo_first_pass:
+ /* Longjmp back to here if have to switch in midstream to utf8 */
+ if (! RExC_orig_utf8) {
+ JMPENV_PUSH(jump_ret);
+ }
+
+ if (jump_ret == 0) { /* First time through */
+ exp = SvPV(pattern, plen);
+ xend = exp + plen;
+
+ DEBUG_COMPILE_r({
+ SV *dsv= sv_newmortal();
+ RE_PV_QUOTED_DECL(s, RExC_utf8,
+ dsv, exp, plen, 60);
+ PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+ PL_colors[4],PL_colors[5],s);
+ });
+ }
+ else { /* longjumped back */
+ STRLEN len = plen;
+
+ /* If the cause for the longjmp was other than changing to utf8, pop
+ * our own setjmp, and longjmp to the correct handler */
+ if (jump_ret != UTF8_LONGJMP) {
+ JMPENV_POP;
+ JMPENV_JUMP(jump_ret);
+ }
+
+ GET_RE_DEBUG_FLAGS;
+
+ /* It's possible to write a regexp in ascii that represents Unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ -- dmq */
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+ xend = exp + len;
+ RExC_orig_utf8 = RExC_utf8 = 1;
+ SAVEFREEPV(exp);
+ }
+
+#ifdef TRIE_STUDY_OPT
+ restudied = 0;
+#endif
+
RExC_precomp = exp;
RExC_flags = pm_flags;
RExC_sawback = 0;
RExC_size = 0L;
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
- RExC_charnames = NULL;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
RExC_opend = NULL;
RExC_precomp = NULL;
return(NULL);
}
- if (RExC_utf8 && !RExC_orig_utf8) {
- /* It's possible to write a regexp in ascii that represents Unicode
- codepoints outside of the byte range, such as via \x{100}. If we
- detect such a sequence we have to convert the entire pattern to utf8
- and then recompile, as our sizing calculation will have been based
- on 1 byte == 1 character, but we will need to use utf8 to encode
- at least some part of the pattern, and therefore must convert the whole
- thing.
- XXX: somehow figure out how to make this less expensive...
- -- dmq */
- STRLEN len = plen;
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
- "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
- exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
- xend = exp + len;
- RExC_orig_utf8 = RExC_utf8;
- SAVEFREEPV(exp);
- goto redo_first_pass;
+
+ /* Here, finished first pass. Get rid of our setjmp, which we added for
+ * efficiency only if the passed-in string wasn't in utf8, as shown by
+ * RExC_orig_utf8. But if the first pass was redone, that variable will be
+ * 1 here even though the original string wasn't utf8, but in this case
+ * there will have been a long jump */
+ if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+ JMPENV_POP;
}
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
/* Allocate space and zero-initialize. Note, the two step process
of zeroing when in debug mode, thus anything assigned has to
happen after that */
- rx = newSV_type(SVt_REGEXP);
+ rx = (REGEXP*) newSV_type(SVt_REGEXP);
r = (struct regexp*)SvANY(rx);
Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
char, regexp_internal);
r->extflags = pm_flags;
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
- bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+ bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+
+ /* The caret is output if there are any defaults: if not all the STD
+ * flags are set, or if no character set specifier is needed */
+ bool has_default =
+ (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
+ || ! has_charset);
bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
>> RXf_PMf_STD_PMMOD_SHIFT);
const char *fptr = STD_PAT_MODS; /*"msix"*/
char *p;
- const STRLEN wraplen = plen + has_minus + has_p + has_runon
+ /* Allocate for the worst case, which is all the std flags are turned
+ * on. If more precision is desired, we could do a population count of
+ * the flags set. This could be done with a small lookup table, or by
+ * shifting, masking and adding, or even, when available, assembly
+ * language for a machine-language population count.
+ * We never output a minus, as all those are defaults, so are
+ * covered by the caret */
+ const STRLEN wraplen = plen + has_p + has_runon
+ + has_default /* If needs a caret */
+ + has_charset /* If needs a character set specifier */
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
- p = sv_grow(rx, wraplen + 1);
- SvCUR_set(rx, wraplen);
+ p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
SvPOK_on(rx);
+ SvFLAGS(rx) |= SvUTF8(pattern);
*p++='('; *p++='?';
+
+ /* If a default, cover it using the caret */
+ if (has_default) {
+ *p++= DEFAULT_PAT_MOD;
+ }
+ if (has_charset) {
+ if (r->extflags & RXf_PMf_LOCALE) {
+ *p++ = LOCALE_PAT_MOD;
+ } else {
+ *p++ = UNICODE_PAT_MOD;
+ }
+ }
if (has_p)
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/
{
- char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
- char *colon = r + 1;
char ch;
-
while((ch = *fptr++)) {
if(reganch & 1)
*p++ = ch;
- else
- *r-- = ch;
reganch >>= 1;
}
- if(has_minus) {
- *r = '-';
- p = colon;
- }
}
*p++ = ':';
*p++ = '\n';
*p++ = ')';
*p = 0;
+ SvCUR_set(rx, p - SvPVX_const(rx));
}
r->intflags = 0;
Zero(r->substrs, 1, struct reg_substr_data);
#ifdef TRIE_STUDY_OPT
- if ( restudied ) {
+ if (!restudied) {
+ StructCopy(&zero_scan_data, &data, scan_data_t);
+ copyRExC_state = RExC_state;
+ } else {
U32 seen=RExC_seen;
DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
SvREFCNT_dec(data.last_found);
}
StructCopy(&zero_scan_data, &data, scan_data_t);
- } else {
- StructCopy(&zero_scan_data, &data, scan_data_t);
- copyRExC_state = RExC_state;
}
#else
StructCopy(&zero_scan_data, &data, scan_data_t);
/*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
if (UTF)
- r->extflags |= RXf_UTF8; /* Unicode in it? */
+ SvUTF8_on(rx); /* Unicode in it? */
ri->regstclass = NULL;
if (RExC_naughty >= 10) /* Probably an expensive pattern. */
r->intflags |= PREGf_NAUGHTY;
regnode *first= scan;
regnode *first_next= regnext(first);
- /* Skip introductions and multiplicators >= 1. */
+ /*
+ * Skip introductions and multiplicators >= 1
+ * so that we can extract the 'meat' of the pattern that must
+ * match in the large if() sequence following.
+ * NOTE that EXACT is NOT covered here, as it is normally
+ * picked up by the optimiser separately.
+ *
+ * This is unfortunate as the optimiser isnt handling lookahead
+ * properly currently.
+ *
+ */
while ((OP(first) == OPEN && (sawopen = 1)) ||
/* An OR of *one* alternative - should not happen now. */
(OP(first) == BRANCH && OP(first_next) != BRANCH) ||
(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
{
-
+ /*
+ * the only op that could be a regnode is PLUS, all the rest
+ * will be regnode_1 or regnode_2.
+ *
+ */
if (OP(first) == PLUS)
sawplus = 1;
else
first += regarglen[OP(first)];
- if (OP(first) == IFMATCH) {
- first = NEXTOPER(first);
- first += EXTRA_STEP_2ARGS;
- } else /* XXX possible optimisation for /(?=)/ */
- first = NEXTOPER(first);
+
+ first = NEXTOPER(first);
first_next= regnext(first);
}
ri->regstclass = trie_op;
}
#endif
- else if (strchr((const char*)PL_simple,OP(first)))
+ else if (REGNODE_SIMPLE(OP(first)))
ri->regstclass = first;
else if (PL_regkind[OP(first)] == BOUND ||
PL_regkind[OP(first)] == NBOUND)
if (RExC_seen & REG_SEEN_CUTGROUP)
r->intflags |= PREGf_CUTGROUP_SEEN;
if (RExC_paren_names)
- r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
+ RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
else
- r->paren_names = NULL;
+ RXp_PAREN_NAMES(r) = NULL;
#ifdef STUPID_PATTERN_CHECKS
if (RX_PRELEN(rx) == 0)
#endif
#ifdef DEBUGGING
if (RExC_paren_names) {
- ri->name_list_idx = add_data( pRExC_state, 1, "p" );
+ ri->name_list_idx = add_data( pRExC_state, 1, "a" );
ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
} else
#endif
Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
const U32 flags)
{
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF;
+
PERL_UNUSED_ARG(value);
if (flags & RXapif_FETCH) {
return reg_named_buff_fetch(rx, key, flags);
} else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
- Perl_croak(aTHX_ PL_no_modify);
+ Perl_croak_no_modify(aTHX);
return NULL;
} else if (flags & RXapif_EXISTS) {
return reg_named_buff_exists(rx, key, flags)
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
const U32 flags)
{
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
PERL_UNUSED_ARG(lastkey);
if (flags & RXapif_FIRSTKEY)
AV *retarray = NULL;
SV *ret;
struct regexp *const rx = (struct regexp *)SvANY(r);
+
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
+
if (flags & RXapif_ALL)
retarray=newAV();
- if (rx && rx->paren_names) {
- HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
+ if (rx && RXp_PAREN_NAMES(rx)) {
+ HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
if (he_str) {
IV i;
SV* sv_dat=HeVAL(he_str);
} else {
ret = newSVsv(&PL_sv_undef);
}
- if (retarray) {
- SvREFCNT_inc_simple_void(ret);
+ if (retarray)
av_push(retarray, ret);
- }
}
if (retarray)
- return newRV((SV*)retarray);
+ return newRV_noinc(MUTABLE_SV(retarray));
}
}
return NULL;
const U32 flags)
{
struct regexp *const rx = (struct regexp *)SvANY(r);
- if (rx && rx->paren_names) {
+
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
+
+ if (rx && RXp_PAREN_NAMES(rx)) {
if (flags & RXapif_ALL) {
- return hv_exists_ent(rx->paren_names, key, 0);
+ return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
} else {
SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
if (sv) {
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
{
struct regexp *const rx = (struct regexp *)SvANY(r);
- if ( rx && rx->paren_names ) {
- (void)hv_iterinit(rx->paren_names);
+
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
+
+ if ( rx && RXp_PAREN_NAMES(rx) ) {
+ (void)hv_iterinit(RXp_PAREN_NAMES(rx));
return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
} else {
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
{
struct regexp *const rx = (struct regexp *)SvANY(r);
- if (rx && rx->paren_names) {
- HV *hv = rx->paren_names;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
+
+ if (rx && RXp_PAREN_NAMES(rx)) {
+ HV *hv = RXp_PAREN_NAMES(rx);
HE *temphe;
while ( (temphe = hv_iternext_flags(hv,0)) ) {
IV i;
SV* sv_dat = HeVAL(temphe);
I32 *nums = (I32*)SvPVX(sv_dat);
for ( i = 0; i < SvIVX(sv_dat); i++ ) {
- if ((I32)(rx->lastcloseparen) >= nums[i] &&
+ if ((I32)(rx->lastparen) >= nums[i] &&
rx->offs[nums[i]].start != -1 &&
rx->offs[nums[i]].end != -1)
{
I32 length;
struct regexp *const rx = (struct regexp *)SvANY(r);
- if (rx && rx->paren_names) {
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
+
+ if (rx && RXp_PAREN_NAMES(rx)) {
if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
- return newSViv(HvTOTALKEYS(rx->paren_names));
+ return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
} else if (flags & RXapif_ONE) {
ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
- av = (AV*)SvRV(ret);
+ av = MUTABLE_AV(SvRV(ret));
length = av_len(av);
+ SvREFCNT_dec(ret);
return newSViv(length + 1);
} else {
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
struct regexp *const rx = (struct regexp *)SvANY(r);
AV *av = newAV();
- if (rx && rx->paren_names) {
- HV *hv= rx->paren_names;
+ PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
+
+ if (rx && RXp_PAREN_NAMES(rx)) {
+ HV *hv= RXp_PAREN_NAMES(rx);
HE *temphe;
(void)hv_iterinit(hv);
while ( (temphe = hv_iternext_flags(hv,0)) ) {
SV* sv_dat = HeVAL(temphe);
I32 *nums = (I32*)SvPVX(sv_dat);
for ( i = 0; i < SvIVX(sv_dat); i++ ) {
- if ((I32)(rx->lastcloseparen) >= nums[i] &&
+ if ((I32)(rx->lastparen) >= nums[i] &&
rx->offs[nums[i]].start != -1 &&
rx->offs[nums[i]].end != -1)
{
}
}
- return newRV((SV*)av);
+ return newRV_noinc(MUTABLE_SV(av));
}
void
char *s = NULL;
I32 i = 0;
I32 s1, t1;
+
+ PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
if (!rx->subbeg) {
sv_setsv(sv,&PL_sv_undef);
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
SV const * const value)
{
+ PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
+
PERL_UNUSED_ARG(rx);
PERL_UNUSED_ARG(paren);
PERL_UNUSED_ARG(value);
if (!PL_localizing)
- Perl_croak(aTHX_ PL_no_modify);
+ Perl_croak_no_modify(aTHX);
}
I32
I32 i;
I32 s1, t1;
+ PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
+
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
switch (paren) {
/* $` / ${^PREMATCH} */
goto getlen;
} else {
if (ckWARN(WARN_UNINITIALIZED))
- report_uninit((SV*)sv);
+ report_uninit((const SV *)sv);
return 0;
}
}
SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)
{
+ PERL_ARGS_ASSERT_REG_QR_PACKAGE;
PERL_UNUSED_ARG(rx);
- return NULL;
+ if (0)
+ return NULL;
+ else
+ return newSVpvs("Regexp");
}
/* Scans the name of a named buffer from the pattern.
#define REG_RSN_RETURN_DATA 2
STATIC SV*
-S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
+S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
+{
char *name_start = RExC_parse;
+ PERL_ARGS_ASSERT_REG_SCAN_NAME;
+
if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
/* skip IDFIRST by using do...while */
if (UTF)
char * const oregcomp_parse = RExC_parse;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REG;
DEBUG_PARSE("reg ");
*flagp = 0; /* Tentatively. */
if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ bool has_use_defaults = FALSE;
RExC_parse++;
paren = *RExC_parse++;
"panic: reg_scan_name returned NULL");
if (!RExC_paren_names) {
RExC_paren_names= newHV();
- sv_2mortal((SV*)RExC_paren_names);
+ sv_2mortal(MUTABLE_SV(RExC_paren_names));
#ifdef DEBUGGING
RExC_paren_name_list= newAV();
- sv_2mortal((SV*)RExC_paren_name_list);
+ sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
#endif
}
he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
pv[count] = RExC_npar;
- SvIVX(sv_dat)++;
+ SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
}
} else {
(void)SvUPGRADE(sv_dat,SVt_PVNV);
sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
SvIOK_on(sv_dat);
- SvIVX(sv_dat)= 1;
+ SvIV_set(sv_dat, 1);
}
#ifdef DEBUGGING
if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
RExC_seen |= REG_SEEN_LOOKBEHIND;
RExC_parse++;
case '=': /* (?=...) */
+ RExC_seen_zerolen++;
+ break;
case '!': /* (?!...) */
RExC_seen_zerolen++;
if (*RExC_parse == ')') {
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
+ case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
+ that follow */
+ has_use_defaults = TRUE;
+ STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+ RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ goto parse_flags;
default:
--RExC_parse;
parse_flags: /* (?i) */
{
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
+ bool has_charset_modifier = 0;
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
and must be globally applied -- japhy */
switch (*RExC_parse) {
CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+ case LOCALE_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ posflags |= RXf_PMf_LOCALE;
+ negflags |= RXf_PMf_UNICODE;
+ has_charset_modifier = 1;
+ break;
+ case UNICODE_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ posflags |= RXf_PMf_UNICODE;
+ negflags |= RXf_PMf_LOCALE;
+ has_charset_modifier = 1;
+ break;
+ case DUAL_PAT_MOD:
+ if (has_use_defaults
+ || has_charset_modifier
+ || flagsp == &negflags)
+ {
+ goto fail_modifiers;
+ }
+ negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ has_charset_modifier = 1;
+ break;
case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
break;
case KEEPCOPY_PAT_MOD: /* 'p' */
if (flagsp == &negflags) {
- if (SIZE_ONLY && ckWARN(WARN_REGEXP))
- vWARN(RExC_parse + 1,"Useless use of (?-p)");
+ if (SIZE_ONLY)
+ ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
} else {
*flagsp |= RXf_PMf_KEEPCOPY;
}
break;
case '-':
- if (flagsp == &negflags) {
+ /* A flag is a default iff it is following a minus, so
+ * if there is a minus, it means will be trying to
+ * re-specify a default which is an error */
+ if (has_use_defaults || flagsp == &negflags) {
+ fail_modifiers:
RExC_parse++;
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
/*NOTREACHED*/
/* Pick up the branches, linking them together. */
parse_start = RExC_parse; /* MJD */
br = regbranch(pRExC_state, &flags, 1,depth+1);
+
+ if (freeze_paren) {
+ if (RExC_npar > after_freeze)
+ after_freeze = RExC_npar;
+ RExC_npar = freeze_paren;
+ }
+
/* branch_len = (paren != 0); */
if (br == NULL)
register regnode *latest;
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REGBRANCH;
+
DEBUG_PARSE("brnc");
if (first)
char *parse_start;
const char *maxpos = NULL;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REGPIECE;
+
DEBUG_PARSE("piec");
ret = regatom(pRExC_state, &flags,depth+1);
*flagp = WORST;
if (max > 0)
*flagp |= HASWIDTH;
- if (max && max < min)
+ if (max < min)
vFAIL("Can't do {n,m} with n > m");
if (!SIZE_ONLY) {
ARG1_SET(ret, (U16)min);
goto do_curly;
}
nest_check:
- if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
- vWARN3(RExC_parse,
- "%.*s matches null string many times",
- (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
- origparse);
+ if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
+ ckWARN3reg(RExC_parse,
+ "%.*s matches null string many times",
+ (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
+ origparse);
}
if (RExC_parse < RExC_end && *RExC_parse == '?') {
/* reg_namedseq(pRExC_state,UVp)
This is expected to be called by a parser routine that has
- recognized'\N' and needs to handle the rest. RExC_parse is
+ recognized '\N' and needs to handle the rest. RExC_parse is
expected to point at the first char following the N at the time
of the call.
+
+ The \N may be inside (indicated by valuep not being NULL) or outside a
+ character class.
+
+ \N may begin either a named sequence, or if outside a character class, mean
+ to match a non-newline. For non single-quoted regexes, the tokenizer has
+ attempted to decide which, and in the case of a named sequence converted it
+ into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
+ where c1... are the characters in the sequence. For single-quoted regexes,
+ the tokenizer passes the \N sequence through unchanged; this code will not
+ attempt to determine this nor expand those. The net effect is that if the
+ beginning of the passed-in pattern isn't '{U+' or there is no '}', it
+ signals that this \N occurrence means to match a non-newline.
+
+ Only the \N{U+...} form should occur in a character class, for the same
+ reason that '.' inside a character class means to just match a period: it
+ just doesn't make sense.
If valuep is non-null then it is assumed that we are parsing inside
of a charclass definition and the first codepoint in the resolved
string is returned via *valuep and the routine will return NULL.
In this mode if a multichar string is returned from the charnames
- handler a warning will be issued, and only the first char in the
+ handler, a warning will be issued, and only the first char in the
sequence will be examined. If the string returned is zero length
then the value of *valuep is undefined and NON-NULL will
be returned to indicate failure. (This will NOT be a valid pointer
to a regnode.)
- If value is null then it is assumed that we are parsing normal text
- and inserts a new EXACT node into the program containing the resolved
- string and returns a pointer to the new node. If the string is
- zerolength a NOTHING node is emitted.
-
+ If valuep is null then it is assumed that we are parsing normal text and a
+ new EXACT node is inserted into the program containing the resolved string,
+ and a pointer to the new node is returned. But if the string is zero length
+ a NOTHING node is emitted instead.
+
On success RExC_parse is set to the char following the endbrace.
- Parsing failures will generate a fatal errorvia vFAIL(...)
-
- NOTE: We cache all results from the charnames handler locally in
- the RExC_charnames hash (created on first use) to prevent a charnames
- handler from playing silly-buggers and returning a short string and
- then a long string for a given pattern. Since the regexp program
- size is calculated during an initial parse this would result
- in a buffer overrun so we cache to prevent the charname result from
- changing during the course of the parse.
-
+ Parsing failures will generate a fatal error via vFAIL(...)
*/
STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
{
- char * name; /* start of the content of the name */
- char * endbrace; /* endbrace following the name */
- SV *sv_str = NULL;
- SV *sv_name = NULL;
- STRLEN len; /* this has various purposes throughout the code */
- bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
+ char * endbrace; /* '}' following the name */
regnode *ret = NULL;
-
+#ifdef DEBUGGING
+ char* parse_start = RExC_parse - 2; /* points to the '\N' */
+#endif
+ char* p;
+
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+
+ GET_RE_DEBUG_FLAGS;
+
+ /* The [^\n] meaning of \N ignores spaces and comments under the /x
+ * modifier. The other meaning does not */
+ p = (RExC_flags & RXf_PMf_EXTENDED)
+ ? regwhite( pRExC_state, RExC_parse )
+ : RExC_parse;
+
+ /* Disambiguate between \N meaning a named character versus \N meaning
+ * [^\n]. The former is assumed when it can't be the latter. */
+ if (*p != '{' || regcurly(p)) {
+ RExC_parse = p;
+ if (valuep) {
+ /* no bare \N in a charclass */
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
+ nextchar(pRExC_state);
+ ret = reg_node(pRExC_state, REG_ANY);
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
+ RExC_parse--;
+ Set_Node_Length(ret, 1); /* MJD */
+ return ret;
+ }
+
+ /* Here, we have decided it should be a named sequence */
+
+ /* The test above made sure that the next real character is a '{', but
+ * under the /x modifier, it could be separated by space (or a comment and
+ * \n) and this is not allowed (for consistency with \x{...} and the
+ * tokenizer handling of \N{NAME}). */
if (*RExC_parse != '{') {
- vFAIL("Missing braces on \\N{}");
+ vFAIL("Missing braces on \\N{}");
}
- name = RExC_parse+1;
- endbrace = strchr(RExC_parse, '}');
- if ( ! endbrace ) {
- RExC_parse++;
- vFAIL("Missing right brace on \\N{}");
- }
- RExC_parse = endbrace + 1;
-
-
- /* RExC_parse points at the beginning brace,
- endbrace points at the last */
- if ( name[0]=='U' && name[1]=='+' ) {
- /* its a "Unicode hex" notation {U+89AB} */
- I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
- | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
- UV cp;
- char string;
- len = (STRLEN)(endbrace - name - 2);
- cp = grok_hex(name + 2, &len, &fl, NULL);
- if ( len != (STRLEN)(endbrace - name - 2) ) {
- cp = 0xFFFD;
- }
- if (cp > 0xff)
- RExC_utf8 = 1;
- if ( valuep ) {
- *valuep = cp;
- return NULL;
- }
- string = (char)cp;
- sv_str= newSVpvn(&string, 1);
- } else {
- /* fetch the charnames handler for this scope */
- HV * const table = GvHV(PL_hintgv);
- SV **cvp= table ?
- hv_fetchs(table, "charnames", FALSE) :
- NULL;
- SV *cv= cvp ? *cvp : NULL;
- HE *he_str;
- int count;
- /* create an SV with the name as argument */
- sv_name = newSVpvn(name, endbrace - name);
-
- if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
- vFAIL2("Constant(\\N{%s}) unknown: "
- "(possibly a missing \"use charnames ...\")",
- SvPVX(sv_name));
- }
- if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
- vFAIL2("Constant(\\N{%s}): "
- "$^H{charnames} is not defined",SvPVX(sv_name));
- }
-
-
-
- if (!RExC_charnames) {
- /* make sure our cache is allocated */
- RExC_charnames = newHV();
- sv_2mortal((SV*)RExC_charnames);
- }
- /* see if we have looked this one up before */
- he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
- if ( he_str ) {
- sv_str = HeVAL(he_str);
- cached = 1;
- } else {
- dSP ;
- ENTER ;
- SAVETMPS ;
- PUSHMARK(SP) ;
-
- XPUSHs(sv_name);
-
- PUTBACK ;
-
- count= call_sv(cv, G_SCALAR);
-
- if (count == 1) { /* XXXX is this right? dmq */
- sv_str = POPs;
- SvREFCNT_inc_simple_void(sv_str);
- }
-
- SPAGAIN ;
- PUTBACK ;
- FREETMPS ;
- LEAVE ;
-
- if ( !sv_str || !SvOK(sv_str) ) {
- vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
- "did not return a defined value",SvPVX(sv_name));
- }
- if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
- cached = 1;
- }
+ RExC_parse++; /* Skip past the '{' */
+
+ if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
+ || ! (endbrace == RExC_parse /* nothing between the {} */
+ || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
+ && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
+ {
+ if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
+ vFAIL("\\N{NAME} must be resolved by the lexer");
}
- if (valuep) {
- char *p = SvPV(sv_str, len);
- if (len) {
- STRLEN numlen = 1;
- if ( SvUTF8(sv_str) ) {
- *valuep = utf8_to_uvchr((U8*)p, &numlen);
- if (*valuep > 0x7F)
- RExC_utf8 = 1;
- /* XXXX
- We have to turn on utf8 for high bit chars otherwise
- we get failures with
-
- "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
- "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
-
- This is different from what \x{} would do with the same
- codepoint, where the condition is > 0xFF.
- - dmq
- */
-
-
- } else {
- *valuep = (UV)*p;
- /* warn if we havent used the whole string? */
- }
- if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- vWARN2(RExC_parse,
- "Ignoring excess chars from \\N{%s} in character class",
- SvPVX(sv_name)
- );
- }
- } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- vWARN2(RExC_parse,
- "Ignoring zero length \\N{%s} in character class",
- SvPVX(sv_name)
- );
- }
- if (sv_name)
- SvREFCNT_dec(sv_name);
- if (!cached)
- SvREFCNT_dec(sv_str);
- return len ? NULL : (regnode *)&len;
- } else if(SvCUR(sv_str)) {
-
- char *s;
- char *p, *pend;
- STRLEN charlen = 1;
-#ifdef DEBUGGING
- char * parse_start = name-3; /* needed for the offsets */
-#endif
- GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
-
- ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
- s= STRING(ret);
-
- if ( RExC_utf8 && !SvUTF8(sv_str) ) {
- sv_utf8_upgrade(sv_str);
- } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
- RExC_utf8= 1;
- }
-
- p = SvPV(sv_str, len);
- pend = p + len;
- /* len is the length written, charlen is the size the char read */
- for ( len = 0; p < pend; p += charlen ) {
- if (UTF) {
- UV uvc = utf8_to_uvchr((U8*)p, &charlen);
- if (FOLD) {
- STRLEN foldlen,numlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
- /* Emit all the Unicode characters. */
-
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen)
- {
- uvc = utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, uvc, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- } else {
- const STRLEN unilen = reguni(pRExC_state, uvc, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- } else {
- len++;
- REGC(*p, s++);
- }
- }
- if (SIZE_ONLY) {
- RExC_size += STR_SZ(len);
- } else {
- STR_LEN(ret) = len;
- RExC_emit += STR_SZ(len);
- }
- Set_Node_Cur_Length(ret); /* MJD */
- RExC_parse--;
- nextchar(pRExC_state);
- } else {
- ret = reg_node(pRExC_state,NOTHING);
+
+ if (endbrace == RExC_parse) { /* empty: \N{} */
+ if (! valuep) {
+ RExC_parse = endbrace + 1;
+ return reg_node(pRExC_state,NOTHING);
+ }
+
+ if (SIZE_ONLY) {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class"
+ );
+ RExC_parse = endbrace + 1;
+ }
+ *valuep = 0;
+ return (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
- if (!cached) {
- SvREFCNT_dec(sv_str);
+
+ REQUIRE_UTF8; /* named sequences imply Unicode semantics */
+ RExC_parse += 2; /* Skip past the 'U+' */
+
+ if (valuep) { /* In a bracketed char class */
+ /* We only pay attention to the first char of
+ multichar strings being returned. I kinda wonder
+ if this makes sense as it does change the behaviour
+ from earlier versions, OTOH that behaviour was broken
+ as well. XXX Solution is to recharacterize as
+ [rest-of-class]|multi1|multi2... */
+
+ STRLEN length_of_hex;
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+
+ char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ if (endchar < endbrace) {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
+
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+
+ /* The tokenizer should have guaranteed validity, but it's possible to
+ * bypass it by using single quoting, so check */
+ if (length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) RExC_parse = endchar;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ }
+
+ RExC_parse = endbrace + 1;
+ if (endchar == endbrace) return NULL;
+
+ ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
- if (sv_name) {
- SvREFCNT_dec(sv_name);
+ else { /* Not a char class */
+ char *s; /* String to put in generated EXACT node */
+ STRLEN len = 0; /* Its current byte length */
+ char *endchar; /* Points to '.' or '}' ending cur char in the input
+ stream */
+
+ ret = reg_node(pRExC_state,
+ (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ s= STRING(ret);
+
+ /* Exact nodes can hold only a U8 length's of text = 255. Loop through
+ * the input which is of the form now 'c1.c2.c3...}' until find the
+ * ending brace or exceed length 255. The characters that exceed this
+ * limit are dropped. The limit could be relaxed should it become
+ * desirable by reparsing this as (?:\N{NAME}), so could generate
+ * multiple EXACT nodes, as is done for just regular input. But this
+ * is primarily a named character, and not intended to be a huge long
+ * string, so 255 bytes should be good enough */
+ while (1) {
+ STRLEN length_of_hex;
+ I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+ UV cp; /* Ord of current character */
+
+ /* Code points are separated by dots. If none, there is only one
+ * code point, and is terminated by the brace */
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
+
+ /* The values are Unicode even on EBCDIC machines */
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
+ if ( length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) RExC_parse = endchar;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ }
+
+ if (! FOLD) { /* Not folding, just append to the string */
+ STRLEN unilen;
+
+ /* Quit before adding this character if would exceed limit */
+ if (len + UNISKIP(cp) > U8_MAX) break;
+
+ unilen = reguni(pRExC_state, cp, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ } else { /* Folding, output the folded equivalent */
+ STRLEN foldlen,numlen;
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+ cp = toFOLD_uni(cp, tmpbuf, &foldlen);
+
+ /* Quit before exceeding size limit */
+ if (len + foldlen > U8_MAX) break;
+
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen)
+ {
+ cp = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ const STRLEN unilen = reguni(pRExC_state, cp, s);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
+ }
+ }
+
+ /* Point to the beginning of the next character in the sequence. */
+ RExC_parse = endchar + 1;
+
+ /* Quit if no more characters */
+ if (RExC_parse >= endbrace) break;
+ }
+
+
+ if (SIZE_ONLY) {
+ if (RExC_parse < endbrace) {
+ ckWARNreg(RExC_parse - 1,
+ "Using just the first characters returned by \\N{}");
+ }
+
+ RExC_size += STR_SZ(len);
+ } else {
+ STR_LEN(ret) = len;
+ RExC_emit += STR_SZ(len);
+ }
+
+ RExC_parse = endbrace + 1;
+
+ *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
+ with malformed in t/re/pat_advanced.t */
+ RExC_parse --;
+ Set_Node_Cur_Length(ret); /* MJD */
+ nextchar(pRExC_state);
}
- return ret;
+ return ret;
}
const STRLEN newlen = SvCUR(sv);
UV uv = UNICODE_REPLACEMENT;
+ PERL_ARGS_ASSERT_REG_RECODE;
+
if (newlen)
uv = SvUTF8(sv)
? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
DEBUG_PARSE("atom");
*flagp = WORST; /* Tentatively. */
+ PERL_ARGS_ASSERT_REGATOM;
tryagain:
switch ((U8)*RExC_parse) {
*flagp |= HASWIDTH;
goto finish_meta_pat;
case 'w':
- ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(ALNUML));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(ALNUM));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'W':
- ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(NALNUML));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(NALNUM));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(BOUNDL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(BOUND));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(NBOUND));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= SIMPLE;
goto finish_meta_pat;
case 's':
- ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(SPACEL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(SPACE));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'S':
- ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(NSPACEL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(NSPACE));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'd':
}
break;
case 'N':
- /* Handle \N{NAME} here and not below because it can be
+ /* Handle \N and \N{NAME} here and not below because it can be
multicharacter. join_exact() will join them up later on.
Also this makes sure that things like /\N{BLAH}+/ and
\N{BLAH} being multi char Just Happen. dmq*/
++RExC_parse;
- ret= reg_namedseq(pRExC_state, NULL);
+ ret= reg_namedseq(pRExC_state, NULL, flagp);
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
ender = ASCII_TO_NATIVE('\007');
p++;
break;
+ case 'o':
+ {
+ STRLEN brace_len = len;
+ UV result;
+ const char* error_msg;
+
+ bool valid = grok_bslash_o(p,
+ &result,
+ &brace_len,
+ &error_msg,
+ 1);
+ p += brace_len;
+ if (! valid) {
+ RExC_parse = p; /* going to die anyway; point
+ to exact spot of failure */
+ vFAIL(error_msg);
+ }
+ else
+ {
+ ender = result;
+ }
+ if (PL_encoding && ender < 0x100) {
+ goto recode_encoding;
+ }
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
+ break;
+ }
case 'x':
if (*++p == '{') {
char* const e = strchr(p, '}');
STRLEN numlen = e - p - 1;
ender = grok_hex(p + 1, &numlen, &flags, NULL);
if (ender > 0xff)
- RExC_utf8 = 1;
+ REQUIRE_UTF8;
p = e + 1;
}
}
break;
case 'c':
p++;
- ender = UCHARAT(p++);
- ender = toCTRL(ender);
+ ender = grok_bslash_c(*p++, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8':case '9':
if (*p == '0' ||
- (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
- I32 flags = 0;
+ (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+ {
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
p += numlen;
}
else {
{
SV* enc = PL_encoding;
ender = reg_recode((const char)(U8)ender, &enc);
- if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
- vWARN(p, "Invalid escape in the specified encoding");
- RExC_utf8 = 1;
+ if (!enc && SIZE_ONLY)
+ ckWARNreg(p, "Invalid escape in the specified encoding");
+ REQUIRE_UTF8;
}
break;
case '\0':
FAIL("Trailing \\");
/* FALL THROUGH */
default:
- if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
- vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
+ if (!SIZE_ONLY&& isALPHA(*p))
+ ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
break;
S_regwhite( RExC_state_t *pRExC_state, char *p )
{
const char *e = RExC_end;
+
+ PERL_ARGS_ASSERT_REGWHITE;
+
while (p < e) {
if (isSPACE(*p))
++p;
dVAR;
I32 namedclass = OOB_NAMEDCLASS;
+ PERL_ARGS_ASSERT_REGPPOSIXCC;
+
if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
POSIXCC(UCHARAT(RExC_parse))) {
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
dVAR;
+
+ PERL_ARGS_ASSERT_CHECKPOSIXCC;
+
if (POSIXCC(UCHARAT(RExC_parse))) {
const char *s = RExC_parse;
const char c = *s++;
while (isALNUM(*s))
s++;
if (*s && c == *s && s[1] == ']') {
- if (ckWARN(WARN_REGEXP))
- vWARN3(s+2,
- "POSIX syntax [%c %c] belongs inside character classes",
- c, c);
+ ckWARN3reg(s+2,
+ "POSIX syntax [%c %c] belongs inside character classes",
+ c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
if (POSIXCC_NOTYET(c)) {
what = WORD; \
break
+/* Like above, but no locale test */
#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
ANYOF_##NAME: \
for (value = 0; value < 256; value++) \
what = WORD; \
break
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD) \
+ANYOF_##NAME: \
+ if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
+ else if (UNI_SEMANTICS) { \
+ for (value = 0; value < 256; value++) { \
+ if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
+ else { \
+ for (value = 0; value < 256; value++) { \
+ if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
+ yesno = '+'; \
+ what = WORD; \
+ break; \
+case ANYOF_N##NAME: \
+ if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
+ else if (UNI_SEMANTICS) { \
+ for (value = 0; value < 256; value++) { \
+ if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
+ else { \
+ for (value = 0; value < 256; value++) { \
+ if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
+ yesno = '!'; \
+ what = WORD; \
+ break
+
+/*
+ We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+ so that it is possible to override the option here without having to
+ rebuild the entire core. as we are required to do if we change regcomp.h
+ which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or if the pattern matches a single char only and
case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REGCLASS;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
from earlier versions, OTOH that behaviour was broken
as well. */
UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v)) {
+ if (reg_namedseq(pRExC_state, &v, NULL)) {
goto parseit;
}
value= v;
case 'b': value = '\b'; break;
case 'e': value = ASCII_TO_NATIVE('\033');break;
case 'a': value = ASCII_TO_NATIVE('\007');break;
+ case 'o':
+ RExC_parse--; /* function expects to be pointed at the 'o' */
+ {
+ const char* error_msg;
+ bool valid = grok_bslash_o(RExC_parse,
+ &value,
+ &numlen,
+ &error_msg,
+ SIZE_ONLY);
+ RExC_parse += numlen;
+ if (! valid) {
+ vFAIL(error_msg);
+ }
+ }
+ if (PL_encoding && value < 0x100) {
+ goto recode_encoding;
+ }
+ break;
case 'x':
if (*RExC_parse == '{') {
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
goto recode_encoding;
break;
case 'c':
- value = UCHARAT(RExC_parse++);
- value = toCTRL(value);
+ value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
+ case '5': case '6': case '7':
{
- I32 flags = 0;
+ /* Take 1-3 octal digits */
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
numlen = 3;
value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
{
SV* enc = PL_encoding;
value = reg_recode((const char)(U8)value, &enc);
- if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
- vWARN(RExC_parse,
- "Invalid escape in the specified encoding");
+ if (!enc && SIZE_ONLY)
+ ckWARNreg(RExC_parse,
+ "Invalid escape in the specified encoding");
break;
}
default:
- if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
- vWARN2(RExC_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
+ /* Allow \_ to not give an error */
+ if (!SIZE_ONLY && isALNUM(value) && value != '_') {
+ ckWARN2reg(RExC_parse,
+ "Unrecognized escape \\%c in character class passed through",
+ (int)value);
+ }
break;
}
} /* end of \blah */
/* a bad range like a-\d, a-[:digit:] ? */
if (range) {
if (!SIZE_ONLY) {
- if (ckWARN(WARN_REGEXP)) {
- const int w =
- RExC_parse >= rangebegin ?
- RExC_parse - rangebegin : 0;
- vWARN4(RExC_parse,
+ const int w =
+ RExC_parse >= rangebegin ?
+ RExC_parse - rangebegin : 0;
+ ckWARN4reg(RExC_parse,
"False [] range \"%*.*s\"",
w, w, rangebegin);
- }
+
if (prevvalue < 256) {
ANYOF_BITMAP_SET(ret, prevvalue);
ANYOF_BITMAP_SET(ret, '-');
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
- case _C_C_T_(ALNUM, isALNUM(value), "Word");
- case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
- case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
- case _C_C_T_(BLANK, isBLANK(value), "Blank");
- case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
- case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
- case _C_C_T_(LOWER, isLOWER(value), "Lower");
- case _C_C_T_(PRINT, isPRINT(value), "Print");
- case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
- case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
- case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
- case _C_C_T_(UPPER, isUPPER(value), "Upper");
+
+ case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+ case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+ case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+ case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+ case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+ case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+ case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+ case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+ case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+ case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+ /* \s, \w match all unicode if utf8. */
+ case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
+ case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
+#else
+ /* \s, \w match ascii and locale only */
+ case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
+ case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
+#endif
case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
ANYOF_BITMAP_SET(ret, value);
}
yesno = '+';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_NDIGIT:
if (LOC)
ANYOF_BITMAP_SET(ret, value);
}
yesno = '!';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_MAX:
/* this is to handle \p and \P */
*STRING(ret)= (char)value;
STR_LEN(ret)= 1;
RExC_emit += STR_SZ(1);
+ SvREFCNT_dec(listsv);
return ret;
}
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
* used later (regexec.c:S_reginclass()). */
av_store(av, 0, listsv);
av_store(av, 1, NULL);
- av_store(av, 2, (SV*)unicode_alternate);
- rv = newRV_noinc((SV*)av);
+ av_store(av, 2, MUTABLE_SV(unicode_alternate));
+ rv = newRV_noinc(MUTABLE_SV(av));
n = add_data(pRExC_state, 1, "s");
RExC_rxi->data->data[n] = (void*)rv;
ARG_SET(ret, n);
S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
{
bool ended = 0;
+
+ PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
+
while (RExC_parse < RExC_end)
if (*RExC_parse++ == '\n') {
ended = 1;
/* nextchar()
- Advance that parse position, and optionall absorbs
+ Advance that parse position, and optionally absorbs
"whitespace" from the inputstream.
Without /x "whitespace" means (?#...) style comments only,
{
char* const retval = RExC_parse++;
+ PERL_ARGS_ASSERT_NEXTCHAR;
+
for (;;) {
if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
RExC_parse[2] == '#') {
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_REG_NODE;
+
if (SIZE_ONLY) {
SIZE_ALIGN(RExC_size);
RExC_size += 1;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_REGANODE;
+
if (SIZE_ONLY) {
SIZE_ALIGN(RExC_size);
RExC_size += 2;
S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
{
dVAR;
+
+ PERL_ARGS_ASSERT_REGUNI;
+
return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
}
const int offset = regarglen[(U8)op];
const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REGINSERT;
PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
dVAR;
register regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REGTAIL;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
#ifdef EXPERIMENTAL_INPLACESCAN
I32 min = 0;
#endif
-
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_REGTAIL_STUDY;
+
if (SIZE_ONLY)
return exact;
#endif
/*
- - regcurly - a little FSA that accepts {\d+,?\d*}
- */
-STATIC I32
-S_regcurly(register const char *s)
-{
- if (*s++ != '{')
- return FALSE;
- if (!isDIGIT(*s))
- return FALSE;
- while (isDIGIT(*s))
- s++;
- if (*s == ',')
- s++;
- while (isDIGIT(*s))
- s++;
- if (*s != '}')
- return FALSE;
- return TRUE;
-}
-
-
-/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
#ifdef DEBUGGING
-void
-S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
+static void
+S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
+{
int bit;
int set=0;
+
for (bit=0; bit<32; bit++) {
if (flags & (1<<bit)) {
if (!set++ && lead)
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_REGDUMP;
+
(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
/* Header fields of interest. */
PerlIO_printf(Perl_debug_log, "\n");
DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
#else
+ PERL_ARGS_ASSERT_REGDUMP;
PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(r);
#endif /* DEBUGGING */
/*
- regprop - printable representation of opcode
*/
+#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
+STMT_START { \
+ if (do_sep) { \
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
+ if (flags & ANYOF_INVERT) \
+ /*make sure the invert info is in each */ \
+ sv_catpvs(sv, "^"); \
+ do_sep = 0; \
+ } \
+} STMT_END
+
void
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
{
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_REGPROP;
- sv_setpvn(sv, "", 0);
+ sv_setpvs(sv, "");
if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
/* It would be nice to FAIL() here, but this may be called from
Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
- if ( prog->paren_names ) {
+ if ( RXp_PAREN_NAMES(prog) ) {
if ( k != REF || OP(o) < NREF) {
- AV *list= (AV *)progi->data->data[progi->name_list_idx];
+ AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
SV **name= av_fetch(list, ARG(o), 0 );
if (name)
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
else {
- AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
- SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
+ AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
+ SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
I32 *nums=(I32*)SvPVX(sv_dat);
SV **name= av_fetch(list, nums[0], 0 );
I32 n;
else if (k == VERB) {
if (!o->flags)
Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
- SVfARG((SV*)progi->data->data[ ARG( o ) ]));
+ SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == FOLDCHAR)
else if (k == ANYOF) {
int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
+ int do_sep = 0;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
static const char * const anyofs[] = {
"[:^alpha:]",
"[:ascii:]",
"[:^ascii:]",
- "[:ctrl:]",
- "[:^ctrl:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
"[:graph:]",
"[:^graph:]",
"[:lower:]",
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
+
+ /* output what the standard cp 0-255 bitmap matches */
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
if (rangestart == -1)
sv_catpvs(sv, "-");
put_byte(sv, i - 1);
}
+ do_sep = 1;
rangestart = -1;
}
}
-
+
+ EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+ /* output any special charclass tests (used mostly under use locale) */
if (o->flags & ANYOF_CLASS)
for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
- if (ANYOF_CLASS_TEST(o,i))
+ if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
-
+ do_sep = 1;
+ }
+
+ EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+
+ /* output information about the unicode matching */
if (flags & ANYOF_UNICODE)
sv_catpvs(sv, "{unicode}");
else if (flags & ANYOF_UNICODE_ALL)
if (lv) {
if (sw) {
U8 s[UTF8_MAXBYTES_CASE+1];
-
+
for (i = 0; i <= 256; i++) { /* just the first 256 */
uvchr_to_utf8(s, i);
dVAR;
struct regexp *const prog = (struct regexp *)SvANY(r);
GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_RE_INTUIT_STRING;
PERL_UNUSED_CONTEXT;
DEBUG_COMPILE_r(
struct regexp *const r = (struct regexp *)SvANY(rx);
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_ARGS_ASSERT_PREGFREE2;
+
if (r->mother_re) {
ReREFCNT_dec(r->mother_re);
} else {
CALLREGFREE_PVT(rx); /* free the private data */
- if (r->paren_names)
- SvREFCNT_dec(r->paren_names);
+ SvREFCNT_dec(RXp_PAREN_NAMES(r));
}
if (r->substrs) {
- if (r->anchored_substr)
- SvREFCNT_dec(r->anchored_substr);
- if (r->anchored_utf8)
- SvREFCNT_dec(r->anchored_utf8);
- if (r->float_substr)
- SvREFCNT_dec(r->float_substr);
- if (r->float_utf8)
- SvREFCNT_dec(r->float_utf8);
+ SvREFCNT_dec(r->anchored_substr);
+ SvREFCNT_dec(r->anchored_utf8);
+ SvREFCNT_dec(r->float_substr);
+ SvREFCNT_dec(r->float_utf8);
Safefree(r->substrs);
}
RX_MATCH_COPY_FREE(rx);
#ifdef PERL_OLD_COPY_ON_WRITE
- if (r->saved_copy)
- SvREFCNT_dec(r->saved_copy);
+ SvREFCNT_dec(r->saved_copy);
#endif
- Safefree(r->swap);
Safefree(r->offs);
}
REGEXP *
-Perl_reg_temp_copy (pTHX_ REGEXP *rx) {
- REGEXP *ret_x = newSV_type(SVt_REGEXP);
- struct regexp *ret = (struct regexp *)SvANY(ret_x);
+Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
+{
+ struct regexp *ret;
struct regexp *const r = (struct regexp *)SvANY(rx);
register const I32 npar = r->nparens+1;
+
+ PERL_ARGS_ASSERT_REG_TEMP_COPY;
+
+ if (!ret_x)
+ ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+ ret = (struct regexp *)SvANY(ret_x);
+
(void)ReREFCNT_inc(rx);
- /* FIXME ORANGE (once we start actually using the regular SV fields.) */
/* We can take advantage of the existing "copied buffer" mechanism in SVs
by pointing directly at the buffer, but flagging that the allocated
space in the copy is zero. As we've just done a struct copy, it's now
a case of zero-ing that, rather than copying the current length. */
SvPV_set(ret_x, RX_WRAPPED(rx));
- StructCopy(r, ret, regexp);
+ SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
+ memcpy(&(ret->xpv_cur), &(r->xpv_cur),
+ sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
SvLEN_set(ret_x, 0);
+ SvSTASH_set(ret_x, NULL);
+ SvMAGIC_set(ret_x, NULL);
Newx(ret->offs, npar, regexp_paren_pair);
Copy(r->offs, ret->offs, npar, regexp_paren_pair);
if (r->substrs) {
ret->saved_copy = NULL;
#endif
ret->mother_re = rx;
- ret->swap = NULL;
return ret_x;
}
struct regexp *const r = (struct regexp *)SvANY(rx);
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
-
+
+ PERL_ARGS_ASSERT_REGFREE_INTERNAL;
+
DEBUG_COMPILE_r({
if (!PL_colorset)
reginitcolors();
while (--n >= 0) {
/* If you add a ->what type here, update the comment in regcomp.h */
switch (ri->data->what[n]) {
+ case 'a':
case 's':
case 'S':
case 'u':
- SvREFCNT_dec((SV*)ri->data->data[n]);
+ SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
break;
case 'f':
Safefree(ri->data->data[n]);
break;
case 'p':
- new_comppad = (AV*)ri->data->data[n];
+ new_comppad = MUTABLE_AV(ri->data->data[n]);
break;
case 'o':
if (new_comppad == NULL)
op_free((OP_4tree*)ri->data->data[n]);
PAD_RESTORE_LOCAL(old_comppad);
- SvREFCNT_dec((SV*)new_comppad);
+ SvREFCNT_dec(MUTABLE_SV(new_comppad));
new_comppad = NULL;
break;
case 'n':
PerlMemShared_free(trie->trans);
if (trie->bitmap)
PerlMemShared_free(trie->bitmap);
- if (trie->wordlen)
- PerlMemShared_free(trie->wordlen);
if (trie->jump)
PerlMemShared_free(trie->jump);
- if (trie->nextword)
- PerlMemShared_free(trie->nextword);
+ PerlMemShared_free(trie->wordinfo);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
}
Safefree(ri);
}
-#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
-#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
-#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
+#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
+#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
/*
re_dup - duplicate a regexp.
- This routine is expected to clone a given regexp structure. It is not
- compiler under USE_ITHREADS.
+ This routine is expected to clone a given regexp structure. It is only
+ compiled under USE_ITHREADS.
After all of the core data stored in struct regexp is duplicated
the regexp_engine.dupe method is used to copy any private data
const struct regexp *r = (const struct regexp *)SvANY(sstr);
struct regexp *ret = (struct regexp *)SvANY(dstr);
+ PERL_ARGS_ASSERT_RE_DUP_GUTS;
+
npar = r->nparens+1;
Newx(ret->offs, npar, regexp_paren_pair);
Copy(r->offs, ret->offs, npar, regexp_paren_pair);
/* Do it this way to avoid reading from *r after the StructCopy().
That way, if any of the sv_dup_inc()s dislodge *r from the L1
cache, it doesn't matter. */
- const bool anchored = r->check_substr == r->anchored_substr;
+ const bool anchored = r->check_substr
+ ? r->check_substr == r->anchored_substr
+ : r->check_utf8 == r->anchored_utf8;
Newx(ret->substrs, 1, struct reg_substr_data);
StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
ret->check_substr = ret->float_substr;
ret->check_utf8 = ret->float_utf8;
}
+ } else if (ret->check_utf8) {
+ if (anchored) {
+ ret->check_utf8 = ret->anchored_utf8;
+ } else {
+ ret->check_utf8 = ret->float_utf8;
+ }
}
}
- ret->paren_names = hv_dup_inc(ret->paren_names, param);
+ RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
if (ret->pprivate)
RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
ret->saved_copy = NULL;
#endif
- ret->mother_re = NULL;
+ if (ret->mother_re) {
+ if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
+ /* Our storage points directly to our mother regexp, but that's
+ 1: a buffer in a different thread
+ 2: something we no longer hold a reference on
+ so we need to copy it locally. */
+ /* Note we need to sue SvCUR() on our mother_re, because it, in
+ turn, may well be pointing to its own mother_re. */
+ SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
+ SvCUR(ret->mother_re)+1));
+ SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
+ }
+ ret->mother_re = NULL;
+ }
ret->gofs = 0;
- ret->seen_evals = 0;
}
#endif /* PERL_IN_XSUB_RE */
regexp_internal *reti;
int len, npar;
RXi_GET_DECL(r,ri);
+
+ PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
npar = r->nparens+1;
len = ProgLen(ri);
- Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
+ Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
Copy(ri->program, reti->program, len+1, regnode);
for (i = 0; i < count; i++) {
d->what[i] = ri->data->what[i];
switch (d->what[i]) {
- /* legal options are one of: sSfpontTu
+ /* legal options are one of: sSfpontTua
see also regcomp.h and pregfree() */
+ case 'a': /* actually an AV, but the dup function is identical. */
case 's':
case 'S':
case 'p': /* actually an AV, but the dup function is identical. */
case 'u': /* actually an HV, but the dup function is identical. */
- d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
+ d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
break;
case 'f':
/* This is cheating. */
#endif /* USE_ITHREADS */
-/*
- reg_stringify()
-
- converts a regexp embedded in a MAGIC struct to its stringified form,
- caching the converted form in the struct and returns the cached
- string.
-
- If lp is nonnull then it is used to return the length of the
- resulting string
-
- If flags is nonnull and the returned string contains UTF8 then
- (*flags & 1) will be true.
-
- If haseval is nonnull then it is used to return whether the pattern
- contains evals.
-
- Normally called via macro:
-
- CALLREG_STRINGIFY(mg,&len,&utf8);
-
- And internally with
-
- CALLREG_AS_STR(mg,&lp,&flags,&haseval)
-
- See sv_2pv_flags() in sv.c for an example of internal usage.
-
- */
#ifndef PERL_IN_XSUB_RE
-char *
-Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
- dVAR;
- const REGEXP * const re = (REGEXP *)mg->mg_obj;
- if (haseval)
- *haseval = RX_SEEN_EVALS(re);
- if (flags)
- *flags = RX_UTF8(re) ? 1 : 0;
- if (lp)
- *lp = RX_WRAPLEN(re);
- return RX_WRAPPED(re);
-}
-
/*
- regnext - dig the "next" pointer out of a node
*/
if (!p)
return(NULL);
+ if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
+ Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
+ }
+
offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
if (offset == 0)
return(NULL);
SV *msv;
const char *message;
+ PERL_ARGS_ASSERT_RE_CROAK2;
+
if (l1 > 510)
l1 = 510;
if (l1 + l2 > 510)
state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
- SSPUSHINT(SAVEt_RE_STATE);
+ SSPUSHUV(SAVEt_RE_STATE);
Copy(&PL_reg_state, state, 1, struct re_save_state);
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
+ PERL_ARGS_ASSERT_PUT_BYTE;
+
/* Our definition of isPRINT() ignores locales, so only bytes that are
not part of UTF-8 are considered printable. I assume that the same
holds for UTF-EBCDIC.
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
-
+
+ PERL_ARGS_ASSERT_DUMPUNTIL;
+
#ifdef DEBUG_DUMPUNTIL
PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
last ? last-start : 0,plast ? plast-start : 0);
const reg_trie_data * const trie =
(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
#ifdef DEBUGGING
- AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
+ AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
#endif
const regnode *nextbranch= NULL;
I32 word_idx;
- sv_setpvn(sv, "", 0);
+ sv_setpvs(sv, "");
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);