*/
/* The names of the functions have been changed from regcomp and
- * regexec to pregcomp and pregexec in order to avoid conflicts
+ * regexec to pregcomp and pregexec in order to avoid conflicts
* with the POSIX routines of the same names.
*/
# include "regcomp.h"
#endif
+#include "dquote_static.c"
+
#ifdef op
#undef op
#endif /* op */
I32 orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
- HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
regnode **recurse; /* Recurse regops */
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_orig_utf8 (pRExC_state->orig_utf8)
-#define RExC_charnames (pRExC_state->charnames)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
#define RExC_opend (pRExC_state->opend)
*/
#define WORST 0 /* Worst case. */
#define HASWIDTH 0x01 /* Known to match non-null strings. */
-#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
+
+/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
+ * character, and if utf8, must be invariant. */
+#define SIMPLE 0x02
#define SPSTART 0x04 /* Starts with * or +. */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
+/* If not already in utf8, do a longjmp back to the beginning */
+#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
+#define REQUIRE_UTF8 STMT_START { \
+ if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
+ } STMT_END
/* About scan_data_t.
various inplace (keyhole style) optimisations. In addition study_chunk
and scan_commit populate this data structure with information about
what strings MUST appear in the pattern. We look for the longest
- string that must appear for at a fixed location, and we look for the
+ string that must appear at a fixed location, and we look for the
longest string that may appear at a floating location. So for instance
in the pattern:
- offset or min_offset
This is the position the string must appear at, or not before.
It also implicitly (when combined with minlenp) tells us how many
- character must match before the string we are searching.
- Likewise when combined with minlenp and the length of the string
+ characters must match before the string we are searching for.
+ Likewise when combined with minlenp and the length of the string it
tells us how many characters must appear after the string we have
found.
- max_offset
Only used for floating strings. This is the rightmost point that
- the string can appear at. Ifset to I32 max it indicates that the
+ the string can appear at. If set to I32 max it indicates that the
string can occur infinitely far to the right.
- minlenp
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
#define SCF_SEEN_ACCEPT 0x8000
-#define UTF (RExC_utf8 != 0)
-#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
-#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
+#define UTF cBOOL(RExC_utf8)
+#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
+#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-#define vWARN(loc,m) STMT_START { \
+#define ckWARNreg(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
- m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-#define vWARNdep(loc,m) STMT_START { \
+#define ckWARNregdep(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
- "%s" REPORT_LOCATION, \
- m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-
-#define vWARN2(loc, m, a1) STMT_START { \
+#define ckWARN2reg(loc, m, a1) STMT_START { \
const IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
#define EXPERIMENTAL_INPLACESCAN
-#endif /*RE_TRACK_PATTERN_OFFSETS*/
+#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
#define DEBUG_STUDYDATA(str,data,depth) \
DEBUG_OPTIMISE_MORE_r(if(data){ \
cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
+ cl->flags |= ANYOF_FOLD;
}
/* Can match anything (initialization) */
if (!(and_with->flags & ANYOF_EOS))
cl->flags &= ~ANYOF_EOS;
- if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
+ if (!(and_with->flags & ANYOF_FOLD))
+ cl->flags &= ~ANYOF_FOLD;
+
+ if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
!(and_with->flags & ANYOF_INVERT)) {
cl->flags &= ~ANYOF_UNICODE_ALL;
- cl->flags |= ANYOF_UNICODE;
+ cl->flags |= ANYOF_NONBITMAP;
ARG_SET(cl, ARG(and_with));
}
if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
!(and_with->flags & ANYOF_INVERT))
cl->flags &= ~ANYOF_UNICODE_ALL;
- if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
+ if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
!(and_with->flags & ANYOF_INVERT))
- cl->flags &= ~ANYOF_UNICODE;
+ cl->flags &= ~ANYOF_NONBITMAP;
}
/* 'OR' a given class with another one. Can create false positives */
if (or_with->flags & ANYOF_EOS)
cl->flags |= ANYOF_EOS;
- if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
+ if (or_with->flags & ANYOF_FOLD)
+ cl->flags |= ANYOF_FOLD;
+
+ /* If both nodes match something outside the bitmap, but what they match
+ * outside is not the same pointer, and hence not easily compared, give up
+ * and allow the start class to match everything outside the bitmap */
+ if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
ARG(cl) != ARG(or_with)) {
cl->flags |= ANYOF_UNICODE_ALL;
- cl->flags &= ~ANYOF_UNICODE;
+ cl->flags &= ~ANYOF_NONBITMAP;
}
+
if (or_with->flags & ANYOF_UNICODE_ALL) {
cl->flags |= ANYOF_UNICODE_ALL;
- cl->flags &= ~ANYOF_UNICODE;
+ cl->flags &= ~ANYOF_NONBITMAP;
}
}
Dumps the final compressed table form of the trie to Perl_debug_log.
Used for debugging make_trie().
*/
-
+
STATIC void
S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
AV *revcharmap, U32 depth)
U32 state;
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
+ U16 word;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_DUMP_TRIE;
}
PerlIO_printf( Perl_debug_log, "\n" );
}
+ PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
+ for (word=1; word <= trie->wordcount; word++) {
+ PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
+ (int)word, (int)(trie->wordinfo[word].prev),
+ (int)(trie->wordinfo[word].len));
+ }
+ PerlIO_printf(Perl_debug_log, "\n" );
}
/*
Dumps a fully constructed but uncompressed trie in list form.
#endif
+
/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
startbranch: the first branch in the whole branch sequence
first : start branch of sequence of branch-exact nodes.
U16 dupe= trie->states[ state ].wordnum; \
regnode * const noper_next = regnext( noper ); \
\
- if (trie->wordlen) \
- trie->wordlen[ curword ] = wordlen; \
DEBUG_r({ \
/* store the word for dumping */ \
SV* tmp; \
}); \
\
curword++; \
+ trie->wordinfo[curword].prev = 0; \
+ trie->wordinfo[curword].len = wordlen; \
+ trie->wordinfo[curword].accept = state; \
\
if ( noper_next < tail ) { \
if (!trie->jump) \
} \
\
if ( dupe ) { \
- /* So it's a dupe. This means we need to maintain a */\
- /* linked-list from the first to the next. */\
- /* we only allocate the nextword buffer when there */\
- /* a dupe, so first time we have to do the allocation */\
- if (!trie->nextword) \
- trie->nextword = (U16 *) \
- PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
- while ( trie->nextword[dupe] ) \
- dupe= trie->nextword[dupe]; \
- trie->nextword[dupe]= curword; \
+ /* It's a dupe. Pre-insert into the wordinfo[].prev */\
+ /* chain, so that when the bits of chain are later */\
+ /* linked together, the dups appear in the chain */\
+ trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
+ trie->wordinfo[dupe].prev = curword; \
} else { \
/* we haven't inserted this word yet. */ \
trie->states[ state ].wordnum = curword; \
regnode *jumper = NULL;
regnode *nextbranch = NULL;
regnode *convert = NULL;
+ U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
const U8 * const folder = ( flags == EXACTF
? PL_fold
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
if (!(UTF && folder))
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
+ trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
+ trie->wordcount+1, sizeof(reg_trie_wordinfo));
+
DEBUG_r({
trie_words = newAV();
});
*TODO* If we keep track of how many times each character is used we can
remap the columns so that the table compression later on is more
- efficient in terms of memory by ensuring most common value is in the
+ efficient in terms of memory by ensuring the most common value is in the
middle and the least common are on the outside. IMO this would be better
than a most to least common mapping as theres a decent chance the most
common letter will share a node with the least common, meaning the node
(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
(int)trie->minlen, (int)trie->maxlen )
);
- trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
/*
We now know what we are dealing with in terms of unique chars and
*/
+ Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
+ prev_states[1] = 0;
+
if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
/*
Second Pass -- Array Of Lists Representation
}
if ( ! newstate ) {
newstate = next_alloc++;
+ prev_states[newstate] = state;
TRIE_LIST_PUSH( state, charid, newstate );
transcount++;
}
We then construct the trie using only the .next slots of the entry
structs.
- We use the .check field of the first entry of the node temporarily to
+ We use the .check field of the first entry of the node temporarily to
make compression both faster and easier by keeping track of how many non
zero fields are in the node.
if ( !trie->trans[ state + charid ].next ) {
trie->trans[ state + charid ].next = next_alloc;
trie->trans[ state ].check++;
+ prev_states[TRIE_NODENUM(next_alloc)]
+ = TRIE_NODENUM(state);
next_alloc += trie->uniquecharcount;
}
state = trie->trans[ state + charid ].next;
- Each states[] entry contains a .base field which indicates the
index in the state[] array wheres its transition data is stored.
- - If .base is 0 there are no valid transitions from that node.
+ - If .base is 0 there are no valid transitions from that node.
- If .base is nonzero then charid is added to it to find an entry in
the trans array.
XXX - wrong maybe?
The following process inplace converts the table to the compressed
- table: We first do not compress the root node 1,and mark its all its
+ table: We first do not compress the root node 1,and mark all its
.check pointers as 1 and set its .base pointer as 1 as well. This
- allows to do a DFA construction from the compressed table later, and
- ensures that any .base pointers we calculate later are greater than
- 0.
+ allows us to do a DFA construction from the compressed table later,
+ and ensures that any .base pointers we calculate later are greater
+ than 0.
- We set 'pos' to indicate the first entry of the second node.
PerlMemShared_realloc( trie->trans, trie->lasttrans
* sizeof(reg_trie_trans) );
- /* and now dump out the compressed format */
- DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
-
- { /* Modify the program and insert the new TRIE node*/
+ { /* Modify the program and insert the new TRIE node */
U8 nodetype =(U8)(flags & 0xFF);
char *str=NULL;
depending on whether the thing following (in 'last') is a branch
or not and whther first is the startbranch (ie is it a sub part of
the alternation or is it the whole thing.)
- Assuming its a sub part we conver the EXACT otherwise we convert
+ Assuming its a sub part we convert the EXACT otherwise we convert
the whole branch sequence, including the first.
*/
/* Find the node we are going to overwrite */
break;
}
}
+ trie->prefixlen = (state-1);
if (str) {
regnode *n = convert+NODE_SZ_STR(convert);
NEXT_OFF(convert) = NODE_SZ_STR(convert);
if (trie->jump)
trie->jump[0] = (U16)(nextbranch - convert);
- /* XXXX */
- if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
- ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
+ /* If the start state is not accepting (meaning there is no empty string/NOTHING)
+ * and there is a bitmap
+ * and the first "jump target" node we found leaves enough room
+ * then convert the TRIE node into a TRIEC node, with the bitmap
+ * embedded inline in the opcode - this is hypothetically faster.
+ */
+ if ( !trie->states[trie->startstate].wordnum
+ && trie->bitmap
+ && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
{
OP( convert ) = TRIEC;
Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
});
} /* end node insert */
+
+ /* Finish populating the prev field of the wordinfo array. Walk back
+ * from each accept state until we find another accept state, and if
+ * so, point the first word's .prev field at the second word. If the
+ * second already has a .prev field set, stop now. This will be the
+ * case either if we've already processed that word's accept state,
+ * or that state had multiple words, and the overspill words were
+ * already linked up earlier.
+ */
+ {
+ U16 word;
+ U32 state;
+ U16 prev;
+
+ for (word=1; word <= trie->wordcount; word++) {
+ prev = 0;
+ if (trie->wordinfo[word].prev)
+ continue;
+ state = trie->wordinfo[word].accept;
+ while (state) {
+ state = prev_states[state];
+ if (!state)
+ break;
+ prev = trie->states[state].wordnum;
+ if (prev)
+ break;
+ }
+ trie->wordinfo[word].prev = prev;
+ }
+ Safefree(prev_states);
+ }
+
+
+ /* and now dump out the compressed format */
+ DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
+
RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
#ifdef DEBUGGING
RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
STATIC void
S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
{
-/* The Trie is constructed and compressed now so we can build a fail array now if its needed
+/* The Trie is constructed and compressed now so we can build a fail array if it's needed
This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
ISBN 0-201-10088-6
We find the fail state for each state in the trie, this state is the longest proper
- suffix of the current states 'word' that is also a proper prefix of another word in our
- trie. State 1 represents the word '' and is the thus the default fail state. This allows
+ suffix of the current state's 'word' that is also a proper prefix of another word in our
+ trie. State 1 represents the word '' and is thus the default fail state. This allows
the DFA not to have to restart after its tried and failed a word at a given point, it
simply continues as though it had been matching the other word in the first place.
Consider
'abcdgu'=~/abcdefg|cdgu/
When we get to 'd' we are still matching the first word, we would encounter 'g' which would
- fail, which would bring use to the state representing 'd' in the second word where we would
- try 'g' and succeed, prodceding to match 'cdgu'.
+ fail, which would bring us to the state representing 'd' in the second word where we would
+ try 'g' and succeed, proceeding to match 'cdgu'.
*/
/* add a fail transition */
const U32 trie_offset = ARG(source);
We have two cases
- 1. patterns where the whole set of branch can be converted.
+ 1. patterns where the whole set of branches can be converted.
2. patterns where only a subset can be converted.
In case 1 we can replace the whole set with a single regop
for the trie. In case 2 we need to keep the start and end
- branchs so
+ branches so
'BRANCH EXACT; BRANCH EXACT; BRANCH X'
becomes BRANCH TRIE; BRANCH X;
}
} else {
/*
- Currently we assume that the trie can handle unicode and ascii
- matches fold cased matches. If this proves true then the following
- define will prevent tries in this situation.
-
- #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
-*/
+ Currently we do not believe that the trie logic can
+ handle case insensitive matching properly when the
+ pattern is not unicode (thus forcing unicode semantics).
+
+ If/when this is fixed the following define can be swapped
+ in below to fully enable trie logic.
+
#define TRIE_TYPE_IS_SAFE 1
+
+*/
+#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+
if ( last && TRIE_TYPE_IS_SAFE ) {
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
}
flags &= ~SCF_DO_STCLASS;
}
- else if (strchr((const char*)PL_varies,OP(scan))) {
+ else if (REGNODE_VARIES(OP(scan))) {
I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
regnode * const oscan = scan;
/* These are the cases when once a subexpression
fails at a particular position, it cannot succeed
even after backtracking at the enclosing scope.
-
+
XXXX what if minimal match and we are at the
initial run of {n,m}? */
if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
(next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
- && maxcount <= REG_INFTY/3 /* Complement check for big count */
- && ckWARN(WARN_REGEXP))
+ && maxcount <= REG_INFTY/3) /* Complement check for big count */
{
- vWARN(RExC_parse,
- "Quantifier unexpected on zero-length expression");
+ ckWARNreg(RExC_parse,
+ "Quantifier unexpected on zero-length expression");
}
min += minnext * mincount;
/* Skip open. */
nxt = regnext(nxt);
- if (!strchr((const char*)PL_simple,OP(nxt))
+ if (!REGNODE_SIMPLE(OP(nxt))
&& !(PL_regkind[OP(nxt)] == EXACT
&& STR_LEN(nxt) == 1))
goto nogo;
#ifdef DEBUGGING
OP(nxt1 + 1) = OPTIMIZED; /* was count. */
- NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
- NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
+ NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
+ NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
OP(nxt) = OPTIMIZED; /* was CLOSE. */
OP(nxt + 1) = OPTIMIZED; /* was count. */
- NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
+ NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
#endif
}
nogo:
nxt = nxt2;
OP(nxt2) = SUCCEED; /* Whas WHILEM */
/* Need to optimize away parenths. */
- if (data->flags & SF_IN_PAR) {
+ if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
/* Set the parenth number. */
regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
- if (OP(nxt) != CLOSE)
- FAIL("Panic opt close");
oscan->flags = (U8)ARG(nxt);
if (RExC_open_parens) {
RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
#if 0
while ( nxt1 && (OP(nxt1) != WHILEM)) {
regnode *nnxt = regnext(nxt1);
-
if (nnxt == nxt) {
if (reg_off_by_arg[OP(nxt1)])
ARG_SET(nxt1, nxt2 - nxt1);
else if ((OP(oscan) == CURLYX)
&& (flags & SCF_WHILEM_VISITED_POS)
/* See the comment on a similar expression above.
- However, this time it not a subexpression
+ However, this time it's not a subexpression
we care about, but the expression itself. */
&& (maxcount == REG_INFTY)
&& data && ++data->whilem_c < 16) {
if (UTF)
old = utf8_hop((U8*)s, old) - (U8*)s;
-
l -= old;
/* Get the added string: */
last_str = newSVpvn_utf8(s + old, l, UTF);
if (flags & SCF_DO_STCLASS_AND) {
for (value = 0; value < 256; value++)
if (!is_VERTWS_cp(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- else {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ else {
for (value = 0; value < 256; value++)
if (is_VERTWS_cp(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
if (flags & SCF_DO_STCLASS_OR)
cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
data->pos_delta += 1;
data->longest = &(data->longest_float);
}
-
}
else if (OP(scan) == FOLDCHAR) {
int d = ARG(scan)==0xDF ? 1 : 2;
data->longest = &(data->longest_float);
}
}
- else if (strchr((const char*)PL_simple,OP(scan))) {
+ else if (REGNODE_SIMPLE(OP(scan))) {
int value = 0;
if (flags & SCF_DO_SUBSTR) {
goto do_default;
if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
- || (data->start_class->flags & ANYOF_CLASS));
+ || ((data->start_class->flags & ANYOF_CLASS)
+ && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
cl_anything(pRExC_state, data->start_class);
}
if (flags & SCF_DO_STCLASS_AND || !value)
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
- for (value = 0; value < 256; value++)
- if (!isALNUM(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (!isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (!isALNUM(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
- else {
- for (value = 0; value < 256; value++)
- if (isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ else if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isALNUM(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
}
break;
case ALNUML:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
- for (value = 0; value < 256; value++)
- if (isALNUM(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isALNUM(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
else {
for (value = 0; value < 256; value++)
if (!isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
- for (value = 0; value < 256; value++)
- if (!isSPACE(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
- if (data->start_class->flags & ANYOF_LOCALE)
+ if (data->start_class->flags & ANYOF_LOCALE) {
ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ else if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
}
}
break;
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE_L1(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (isSPACE(value)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ }
+ }
}
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
- else {
- for (value = 0; value < 256; value++)
- if (!isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ else if (FLAGS(scan) & USE_UNI) {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
+ else {
+ for (value = 0; value < 256; value++) {
+ if (!isSPACE(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
}
break;
case NSPACEL:
else {
for (value = 0; value < 256; value++)
if (isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
else {
for (value = 0; value < 256; value++)
if (!isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
data->whilem_c = data_fake.whilem_c;
}
if (f & SCF_DO_STCLASS_AND) {
- const int was = (data->start_class->flags & ANYOF_EOS);
-
- cl_and(data->start_class, &intrnl);
- if (was)
- data->start_class->flags |= ANYOF_EOS;
+ if (flags & SCF_DO_STCLASS_OR) {
+ /* OR before, AND after: ideally we would recurse with
+ * data_fake to get the AND applied by study of the
+ * remainder of the pattern, and then derecurse;
+ * *** HACK *** for now just treat as "no information".
+ * See [perl #56690].
+ */
+ cl_init(pRExC_state, data->start_class);
+ } else {
+ /* AND before and after: combine and continue */
+ const int was = (data->start_class->flags & ANYOF_EOS);
+
+ cl_and(data->start_class, &intrnl);
+ if (was)
+ data->start_class->flags |= ANYOF_EOS;
+ }
}
}
#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
int f = 0;
/* We use SAVEFREEPV so that when the full compile
is finished perl will clean up the allocated
- minlens when its all done. This was we don't
+ minlens when it's all done. This way we don't
have to worry about freeing them when we know
they wont be used, which would be a pain.
*/
struct regexp *r;
register regexp_internal *ri;
STRLEN plen;
- char *exp = SvPV(pattern, plen);
- char* xend = exp + plen;
+ char *exp;
+ char* xend;
regnode *scan;
I32 flags;
I32 minlen = 0;
+
+ /* these are all flags - maybe they should be turned
+ * into a single int with different bit masks */
+ I32 sawlookahead = 0;
I32 sawplus = 0;
I32 sawopen = 0;
+
+ U8 jump_ret = 0;
+ dJMPENV;
scan_data_t data;
RExC_state_t RExC_state;
RExC_state_t * const pRExC_state = &RExC_state;
#ifdef TRIE_STUDY_OPT
- int restudied= 0;
+ int restudied;
RExC_state_t copyRExC_state;
#endif
GET_RE_DEBUG_FLAGS_DECL;
RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
- DEBUG_COMPILE_r({
- SV *dsv= sv_newmortal();
- RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, exp, plen, 60);
- PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
- PL_colors[4],PL_colors[5],s);
- });
-redo_first_pass:
+ /* Longjmp back to here if have to switch in midstream to utf8 */
+ if (! RExC_orig_utf8) {
+ JMPENV_PUSH(jump_ret);
+ }
+
+ if (jump_ret == 0) { /* First time through */
+ exp = SvPV(pattern, plen);
+ xend = exp + plen;
+
+ DEBUG_COMPILE_r({
+ SV *dsv= sv_newmortal();
+ RE_PV_QUOTED_DECL(s, RExC_utf8,
+ dsv, exp, plen, 60);
+ PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+ PL_colors[4],PL_colors[5],s);
+ });
+ }
+ else { /* longjumped back */
+ STRLEN len = plen;
+
+ /* If the cause for the longjmp was other than changing to utf8, pop
+ * our own setjmp, and longjmp to the correct handler */
+ if (jump_ret != UTF8_LONGJMP) {
+ JMPENV_POP;
+ JMPENV_JUMP(jump_ret);
+ }
+
+ GET_RE_DEBUG_FLAGS;
+
+ /* It's possible to write a regexp in ascii that represents Unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ -- dmq */
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+ xend = exp + len;
+ RExC_orig_utf8 = RExC_utf8 = 1;
+ SAVEFREEPV(exp);
+ }
+
+#ifdef TRIE_STUDY_OPT
+ restudied = 0;
+#endif
+
RExC_precomp = exp;
RExC_flags = pm_flags;
RExC_sawback = 0;
RExC_size = 0L;
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
- RExC_charnames = NULL;
RExC_open_parens = NULL;
RExC_close_parens = NULL;
RExC_opend = NULL;
RExC_precomp = NULL;
return(NULL);
}
- if (RExC_utf8 && !RExC_orig_utf8) {
- /* It's possible to write a regexp in ascii that represents Unicode
- codepoints outside of the byte range, such as via \x{100}. If we
- detect such a sequence we have to convert the entire pattern to utf8
- and then recompile, as our sizing calculation will have been based
- on 1 byte == 1 character, but we will need to use utf8 to encode
- at least some part of the pattern, and therefore must convert the whole
- thing.
- XXX: somehow figure out how to make this less expensive...
- -- dmq */
- STRLEN len = plen;
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
- "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
- exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
- xend = exp + len;
- RExC_orig_utf8 = RExC_utf8;
- SAVEFREEPV(exp);
- goto redo_first_pass;
+
+ /* Here, finished first pass. Get rid of our setjmp, which we added for
+ * efficiency only if the passed-in string wasn't in utf8, as shown by
+ * RExC_orig_utf8. But if the first pass was redone, that variable will be
+ * 1 here even though the original string wasn't utf8, but in this case
+ * there will have been a long jump */
+ if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+ JMPENV_POP;
}
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
r->extflags = pm_flags;
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
- bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+ bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+
+ /* The caret is output if there are any defaults: if not all the STD
+ * flags are set, or if no character set specifier is needed */
+ bool has_default =
+ (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
+ || ! has_charset);
bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
>> RXf_PMf_STD_PMMOD_SHIFT);
const char *fptr = STD_PAT_MODS; /*"msix"*/
char *p;
- const STRLEN wraplen = plen + has_minus + has_p + has_runon
+ /* Allocate for the worst case, which is all the std flags are turned
+ * on. If more precision is desired, we could do a population count of
+ * the flags set. This could be done with a small lookup table, or by
+ * shifting, masking and adding, or even, when available, assembly
+ * language for a machine-language population count.
+ * We never output a minus, as all those are defaults, so are
+ * covered by the caret */
+ const STRLEN wraplen = plen + has_p + has_runon
+ + has_default /* If needs a caret */
+ + has_charset /* If needs a character set specifier */
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
- p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
- SvCUR_set(rx, wraplen);
+ p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
SvPOK_on(rx);
SvFLAGS(rx) |= SvUTF8(pattern);
*p++='('; *p++='?';
+
+ /* If a default, cover it using the caret */
+ if (has_default) {
+ *p++= DEFAULT_PAT_MOD;
+ }
+ if (has_charset) {
+ if (r->extflags & RXf_PMf_LOCALE) {
+ *p++ = LOCALE_PAT_MOD;
+ } else {
+ *p++ = UNICODE_PAT_MOD;
+ }
+ }
if (has_p)
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/
{
- char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
- char *colon = r + 1;
char ch;
-
while((ch = *fptr++)) {
if(reganch & 1)
*p++ = ch;
- else
- *r-- = ch;
reganch >>= 1;
}
- if(has_minus) {
- *r = '-';
- p = colon;
- }
}
*p++ = ':';
*p++ = '\n';
*p++ = ')';
*p = 0;
+ SvCUR_set(rx, p - SvPVX_const(rx));
}
r->intflags = 0;
}
reStudy:
- r->minlen = minlen = sawplus = sawopen = 0;
+ r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
Zero(r->substrs, 1, struct reg_substr_data);
#ifdef TRIE_STUDY_OPT
I32 last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
-
/*
* Skip introductions and multiplicators >= 1
* so that we can extract the 'meat' of the pattern that must
/* An OR of *one* alternative - should not happen now. */
(OP(first) == BRANCH && OP(first_next) != BRANCH) ||
/* for now we can't handle lookbehind IFMATCH*/
- (OP(first) == IFMATCH && !first->flags) ||
+ (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
(OP(first) == PLUS) ||
(OP(first) == MINMOD) ||
/* An {n,m} with n>0 */
ri->regstclass = trie_op;
}
#endif
- else if (strchr((const char*)PL_simple,OP(first)))
+ else if (REGNODE_SIMPLE(OP(first)))
ri->regstclass = first;
else if (PL_regkind[OP(first)] == BOUND ||
PL_regkind[OP(first)] == NBOUND)
first = NEXTOPER(first);
goto again;
}
- if (sawplus && (!sawopen || !RExC_sawback)
+ if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->intflags |= PREGf_SKIP;
#endif
#ifdef DEBUGGING
if (RExC_paren_names) {
- ri->name_list_idx = add_data( pRExC_state, 1, "p" );
+ ri->name_list_idx = add_data( pRExC_state, 1, "a" );
ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
} else
#endif
if (flags & RXapif_FETCH) {
return reg_named_buff_fetch(rx, key, flags);
} else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
- Perl_croak(aTHX_ "%s", PL_no_modify);
+ Perl_croak_no_modify(aTHX);
return NULL;
} else if (flags & RXapif_EXISTS) {
return reg_named_buff_exists(rx, key, flags)
PERL_UNUSED_ARG(value);
if (!PL_localizing)
- Perl_croak(aTHX_ "%s", PL_no_modify);
+ Perl_croak_no_modify(aTHX);
}
I32
if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ bool has_use_defaults = FALSE;
RExC_parse++;
paren = *RExC_parse++;
RExC_parse++;
case '=': /* (?=...) */
RExC_seen_zerolen++;
- break;
+ break;
case '!': /* (?!...) */
RExC_seen_zerolen++;
if (*RExC_parse == ')') {
/*
Diagram of capture buffer numbering.
Top line is the normal capture buffer numbers
- Botton line is the negative indexing as from
+ Bottom line is the negative indexing as from
the X (the (?-2))
+ 1 2 3 4 5 X 6 7
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
+ case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
+ that follow */
+ has_use_defaults = TRUE;
+ STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+ RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ goto parse_flags;
default:
--RExC_parse;
parse_flags: /* (?i) */
{
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
+ bool has_charset_modifier = 0;
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
and must be globally applied -- japhy */
switch (*RExC_parse) {
CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+ case LOCALE_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ posflags |= RXf_PMf_LOCALE;
+ negflags |= RXf_PMf_UNICODE;
+ has_charset_modifier = 1;
+ break;
+ case UNICODE_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ posflags |= RXf_PMf_UNICODE;
+ negflags |= RXf_PMf_LOCALE;
+ has_charset_modifier = 1;
+ break;
+ case DUAL_PAT_MOD:
+ if (has_use_defaults
+ || has_charset_modifier
+ || flagsp == &negflags)
+ {
+ goto fail_modifiers;
+ }
+ negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ has_charset_modifier = 1;
+ break;
case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
break;
case KEEPCOPY_PAT_MOD: /* 'p' */
if (flagsp == &negflags) {
- if (SIZE_ONLY && ckWARN(WARN_REGEXP))
- vWARN(RExC_parse + 1,"Useless use of (?-p)");
+ if (SIZE_ONLY)
+ ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
} else {
*flagsp |= RXf_PMf_KEEPCOPY;
}
break;
case '-':
- if (flagsp == &negflags) {
+ /* A flag is a default iff it is following a minus, so
+ * if there is a minus, it means will be trying to
+ * re-specify a default which is an error */
+ if (has_use_defaults || flagsp == &negflags) {
+ fail_modifiers:
RExC_parse++;
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
/*NOTREACHED*/
/* Pick up the branches, linking them together. */
parse_start = RExC_parse; /* MJD */
br = regbranch(pRExC_state, &flags, 1,depth+1);
+
+ if (freeze_paren) {
+ if (RExC_npar > after_freeze)
+ after_freeze = RExC_npar;
+ RExC_npar = freeze_paren;
+ }
+
/* branch_len = (paren != 0); */
if (br == NULL)
goto do_curly;
}
nest_check:
- if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
- vWARN3(RExC_parse,
- "%.*s matches null string many times",
- (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
- origparse);
+ if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
+ ckWARN3reg(RExC_parse,
+ "%.*s matches null string many times",
+ (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
+ origparse);
}
if (RExC_parse < RExC_end && *RExC_parse == '?') {
recognized '\N' and needs to handle the rest. RExC_parse is
expected to point at the first char following the N at the time
of the call.
+
+ The \N may be inside (indicated by valuep not being NULL) or outside a
+ character class.
+
+ \N may begin either a named sequence, or if outside a character class, mean
+ to match a non-newline. For non single-quoted regexes, the tokenizer has
+ attempted to decide which, and in the case of a named sequence converted it
+ into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
+ where c1... are the characters in the sequence. For single-quoted regexes,
+ the tokenizer passes the \N sequence through unchanged; this code will not
+ attempt to determine this nor expand those. The net effect is that if the
+ beginning of the passed-in pattern isn't '{U+' or there is no '}', it
+ signals that this \N occurrence means to match a non-newline.
+
+ Only the \N{U+...} form should occur in a character class, for the same
+ reason that '.' inside a character class means to just match a period: it
+ just doesn't make sense.
If valuep is non-null then it is assumed that we are parsing inside
of a charclass definition and the first codepoint in the resolved
string is returned via *valuep and the routine will return NULL.
In this mode if a multichar string is returned from the charnames
- handler a warning will be issued, and only the first char in the
+ handler, a warning will be issued, and only the first char in the
sequence will be examined. If the string returned is zero length
then the value of *valuep is undefined and NON-NULL will
be returned to indicate failure. (This will NOT be a valid pointer
to a regnode.)
- If valuep is null then it is assumed that we are parsing normal text
- and inserts a new EXACT node into the program containing the resolved
- string and returns a pointer to the new node. If the string is
- zerolength a NOTHING node is emitted.
+ If valuep is null then it is assumed that we are parsing normal text and a
+ new EXACT node is inserted into the program containing the resolved string,
+ and a pointer to the new node is returned. But if the string is zero length
+ a NOTHING node is emitted instead.
On success RExC_parse is set to the char following the endbrace.
- Parsing failures will generate a fatal errorvia vFAIL(...)
-
- NOTE: We cache all results from the charnames handler locally in
- the RExC_charnames hash (created on first use) to prevent a charnames
- handler from playing silly-buggers and returning a short string and
- then a long string for a given pattern. Since the regexp program
- size is calculated during an initial parse this would result
- in a buffer overrun so we cache to prevent the charname result from
- changing during the course of the parse.
-
+ Parsing failures will generate a fatal error via vFAIL(...)
*/
STATIC regnode *
S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
{
- char * name; /* start of the content of the name */
- char * endbrace; /* endbrace following the name */
- SV *sv_str = NULL;
- SV *sv_name = NULL;
- STRLEN len; /* this has various purposes throughout the code */
- bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
+ char * endbrace; /* '}' following the name */
regnode *ret = NULL;
+#ifdef DEBUGGING
+ char* parse_start = RExC_parse - 2; /* points to the '\N' */
+#endif
+ char* p;
+
+ GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+
+ GET_RE_DEBUG_FLAGS;
+
+ /* The [^\n] meaning of \N ignores spaces and comments under the /x
+ * modifier. The other meaning does not */
+ p = (RExC_flags & RXf_PMf_EXTENDED)
+ ? regwhite( pRExC_state, RExC_parse )
+ : RExC_parse;
- if (*RExC_parse != '{' ||
- (*RExC_parse == '{' && RExC_parse[1]
- && strchr("0123456789", RExC_parse[1])))
- {
- GET_RE_DEBUG_FLAGS_DECL;
- if (valuep)
+ /* Disambiguate between \N meaning a named character versus \N meaning
+ * [^\n]. The former is assumed when it can't be the latter. */
+ if (*p != '{' || regcurly(p)) {
+ RExC_parse = p;
+ if (valuep) {
/* no bare \N in a charclass */
- vFAIL("Missing braces on \\N{}");
- GET_RE_DEBUG_FLAGS;
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
nextchar(pRExC_state);
ret = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
Set_Node_Length(ret, 1); /* MJD */
return ret;
}
- name = RExC_parse+1;
- endbrace = strchr(RExC_parse, '}');
- if ( ! endbrace ) {
- RExC_parse++;
- vFAIL("Missing right brace on \\N{}");
- }
- RExC_parse = endbrace + 1;
-
-
- /* RExC_parse points at the beginning brace,
- endbrace points at the last */
- if ( name[0]=='U' && name[1]=='+' ) {
- /* its a "Unicode hex" notation {U+89AB} */
- I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
- | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
- UV cp;
- len = (STRLEN)(endbrace - name - 2);
- cp = grok_hex(name + 2, &len, &fl, NULL);
- if ( len != (STRLEN)(endbrace - name - 2) ) {
- cp = 0xFFFD;
- }
- if ( valuep ) {
- if (cp > 0xff) RExC_utf8 = 1;
- *valuep = cp;
- return NULL;
- }
- /* Need to convert to utf8 if either: won't fit into a byte, or the re
- * is going to be in utf8 and the representation changes under utf8. */
- if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
- U8 string[UTF8_MAXBYTES+1];
- U8 *tmps;
- RExC_utf8 = 1;
- tmps = uvuni_to_utf8(string, cp);
- sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
- } else { /* Otherwise, no need for utf8, can skip that step */
- char string;
- string = (char)cp;
- sv_str= newSVpvn(&string, 1);
- }
- } else {
- /* fetch the charnames handler for this scope */
- HV * const table = GvHV(PL_hintgv);
- SV **cvp= table ?
- hv_fetchs(table, "charnames", FALSE) :
- NULL;
- SV *cv= cvp ? *cvp : NULL;
- HE *he_str;
- int count;
- /* create an SV with the name as argument */
- sv_name = newSVpvn(name, endbrace - name);
-
- if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
- vFAIL2("Constant(\\N{%s}) unknown: "
- "(possibly a missing \"use charnames ...\")",
- SvPVX(sv_name));
- }
- if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
- vFAIL2("Constant(\\N{%s}): "
- "$^H{charnames} is not defined",SvPVX(sv_name));
- }
-
-
-
- if (!RExC_charnames) {
- /* make sure our cache is allocated */
- RExC_charnames = newHV();
- sv_2mortal(MUTABLE_SV(RExC_charnames));
- }
- /* see if we have looked this one up before */
- he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
- if ( he_str ) {
- sv_str = HeVAL(he_str);
- cached = 1;
- } else {
- dSP ;
+ /* Here, we have decided it should be a named sequence */
- ENTER ;
- SAVETMPS ;
- PUSHMARK(SP) ;
-
- XPUSHs(sv_name);
-
- PUTBACK ;
-
- count= call_sv(cv, G_SCALAR);
-
- if (count == 1) { /* XXXX is this right? dmq */
- sv_str = POPs;
- SvREFCNT_inc_simple_void(sv_str);
- }
-
- SPAGAIN ;
- PUTBACK ;
- FREETMPS ;
- LEAVE ;
-
- if ( !sv_str || !SvOK(sv_str) ) {
- vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
- "did not return a defined value",SvPVX(sv_name));
- }
- if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
- cached = 1;
- }
+ /* The test above made sure that the next real character is a '{', but
+ * under the /x modifier, it could be separated by space (or a comment and
+ * \n) and this is not allowed (for consistency with \x{...} and the
+ * tokenizer handling of \N{NAME}). */
+ if (*RExC_parse != '{') {
+ vFAIL("Missing braces on \\N{}");
}
- if (valuep) {
- char *p = SvPV(sv_str, len);
- if (len) {
- STRLEN numlen = 1;
- if ( SvUTF8(sv_str) ) {
- *valuep = utf8_to_uvchr((U8*)p, &numlen);
- if (*valuep > 0x7F)
- RExC_utf8 = 1;
- /* XXXX
- We have to turn on utf8 for high bit chars otherwise
- we get failures with
-
- "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
- "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
-
- This is different from what \x{} would do with the same
- codepoint, where the condition is > 0xFF.
- - dmq
- */
-
-
- } else {
- *valuep = (UV)*p;
- /* warn if we havent used the whole string? */
- }
- if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- vWARN2(RExC_parse,
- "Ignoring excess chars from \\N{%s} in character class",
- SvPVX(sv_name)
- );
- }
- } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- vWARN2(RExC_parse,
- "Ignoring zero length \\N{%s} in character class",
- SvPVX(sv_name)
- );
- }
- if (sv_name)
- SvREFCNT_dec(sv_name);
- if (!cached)
- SvREFCNT_dec(sv_str);
- return len ? NULL : (regnode *)&len;
- } else if(SvCUR(sv_str)) {
-
- char *s;
- char *p, *pend;
- STRLEN charlen = 1;
-#ifdef DEBUGGING
- char * parse_start = name-3; /* needed for the offsets */
-#endif
- GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
-
- ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
- s= STRING(ret);
-
- if ( RExC_utf8 && !SvUTF8(sv_str) ) {
- sv_utf8_upgrade(sv_str);
- } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
- RExC_utf8= 1;
- }
-
- p = SvPV(sv_str, len);
- pend = p + len;
- /* len is the length written, charlen is the size the char read */
- for ( len = 0; p < pend; p += charlen ) {
- if (UTF) {
- UV uvc = utf8_to_uvchr((U8*)p, &charlen);
- if (FOLD) {
- STRLEN foldlen,numlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
- /* Emit all the Unicode characters. */
-
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen)
- {
- uvc = utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, uvc, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- } else {
- const STRLEN unilen = reguni(pRExC_state, uvc, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- } else {
- len++;
- REGC(*p, s++);
- }
- }
- if (SIZE_ONLY) {
- RExC_size += STR_SZ(len);
- } else {
- STR_LEN(ret) = len;
- RExC_emit += STR_SZ(len);
- }
- Set_Node_Cur_Length(ret); /* MJD */
- RExC_parse--;
- nextchar(pRExC_state);
- } else { /* zero length */
- ret = reg_node(pRExC_state,NOTHING);
+
+ RExC_parse++; /* Skip past the '{' */
+
+ if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
+ || ! (endbrace == RExC_parse /* nothing between the {} */
+ || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
+ && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
+ {
+ if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
+ vFAIL("\\N{NAME} must be resolved by the lexer");
}
- if (!cached) {
- SvREFCNT_dec(sv_str);
+
+ if (endbrace == RExC_parse) { /* empty: \N{} */
+ if (! valuep) {
+ RExC_parse = endbrace + 1;
+ return reg_node(pRExC_state,NOTHING);
+ }
+
+ if (SIZE_ONLY) {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class"
+ );
+ RExC_parse = endbrace + 1;
+ }
+ *valuep = 0;
+ return (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
- if (sv_name) {
- SvREFCNT_dec(sv_name);
+
+ REQUIRE_UTF8; /* named sequences imply Unicode semantics */
+ RExC_parse += 2; /* Skip past the 'U+' */
+
+ if (valuep) { /* In a bracketed char class */
+ /* We only pay attention to the first char of
+ multichar strings being returned. I kinda wonder
+ if this makes sense as it does change the behaviour
+ from earlier versions, OTOH that behaviour was broken
+ as well. XXX Solution is to recharacterize as
+ [rest-of-class]|multi1|multi2... */
+
+ STRLEN length_of_hex;
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+
+ char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ if (endchar < endbrace) {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
+
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+
+ /* The tokenizer should have guaranteed validity, but it's possible to
+ * bypass it by using single quoting, so check */
+ if (length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) RExC_parse = endchar;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ }
+
+ RExC_parse = endbrace + 1;
+ if (endchar == endbrace) return NULL;
+
+ ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
- return ret;
+ else { /* Not a char class */
+ char *s; /* String to put in generated EXACT node */
+ STRLEN len = 0; /* Its current byte length */
+ char *endchar; /* Points to '.' or '}' ending cur char in the input
+ stream */
+
+ ret = reg_node(pRExC_state,
+ (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ s= STRING(ret);
+
+ /* Exact nodes can hold only a U8 length's of text = 255. Loop through
+ * the input which is of the form now 'c1.c2.c3...}' until find the
+ * ending brace or exceed length 255. The characters that exceed this
+ * limit are dropped. The limit could be relaxed should it become
+ * desirable by reparsing this as (?:\N{NAME}), so could generate
+ * multiple EXACT nodes, as is done for just regular input. But this
+ * is primarily a named character, and not intended to be a huge long
+ * string, so 255 bytes should be good enough */
+ while (1) {
+ STRLEN length_of_hex;
+ I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+ UV cp; /* Ord of current character */
+
+ /* Code points are separated by dots. If none, there is only one
+ * code point, and is terminated by the brace */
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
+
+ /* The values are Unicode even on EBCDIC machines */
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
+ if ( length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) RExC_parse = endchar;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ }
+
+ if (! FOLD) { /* Not folding, just append to the string */
+ STRLEN unilen;
+
+ /* Quit before adding this character if would exceed limit */
+ if (len + UNISKIP(cp) > U8_MAX) break;
+
+ unilen = reguni(pRExC_state, cp, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ } else { /* Folding, output the folded equivalent */
+ STRLEN foldlen,numlen;
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+ cp = toFOLD_uni(cp, tmpbuf, &foldlen);
+ /* Quit before exceeding size limit */
+ if (len + foldlen > U8_MAX) break;
+
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen)
+ {
+ cp = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ const STRLEN unilen = reguni(pRExC_state, cp, s);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
+ }
+ }
+
+ /* Point to the beginning of the next character in the sequence. */
+ RExC_parse = endchar + 1;
+
+ /* Quit if no more characters */
+ if (RExC_parse >= endbrace) break;
+ }
+
+
+ if (SIZE_ONLY) {
+ if (RExC_parse < endbrace) {
+ ckWARNreg(RExC_parse - 1,
+ "Using just the first characters returned by \\N{}");
+ }
+
+ RExC_size += STR_SZ(len);
+ } else {
+ STR_LEN(ret) = len;
+ RExC_emit += STR_SZ(len);
+ }
+
+ RExC_parse = endbrace + 1;
+
+ *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
+ with malformed in t/re/pat_advanced.t */
+ RExC_parse --;
+ Set_Node_Cur_Length(ret); /* MJD */
+ nextchar(pRExC_state);
+ }
+
+ return ret;
}
*flagp |= HASWIDTH;
goto finish_meta_pat;
case 'w':
- ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(ALNUML));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(ALNUM));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'W':
- ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(NALNUML));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(NALNUM));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(BOUNDL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(BOUND));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(NBOUND));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= SIMPLE;
goto finish_meta_pat;
case 's':
- ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(SPACEL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(SPACE));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'S':
- ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
+ if (LOC) {
+ ret = reg_node(pRExC_state, (U8)(NSPACEL));
+ } else {
+ ret = reg_node(pRExC_state, (U8)(NSPACE));
+ FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+ }
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'd':
ender = ASCII_TO_NATIVE('\007');
p++;
break;
+ case 'o':
+ {
+ STRLEN brace_len = len;
+ UV result;
+ const char* error_msg;
+
+ bool valid = grok_bslash_o(p,
+ &result,
+ &brace_len,
+ &error_msg,
+ 1);
+ p += brace_len;
+ if (! valid) {
+ RExC_parse = p; /* going to die anyway; point
+ to exact spot of failure */
+ vFAIL(error_msg);
+ }
+ else
+ {
+ ender = result;
+ }
+ if (PL_encoding && ender < 0x100) {
+ goto recode_encoding;
+ }
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
+ break;
+ }
case 'x':
if (*++p == '{') {
char* const e = strchr(p, '}');
STRLEN numlen = e - p - 1;
ender = grok_hex(p + 1, &numlen, &flags, NULL);
if (ender > 0xff)
- RExC_utf8 = 1;
+ REQUIRE_UTF8;
p = e + 1;
}
}
break;
case 'c':
p++;
- ender = UCHARAT(p++);
- ender = toCTRL(ender);
+ ender = grok_bslash_c(*p++, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8':case '9':
if (*p == '0' ||
- (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
- I32 flags = 0;
+ (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
+ {
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
-
- /* An octal above 0xff is interpreted differently
- * depending on if the re is in utf8 or not. If it
- * is in utf8, the value will be itself, otherwise
- * it is interpreted as modulo 0x100. It has been
- * decided to discourage the use of octal above the
- * single-byte range. For now, warn only when
- * it ends up modulo */
- if (SIZE_ONLY && ender >= 0x100
- && ! UTF && ! PL_encoding
- && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) {
- vWARNdep(p, "Use of octal value above 377 is deprecated");
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
}
p += numlen;
}
{
SV* enc = PL_encoding;
ender = reg_recode((const char)(U8)ender, &enc);
- if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
- vWARN(p, "Invalid escape in the specified encoding");
- RExC_utf8 = 1;
+ if (!enc && SIZE_ONLY)
+ ckWARNreg(p, "Invalid escape in the specified encoding");
+ REQUIRE_UTF8;
}
break;
case '\0':
FAIL("Trailing \\");
/* FALL THROUGH */
default:
- if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
- vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
+ if (!SIZE_ONLY&& isALPHA(*p))
+ ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
break;
while (isALNUM(*s))
s++;
if (*s && c == *s && s[1] == ']') {
- if (ckWARN(WARN_REGEXP))
- vWARN3(s+2,
- "POSIX syntax [%c %c] belongs inside character classes",
- c, c);
+ ckWARN3reg(s+2,
+ "POSIX syntax [%c %c] belongs inside character classes",
+ c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
if (POSIXCC_NOTYET(c)) {
}
}
-
-#define _C_C_T_(NAME,TEST,WORD) \
+/* No locale test */
+#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
ANYOF_##NAME: \
- if (LOC) \
- ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
- else { \
for (value = 0; value < 256; value++) \
if (TEST) \
ANYOF_BITMAP_SET(ret, value); \
- } \
yesno = '+'; \
what = WORD; \
break; \
case ANYOF_N##NAME: \
- if (LOC) \
- ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
- else { \
for (value = 0; value < 256; value++) \
if (!TEST) \
ANYOF_BITMAP_SET(ret, value); \
- } \
yesno = '!'; \
what = WORD; \
break
-#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
ANYOF_##NAME: \
- for (value = 0; value < 256; value++) \
- if (TEST) \
- ANYOF_BITMAP_SET(ret, value); \
+ if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
+ else if (UNI_SEMANTICS) { \
+ for (value = 0; value < 256; value++) { \
+ if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
+ else { \
+ for (value = 0; value < 256; value++) { \
+ if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
yesno = '+'; \
what = WORD; \
break; \
case ANYOF_N##NAME: \
- for (value = 0; value < 256; value++) \
- if (!TEST) \
- ANYOF_BITMAP_SET(ret, value); \
+ if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
+ else if (UNI_SEMANTICS) { \
+ for (value = 0; value < 256; value++) { \
+ if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
+ else { \
+ for (value = 0; value < 256; value++) { \
+ if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+ } \
+ } \
yesno = '!'; \
what = WORD; \
break
#ifdef EBCDIC
UV literal_endpoint = 0;
#endif
- UV stored = 0; /* number of chars stored in the class */
+ UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */
regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
case we need to change the emitted regop to an EXACT. */
(value=='p' ? '+' : '!'), (int)n, RExC_parse);
}
RExC_parse = e + 1;
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
namedclass = ANYOF_MAX; /* no official name, but it's named */
}
break;
case 'b': value = '\b'; break;
case 'e': value = ASCII_TO_NATIVE('\033');break;
case 'a': value = ASCII_TO_NATIVE('\007');break;
+ case 'o':
+ RExC_parse--; /* function expects to be pointed at the 'o' */
+ {
+ const char* error_msg;
+ bool valid = grok_bslash_o(RExC_parse,
+ &value,
+ &numlen,
+ &error_msg,
+ SIZE_ONLY);
+ RExC_parse += numlen;
+ if (! valid) {
+ vFAIL(error_msg);
+ }
+ }
+ if (PL_encoding && value < 0x100) {
+ goto recode_encoding;
+ }
+ break;
case 'x':
if (*RExC_parse == '{') {
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
goto recode_encoding;
break;
case 'c':
- value = UCHARAT(RExC_parse++);
- value = toCTRL(value);
+ value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
+ case '5': case '6': case '7':
{
- I32 flags = 0;
+ /* Take 1-3 octal digits */
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
numlen = 3;
value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
{
SV* enc = PL_encoding;
value = reg_recode((const char)(U8)value, &enc);
- if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
- vWARN(RExC_parse,
- "Invalid escape in the specified encoding");
+ if (!enc && SIZE_ONLY)
+ ckWARNreg(RExC_parse,
+ "Invalid escape in the specified encoding");
break;
}
default:
- if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
- vWARN2(RExC_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
+ /* Allow \_ to not give an error */
+ if (!SIZE_ONLY && isALNUM(value) && value != '_') {
+ ckWARN2reg(RExC_parse,
+ "Unrecognized escape \\%c in character class passed through",
+ (int)value);
+ }
break;
}
} /* end of \blah */
if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
- if (!SIZE_ONLY && !need_class)
- ANYOF_CLASS_ZERO(ret);
-
- need_class = 1;
+ /* What matches in a locale is not known until runtime, so need to
+ * (one time per class) allocate extra space to pass to regexec.
+ * The space will contain a bit for each named class that is to be
+ * matched against. This isn't needed for \p{} and pseudo-classes,
+ * as they are not affected by locale, and hence are dealt with
+ * separately */
+ if (LOC && namedclass < ANYOF_MAX && ! need_class) {
+ need_class = 1;
+ if (SIZE_ONLY) {
+ RExC_size += ANYOF_CLASS_ADD_SKIP;
+ }
+ else {
+ RExC_emit += ANYOF_CLASS_ADD_SKIP;
+ ANYOF_CLASS_ZERO(ret);
+ }
+ ANYOF_FLAGS(ret) |= ANYOF_CLASS;
+ }
/* a bad range like a-\d, a-[:digit:] ? */
if (range) {
if (!SIZE_ONLY) {
- if (ckWARN(WARN_REGEXP)) {
- const int w =
- RExC_parse >= rangebegin ?
- RExC_parse - rangebegin : 0;
- vWARN4(RExC_parse,
+ const int w =
+ RExC_parse >= rangebegin ?
+ RExC_parse - rangebegin : 0;
+ ckWARN4reg(RExC_parse,
"False [] range \"%*.*s\"",
w, w, rangebegin);
- }
+
if (prevvalue < 256) {
ANYOF_BITMAP_SET(ret, prevvalue);
ANYOF_BITMAP_SET(ret, '-');
}
else {
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
Perl_sv_catpvf(aTHX_ listsv,
"%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
}
* --jhi */
switch ((I32)namedclass) {
- case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
- case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
- case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
- case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
- case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
- case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
- case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
- case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
- case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
- case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+ case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
+ case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
+ case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
+ case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
+ case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
+ case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
+ case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
+ case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
+ case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
+ case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
- case _C_C_T_(ALNUM, isALNUM(value), "Word");
- case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
+ /* \s, \w match all unicode if utf8. */
+ case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
+ case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
#else
- case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
- case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+ /* \s, \w match ascii and locale only */
+ case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
+ case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
#endif
- case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
+ case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
case ANYOF_ASCII:
/* Strings such as "+utf8::isWord\n" */
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
}
- if (LOC)
- ANYOF_FLAGS(ret) |= ANYOF_CLASS;
+ stored+=2; /* can't optimize this class */
continue;
}
} /* end of namedclass \blah */
const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
const UV natvalue = NATIVE_TO_UNI(value);
stored+=2; /* can't optimize this class */
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
if (prevnatvalue < natvalue) { /* what about > ? */
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
prevnatvalue, natvalue);
range = 0; /* this range (if it was one) is done now */
}
- if (need_class) {
- ANYOF_FLAGS(ret) |= ANYOF_LARGE;
- if (SIZE_ONLY)
- RExC_size += ANYOF_CLASS_ADD_SKIP;
- else
- RExC_emit += ANYOF_CLASS_ADD_SKIP;
- }
if (SIZE_ONLY)
*STRING(ret)= (char)value;
STR_LEN(ret)= 1;
RExC_emit += STR_SZ(1);
- if (listsv) {
- SvREFCNT_dec(listsv);
- }
+ SvREFCNT_dec(listsv);
return ret;
}
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
/* nextchar()
- Advance that parse position, and optionall absorbs
+ Advances the parse position, and optionally absorbs
"whitespace" from the inputstream.
Without /x "whitespace" means (?#...) style comments only,
#endif
/*
- - regcurly - a little FSA that accepts {\d+,?\d*}
- */
-STATIC I32
-S_regcurly(register const char *s)
-{
- PERL_ARGS_ASSERT_REGCURLY;
-
- if (*s++ != '{')
- return FALSE;
- if (!isDIGIT(*s))
- return FALSE;
- while (isDIGIT(*s))
- s++;
- if (*s == ',')
- s++;
- while (isDIGIT(*s))
- s++;
- if (*s != '}')
- return FALSE;
- return TRUE;
-}
-
-
-/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
#ifdef DEBUGGING
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
/* output any special charclass tests (used mostly under use locale) */
- if (o->flags & ANYOF_CLASS)
+ if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
/* output information about the unicode matching */
- if (flags & ANYOF_UNICODE)
+ if (flags & ANYOF_NONBITMAP)
sv_catpvs(sv, "{unicode}");
else if (flags & ANYOF_UNICODE_ALL)
sv_catpvs(sv, "{unicode_all}");
handles refcounting and freeing the perl core regexp structure. When
it is necessary to actually free the structure the first thing it
- does is call the 'free' method of the regexp_engine associated to to
+ does is call the 'free' method of the regexp_engine associated to
the regexp, allowing the handling of the void *pprivate; member
first. (This routine is not overridable by extensions, which is why
the extensions free is called first.)
ReREFCNT_dec(r->mother_re);
} else {
CALLREGFREE_PVT(rx); /* free the private data */
- if (RXp_PAREN_NAMES(r))
- SvREFCNT_dec(RXp_PAREN_NAMES(r));
+ SvREFCNT_dec(RXp_PAREN_NAMES(r));
}
if (r->substrs) {
- if (r->anchored_substr)
- SvREFCNT_dec(r->anchored_substr);
- if (r->anchored_utf8)
- SvREFCNT_dec(r->anchored_utf8);
- if (r->float_substr)
- SvREFCNT_dec(r->float_substr);
- if (r->float_utf8)
- SvREFCNT_dec(r->float_utf8);
+ SvREFCNT_dec(r->anchored_substr);
+ SvREFCNT_dec(r->anchored_utf8);
+ SvREFCNT_dec(r->float_substr);
+ SvREFCNT_dec(r->float_utf8);
Safefree(r->substrs);
}
RX_MATCH_COPY_FREE(rx);
#ifdef PERL_OLD_COPY_ON_WRITE
- if (r->saved_copy)
- SvREFCNT_dec(r->saved_copy);
+ SvREFCNT_dec(r->saved_copy);
#endif
- Safefree(r->swap);
Safefree(r->offs);
}
REGEXP *
-Perl_reg_temp_copy (pTHX_ REGEXP *rx)
+Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
{
- REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
- struct regexp *ret = (struct regexp *)SvANY(ret_x);
+ struct regexp *ret;
struct regexp *const r = (struct regexp *)SvANY(rx);
register const I32 npar = r->nparens+1;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
+ if (!ret_x)
+ ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+ ret = (struct regexp *)SvANY(ret_x);
+
(void)ReREFCNT_inc(rx);
/* We can take advantage of the existing "copied buffer" mechanism in SVs
by pointing directly at the buffer, but flagging that the allocated
a case of zero-ing that, rather than copying the current length. */
SvPV_set(ret_x, RX_WRAPPED(rx));
SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
- StructCopy(&(r->xpv_cur), &(ret->xpv_cur), struct regexp_allocated);
+ memcpy(&(ret->xpv_cur), &(r->xpv_cur),
+ sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
SvLEN_set(ret_x, 0);
+ SvSTASH_set(ret_x, NULL);
+ SvMAGIC_set(ret_x, NULL);
Newx(ret->offs, npar, regexp_paren_pair);
Copy(r->offs, ret->offs, npar, regexp_paren_pair);
if (r->substrs) {
ret->saved_copy = NULL;
#endif
ret->mother_re = rx;
- ret->swap = NULL;
return ret_x;
}
Free the private data in a regexp. This is overloadable by
extensions. Perl takes care of the regexp structure in pregfree(),
- this covers the *pprivate pointer which technically perldoesnt
+ this covers the *pprivate pointer which technically perl doesn't
know about, however of course we have to handle the
regexp_internal structure when no extension is in use.
while (--n >= 0) {
/* If you add a ->what type here, update the comment in regcomp.h */
switch (ri->data->what[n]) {
+ case 'a':
case 's':
case 'S':
case 'u':
PerlMemShared_free(trie->trans);
if (trie->bitmap)
PerlMemShared_free(trie->bitmap);
- if (trie->wordlen)
- PerlMemShared_free(trie->wordlen);
if (trie->jump)
PerlMemShared_free(trie->jump);
- if (trie->nextword)
- PerlMemShared_free(trie->nextword);
+ PerlMemShared_free(trie->wordinfo);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
}
Safefree(ri);
}
-#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
-#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
-#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
+#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
+#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
/*
ret->saved_copy = NULL;
#endif
- ret->mother_re = NULL;
+ if (ret->mother_re) {
+ if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
+ /* Our storage points directly to our mother regexp, but that's
+ 1: a buffer in a different thread
+ 2: something we no longer hold a reference on
+ so we need to copy it locally. */
+ /* Note we need to sue SvCUR() on our mother_re, because it, in
+ turn, may well be pointing to its own mother_re. */
+ SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
+ SvCUR(ret->mother_re)+1));
+ SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
+ }
+ ret->mother_re = NULL;
+ }
ret->gofs = 0;
}
#endif /* PERL_IN_XSUB_RE */
for (i = 0; i < count; i++) {
d->what[i] = ri->data->what[i];
switch (d->what[i]) {
- /* legal options are one of: sSfpontTu
+ /* legal options are one of: sSfpontTua
see also regcomp.h and pregfree() */
+ case 'a': /* actually an AV, but the dup function is identical. */
case 's':
case 'S':
case 'p': /* actually an AV, but the dup function is identical. */
if (!p)
return(NULL);
+ if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
+ Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
+ }
+
offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
if (offset == 0)
return(NULL);
state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
- SSPUSHINT(SAVEt_RE_STATE);
+ SSPUSHUV(SAVEt_RE_STATE);
Copy(&PL_reg_state, state, 1, struct re_save_state);
}
else if (op == ANYOF) {
/* arglen 1 + class block */
- node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
+ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
? ANYOF_CLASS_SKIP : ANYOF_SKIP);
node = NEXTOPER(node);
}