char *start; /* Start of input for compile */
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
- char *adjusted_start; /* 'start', adjusted. See code use */
- STRLEN precomp_adj; /* an offset beyond precomp. See code use */
+ char *copy_start; /* start of copy of input within
+ constructed parse string */
+ char *copy_start_in_input; /* Position in input string
+ corresponding to copy_start */
SSize_t whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
regnode *emit_bound; /* First regnode outside of the
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
- SSize_t size; /* Code size. */
+ SSize_t size; /* Number of regnode equivalents in
+ pattern */
I32 npar; /* Capture buffer count, (OPEN) plus
one. ("par" 0 is the whole
pattern)*/
HV *paren_names; /* Paren names */
regnode **recurse; /* Recurse regops */
- I32 recurse_count; /* Number of recurse regops we have generated */
+ I32 recurse_count; /* Number of recurse regops we have generated */
U8 *study_chunk_recursed; /* bitmap of which subs we have moved
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
U32 study_chunk_recursed_count;
SV *mysv1;
SV *mysv2;
+
#define RExC_lastparse (pRExC_state->lastparse)
#define RExC_lastnum (pRExC_state->lastnum)
#define RExC_paren_name_list (pRExC_state->paren_name_list)
#define RExC_flags (pRExC_state->flags)
#define RExC_pm_flags (pRExC_state->pm_flags)
#define RExC_precomp (pRExC_state->precomp)
-#define RExC_precomp_adj (pRExC_state->precomp_adj)
-#define RExC_adjusted_start (pRExC_state->adjusted_start)
+#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
+#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
#define RExC_precomp_end (pRExC_state->precomp_end)
#define RExC_rx_sv (pRExC_state->rx_sv)
#define RExC_rx (pRExC_state->rx)
#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
#ifdef RE_TRACK_PATTERN_OFFSETS
-#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
+# define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
others */
#endif
#define RExC_emit (pRExC_state->emit)
#define TRYAGAIN 0x10 /* Weeded out a declaration. */
#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
- calcuate sizes as UTF-8 */
+ calculate sizes as UTF-8 */
#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
} \
} STMT_END
-#define RETURN_NULL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
+#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
RETURN_X_ON_RESTART_OR_FLAGS(NULL,flags,flagp,extra)
#define RETURN_X_ON_RESTART(X, flags,flagp) \
RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
-#define RETURN_NULL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
+#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
if (*(flagp) & (RESTART_PASS1|(extra))) return NULL
#define MUST_RESTART(flags) ((flags) & (RESTART_PASS1))
-#define RETURN_NULL_ON_RESTART(flags,flagp) \
+#define RETURN_FAIL_ON_RESTART(flags,flagp) \
RETURN_X_ON_RESTART(NULL, flags,flagp)
-#define RETURN_NULL_ON_RESTART_FLAGP(flagp) \
- RETURN_NULL_ON_RESTART_FLAGP_OR_FLAGS(flagp,0)
+#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
+ RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
* the form of something that is completely different from the input, or
* something that uses the input as part of the alternate. In the first case,
* there should be no possibility of an error, as we are in complete control of
- * the alternate string. But in the second case we don't control the input
- * portion, so there may be errors in that. Here's an example:
+ * the alternate string. But in the second case we don't completely control
+ * the input portion, so there may be errors in that. Here's an example:
* /[abc\x{DF}def]/ui
* is handled specially because \x{df} folds to a sequence of more than one
- * character, 'ss'. What is done is to create and parse an alternate string,
+ * character: 'ss'. What is done is to create and parse an alternate string,
* which looks like this:
* /(?:\x{DF}|[abc\x{DF}def])/ui
* where it uses the input unchanged in the middle of something it constructs,
* class while in this substitute parse.) 'abc' and 'def' may have errors that
* need to be reported. The general situation looks like this:
*
+ * |<------- identical ------>|
* sI tI xI eI
- * Input: ----------------------------------------------------
+ * Input: ---------------------------------------------------------------
* Constructed: ---------------------------------------------------
* sC tC xC eC EC
+ * |<------- identical ------>|
*
- * The input string sI..eI is the input pattern. The string sC..EC is the
- * constructed substitute parse string. The portions sC..tC and eC..EC are
- * constructed by us. The portion tC..eC is an exact duplicate of the input
- * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
- * while parsing, we find an error at xC. We want to display a message showing
- * the real input string. Thus we need to find the point xI in it which
- * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
- * been constructed by us, and so shouldn't have errors. We get:
- *
- * xI = sI + (tI - sI) + (xC - tC)
+ * sI..eI is the portion of the input pattern we are concerned with here.
+ * sC..EC is the constructed substitute parse string.
+ * sC..tC is constructed by us
+ * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
+ * In the diagram, these are vertically aligned.
+ * eC..EC is also constructed by us.
+ * xC is the position in the substitute parse string where we found a
+ * problem.
+ * xI is the position in the original pattern corresponding to xC.
*
- * and, the offset into sI is:
+ * We want to display a message showing the real input string. Thus we need to
+ * translate from xC to xI. We know that xC >= tC, since the portion of the
+ * string sC..tC has been constructed by us, and so shouldn't have errors. We
+ * get:
+ * xI = tI + (xC - tC)
*
- * (xI - sI) = (tI - sI) + (xC - tC)
+ * When the substitute parse is constructed, the code needs to set:
+ * RExC_start (sC)
+ * RExC_end (eC)
+ * RExC_copy_start_in_input (tI)
+ * RExC_copy_start_in_constructed (tC)
+ * and restore them when done.
*
- * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
- * and we save tC as RExC_adjusted_start.
- *
- * During normal processing of the input pattern, everything points to that,
- * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
+ * During normal processing of the input pattern, both
+ * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
+ * sI, so that xC equals xI.
*/
-#define tI_sI RExC_precomp_adj
-#define tC RExC_adjusted_start
-#define sC RExC_precomp
-#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
-#define xI(xC) (sC + xI_offset(xC))
-#define eC RExC_precomp_end
+#define sI RExC_precomp
+#define eI RExC_precomp_end
+#define sC RExC_start
+#define eC RExC_end
+#define tI RExC_copy_start_in_input
+#define tC RExC_copy_start_in_constructed
+#define xI(xC) (tI + (xC - tC))
+#define xI_offset(xC) (xI(xC) - sI)
#define REPORT_LOCATION_ARGS(xC) \
UTF8fARG(UTF, \
- (xI(xC) > eC) /* Don't run off end */ \
+ (xI(xC) > eI) /* Don't run off end */ \
? eC - sC /* Length before the <--HERE */ \
- : ( __ASSERT_(xI_offset(xC) >= 0) xI_offset(xC) ), \
- sC), /* The input pattern printed up to the <--HERE */ \
+ : ((xI_offset(xC) >= 0) \
+ ? xI_offset(xC) \
+ : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
+ IVdf " trying to output message for " \
+ " pattern %.*s", \
+ __FILE__, __LINE__, (IV) xI_offset(xC), \
+ ((int) (eC - sC)), sC), 0)), \
+ sI), /* The input pattern printed up to the <--HERE */ \
UTF8fARG(UTF, \
- (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
- (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
+ (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
+ (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
/* Used to point after bad bytes for an error message, but avoid skipping
* past a nul byte. */
*/
#define _FAIL(code) STMT_START { \
const char *ellipses = ""; \
- IV len = RExC_precomp_end - RExC_precomp; \
+ IV len = RExC_precomp_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEFREESV(RExC_rx_sv); \
REPORT_LOCATION_ARGS(loc)); \
} STMT_END
+/* Convert between a pointer to a node and its offset from the beginning of the
+ * program */
+#define REGNODE_p(offset) (RExC_emit_start + (offset))
+#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
+
/* Macros for recording node offsets. 20001227 mjd@plover.com
* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
* element 2*n-1 of the array. Element #2n holds the byte length node #n.
* Position is 1 indexed.
*/
#ifndef RE_TRACK_PATTERN_OFFSETS
-#define Set_Node_Offset_To_R(node,byte)
+#define Set_Node_Offset_To_R(offset,byte)
#define Set_Node_Offset(node,byte)
#define Set_Cur_Node_Offset
#define Set_Node_Length_To_R(node,len)
#else
#define ProgLen(ri) ri->u.offsets[0]
#define SetProgLen(ri,x) ri->u.offsets[0] = x
-#define Set_Node_Offset_To_R(node,byte) STMT_START { \
+#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
- __LINE__, (int)(node), (int)(byte))); \
- if((node) < 0) { \
+ __LINE__, (int)(offset), (int)(byte))); \
+ if((offset) < 0) { \
Perl_croak(aTHX_ "value of node is %d in Offset macro", \
- (int)(node)); \
+ (int)(offset)); \
} else { \
- RExC_offsets[2*(node)-1] = (byte); \
+ RExC_offsets[2*(offset)-1] = (byte); \
} \
} \
} STMT_END
-#define Set_Node_Offset(node,byte) \
- Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
+#define Set_Node_Offset(node,byte) \
+ Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
#define Set_Node_Length_To_R(node,len) STMT_START { \
} STMT_END
#define Set_Node_Length(node,len) \
- Set_Node_Length_To_R((node)-RExC_emit_start, len)
+ Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
#define Set_Node_Cur_Length(node, start) \
Set_Node_Length(node, RExC_parse - start)
/* Get offsets and lengths */
-#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
-#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
+#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
+#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
- Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
- Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
+ Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
+ Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
} STMT_END
#endif
#define DEBUG_RExC_seen() \
DEBUG_OPTIMISE_MORE_r({ \
- Perl_re_printf( aTHX_ "RExC_seen: "); \
+ Perl_re_printf( aTHX_ "RExC_seen: "); \
\
if (RExC_seen & REG_ZERO_LEN_SEEN) \
- Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
\
if (RExC_seen & REG_LOOKBEHIND_SEEN) \
- Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
\
if (RExC_seen & REG_GPOS_SEEN) \
- Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
\
if (RExC_seen & REG_RECURSE_SEEN) \
- Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
\
if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
- Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
\
if (RExC_seen & REG_VERBARG_SEEN) \
- Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
\
if (RExC_seen & REG_CUTGROUP_SEEN) \
- Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
\
if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
- Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
\
if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
- Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
\
if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
- Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
+ Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
\
- Perl_re_printf( aTHX_ "\n"); \
+ Perl_re_printf( aTHX_ "\n"); \
});
#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
PERL_STATIC_INLINE item*
-push(UV key,item* curr)
+push(UV key, item* curr)
{
item* head;
Newx(head, 1, item);
}
PERL_STATIC_INLINE item*
-uniquePush(item* head,UV key)
+uniquePush(item* head, UV key)
{
item* iterator = head;
iterator = iterator->next;
}
- return push(key,head);
+ return push(key, head);
}
PERL_STATIC_INLINE void
)
{
item *head = NULL;
- UV swapCount,swapScore,targetCharCount,i,j;
+ UV swapCount, swapScore, targetCharCount, i, j;
UV *scores;
UV score_ceil = x + y;
scores[1 * (y + 2) + 0] = score_ceil;
scores[0 * (y + 2) + 1] = score_ceil;
scores[1 * (y + 2) + 1] = 0;
- head = uniquePush(uniquePush(head,src[0]),tgt[0]);
+ head = uniquePush(uniquePush(head, src[0]), tgt[0]);
/* work loops */
/* i = src index */
/* j = tgt index */
for (i=1;i<=x;i++) {
if (i < x)
- head = uniquePush(head,src[i]);
+ head = uniquePush(head, src[i]);
scores[(i+1) * (y + 2) + 1] = i;
scores[(i+1) * (y + 2) + 0] = score_ceil;
swapCount = 0;
for (j=1;j<=y;j++) {
if (i == 1) {
if(j < y)
- head = uniquePush(head,tgt[j]);
+ head = uniquePush(head, tgt[j]);
scores[1 * (y + 2) + (j + 1)] = j;
scores[0 * (y + 2) + (j + 1)] = score_ceil;
}
- targetCharCount = find(head,tgt[j-1])->value;
+ targetCharCount = find(head, tgt[j-1])->value;
swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
if (src[i-1] != tgt[j-1]){
}
}
- find(head,src[i-1])->value = i;
+ find(head, src[i-1])->value = i;
}
{
assert(RExC_rxi->data->what[n] == 's');
if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
- invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
+ invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
}
else if (ary[0] && ary[0] != &PL_sv_undef) {
/* Here no compile-time swash, and no run-time only data. Use the
* node's inversion list */
- invlist = sv_2mortal(invlist_clone(ary[3]));
+ invlist = sv_2mortal(invlist_clone(ary[3], NULL));
}
/* Get the code points valid only under UTF-8 locales */
* ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
* map */
- SV* invlist = invlist_clone(ssc->invlist);
+ SV* invlist = invlist_clone(ssc->invlist, NULL);
PERL_ARGS_ASSERT_SSC_FINALIZE;
if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
+ OP(ssc) = ANYOFPOSIXL;
}
-
- if (RExC_contains_locale) {
+ else if (RExC_contains_locale) {
OP(ssc) = ANYOFL;
}
(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
);
} else {
- Perl_re_printf( aTHX_ "%*s",colwidth," ." );
+ Perl_re_printf( aTHX_ "%*s", colwidth," ." );
}
}
}
for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
SV ** const tmp = av_fetch( revcharmap,
- TRIE_LIST_ITEM(state,charid).forid, 0);
+ TRIE_LIST_ITEM(state, charid).forid, 0);
if ( tmp ) {
Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
colwidth,
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
| PERL_PV_ESCAPE_FIRSTCHAR
) ,
- TRIE_LIST_ITEM(state,charid).forid,
- (UV)TRIE_LIST_ITEM(state,charid).newstate
+ TRIE_LIST_ITEM(state, charid).forid,
+ (UV)TRIE_LIST_ITEM(state, charid).newstate
);
if (!(charid % 10))
Perl_re_printf( aTHX_ "\n%*s| ",
Perl_re_indentf( aTHX_
"make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
depth+1,
- REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
+ REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
});
SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
- TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
+ TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "%s", (char*)ch)
);
}
}
/* store the current firstchar in the bitmap */
- TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
+ TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
}
first_ofs = ofs;
) {
regnode *fix = convert;
U32 word = trie->wordcount;
+#ifdef RE_TRACK_PATTERN_OFFSETS
mjd_nodelen++;
+#endif
Set_Node_Offset_Length(convert, mjd_offset, state - 1);
while( ++fix < n ) {
Set_Node_Offset_Length(fix, 0, 0);
regnode *opt = convert;
while ( ++opt < optimize) {
- Set_Node_Offset_Length(opt,0,0);
+ Set_Node_Offset_Length(opt, 0, 0);
}
/*
Try to clean up some of the debris left after the
optimisation.
*/
while( optimize < jumper ) {
+#ifdef RE_TRACK_PATTERN_OFFSETS
mjd_nodelen += Node_Length((optimize));
+#endif
OP( optimize ) = OPTIMIZED;
- Set_Node_Offset_Length(optimize,0,0);
+ Set_Node_Offset_Length(optimize, 0, 0);
optimize++;
}
- Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
+ Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
});
} /* end node insert */
if ( OP(source) == TRIE ) {
struct regnode_1 *op = (struct regnode_1 *)
PerlMemShared_calloc(1, sizeof(struct regnode_1));
- StructCopy(source,op,struct regnode_1);
+ StructCopy(source, op, struct regnode_1);
stclass = (regnode *)op;
} else {
struct regnode_charclass *op = (struct regnode_charclass *)
PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
- StructCopy(source,op,struct regnode_charclass);
+ StructCopy(source, op, struct regnode_charclass);
stclass = (regnode *)op;
}
OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
- join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
+ join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
STATIC U32
S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
UV *min_subtract, bool *unfolded_multi_char,
- U32 flags,regnode *val, U32 depth)
+ U32 flags, regnode *val, U32 depth)
{
/* Merge several consecutive EXACTish nodes into one. */
regnode *n = regnext(scan);
#define INIT_AND_WITHP \
assert(!and_withp); \
- Newx(and_withp,1, regnode_ssc); \
+ Newx(and_withp, 1, regnode_ssc); \
SAVEFREEPV(and_withp)
/* recurse study_chunk() for each BRANCH in an alternation */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, next, &data_fake, stopparen,
- recursed_depth, NULL, f,depth+1);
+ recursed_depth, NULL, f, depth+1);
if (min1 > minnext)
min1 = minnext;
Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
depth+1,
"Looking for TRIE'able sequences. Tail node is ",
- (UV)(tail - RExC_emit_start),
+ (UV) REGNODE_OFFSET(tail),
SvPV_nolen_const( RExC_mysv )
);
});
DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
- depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
+ depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
PL_reg_name[trietype]
regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
depth+1,
- SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
+ SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
});
OP(startbranch)= NOTHING;
RExC_frame_head= newframe;
RExC_frame_count++;
} else if (!RExC_frame_last->next_frame) {
- Newxz(newframe,1,scan_frame);
+ Newxz(newframe, 1, scan_frame);
RExC_frame_last->next_frame= newframe;
newframe->prev_frame= RExC_frame_last;
RExC_frame_count++;
/* Optimize again: */
/* recurse study_chunk() on optimised CURLYX => CURLYM */
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
- NULL, stopparen, recursed_depth, NULL, 0,depth+1);
+ NULL, stopparen, recursed_depth, NULL, 0,
+ depth+1);
}
else
oscan->flags = 0;
/* Cannot expect anything... */
scan_commit(pRExC_state, data, minlenp, is_inf);
data->pos_min += 1;
- data->pos_delta += 1;
+ if (data->pos_delta != SSize_t_MAX) {
+ data->pos_delta += 1;
+ }
data->cur_is_floating = 1; /* float */
}
}
case ANYOFD:
case ANYOFL:
+ case ANYOFPOSIXL:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
invert = 1;
/* FALLTHROUGH */
case ASCII:
- my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
+ my_invlist = invlist_clone(PL_Posix_ptrs[_CC_ASCII], NULL);
/* This can be handled as a Posix class */
goto join_posix_and_ascii;
/* FALLTHROUGH */
case POSIXA:
assert(FLAGS(scan) != _CC_ASCII);
- _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
- PL_XPosix_ptrs[_CC_ASCII],
- &my_invlist);
+ my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
goto join_posix_and_ascii;
case NPOSIXD:
/* FALLTHROUGH */
case POSIXD:
case POSIXU:
- my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
+ my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
/* NPOSIXD matches all upper Latin1 code points unless the
* target string being matched is UTF-8, which is
*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
&deltanext, last, &data_fake,
stopparen, recursed_depth, NULL,
- f,depth+1);
+ f, depth+1);
if (scan->flags) {
if (deltanext) {
FAIL("Variable length lookbehind not implemented");
/* optimise study_chunk() for TRIE */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, (regnode *)nextbranch, &data_fake,
- stopparen, recursed_depth, NULL, f,depth+1);
+ stopparen, recursed_depth, NULL, f, depth+1);
}
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
ptr = hv_fetchs(table, "regcomp", FALSE);
if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
return &PL_core_reg_engine;
- return INT2PTR(regexp_engine*,SvIV(*ptr));
+ return INT2PTR(regexp_engine*, SvIV(*ptr));
}
else {
SV *ptr;
ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
return &PL_core_reg_engine;
- return INT2PTR(regexp_engine*,SvIV(ptr));
+ return INT2PTR(regexp_engine*, SvIV(ptr));
}
}
* it is properly null terminated or we will fail asserts
* later. In theory we probably shouldn't get such SV's,
* but if we do we should handle it gracefully. */
- if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) ) {
+ if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
/* not a string, or a string with a trailing null */
pat = msv;
} else {
/* a string with no trailing null, we need to copy it
- * so it we have a trailing null */
- pat = newSVsv(msv);
+ * so it has a trailing null */
+ pat = sv_2mortal(newSVsv(msv));
}
}
{
int n = 0;
STRLEN s;
-
+
PERL_UNUSED_CONTEXT;
for (s = 0; s < plen; s++) {
DEBUG_COMPILE_r({
Perl_re_printf( aTHX_
"%sre-parsing pattern for runtime code:%s %s\n",
- PL_colors[4],PL_colors[5],newpat);
+ PL_colors[4], PL_colors[5], newpat);
});
sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
OP *expr, const regexp_engine* eng, REGEXP *old_re,
- bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
+ bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
{
- REGEXP *rx;
+ REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
struct regexp *r;
regexp_internal *ri;
STRLEN plen;
/* Initialize these here instead of as-needed, as is quick and avoids
* having to test them each time otherwise */
- if (! PL_AboveLatin1) {
+ if (! PL_InBitmap) {
#ifdef DEBUGGING
char * dump_len_string;
#endif
- PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
- PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
- PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
- PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
- PL_HasMultiCharFold =
- _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
-
/* This is calculated here, because the Perl program that generates the
* static global ones doesn't currently have access to
* NUM_ANYOF_CODE_POINTS */
RExC_mysv1= sv_newmortal();
RExC_mysv2= sv_newmortal();
});
+
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
- PL_colors[4],PL_colors[5],s);
+ PL_colors[4], PL_colors[5], s);
});
redo_first_pass:
set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
}
- RExC_precomp = exp;
- RExC_precomp_adj = 0;
+ RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
RExC_flags = rx_flags;
RExC_pm_flags = pm_flags;
/* First pass: determine size, legality. */
RExC_parse = exp;
- RExC_start = RExC_adjusted_start = exp;
+ RExC_start = RExC_copy_start_in_constructed = exp;
RExC_end = exp + plen;
RExC_precomp_end = RExC_end;
RExC_naughty = 0;
RExC_lastparse=NULL;
);
- if (reg(pRExC_state, 0, &flags,1) == NULL) {
+ if (reg(pRExC_state, 0, &flags, 1) == NULL) {
/* It's possible to write a regexp in ascii that represents Unicode
codepoints outside of the byte range, such as via \x{100}. If we
detect such a sequence we have to convert the entire pattern to utf8
goto redo_first_pass;
}
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
+ Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
}
DEBUG_PARSE_r({
/* Allocate space and zero-initialize. Note, the two step process
of zeroing when in debug mode, thus anything assigned has to
happen after that */
- rx = (REGEXP*) newSV_type(SVt_REGEXP);
- r = ReANY(rx);
+ Rx = (REGEXP*) newSV_type(SVt_REGEXP);
+ r = ReANY(Rx);
Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
char, regexp_internal);
if ( r == NULL || ri == NULL )
/* make sure PL_bitcount bounds not exceeded */
assert(sizeof(STD_PAT_MODS) <= 8);
- p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
- SvPOK_on(rx);
+ p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
+ SvPOK_on(Rx);
if (RExC_utf8)
- SvFLAGS(rx) |= SVf_UTF8;
+ SvFLAGS(Rx) |= SVf_UTF8;
*p++='('; *p++='?';
/* If a default, cover it using the caret */
*p++ = ':';
Copy(RExC_precomp, p, plen, char);
- assert ((RX_WRAPPED(rx) - p) < 16);
- r->pre_prefix = p - RX_WRAPPED(rx);
+ assert ((RX_WRAPPED(Rx) - p) < 16);
+ r->pre_prefix = p - RX_WRAPPED(Rx);
p += plen;
+
+ /* Adding a trailing \n causes this to compile properly:
+ my $R = qr / A B C # D E/x; /($R)/
+ Otherwise the parens are considered part of the comment */
if (has_runon)
*p++ = '\n';
*p++ = ')';
*p = 0;
- SvCUR_set(rx, p - RX_WRAPPED(rx));
+ SvCUR_set(Rx, p - RX_WRAPPED(Rx));
}
r->intflags = 0;
(UV)((2*RExC_size+1) * sizeof(U32))));
#endif
SetProgLen(ri,RExC_size);
- RExC_rx_sv = rx;
+ RExC_rx_sv = Rx;
RExC_rx = r;
RExC_rxi = ri;
/* setup RExC_open_parens, which holds the address of each
* OPEN tag, and to make things simpler for the 0 index
* the start of the program - this is used later for offsets */
- Newxz(RExC_open_parens, RExC_npar,regnode *);
+ Newxz(RExC_open_parens, RExC_npar, regnode *);
SAVEFREEPV(RExC_open_parens);
RExC_open_parens[0] = RExC_emit;
/* setup RExC_close_parens, which holds the address of each
* CLOSE tag, and to make things simpler for the 0 index
* the end of the program - this is used later for offsets */
- Newxz(RExC_close_parens, RExC_npar,regnode *);
+ Newxz(RExC_close_parens, RExC_npar, regnode *);
SAVEFREEPV(RExC_close_parens);
/* we dont know where end op starts yet, so we dont
* need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
}
RExC_npar = 1;
if (reg(pRExC_state, 0, &flags,1) == NULL) {
- ReREFCNT_dec(rx);
- Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
+ ReREFCNT_dec(Rx);
+ Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
}
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "Starting post parse optimization\n");
3-units-long substrs field. */
Newx(r->substrs, 1, struct reg_substr_data);
if (RExC_recurse_count) {
- Newx(RExC_recurse,RExC_recurse_count,regnode *);
+ Newx(RExC_recurse, RExC_recurse_count, regnode *);
SAVEFREEPV(RExC_recurse);
}
/*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
if (UTF)
- SvUTF8_on(rx); /* Unicode in it? */
+ SvUTF8_on(Rx); /* Unicode in it? */
ri->regstclass = NULL;
if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
r->intflags |= PREGf_NAUGHTY;
){
r->extflags |= RXf_CHECK_ALL;
}
- scan_commit(pRExC_state, &data,&minlen,0);
+ scan_commit(pRExC_state, &data,&minlen, 0);
/* XXX this is done in reverse order because that's the way the
* by setting the regexp SV to readonly-only instead. If the
* pattern's been recompiled, the USEDness should remain. */
if (old_re && SvREADONLY(old_re))
- SvREADONLY_on(rx);
+ SvREADONLY_on(Rx);
#endif
- return rx;
+ return Rx;
}
&& rx->offs[nums[i]].end != -1)
{
ret = newSVpvs("");
- CALLREG_NUMBUF_FETCH(r,nums[i],ret);
+ CALLREG_NUMBUF_FETCH(r, nums[i], ret);
if (!retarray)
return ret;
} else {
if (rx && RXp_PAREN_NAMES(rx)) {
HV *hv = RXp_PAREN_NAMES(rx);
HE *temphe;
- while ( (temphe = hv_iternext_flags(hv,0)) ) {
+ while ( (temphe = hv_iternext_flags(hv, 0)) ) {
IV i;
IV parno = 0;
SV* sv_dat = HeVAL(temphe);
HV *hv= RXp_PAREN_NAMES(rx);
HE *temphe;
(void)hv_iterinit(hv);
- while ( (temphe = hv_iternext_flags(hv,0)) ) {
+ while ( (temphe = hv_iternext_flags(hv, 0)) ) {
IV i;
IV parno = 0;
SV* sv_dat = HeVAL(temphe);
he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
if ( he_str )
sv_dat = HeVAL(he_str);
- if ( ! sv_dat )
+ if ( ! sv_dat ) /* Didn't find group */
vFAIL("Reference to nonexistent named group");
return sv_dat;
}
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
int num; \
if (RExC_lastparse!=RExC_parse) { \
- Perl_re_printf( aTHX_ "%s", \
+ Perl_re_printf( aTHX_ "%s", \
Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
RExC_end - RExC_parse, 16, \
"", "", \
) \
); \
} else \
- Perl_re_printf( aTHX_ "%16s",""); \
+ Perl_re_printf( aTHX_ "%16s",""); \
\
if (SIZE_ONLY) \
num = RExC_size + 1; \
else \
num=REG_NODE_NUM(RExC_emit); \
if (RExC_lastnum!=num) \
- Perl_re_printf( aTHX_ "|%4d",num); \
+ Perl_re_printf( aTHX_ "|%4d", num); \
else \
- Perl_re_printf( aTHX_ "|%4s",""); \
- Perl_re_printf( aTHX_ "|%*s%-4s", \
+ Perl_re_printf( aTHX_ "|%4s",""); \
+ Perl_re_printf( aTHX_ "|%*s%-4s", \
(int)((depth*2)), "", \
(funcname) \
); \
return zero_addr + *offset;
}
-#endif
-
PERL_STATIC_INLINE void
S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
{
PERL_UNUSED_CONTEXT;
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- assert(SvTYPE(invlist) == SVt_INVLIST);
+ assert(is_invlist(invlist));
SvCUR_set(invlist,
(len == 0)
assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
}
-#ifndef PERL_IN_XSUB_RE
-
STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
{
PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
- assert(SvTYPE(src) == SVt_INVLIST);
- assert(SvTYPE(dest) == SVt_INVLIST);
+ assert(is_invlist(src));
+ assert(is_invlist(dest));
assert(! invlist_is_iterating(src));
assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
* */
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
- assert(SvTYPE(invlist) == SVt_INVLIST);
+ assert(is_invlist(invlist));
return &(((XINVLIST*) SvANY(invlist))->prev_index);
}
PERL_ARGS_ASSERT_INVLIST_TRIM;
- assert(SvTYPE(invlist) == SVt_INVLIST);
+ assert(is_invlist(invlist));
SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
}
{
PERL_ARGS_ASSERT_INVLIST_CLEAR;
- assert(SvTYPE(invlist) == SVt_INVLIST);
+ assert(is_invlist(invlist));
invlist_set_len(invlist, 0, 0);
invlist_trim(invlist);
PERL_ARGS_ASSERT_INVLIST_MAX;
- assert(SvTYPE(invlist) == SVt_INVLIST);
+ assert(is_invlist(invlist));
/* Assumes worst case, in which the 0 element is not counted in the
* inversion list, so subtracts 1 for that */
? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
: FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
}
+
+STATIC void
+S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
+{
+ PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
+
+ /* First 1 is in case the zero element isn't in the list; second 1 is for
+ * trailing NUL */
+ SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
+ invlist_set_len(invlist, 0, 0);
+
+ /* Force iterinit() to be used to get iteration to work */
+ invlist_iterfinish(invlist);
+
+ *get_invlist_previous_index_addr(invlist) = 0;
+}
+
SV*
Perl__new_invlist(pTHX_ IV initial_size)
{
/* Allocate the initial space */
new_list = newSV_type(SVt_INVLIST);
- /* First 1 is in case the zero element isn't in the list; second 1 is for
- * trailing NUL */
- SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
- invlist_set_len(new_list, 0, 0);
-
- /* Force iterinit() to be used to get iteration to work */
- *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
-
- *get_invlist_previous_index_addr(new_list) = 0;
+ initialize_invlist_guts(new_list, initial_size);
return new_list;
}
PERL_ARGS_ASSERT_INVLIST_EXTEND;
- assert(SvTYPE(invlist) == SVt_INVLIST);
+ assert(is_invlist(invlist));
/* Add one to account for the zero element at the beginning which may not
* be counted by the calling parameters */
PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- assert(*output == NULL || SvTYPE(*output) == SVt_INVLIST);
+ assert(*output == NULL || is_invlist(*output));
len_b = _invlist_len(b);
if (len_b == 0) {
* union. We can just return a copy of 'a' if '*output' doesn't point
* to an existing list */
if (*output == NULL) {
- *output = invlist_clone(a);
+ *output = invlist_clone(a, NULL);
return;
}
}
/* Here, '*output' is to be overwritten by 'a' */
- u = invlist_clone(a);
+ u = invlist_clone(a, NULL);
invlist_replace_list_destroys_src(*output, u);
SvREFCNT_dec_NN(u);
* the clone */
SV ** dest = (*output == NULL) ? output : &u;
- *dest = invlist_clone(b);
+ *dest = invlist_clone(b, NULL);
if (complement_b) {
_invlist_invert(*dest);
}
PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- assert(*i == NULL || SvTYPE(*i) == SVt_INVLIST);
+ assert(*i == NULL || is_invlist(*i));
/* Special case if either one is empty */
len_a = (a == NULL) ? 0 : _invlist_len(a);
}
if (*i == NULL) {
- *i = invlist_clone(a);
+ *i = invlist_clone(a, NULL);
return;
}
- r = invlist_clone(a);
+ r = invlist_clone(a, NULL);
invlist_replace_list_destroys_src(*i, r);
SvREFCNT_dec_NN(r);
return;
*get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
}
-#endif
-
-PERL_STATIC_INLINE SV*
-S_invlist_clone(pTHX_ SV* const invlist)
+SV*
+Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
{
/* Return a new inversion list that is a copy of the input one, which is
* unchanged. The new list will not be mortal even if the old one was. */
- /* Need to allocate extra space to accommodate Perl's addition of a
- * trailing NUL to SvPV's, since it thinks they are always strings */
- SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
- STRLEN physical_length = SvCUR(invlist);
- bool offset = *(get_invlist_offset_addr(invlist));
+ const STRLEN nominal_length = _invlist_len(invlist); /* Why not +1 XXX */
+ const STRLEN physical_length = SvCUR(invlist);
+ const bool offset = *(get_invlist_offset_addr(invlist));
PERL_ARGS_ASSERT_INVLIST_CLONE;
+ /* Need to allocate extra space to accommodate Perl's addition of a
+ * trailing NUL to SvPV's, since it thinks they are always strings */
+ if (new_invlist == NULL) {
+ new_invlist = _new_invlist(nominal_length);
+ }
+ else {
+ sv_upgrade(new_invlist, SVt_INVLIST);
+ initialize_invlist_guts(new_invlist, nominal_length);
+ }
+
*(get_invlist_offset_addr(new_invlist)) = offset;
- invlist_set_len(new_invlist, _invlist_len(invlist), offset);
+ invlist_set_len(new_invlist, nominal_length, offset);
Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
return new_invlist;
}
+#endif
+
PERL_STATIC_INLINE STRLEN*
S_get_invlist_iter_addr(SV* invlist)
{
PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
- assert(SvTYPE(invlist) == SVt_INVLIST);
+ assert(is_invlist(invlist));
return &(((XINVLIST*) SvANY(invlist))->iterator);
}
}
}
-void
-Perl__load_PL_utf8_foldclosures (pTHX)
-{
- assert(! PL_utf8_foldclosures);
-
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES_CASE+1];
- const U8 hyphen[] = HYPHEN_UTF8;
-
- /* This string is just a short named one above \xff */
- toFOLD_utf8_safe(hyphen, hyphen + sizeof(hyphen) - 1, dummy, NULL);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
-}
#endif
#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
}
else { /* Pattern is UTF-8 */
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
- STRLEN foldlen = UTF8SKIP(s);
const U8* e = s + bytelen;
- SV** listp;
+ IV fc;
- uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+ fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
/* The only code points that aren't folded in a UTF EXACTFish
* node are are the problematic ones in EXACTFL nodes */
U8 *d = folded;
int i;
+ fc = -1;
for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
if (isASCII(*s)) {
*(d++) = (U8) toFOLD(*s);
+ if (fc < 0) { /* Save the first fold */
+ fc = *(d-1);
+ }
s++;
}
else {
STRLEN len;
- toFOLD_utf8_safe(s, e, d, &len);
+ UV fold = toFOLD_utf8_safe(s, e, d, &len);
+ if (fc < 0) { /* Save the first fold */
+ fc = fold;
+ }
d += len;
s += UTF8SKIP(s);
}
/* And set up so the code below that looks in this folded
* buffer instead of the node's string */
e = d;
- foldlen = UTF8SKIP(folded);
s = folded;
}
/* When we reach here 's' points to the fold of the first
* character(s) of the node; and 'e' points to far enough along
* the folded string to be just past any possible multi-char
- * fold. 'foldlen' is the length in bytes of the first
- * character in 's'
+ * fold.
*
* Unlike the non-UTF-8 case, the macro for determining if a
* string is a multi-char fold requires all the characters to
invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
}
else { /* Single char fold */
-
- /* It matches all the things that fold to it, which are
- * found in PL_utf8_foldclosures (including itself) */
- invlist = add_cp_to_invlist(invlist, uc);
- if (! PL_utf8_foldclosures)
- _load_PL_utf8_foldclosures();
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) s, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- assert(c_p);
-
- c = SvUV(*c_p);
-
- /* /aa doesn't allow folds between ASCII and non- */
- if ((OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
- && isASCII(c) != isASCII(uc))
- {
- continue;
- }
-
- invlist = add_cp_to_invlist(invlist, c);
+ unsigned int k;
+ unsigned int first_folds_to;
+ const unsigned int * remaining_folds_to_list;
+ Size_t folds_to_count;
+
+ /* It matches itself */
+ invlist = add_cp_to_invlist(invlist, fc);
+
+ /* ... plus all the things that fold to it, which are found in
+ * PL_utf8_foldclosures */
+ folds_to_count = _inverse_folds(fc, &first_folds_to,
+ &remaining_folds_to_list);
+ for (k = 0; k < folds_to_count; k++) {
+ UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1];
+
+ /* /aa doesn't allow folds between ASCII and non- */
+ if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
+ && isASCII(c) != isASCII(fc))
+ {
+ continue;
}
+
+ invlist = add_cp_to_invlist(invlist, c);
}
}
}
if (RExC_parse == name_start || *RExC_parse != ch) {
/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
- vFAIL2("Sequence %.3s... not terminated",parse_start);
+ vFAIL2("Sequence %.3s... not terminated", parse_start);
}
if (!SIZE_ONLY) {
upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns
NULL, which cannot happen. */
STATIC regnode *
-S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
+S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
/* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
* 2 is like 1, but indicates that nextchar() has been called to advance
* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
* this flag alerts us to the need to check for that */
{
- regnode *ret; /* Will be the head of the group. */
+ regnode *ret = NULL; /* Will be the head of the group. */
regnode *br;
regnode *lastbr;
regnode *ender = NULL;
vFAIL("Unmatched (");
}
- if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
+ if (paren == 'r') { /* Atomic script run */
+ paren = '>';
+ goto parse_rest;
+ }
+ else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
char *start_verb = RExC_parse + 1;
STRLEN verb_len;
char *start_arg = NULL;
switch ( *start_verb ) {
case 'A': /* (*ACCEPT) */
- if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
+ if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
op = ACCEPT;
internal_argval = RExC_nestroot;
}
break;
case 'C': /* (*COMMIT) */
- if ( memEQs(start_verb,verb_len,"COMMIT") )
+ if ( memEQs(start_verb, verb_len,"COMMIT") )
op = COMMIT;
break;
case 'F': /* (*FAIL) */
- if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
+ if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
op = OPFAIL;
}
break;
case ':': /* (*:NAME) */
case 'M': /* (*MARK:NAME) */
- if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
+ if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
op = MARKPOINT;
arg_required = 1;
}
break;
case 'P': /* (*PRUNE) */
- if ( memEQs(start_verb,verb_len,"PRUNE") )
+ if ( memEQs(start_verb, verb_len,"PRUNE") )
op = PRUNE;
break;
case 'S': /* (*SKIP) */
- if ( memEQs(start_verb,verb_len,"SKIP") )
+ if ( memEQs(start_verb, verb_len,"SKIP") )
op = SKIP;
break;
case 'T': /* (*THEN) */
/* [19:06] <TimToady> :: is then */
- if ( memEQs(start_verb,verb_len,"THEN") ) {
+ if ( memEQs(start_verb, verb_len,"THEN") ) {
op = CUTGROUP;
RExC_seen |= REG_CUTGROUP_SEEN;
}
break;
case 'a':
- if (memEQs(start_verb, verb_len, "atomic")) {
+ if ( memEQs(start_verb, verb_len, "asr")
+ || memEQs(start_verb, verb_len, "atomic_script_run"))
+ {
+ paren = 'r'; /* Mnemonic: recursed run */
+ goto script_run;
+ }
+ else if (memEQs(start_verb, verb_len, "atomic")) {
paren = 't'; /* AtOMIC */
goto alpha_assertions;
}
if ( memEQs(start_verb, verb_len, "sr")
|| memEQs(start_verb, verb_len, "script_run"))
{
+ regnode * atomic;
+
paren = 's';
+ script_run:
+
/* This indicates Unicode rules. */
REQUIRE_UNI_RULES(flagp, NULL);
RExC_parse = start_arg;
+ if (RExC_in_script_run) {
+
+ /* Nested script runs are treated as no-ops, because
+ * if the nested one fails, the outer one must as
+ * well. It could fail sooner, and avoid (??{} with
+ * side effects, but that is explicitly documented as
+ * undefined behavior. */
+
+ ret = NULL;
+
+ if (paren == 's') {
+ paren = ':';
+ goto parse_rest;
+ }
+
+ /* But, the atomic part of a nested atomic script run
+ * isn't a no-op, but can be treated just like a '(?>'
+ * */
+ paren = '>';
+ goto parse_rest;
+ }
+
+ /* By doing this here, we avoid extra warnings for nested
+ * script runs */
if (PASS2) {
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
}
- if (RExC_in_script_run) {
- paren = ':';
- nextchar(pRExC_state);
- ret = NULL;
+ if (paren == 's') {
+ /* Here, we're starting a new regular script run */
+ ret = reg_node(pRExC_state, SROPEN);
+ RExC_in_script_run = 1;
+ is_open = 1;
goto parse_rest;
}
- RExC_in_script_run = 1;
+
+ /* Here, we are starting an atomic script run. This is
+ * handled by recursing to deal with the atomic portion
+ * separately, enclosed in SROPEN ... SRCLOSE nodes */
ret = reg_node(pRExC_state, SROPEN);
- is_open = 1;
- goto parse_rest;
+ RExC_in_script_run = 1;
+
+ atomic = reg(pRExC_state, 'r', &flags, depth);
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return NULL;
+ }
+
+ REGTAIL(pRExC_state, ret, atomic);
+
+ REGTAIL(pRExC_state, atomic,
+ reg_node(pRExC_state, SRCLOSE));
+
+ RExC_in_script_run = 0;
+ return ret;
}
break;
ARG(ret) = add_data( pRExC_state,
STR_WITH_LEN("S"));
RExC_rxi->data->data[ARG(ret)]=(void*)sv;
- ret->flags = 1;
+ FLAGS(ret) = 1;
} else {
- ret->flags = 0;
+ FLAGS(ret) = 0;
}
if ( internal_argval != -1 )
ARG2L_SET(ret, internal_argval);
SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
}
} else {
- (void)SvUPGRADE(sv_dat,SVt_PVNV);
+ (void)SvUPGRADE(sv_dat, SVt_PVNV);
sv_setpvn(sv_dat, (char *)&(RExC_npar),
sizeof(I32));
SvIOK_on(sv_dat);
RExC_parse++;
is_neg = TRUE;
}
+ endptr = RExC_end;
if (grok_atoUV(RExC_parse, &unum, &endptr)
&& unum <= I32_MAX
) {
RExC_flags & RXf_PMf_COMPILETIME
);
if (!SIZE_ONLY) {
- ret->flags = 2;
+ FLAGS(ret) = 2;
}
REGTAIL(pRExC_state, ret, eval);
/* deal with the length of this later - MJD */
|| RExC_parse[1] == '<'
|| RExC_parse[1] == '{'))
|| ( RExC_parse[0] == '*' /* (?(*...)) */
- && ( memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "pla:")
- || memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "plb")
- || memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "nla")
- || memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "nlb")
- || memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "positive_lookahead")
- || memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "positive_lookbehind")
- || memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "negative_lookahead")
- || memBEGINs(RExC_parse +1,
- (Size_t) (RExC_end - (RExC_parse + 1)),
- "negative_lookbehind"))))
- ) { /* Lookahead or eval. */
- I32 flag;
- regnode *tail;
-
- ret = reg_node(pRExC_state, LOGICAL);
- if (!SIZE_ONLY)
- ret->flags = 1;
-
- tail = reg(pRExC_state, 1, &flag, depth+1);
- RETURN_NULL_ON_RESTART(flag,flagp);
- REGTAIL(pRExC_state, ret, tail);
- goto insert_if;
- }
+ && ( memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "pla:")
+ || memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "plb:")
+ || memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "nla:")
+ || memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "nlb:")
+ || memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "positive_lookahead:")
+ || memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "positive_lookbehind:")
+ || memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "negative_lookahead:")
+ || memBEGINs(RExC_parse + 1,
+ (Size_t) (RExC_end - (RExC_parse + 1)),
+ "negative_lookbehind:"))))
+ ) { /* Lookahead or eval. */
+ I32 flag;
+ regnode *tail;
+
+ ret = reg_node(pRExC_state, LOGICAL);
+ if (!SIZE_ONLY)
+ FLAGS(ret) = 1;
+
+ tail = reg(pRExC_state, 1, &flag, depth+1);
+ RETURN_FAIL_ON_RESTART(flag, flagp);
+ REGTAIL(pRExC_state, ret, tail);
+ goto insert_if;
+ }
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
|| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
{
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void(sv_dat);
}
- ret = reganode(pRExC_state,NGROUPP,num);
+ ret = reganode(pRExC_state, NGROUPP, num);
goto insert_if_check_paren;
}
else if (memBEGINs(RExC_parse,
(STRLEN) (RExC_end - RExC_parse),
"DEFINE"))
{
- ret = reganode(pRExC_state,DEFINEP,0);
+ ret = reganode(pRExC_state, DEFINEP, 0);
RExC_parse += DEFINE_len;
is_define = 1;
goto insert_if_check_paren;
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
UV uv;
+ endptr = RExC_end;
if (grok_atoUV(RExC_parse, &uv, &endptr)
&& uv <= I32_MAX
) {
if (sv_dat)
parno = 1 + *((I32 *)SvPVX(sv_dat));
}
- ret = reganode(pRExC_state,INSUBP,parno);
+ ret = reganode(pRExC_state, INSUBP, parno);
goto insert_if_check_paren;
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
/* (?(1)...) */
char c;
UV uv;
+ endptr = RExC_end;
if (grok_atoUV(RExC_parse, &uv, &endptr)
&& uv <= I32_MAX
) {
nextchar(pRExC_state);
insert_if:
REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
- br = regbranch(pRExC_state, &flags, 1,depth+1);
+ br = regbranch(pRExC_state, &flags, 1, depth+1);
if (br == NULL) {
- RETURN_NULL_ON_RESTART(flags,flagp);
- FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
+ RETURN_FAIL_ON_RESTART(flags, flagp);
+ FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
} else
REGTAIL(pRExC_state, br, reganode(pRExC_state,
/* Fake one for optimizer. */
lastbr = reganode(pRExC_state, IFTHEN, 0);
- if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
- RETURN_NULL_ON_RESTART(flags,flagp);
- FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
+ if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
+ RETURN_FAIL_ON_RESTART(flags, flagp);
+ FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
(UV) flags);
}
REGTAIL(pRExC_state, ret, lastbr);
RExC_open_parens[parno]= ret;
}
}
+
Set_Node_Length(ret, 1); /* MJD */
Set_Node_Offset(ret, RExC_parse); /* MJD */
is_open = 1;
parse_rest:
/* Pick up the branches, linking them together. */
parse_start = RExC_parse; /* MJD */
- br = regbranch(pRExC_state, &flags, 1,depth+1);
+ br = regbranch(pRExC_state, &flags, 1, depth+1);
/* branch_len = (paren != 0); */
if (br == NULL) {
- RETURN_NULL_ON_RESTART(flags,flagp);
- FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
+ RETURN_FAIL_ON_RESTART(flags, flagp);
+ FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
}
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
lastbr = br;
while (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
- ender = reganode(pRExC_state, LONGJMP,0);
+ ender = reganode(pRExC_state, LONGJMP, 0);
/* Append to the previous. */
REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
br = regbranch(pRExC_state, &flags, 0, depth+1);
if (br == NULL) {
- RETURN_NULL_ON_RESTART(flags,flagp);
- FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
+ RETURN_FAIL_ON_RESTART(flags, flagp);
+ FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
}
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
if (RExC_nestroot == parno)
RExC_nestroot = 0;
}
- Set_Node_Offset(ender,RExC_parse+1); /* MJD */
- Set_Node_Length(ender,1); /* MJD */
+ Set_Node_Offset(ender, RExC_parse+1); /* MJD */
+ Set_Node_Length(ender, 1); /* MJD */
break;
case 's':
ender = reg_node(pRExC_state, SRCLOSE);
if (paren == '>' || paren == 't') {
node = SUSPEND, flag = 0;
}
- reginsert(pRExC_state, node,ret, depth+1);
+
+ reginsert(pRExC_state, node, ret, depth+1);
Set_Node_Cur_Length(ret, parse_start);
Set_Node_Offset(ret, parse_start + 1);
- ret->flags = flag;
+ FLAGS(ret) = flag;
REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
}
}
ret = NULL;
else {
if (!SIZE_ONLY && RExC_extralen)
- ret = reganode(pRExC_state, BRANCHJ,0);
+ ret = reganode(pRExC_state, BRANCHJ, 0);
else {
ret = reg_node(pRExC_state, BRANCH);
Set_Node_Length(ret, 1);
FALSE /* Don't force to /x */ );
while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
flags &= ~TRYAGAIN;
- latest = regpiece(pRExC_state, &flags,depth+1);
+ latest = regpiece(pRExC_state, &flags, depth+1);
if (latest == NULL) {
if (flags & TRYAGAIN)
continue;
- RETURN_NULL_ON_RESTART(flags,flagp);
- FAIL2("panic: regpiece returned NULL, flags=%#" UVxf, (UV) flags);
+ RETURN_FAIL_ON_RESTART(flags, flagp);
+ FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
}
else if (ret == NULL)
ret = latest;
DEBUG_PARSE("piec");
- ret = regatom(pRExC_state, &flags,depth+1);
+ ret = regatom(pRExC_state, &flags, depth+1);
if (ret == NULL) {
- RETURN_NULL_ON_RESTART_OR_FLAGS(flags,flagp,TRYAGAIN);
- FAIL2("panic: regatom returned NULL, flags=%#" UVxf, (UV) flags);
+ RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
+ FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
}
op = *RExC_parse;
maxpos = next;
RExC_parse++;
if (isDIGIT(*RExC_parse)) {
+ endptr = RExC_end;
if (!grok_atoUV(RExC_parse, &uv, &endptr))
vFAIL("Invalid quantifier in {,}");
if (uv >= REG_INFTY)
else
maxpos = RExC_parse;
if (isDIGIT(*maxpos)) {
+ endptr = RExC_end;
if (!grok_atoUV(maxpos, &uv, &endptr))
vFAIL("Invalid quantifier in {,}");
if (uv >= REG_INFTY)
else {
regnode * const w = reg_node(pRExC_state, WHILEM);
- w->flags = 0;
+ FLAGS(w) = 0;
REGTAIL(pRExC_state, ret, w);
if (!SIZE_ONLY && RExC_extralen) {
- reginsert(pRExC_state, LONGJMP,ret, depth+1);
- reginsert(pRExC_state, NOTHING,ret, depth+1);
+ reginsert(pRExC_state, LONGJMP, ret, depth+1);
+ reginsert(pRExC_state, NOTHING, ret, depth+1);
NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
}
- reginsert(pRExC_state, CURLYX,ret, depth+1);
+ reginsert(pRExC_state, CURLYX, ret, depth+1);
/* MJD hk */
Set_Node_Offset(ret, parse_start+1);
Set_Node_Length(ret,
RExC_whilem_seen++, RExC_extralen += 3;
MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
- ret->flags = 0;
+ FLAGS(ret) = 0;
if (min > 0)
*flagp = WORST;
* *node_p, nor *code_point_p, nor *flagp.
*
* If <cp_count> is not NULL, the caller wants to know the length (in code
- * points) that this \N sequence matches. This is set even if the function
- * returns FALSE, as detailed below.
+ * points) that this \N sequence matches. This is set, and the input is
+ * parsed for errors, even if the function returns FALSE, as detailed below.
*
* There are 5 possibilities here, as detailed in the next 5 paragraphs.
*
*
* The fourth possibility is that \N resolves to a sequence of more than one
* code points. *cp_count will be set to the number of code points in the
- * sequence. *node_p * will be set to a generated node returned by this
+ * sequence. *node_p will be set to a generated node returned by this
* function calling S_reg().
*
* The final possibility is that it is premature to be calling this function;
*/
char * endbrace; /* points to '}' following the name */
- char *endchar; /* Points to '.' or '}' ending cur char in the input
- stream */
char* p = RExC_parse; /* Temporary */
+ SV * substitute_parse = NULL;
+ char *orig_end;
+ char *save_start;
+ I32 flags;
+ Size_t count = 0; /* code point count kept internally by this function */
+
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_GROK_BSLASH_N;
* [^\n]. The latter is assumed when the {...} following the \N is a legal
* quantifier, or there is no '{' at all */
if (*p != '{' || regcurly(p)) {
- RExC_parse = p;
+ RExC_parse = p;
if (cp_count) {
*cp_count = -1;
}
- if (! node_p) {
+ if (! node_p) {
return FALSE;
}
- *node_p = reg_node(pRExC_state, REG_ANY);
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
+ *node_p = reg_node(pRExC_state, REG_ANY);
+ *flagp |= HASWIDTH|SIMPLE;
+ MARK_NAUGHTY(1);
Set_Node_Length(*node_p, 1); /* MJD */
- return TRUE;
+ return TRUE;
}
- /* Here, we have decided it should be a named character or sequence */
-
/* The test above made sure that the next real character is a '{', but
* under the /x modifier, it could be separated by space (or a comment and
* \n) and this is not allowed (for consistency with \x{...} and the
* tokenizer handling of \N{NAME}). */
if (*RExC_parse != '{') {
- vFAIL("Missing braces on \\N{}");
+ vFAIL("Missing braces on \\N{}");
}
- RExC_parse++; /* Skip past the '{' */
+ RExC_parse++; /* Skip past the '{' */
endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
if (! endbrace) { /* no trailing brace */
vFAIL2("Missing right brace on \\%c{}", 'N');
}
- else if (!( endbrace == RExC_parse /* nothing between the {} */
- || memBEGINs(RExC_parse, /* U+ (bad hex is checked below
- for a better error msg) */
- (STRLEN) (RExC_end - RExC_parse),
- "U+")))
- {
- RExC_parse = endbrace; /* position msg's '<--HERE' */
- vFAIL("\\N{NAME} must be resolved by the lexer");
- }
+ /* Here, we have decided it should be a named character or sequence */
REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
semantics */
*cp_count = 0;
}
nextchar(pRExC_state);
- if (! node_p) {
+ if (! node_p) {
return FALSE;
}
- *node_p = reg_node(pRExC_state,NOTHING);
+ *node_p = reg_node(pRExC_state, NOTHING);
return TRUE;
}
- RExC_parse += 2; /* Skip past the 'U+' */
+ /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
+ if ( endbrace - RExC_parse < 2
+ || strnNE(RExC_parse, "U+", 2))
+ {
+ RExC_parse = endbrace; /* position msg's '<--HERE' */
+ vFAIL("\\N{NAME} must be resolved by the lexer");
+ }
- /* Because toke.c has generated a special construct for us guaranteed not
- * to have NULs, we can use a str function */
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ /* This code purposely indented below because of future changes coming */
- /* Code points are separated by dots. If none, there is only one code
- * point, and is terminated by the brace */
+ /* We can get to here when the input is \N{U+...} or when toke.c has
+ * converted a name to the \N{U+...} form. This include changing a
+ * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
- if (endchar >= endbrace) {
- STRLEN length_of_hex;
- I32 grok_hex_flags;
+ RExC_parse += 2; /* Skip past the 'U+' */
- /* Here, exactly one code point. If that isn't what is wanted, fail */
- if (! code_point_p) {
- RExC_parse = p;
- return FALSE;
- }
+ /* Code points are separated by dots. The '}' terminates the whole
+ * thing. */
- /* Convert code point from hex */
- length_of_hex = (STRLEN)(endchar - RExC_parse);
- grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
-
- /* No errors in the first pass (See [perl
- * #122671].) We let the code below find the
- * errors when there are multiple chars. */
- | ((SIZE_ONLY)
- ? PERL_SCAN_SILENT_ILLDIGIT
- : 0);
-
- /* This routine is the one place where both single- and double-quotish
- * \N{U+xxxx} are evaluated. The value is a Unicode code point which
- * must be converted to native. */
- *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
- &length_of_hex,
- &grok_hex_flags,
- NULL));
-
- /* The tokenizer should have guaranteed validity, but it's possible to
- * bypass it by using single quoting, so check. Don't do the check
- * here when there are multiple chars; we do it below anyway. */
- if (length_of_hex == 0
- || length_of_hex != (STRLEN)(endchar - RExC_parse) )
- {
- RExC_parse += length_of_hex; /* Includes all the valid */
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- /* Guard against malformed utf8 */
- if (RExC_parse >= endchar) {
- RExC_parse = endchar;
+ do { /* Loop until the ending brace */
+ UV cp = 0;
+ char * start_digit; /* The first of the current code point */
+ if (! isXDIGIT(*RExC_parse)) {
+ RExC_parse++;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
- RExC_parse = endbrace + 1;
- return TRUE;
- }
- else { /* Is a multiple character sequence */
- SV * substitute_parse;
- STRLEN len;
- char *orig_end = RExC_end;
- char *save_start = RExC_start;
- I32 flags;
+ start_digit = RExC_parse;
+ count++;
- /* Count the code points, if desired, in the sequence */
- if (cp_count) {
- *cp_count = 0;
- while (RExC_parse < endbrace) {
- /* Point to the beginning of the next character in the sequence. */
- RExC_parse = endchar + 1;
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
- (*cp_count)++;
+ /* Loop through the hex digits of the current code point */
+ do {
+ /* Adding this digit will shift the result 4 bits. If that
+ * result would be above the legal max, it's overflow */
+ if (cp > MAX_LEGAL_CP >> 4) {
+
+ /* Find the end of the code point */
+ do {
+ RExC_parse ++;
+ } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
+
+ /* Be sure to synchronize this message with the similar one
+ * in utf8.c */
+ vFAIL4("Use of code point 0x%.*s is not allowed; the"
+ " permissible max is 0x%" UVxf,
+ (int) (RExC_parse - start_digit), start_digit,
+ MAX_LEGAL_CP);
+ }
+
+ /* Accumulate this (valid) digit into the running total */
+ cp = (cp << 4) + READ_XDIGIT(RExC_parse);
+
+ /* READ_XDIGIT advanced the input pointer. Ignore a single
+ * underscore separator */
+ if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
+ RExC_parse++;
+ }
+ } while (isXDIGIT(*RExC_parse));
+
+ /* Here, have accumulated the next code point */
+ if (RExC_parse >= endbrace) { /* If done ... */
+ if (count != 1) {
+ goto do_concat;
+ }
+
+ /* Here, is a single code point; fail if doesn't want that */
+ if (! code_point_p) {
+ RExC_parse = p;
+ return FALSE;
+ }
+
+ /* A single code point is easy to handle; just return it */
+ *code_point_p = UNI_TO_NATIVE(cp);
+ RExC_parse = endbrace;
+ nextchar(pRExC_state);
+ return TRUE;
}
- }
- /* Fail if caller doesn't want to handle a multi-code-point sequence.
- * But don't backup up the pointer if the caller wants to know how many
- * code points there are (they can then handle things) */
- if (! node_p) {
- if (! cp_count) {
- RExC_parse = p;
+ /* Here, the only legal thing would be a multiple character
+ * sequence (of the form "\N{U+c1.c2. ... }". So the next
+ * character must be a dot (and the one after that can't be the
+ * endbrace, or we'd have something like \N{U+100.} ) */
+ if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
+ RExC_parse = endbrace;
+ }
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- return FALSE;
- }
- /* What is done here is to convert this to a sub-pattern of the form
- * \x{char1}\x{char2}... and then call reg recursively to parse it
- * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
- * while not having to worry about special handling that some code
- * points may have. */
+ /* Here, looks like its really a multiple character sequence. Fail
+ * if that's not what the caller wants. But continue with counting
+ * and error checking if they still want a count */
+ if (! node_p && ! cp_count) {
+ return FALSE;
+ }
- substitute_parse = newSVpvs("?:");
+ /* What is done here is to convert this to a sub-pattern of the
+ * form \x{char1}\x{char2}... and then call reg recursively to
+ * parse it (enclosing in "(?: ... )" ). That way, it retains its
+ * atomicness, while not having to worry about special handling
+ * that some code points may have. We don't create a subpattern,
+ * but go through the motions of code point counting and error
+ * checking, if the caller doesn't want a node returned. */
- while (RExC_parse < endbrace) {
+ if (node_p && count == 1) {
+ substitute_parse = newSVpvs("?:");
+ }
- /* Convert to notation the rest of the code understands */
- sv_catpv(substitute_parse, "\\x{");
- sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
- sv_catpv(substitute_parse, "}");
+ do_concat:
- /* Point to the beginning of the next character in the sequence. */
- RExC_parse = endchar + 1;
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ if (node_p) {
+ /* Convert to notation the rest of the code understands */
+ sv_catpvs(substitute_parse, "\\x{");
+ sv_catpvn(substitute_parse, start_digit,
+ RExC_parse - start_digit);
+ sv_catpvs(substitute_parse, "}");
+ }
- }
- sv_catpv(substitute_parse, ")");
+ /* Move to after the dot (or ending brace the final time through.)
+ * */
+ RExC_parse++;
+ count++;
- len = SvCUR(substitute_parse);
+ } while (RExC_parse < endbrace);
- /* Don't allow empty number */
- if (len < (STRLEN) 8) {
- RExC_parse = endbrace;
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
+ if (! node_p) { /* Doesn't want the node */
+ assert (cp_count);
- RExC_parse = RExC_start = RExC_adjusted_start
- = SvPV_nolen(substitute_parse);
- RExC_end = RExC_parse + len;
+ *cp_count = count;
+ return FALSE;
+ }
+
+ sv_catpvs(substitute_parse, ")");
- /* The values are Unicode, and therefore not subject to recoding, but
- * have to be converted to native on a non-Unicode (meaning non-ASCII)
- * platform. */
#ifdef EBCDIC
+ /* The values are Unicode, and therefore have to be converted to native
+ * on a non-Unicode (meaning non-ASCII) platform. */
RExC_recode_x_to_native = 1;
#endif
- *node_p = reg(pRExC_state, 1, &flags, depth+1);
-
- /* Restore the saved values */
- RExC_start = RExC_adjusted_start = save_start;
- RExC_parse = endbrace;
- RExC_end = orig_end;
+ /* Here, we have the string the name evaluates to, ready to be parsed,
+ * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
+ * constructs. This can be called from within a substitute parse already.
+ * The error reporting mechanism doesn't work for 2 levels of this, but the
+ * code above has validated this new construct, so there should be no
+ * errors generated by the below. And this isn' an exact copy, so the
+ * mechanism to seamlessly deal with this won't work. XXX Maybe should
+ * turn off all warnings for safety? */
+ save_start = RExC_start;
+ orig_end = RExC_end;
+
+ RExC_parse = RExC_start = SvPVX(substitute_parse);
+ RExC_end = RExC_parse + SvCUR(substitute_parse);
+
+ *node_p = reg(pRExC_state, 1, &flags, depth+1);
+
+ /* Restore the saved values */
+ RExC_start = save_start;
+ RExC_parse = endbrace;
+ RExC_end = orig_end;
#ifdef EBCDIC
- RExC_recode_x_to_native = 0;
+ RExC_recode_x_to_native = 0;
#endif
- SvREFCNT_dec_NN(substitute_parse);
- if (! *node_p) {
- RETURN_X_ON_RESTART(FALSE, flags,flagp);
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
- (UV) flags);
- }
- *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
-
- nextchar(pRExC_state);
+ SvREFCNT_dec_NN(substitute_parse);
- return TRUE;
+ if (! *node_p) {
+ RETURN_X_ON_RESTART(FALSE, flags, flagp);
+ FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
+ (UV) flags);
}
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+
+ nextchar(pRExC_state);
+
+ return TRUE;
}
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
static I32
-S_backref_value(char *p)
+S_backref_value(char *p, char *e)
{
- const char* endptr;
+ const char* endptr = e;
UV val;
if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
return (I32)val;
case '[':
{
char * const oregcomp_parse = ++RExC_parse;
- ret = regclass(pRExC_state, flagp,depth+1,
+ ret = regclass(pRExC_state, flagp, depth+1,
FALSE, /* means parse the whole char class */
TRUE, /* allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
NULL,
NULL);
if (ret == NULL) {
- RETURN_NULL_ON_RESTART_FLAGP_OR_FLAGS(flagp,NEED_UTF8);
- FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
+ RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, NEED_UTF8);
+ FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
(UV) *flagp);
}
if (*RExC_parse != ']') {
}
case '(':
nextchar(pRExC_state);
- ret = reg(pRExC_state, 2, &flags,depth+1);
+ ret = reg(pRExC_state, 2, &flags, depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
if (RExC_parse >= RExC_end) {
}
goto tryagain;
}
- RETURN_NULL_ON_RESTART(flags,flagp);
- FAIL2("panic: reg returned NULL to regatom, flags=%#" UVxf,
+ RETURN_FAIL_ON_RESTART(flags, flagp);
+ FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
(UV) flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
* /\A/ from /^/ in split. We check ret because first pass we
* have no regop struct to set the flags on. */
if (PASS2)
- ret->flags = 1;
+ FLAGS(ret) = 1;
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
case 'P':
RExC_parse--;
- ret = regclass(pRExC_state, flagp,depth+1,
+ ret = regclass(pRExC_state, flagp, depth+1,
TRUE, /* means just parse this element */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. It
TRUE, /* Allow an optimized regnode result */
NULL,
NULL);
- RETURN_NULL_ON_RESTART_FLAGP(flagp);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
/* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
* multi-char folds are allowed. */
if (!ret)
- FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
+ FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
(UV) *flagp);
RExC_parse--;
break;
}
- RETURN_NULL_ON_RESTART_FLAGP(flagp);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
/* Here, evaluates to a single code point. Go get that */
RExC_parse = parse_start;
{
RExC_parse++;
/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
- vFAIL2("Sequence %.2s... not terminated",parse_start);
+ vFAIL2("Sequence %.2s... not terminated", parse_start);
} else {
RExC_parse += 2;
ret = handle_named_backref(pRExC_state,
if (RExC_parse >= RExC_end) {
goto unterminated_g;
}
- num = S_backref_value(RExC_parse);
+ num = S_backref_value(RExC_parse, RExC_end);
if (num == 0)
vFAIL("Reference to invalid group 0");
else if (num == I32_MAX) {
}
}
else {
- num = S_backref_value(RExC_parse);
+ num = S_backref_value(RExC_parse, RExC_end);
/* bare \NNN might be backref or octal - if it is larger
* than or equal RExC_npar then it is assumed to be an
* octal escape. Note RExC_npar is +1 from the actual
&& *RExC_parse != '8'
/* cannot be an octal escape it it starts with 9 */
&& *RExC_parse != '9'
- )
- {
- /* Probably not a backref, instead likely to be an
- * octal character escape, e.g. \35 or \777.
+ ) {
+ /* Probably not meant to be a backref, instead likely
+ * to be an octal character escape, e.g. \35 or \777.
* The above logic should make it obvious why using
* octal escapes in patterns is problematic. - Yves */
RExC_parse = parse_start;
* need to figure this out until pass 2) */
bool maybe_exactfu = PASS2;
+ /* To see if RExC_uni_semantics changes during parsing of the node.
+ * */
+ bool uni_semantics_at_node_start;
+
/* The node_type may change below, but since the size of the node
* doesn't change, it works */
ret = reg_node(pRExC_state, node_type);
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
+ uni_semantics_at_node_start = cBOOL(RExC_uni_semantics);
+
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
* We quit at the first non-literal or when the node gets full, or
*/
switch ((U8)*++p) {
+
/* These are all the special escapes. */
case 'A': /* Start assertion */
case 'b': case 'B': /* Word-boundary assertion*/
) {
if (*flagp & NEED_UTF8)
FAIL("panic: grok_bslash_N set NEED_UTF8");
- RETURN_NULL_ON_RESTART_FLAGP(flagp);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
/* Here, it wasn't a single code point. Go close
* up this EXACTish node. The switch() prior to
* pattern. */
/* NOTE, RExC_npar is 1 more than the actual number of
- * parens we have seen so far, hence the < RExC_npar below. */
-
- if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
+ * parens we have seen so far, hence the "<" as opposed
+ * to "<=" */
+ if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
{ /* Not to be treated as an octal constant, go
find backref */
--p;
} /* End of switch on '\' */
break;
case '{':
- /* Currently we allow an lbrace at the start of a construct
- * without raising a warning. This is because we think we
- * will never want such a brace to be meant to be other
- * than taken literally. */
+ /* Trying to gain new uses for '{' without breaking too
+ * much existing code is hard. The solution currently
+ * adopted is:
+ * 1) If there is no ambiguity that a '{' should always
+ * be taken literally, at the start of a construct, we
+ * just do so.
+ * 2) If the literal '{' conflicts with our desired use
+ * of it as a metacharacter, we die. The deprecation
+ * cycles for this have come and gone.
+ * 3) If there is ambiguity, we raise a simple warning.
+ * This could happen, for example, if the user
+ * intended it to introduce a quantifier, but slightly
+ * misspelled the quantifier. Without this warning,
+ * the quantifier would silently be taken as a literal
+ * string of characters instead of a meta construct */
if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
-
- /* But, we raise a fatal warning otherwise, as the
- * deprecation cycle has come and gone. Except that it
- * turns out that some heavily-relied on upstream
- * software, notably GNU Autoconf, have failed to fix
- * their uses. For these, don't make it fatal unless
- * we anticipate using the '{' for something else.
- * This happens after any alpha, and for a looser {m,n}
- * quantifier specification */
if ( RExC_strict
|| ( p > parse_start + 1
&& isALPHA_A(*(p - 1))
"illegal here");
}
if (PASS2) {
- ckWARNregdep(p + 1,
- "Unescaped left brace in regex is "
- "deprecated here (and will be fatal "
- "in Perl 5.30), passed through");
+ ckWARNreg(p + 1, "Unescaped left brace in regex is"
+ " passed through");
}
}
goto normal_default;
ender = 's';
added_len = 2;
}
+ else if ( uni_semantics_at_node_start
+ != RExC_uni_semantics)
+ {
+ /* Here, we are supossed to be using Unicode
+ * rules, but this folding node is not. This
+ * happens during pass 1 when the node started
+ * out not under Unicode rules, but a \N{} was
+ * encountered during the processing of it,
+ * causing Unicode rules to be switched into.
+ * Pass 1 continues uninterrupted, as by the
+ * time we get to pass 2, we will know enough
+ * to generate the correct folds. Except in
+ * this one case, we need to restart the node,
+ * because the fold of the sharp s requires 2
+ * characters, and the sizing needs to account
+ * for that. */
+ p = oldp;
+ goto loopdone;
+ }
else {
RExC_seen_unfolded_sharp_s = 1;
maybe_exactfu = FALSE;
len = s - s0 + 1;
}
else {
- if (! PL_NonL1NonFinalFold) {
- PL_NonL1NonFinalFold = _new_invlist_C_array(
- NonL1_Perl_Non_Final_Folds_invlist);
- }
/* Point to the first byte of the final character */
s = (char *) utf8_hop((U8 *) s, -1);
/* Position parse to next real character */
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force to /x */ );
- if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
- ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through");
+ if ( PASS2 && *RExC_parse == '{'
+ && OP(ret) != SBOL && ! regcurly(RExC_parse))
+ {
+ if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
+ RExC_parse++;
+ vFAIL("Unescaped left brace in regex is illegal here");
+ }
+ ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
+ " passed through");
}
return(ret);
* routine. q.v. */
#define ADD_POSIX_WARNING(p, text) STMT_START { \
if (posix_warnings) { \
- if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
- av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
+ if (! RExC_warn_text ) RExC_warn_text = \
+ (AV *) sv_2mortal((SV *) newAV()); \
+ av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
WARNING_PREFIX \
text \
REPORT_LOCATION, \
ADD_POSIX_WARNING(p, "there is no terminating ']'");
}
- if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
+ if ( posix_warnings
+ && RExC_warn_text
+ && av_top_index(RExC_warn_text) > -1)
+ {
*posix_warnings = RExC_warn_text;
}
}
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
- REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
- This is required so that the compile
- time values are valid in all runtime
- cases */
+ /* The use of this operator implies /u. This is required so that the
+ * compile time values are valid in all runtime cases */
+ REQUIRE_UNI_RULES(flagp, NULL);
/* This will return only an ANYOF regnode, or (unlikely) something smaller
* (such as EXACT). Thus we can skip most everything if just sizing. We
/* regclass() can only return RESTART_PASS1 and NEED_UTF8
* if multi-char folds are allowed. */
- if (!regclass(pRExC_state, flagp,depth+1,
+ if (!regclass(pRExC_state, flagp, depth+1,
is_posix_class, /* parse the whole char
class only if not a
posix class */
¤t,
&posix_warnings
))
- FAIL2("panic: regclass returned NULL to handle_sets, "
+ FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
/* function call leaves parse pointing to the ']', except
case '\\':
/* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
* multi-char folds are allowed. */
- if (!regclass(pRExC_state, flagp,depth+1,
+ if (!regclass(pRExC_state, flagp, depth+1,
TRUE, /* means parse just the next thing */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
¤t,
NULL))
{
- FAIL2("panic: regclass returned NULL to handle_sets, "
+ FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
}
/* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
* multi-char folds are allowed. */
- if (!regclass(pRExC_state, flagp,depth+1,
+ if (!regclass(pRExC_state, flagp, depth+1,
is_posix_class, /* parse the whole char
class only if not a
posix class */
NULL
))
{
- FAIL2("panic: regclass returned NULL to handle_sets, "
+ FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
}
* fence. Get rid of it */
fence_ptr = av_pop(fence_stack);
assert(fence_ptr);
- fence = SvIV(fence_ptr) - 1;
+ fence = SvIV(fence_ptr);
SvREFCNT_dec_NN(fence_ptr);
fence_ptr = NULL;
if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
|| ((final = av_pop(stack)) == NULL)
|| ! IS_OPERAND(final)
- || SvTYPE(final) != SVt_INVLIST
+ || ! is_invlist(final)
|| av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
{
bad_syntax:
RExC_flags &= ~RXf_PMf_FOLD;
/* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
* folds are allowed. */
- node = regclass(pRExC_state, flagp,depth+1,
+ node = regclass(pRExC_state, flagp, depth+1,
FALSE, /* means parse the whole char class */
FALSE, /* don't allow multi-char folds */
TRUE, /* silence non-portable warnings. The above may very
NULL
);
if (!node)
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#" UVxf,
+ FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
PTR2UV(flagp));
/* Fix up the node type if we are in locale. (We have pretended we are
STATIC void
S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
{
- /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
- * innocent-looking character class, like /[ks]/i won't have to go out to
- * disk to find the possible matches.
+ /* This adds the Latin1/above-Latin1 folding rules.
*
* This should be called only for a Latin1-range code points, cp, which is
* known to be involved in a simple fold with other code points above
* Latin1. It would give false results if /aa has been specified.
* Multi-char folds are outside the scope of this, and must be handled
- * specially.
- *
- * XXX It would be better to generate these via regen, in case a new
- * version of the Unicode standard adds new mappings, though that is not
- * really likely, and may be caught by the default: case of the switch
- * below. */
+ * specially. */
PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
+ /* The rules that are valid for all Unicode versions are hard-coded in */
switch (cp) {
case 'k':
case 'K':
LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
break;
-#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+ default: /* Other code points are checked against the data for the
+ current Unicode version */
+ {
+ Size_t folds_to_count;
+ unsigned int first_folds_to;
+ const unsigned int * remaining_folds_to_list;
+ UV folded_cp;
- case LATIN_SMALL_LETTER_SHARP_S:
- *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
- break;
+ if (isASCII(cp)) {
+ folded_cp = toFOLD(cp);
+ }
+ else {
+ U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
+ Size_t dummy_len;
+ folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
+ }
-#endif
+ if (folded_cp > 255) {
+ *invlist = add_cp_to_invlist(*invlist, folded_cp);
+ }
-#if UNICODE_MAJOR_VERSION < 3 \
- || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
+ folds_to_count = _inverse_folds(folded_cp, &first_folds_to,
+ &remaining_folds_to_list);
+ if (folds_to_count == 0) {
- /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
- * U+0131. */
- case 'i':
- case 'I':
- *invlist =
- add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
-# if UNICODE_DOT_DOT_VERSION == 1
- *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
-# endif
- break;
-#endif
+ /* Use deprecated warning to increase the chances of this being
+ * output */
+ if (PASS2) {
+ ckWARN2reg_d(RExC_parse,
+ "Perl folding rules are not up-to-date for 0x%02X;"
+ " please use the perlbug utility to report;", cp);
+ }
+ }
+ else {
+ unsigned int i;
- default:
- /* Use deprecated warning to increase the chances of this being
- * output */
- if (PASS2) {
- ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+ if (first_folds_to > 255) {
+ *invlist = add_cp_to_invlist(*invlist, first_folds_to);
+ }
+ for (i = 0; i < folds_to_count - 1; i++) {
+ if (remaining_folds_to_list[i] > 255) {
+ *invlist = add_cp_to_invlist(*invlist,
+ remaining_folds_to_list[i]);
+ }
+ }
}
break;
+ }
}
}
AV* posix_warnings = NULL;
const bool do_posix_warnings = return_posix_warnings
|| (PASS2 && ckWARN(WARN_REGEXP));
+ U8 op = END; /* The returned node-type, initialized to an impossible
+ one. */
+ U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
+ U32 posixl = 0; /* bit field of posix classes matched under /l */
GET_RE_DEBUG_FLAGS_DECL;
allow_multi_folds = FALSE;
#endif
- /* Assume we are going to generate an ANYOF node. */
- ret = reganode(pRExC_state,
- (LOC)
- ? ANYOFL
- : ANYOF,
- 0);
-
if (SIZE_ONLY) {
- RExC_size += ANYOF_SKIP;
listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
}
else {
- ANYOF_FLAGS(ret) = 0;
-
- RExC_emit += ANYOF_SKIP;
listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
initial_listsv_len = SvCUR(listsv);
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
if (*flagp & NEED_UTF8)
FAIL("panic: grok_bslash_N set NEED_UTF8");
- RETURN_NULL_ON_RESTART_FLAGP(flagp);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
if (cp_count < 0) {
vFAIL("\\N in a character class must be a named character: \\N{...}");
case 'P':
{
char *e;
+ char *i;
/* We will handle any undefined properties ourselves */
U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
* anyway, to save a little time */
|_CORE_SWASH_INIT_ACCEPT_INVLIST;
+ SvREFCNT_dec(swash); /* Free any left-overs */
if (RExC_parse >= RExC_end)
vFAIL2("Empty \\%c", (U8)value);
if (*RExC_parse == '{') {
}
RExC_parse++;
+
+ /* White space is allowed adjacent to the braces and after
+ * any '^', even when not under /x */
while (isSPACE(*RExC_parse)) {
RExC_parse++;
}
n = e - RExC_parse;
while (isSPACE(*(RExC_parse + n - 1)))
n--;
+
} /* The \p isn't immediately followed by a '{' */
else if (! isALPHA(*RExC_parse)) {
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
n = 1;
}
if (!SIZE_ONLY) {
- SV* invlist;
- char* name;
+ char* name = RExC_parse;
char* base_name; /* name after any packages are stripped */
char* lookup_name = NULL;
const char * const colon_colon = "::";
+ bool invert;
+
+ SV* invlist;
+
+ /* Temporary workaround for [perl #133136]. For this
+ * precise input that is in the .t that is failing, load
+ * utf8.pm, which is what the test wants, so that that
+ * .t passes */
+ if ( memEQs(RExC_start, e + 1 - RExC_start,
+ "foo\\p{Alnum}")
+ && ! hv_common(GvHVn(PL_incgv),
+ NULL,
+ "utf8.pm", sizeof("utf8.pm") - 1,
+ 0, HV_FETCH_ISEXISTS, NULL, 0))
+ {
+ require_pv("utf8.pm");
+ }
+ invlist = parse_uniprop_string(name, n, FOLD, &invert);
+ if (invlist) {
+ if (invert) {
+ value ^= 'P' ^ 'p';
+ }
+ }
+ else {
/* Try to get the definition of the property into
* <invlist>. If /i is in effect, the effective property
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
SAVEFREEPV(name);
+
+ for (i = RExC_parse; i < RExC_parse + n; i++) {
+ if (isCNTRL(*i) && *i != '\t') {
+ RExC_parse = e + 1;
+ vFAIL2("Can't find Unicode property definition \"%s\"", name);
+ }
+ }
+
if (FOLD) {
lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
- SvREFCNT_dec(swash); /* Free any left-overs */
swash = _core_swash_init("utf8",
(lookup_name)
? lookup_name
/* We don't know yet what this matches, so have to flag
* it */
- ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
+ anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
else {
{
has_user_defined_property = TRUE;
}
- else if
+ }
+ }
+ if (invlist) {
+ if (! has_user_defined_property &&
/* We warn on matching an above-Unicode code point
* if the match would return true, except don't
* warn for \p{All}, which has exactly one element
* = 0 */
(_invlist_contains_cp(invlist, 0x110000)
&& (! (_invlist_len(invlist) == 1
- && *invlist_array(invlist) == 0)))
+ && *invlist_array(invlist) == 0))))
{
warn_super = TRUE;
}
-
/* Invert if asking for the complement */
if (value == 'P') {
_invlist_union_complement_2nd(properties,
/* The swash can't be used as-is, because we've
* inverted things; delay removing it to here after
* have copied its invlist above */
- SvREFCNT_dec_NN(swash);
+ if (! swash) {
+ SvREFCNT_dec_NN(invlist);
+ }
+ SvREFCNT_dec(swash);
swash = NULL;
}
else {
_invlist_union(properties, invlist, &properties);
+ if (! swash) {
+ SvREFCNT_dec_NN(invlist);
+ }
}
- }
- }
+ }
+ } /* End of actually getting the values in pass 2 */
+
RExC_parse = e + 1;
namedclass = ANYOF_UNIPROP; /* no official name, but it's
named */
* by locale, and hence are dealt with separately */
if (! need_class) {
need_class = 1;
- if (SIZE_ONLY) {
- RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
- }
- else {
- RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
- }
- ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
- ANYOF_POSIXL_ZERO(ret);
+ anyof_flags |= ANYOF_MATCHES_POSIXL;
/* We can't change this into some other type of node
* (unless this is the only element, in which case there
/* Coverity thinks it is possible for this to be negative; both
* jhi and khw think it's not, but be safer */
- assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
+ assert(! (anyof_flags & ANYOF_MATCHES_POSIXL)
|| (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
/* See if it already matches the complement of this POSIX
* class */
- if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
- && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
- ? -1
- : 1)))
+ if ( (anyof_flags & ANYOF_MATCHES_POSIXL)
+ && POSIXL_TEST(posixl, namedclass + ((namedclass % 2)
+ ? -1
+ : 1)))
{
posixl_matches_all = TRUE;
break; /* No need to continue. Since it matches both
}
/* Add this class to those that should be checked at runtime */
- ANYOF_POSIXL_SET(ret, namedclass);
+ POSIXL_SET(posixl, namedclass);
/* The above-Latin1 characters are not subject to locale rules.
* Just add them, in the second pass, to the
}
}
else if ( UNI_SEMANTICS
+ || AT_LEAST_ASCII_RESTRICTED
|| classnum == _CC_ASCII
|| (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
|| classnum == _CC_XDIGIT)))
{
- /* We usually have to worry about /d and /a affecting what
- * POSIX classes match, with special code needed for /d
- * because we won't know until runtime what all matches.
- * But there is no extra work needed under /u, and
- * [:ascii:] is unaffected by /a and /d; and :digit: and
- * :xdigit: don't have runtime differences under /d. So we
- * can special case these, and avoid some extra work below,
- * and at runtime. */
+ /* We usually have to worry about /d affecting what POSIX
+ * classes match, with special code needed because we won't
+ * know until runtime what all matches. But there is no
+ * extra work needed under /u and /a; and [:ascii:] is
+ * unaffected by /d; and :digit: and :xdigit: don't have
+ * runtime differences under /d. So we can special case
+ * these, and avoid some extra work below, and at runtime.
+ * */
_invlist_union_maybe_complement_2nd(
simple_posixes,
- PL_XPosix_ptrs[classnum],
+ ((AT_LEAST_ASCII_RESTRICTED)
+ ? PL_Posix_ptrs[classnum]
+ : PL_XPosix_ptrs[classnum]),
namedclass % 2 != 0,
&simple_posixes);
}
char *save_end = RExC_end;
char *save_parse = RExC_parse;
char *save_start = RExC_start;
- STRLEN prefix_end = 0; /* We copy the character class after a
- prefix supplied here. This is the size
- + 1 of that prefix */
+ Size_t constructed_prefix_len = 0; /* This gives the length of the
+ constructed portion of the
+ substitute parse. */
bool first_time = TRUE; /* First multi-char occurrence doesn't get
a "|" */
I32 reg_flags;
assert(! invert);
- assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
+ /* Only one level of recursion allowed */
+ assert(RExC_copy_start_in_constructed == RExC_precomp);
#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
because too confusing */
if (invert) {
- sv_catpv(substitute_parse, "(?:");
+ sv_catpvs(substitute_parse, "(?:");
}
#endif
&PL_sv_undef)
{
if (! first_time) {
- sv_catpv(substitute_parse, "|");
+ sv_catpvs(substitute_parse, "|");
}
first_time = FALSE;
/* If the character class contains anything else besides these
* multi-character folds, have to include it in recursive parsing */
if (element_count) {
- sv_catpv(substitute_parse, "|[");
- prefix_end = SvCUR(substitute_parse);
+ sv_catpvs(substitute_parse, "|[");
+ constructed_prefix_len = SvCUR(substitute_parse);
sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
/* Put in a closing ']' only if not going off the end, as otherwise
* we are adding something that really isn't there */
if (RExC_parse < RExC_end) {
- sv_catpv(substitute_parse, "]");
+ sv_catpvs(substitute_parse, "]");
}
}
- sv_catpv(substitute_parse, ")");
+ sv_catpvs(substitute_parse, ")");
#if 0
if (invert) {
/* This is a way to get the parse to skip forward a whole named
* sequence instead of matching the 2nd character when it fails the
* first */
- sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
+ sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
}
#endif
/* Set up the data structure so that any errors will be properly
* reported. See the comments at the definition of
* REPORT_LOCATION_ARGS for details */
- RExC_precomp_adj = orig_parse - RExC_precomp;
- RExC_start = RExC_parse = SvPV(substitute_parse, len);
- RExC_adjusted_start = RExC_start + prefix_end;
+ RExC_copy_start_in_input = (char *) orig_parse;
+ RExC_start = RExC_parse = SvPV(substitute_parse, len);
+ RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
RExC_end = RExC_parse + len;
RExC_in_multi_char_class = 1;
- RExC_emit = (regnode *)orig_emit;
ret = reg(pRExC_state, 1, ®_flags, depth+1);
/* And restore so can parse the rest of the pattern */
RExC_parse = save_parse;
- RExC_start = RExC_adjusted_start = save_start;
- RExC_precomp_adj = 0;
+ RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
RExC_end = save_end;
RExC_in_multi_char_class = 0;
SvREFCNT_dec_NN(multi_char_matches);
* an optimization */
if (op != END) {
- /* Throw away this ANYOF regnode, and emit the calculated one,
+ /* Emit the calculated regnode,
* which should correspond to the beginning, not current, state of
* the parse */
const char * cur_parse = RExC_parse;
RExC_parse = (char *)orig_parse;
- if ( SIZE_ONLY) {
- if (! LOC) {
-
- /* To get locale nodes to not use the full ANYOF size would
- * require moving the code above that writes the portions
- * of it that aren't in other nodes to after this point.
- * e.g. ANYOF_POSIXL_SET */
- RExC_size = orig_size;
- }
- }
- else {
- RExC_emit = (regnode *)orig_emit;
if (PL_regkind[op] == POSIXD) {
if (op == POSIXL) {
RExC_contains_locale = 1;
op += NPOSIXD - POSIXD;
}
}
- }
ret = reg_node(pRExC_state, op);
}
}
- if (SIZE_ONLY)
+ /* Assume we are going to generate an ANYOF-type node. */
+ op = (posixl)
+ ? ANYOFPOSIXL
+ : (LOC)
+ ? ANYOFL
+ : ANYOF;
+ ret = reganode(pRExC_state, op, 0);
+
+ if (SIZE_ONLY) {
+ RExC_size += (op == ANYOFPOSIXL) ? ANYOF_POSIXL_SKIP : ANYOF_SKIP + 1;
return ret;
+ }
+
/****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
+ RExC_emit += (op == ANYOFPOSIXL) ? ANYOF_POSIXL_SKIP : ANYOF_SKIP;
+
+ ANYOF_FLAGS(ret) = anyof_flags;
+ if (posixl) {
+ ANYOF_POSIXL_SET_TO_BITMAP(ret, posixl);
+ }
/* If folding, we calculate all characters that could fold to or from the
* ones already on the list */
_invlist_intersection(PL_utf8_foldable, cp_foldable_list,
&fold_intersection);
- /* The folds for all the Latin1 characters are hard-coded into this
- * program, but we have to go out to disk to get the others. */
- if (invlist_highest(cp_foldable_list) >= 256) {
-
- /* This is a hash that for a particular fold gives all
- * characters that are involved in it */
- if (! PL_utf8_foldclosures) {
- _load_PL_utf8_foldclosures();
- }
- }
-
/* Now look at the foldable characters in this class individually */
invlist_iterinit(fold_intersection);
while (invlist_iternext(fold_intersection, &start, &end)) {
UV j;
+ UV folded;
/* Look at every character in the range */
for (j = start; j <= end; j++) {
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- SV** listp;
+ unsigned int k;
+ Size_t folds_to_count;
+ unsigned int first_folds_to;
+ const unsigned int * remaining_folds_to_list;
if (j < 256) {
* rules hard-coded for it. First, get its fold. This is
* the simple fold, as the multi-character folds have been
* handled earlier and separated out */
- _to_uni_fold_flags(j, foldbuf, &foldlen,
+ folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
(ASCII_FOLD_RESTRICTED)
? FOLD_FLAGS_NOMIX_ASCII
: 0);
- /* Single character fold of above Latin1. Add everything in
- * its fold closure to the list that this node should match.
- * The fold closures data structure is a hash with the keys
- * being the UTF-8 of every character that is folded to, like
- * 'k', and the values each an array of all code points that
- * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
- * Multi-character folds are not included */
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) foldbuf, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- assert(c_p);
-
- c = SvUV(*c_p);
-
- /* /aa doesn't allow folds between ASCII and non- */
- if ((ASCII_FOLD_RESTRICTED
- && (isASCII(c) != isASCII(j))))
- {
- continue;
- }
+ /* Single character fold of above Latin1. Add everything
+ * in its fold closure to the list that this node should
+ * match. */
+ folds_to_count = _inverse_folds(folded, &first_folds_to,
+ &remaining_folds_to_list);
+ for (k = 0; k <= folds_to_count; k++) {
+ UV c = (k == 0) /* First time through use itself */
+ ? folded
+ : (k == 1) /* 2nd time use, the first fold */
+ ? first_folds_to
+
+ /* Then the remaining ones */
+ : remaining_folds_to_list[k-2];
+
+ /* /aa doesn't allow folds between ASCII and non- */
+ if (( ASCII_FOLD_RESTRICTED
+ && (isASCII(c) != isASCII(j))))
+ {
+ continue;
+ }
- /* Folds under /l which cross the 255/256 boundary
- * are added to a separate list. (These are valid
- * only when the locale is UTF-8.) */
- if (c < 256 && LOC) {
- *use_list = add_cp_to_invlist(*use_list, c);
- continue;
- }
+ /* Folds under /l which cross the 255/256 boundary are
+ * added to a separate list. (These are valid only
+ * when the locale is UTF-8.) */
+ if (c < 256 && LOC) {
+ *use_list = add_cp_to_invlist(*use_list, c);
+ continue;
+ }
- if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
- {
- cp_list = add_cp_to_invlist(cp_list, c);
- }
- else {
- /* Similarly folds involving non-ascii Latin1
- * characters under /d are added to their list */
- has_upper_latin1_only_utf8_matches
- = add_cp_to_invlist(
- has_upper_latin1_only_utf8_matches,
- c);
- }
+ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+ {
+ cp_list = add_cp_to_invlist(cp_list, c);
+ }
+ else {
+ /* Similarly folds involving non-ascii Latin1
+ * characters under /d are added to their list */
+ has_upper_latin1_only_utf8_matches
+ = add_cp_to_invlist(
+ has_upper_latin1_only_utf8_matches,
+ c);
}
}
}
}
}
if (posixes || nposixes) {
-
- /* We have to adjust /a and /aa */
- if (AT_LEAST_ASCII_RESTRICTED) {
-
- /* Under /a and /aa, nothing above ASCII matches these */
- if (posixes) {
- _invlist_intersection(posixes,
- PL_XPosix_ptrs[_CC_ASCII],
- &posixes);
- }
-
- /* Under /a and /aa, everything above ASCII matches these
- * complements */
- if (nposixes) {
- _invlist_union_complement_2nd(nposixes,
- PL_XPosix_ptrs[_CC_ASCII],
- &nposixes);
- }
- }
-
if (! DEPENDS_SEMANTICS) {
/* For everything but /d, we can just add the current 'posixes' and
*
* Handle the case where there something like \W separately */
if (nposixes) {
- SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+ SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
/* A complemented posix class matches all upper Latin1
* characters if not in UTF-8. And it matches just certain
* at compile time. Besides not inverting folded locale now, we can't
* invert if there are things such as \w, which aren't known until runtime
* */
- if (cp_list
- && invert
+ if ( cp_list
+ && invert
&& OP(ret) != ANYOFD
&& ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
SV *si = NULL; /* Input swash initialization string */
SV* invlist = NULL;
- RXi_GET_DECL(prog,progi);
+ RXi_GET_DECL(prog, progi);
const struct reg_data * const data = prog ? progi->data : NULL;
PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
));
}
else if (! *output_invlist) {
- *output_invlist = invlist_clone(invlist);
+ *output_invlist = invlist_clone(invlist, NULL);
}
else {
_invlist_union(*output_invlist, invlist, output_invlist);
* RExC_emit */
regnode * const ret = RExC_emit;
+
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGNODE_GUTS;
/*
- reginsert - insert an operator in front of already-emitted operand
*
-* Means relocating the operand.
+* That means that on exit 'operand' is the offset of the newly inserted
+* operator, and the original operand has been relocated.
*
* IMPORTANT NOTE - it is the *callers* responsibility to correctly
* set up NEXT_OFF() of the inserted node if needed. Something like this:
* if (PASS2)
* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
*
-* ALSO NOTE - operand->flags will be set to 0 as well.
+* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
*/
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth)
PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
- DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
+ DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
if (SIZE_ONLY) {
RExC_size += size;
return;
* regex, it can't move. RExC_close_parens[0] is the end
* of the regex, it *can* move. */
if ( paren && RExC_open_parens[paren] >= operand ) {
- /*DEBUG_PARSE_FMT("open"," - %d",size);*/
+ /*DEBUG_PARSE_FMT("open"," - %d", size);*/
RExC_open_parens[paren] += size;
} else {
/*DEBUG_PARSE_FMT("open"," - %s","ok");*/
}
if ( RExC_close_parens[paren] >= operand ) {
- /*DEBUG_PARSE_FMT("close"," - %d",size);*/
+ /*DEBUG_PARSE_FMT("close"," - %d", size);*/
RExC_close_parens[paren] += size;
} else {
/*DEBUG_PARSE_FMT("close"," - %s","ok");*/
if (RExC_offsets) { /* MJD 20010112 */
MJD_OFFSET_DEBUG(
("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
- "reg_insert",
+ "reginsert",
__LINE__,
PL_reg_name[op],
- (UV)(dst - RExC_emit_start) > RExC_offsets[0]
+ (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
- (UV)(src - RExC_emit_start),
- (UV)(dst - RExC_emit_start),
+ (UV)REGNODE_OFFSET(src),
+ (UV)REGNODE_OFFSET(dst),
(UV)RExC_offsets[0]));
- Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
- Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
+ Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
+ Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
}
#endif
}
"reginsert",
__LINE__,
PL_reg_name[op],
- (UV)(place - RExC_emit_start) > RExC_offsets[0]
+ (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
- (UV)(place - RExC_emit_start),
+ (UV)REGNODE_OFFSET(place),
(UV)(RExC_parse - RExC_start),
(UV)RExC_offsets[0]));
Set_Node_Offset(place, RExC_parse);
}
#endif
src = NEXTOPER(place);
- place->flags = 0;
- FILL_ADVANCE_NODE(place, op);
+ FLAGS(place) = 0;
+ FILL_NODE(place, op);
+
+ /* Zero out any arguments in the new node */
Zero(src, offset, regnode);
}
STATIC U8
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
- const regnode *val,U32 depth)
+ const regnode *val, U32 depth)
{
regnode *scan;
U8 exact = PSEUDO;
for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
if (flags & (1<<bit)) {
if (!set++ && lead)
- Perl_re_printf( aTHX_ "%s",lead);
- Perl_re_printf( aTHX_ "%s ",PL_reg_intflags_name[bit]);
+ Perl_re_printf( aTHX_ "%s", lead);
+ Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
}
}
if (lead) {
if (set)
Perl_re_printf( aTHX_ "\n");
else
- Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
+ Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
}
}
continue;
}
if (!set++ && lead)
- Perl_re_printf( aTHX_ "%s",lead);
- Perl_re_printf( aTHX_ "%s ",PL_reg_extflags_name[bit]);
+ Perl_re_printf( aTHX_ "%s", lead);
+ Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
}
}
if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
if (!set++ && lead) {
- Perl_re_printf( aTHX_ "%s",lead);
+ Perl_re_printf( aTHX_ "%s", lead);
}
switch (cs) {
case REGEX_UNICODE_CHARSET:
if (set)
Perl_re_printf( aTHX_ "\n");
else
- Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
+ Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
}
}
#endif
int i;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
- RXi_GET_DECL(r,ri);
+ RXi_GET_DECL(r, ri);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGDUMP;
Perl_re_printf( aTHX_ "with eval ");
Perl_re_printf( aTHX_ "\n");
DEBUG_FLAGS_r({
- regdump_extflags("r->extflags: ",r->extflags);
- regdump_intflags("r->intflags: ",r->intflags);
+ regdump_extflags("r->extflags: ", r->extflags);
+ regdump_intflags("r->intflags: ", r->intflags);
});
#else
PERL_ARGS_ASSERT_REGDUMP;
{
#ifdef DEBUGGING
int k;
- RXi_GET_DECL(prog,progi);
+ RXi_GET_DECL(prog, progi);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGPROP;
const reg_trie_data * const trie
= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
- Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
+ Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
DEBUG_TRIE_COMPILE_r({
if (trie->jump)
sv_catpvs(sv, "(JUMP)");
const bool inverted = flags & ANYOF_INVERT;
- if (OP(o) == ANYOFL) {
+ if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
sv_catpvs(sv, "{utf8-locale-reqd}");
}
sv_catpvs(sv, "{");
}
else if (do_sep) {
- Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
}
sv_catsv(sv, unresolved);
if (inverted) {
/* This is output in a separate [] */
if (do_sep) {
- Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
}
/* And, for easy of understanding, it is shown in the
U8 index = FLAGS(o) * 2;
if (index < C_ARRAY_LENGTH(anyofs)) {
if (*anyofs[index] != '[') {
- sv_catpv(sv, "[");
+ sv_catpvs(sv, "[");
}
sv_catpv(sv, anyofs[index]);
if (*anyofs[index] != '[') {
- sv_catpv(sv, "]");
+ sv_catpvs(sv, "]");
}
}
else {
"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
PL_colors[4],
RX_UTF8(r) ? "utf8 " : "",
- PL_colors[5],PL_colors[0],
+ PL_colors[5], PL_colors[0],
s,
PL_colors[1],
(strlen(s) > PL_dump_re_max_len ? "..." : ""));
drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
SvREFCNT_inc_void(drx->qr_anoncv);
if (srx->recurse_locinput)
- Newx(drx->recurse_locinput,srx->nparens + 1,char *);
+ Newx(drx->recurse_locinput, srx->nparens + 1, char *);
return dsv;
}
Perl_regfree_internal(pTHX_ REGEXP * const rx)
{
struct regexp *const r = ReANY(rx);
- RXi_GET_DECL(r,ri);
+ RXi_GET_DECL(r, ri);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGFREE_INTERNAL;
RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
- PL_colors[4],PL_colors[5],s);
+ PL_colors[4], PL_colors[5], s);
}
});
+
#ifdef RE_TRACK_PATTERN_OFFSETS
if (ri->u.offsets)
Safefree(ri->u.offsets); /* 20010421 MJD */
Safefree(ri);
}
-#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
-#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
-#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
+#define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
+#define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
+#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
/*
re_dup_guts - duplicate a regexp.
RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
if (r->recurse_locinput)
- Newx(ret->recurse_locinput,r->nparens + 1,char *);
+ Newx(ret->recurse_locinput, r->nparens + 1, char *);
if (ret->pprivate)
- RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
+ RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
if (RX_MATCH_COPIED(dstr))
ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
struct regexp *const r = ReANY(rx);
regexp_internal *reti;
int len;
- RXi_GET_DECL(r,ri);
+ RXi_GET_DECL(r, ri);
PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
}
#else
- SetProgLen(reti,len);
+ SetProgLen(reti, len);
#endif
return (void*)reti;
#endif
STATIC void
-S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
+S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
{
va_list args;
STRLEN l1 = strlen(pat1);
va_start(args, pat2);
msv = vmess(buf, &args);
va_end(args);
- message = SvPV_const(msv,l1);
+ message = SvPV_const(msv, l1);
if (l1 > 512)
l1 = 512;
Copy(message, buf, l1 , char);
* don't change the caller's list) */
if (nonbitmap_invlist) {
assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
- invlist = invlist_clone(nonbitmap_invlist);
+ invlist = invlist_clone(nonbitmap_invlist, NULL);
}
else { /* Worst case size is every other code point is matched */
invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
/* And this flag for matching all non-ASCII 0xFF and below */
if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
{
- not_utf8 = invlist_clone(PL_UpperLatin1);
+ not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
}
}
- else if (OP(node) == ANYOFL) {
+ else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
/* If either of these flags are set, what matches isn't
* determinable except during execution, so don't know enough here
posixes = newSVpvs("");
for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
- if (ANYOF_POSIXL_TEST(node,i)) {
+ if (ANYOF_POSIXL_TEST(node, i)) {
sv_catpv(posixes, anyofs[i]);
}
}
/* Since this list is passed in, we have to make a copy before
* modifying it */
- only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
+ only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
_invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
const regnode *next;
const regnode *optstart= NULL;
- RXi_GET_DECL(r,ri);
+ RXi_GET_DECL(r, ri);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_DUMPUNTIL;
#ifdef DEBUG_DUMPUNTIL
- Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n",indent,node-start,
- last ? last-start : 0,plast ? plast-start : 0);
+ Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
+ last ? last-start : 0, plast ? plast-start : 0);
#endif
if (plast && plast < last)
I32 word_idx;
SvPVCLEAR(sv);
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
- SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
+ SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
Perl_re_indentf( aTHX_ "%s ",
indent+3,
#endif /* DEBUGGING */
+#ifndef PERL_IN_XSUB_RE
+
+#include "uni_keywords.h"
+
+void
+Perl_init_uniprops(pTHX)
+{
+ /* Set up the inversion list global variables */
+
+ PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
+ PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
+ PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
+ PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
+ PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
+ PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
+ PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
+ PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
+ PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
+ PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
+ PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
+ PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
+ PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
+ PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
+ PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
+ PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
+
+ PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
+ PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
+ PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
+ PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
+ PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
+ PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
+ PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
+ PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
+ PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
+ PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
+ PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
+ PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
+ PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
+ PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
+ PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
+ PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
+
+ PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
+ PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
+ PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
+ PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
+ PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
+
+ PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
+ PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
+ PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
+
+ PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
+
+ PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
+ PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
+
+ PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
+ PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
+
+ PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
+ PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
+ UNI__PERL_FOLDS_TO_MULTI_CHAR]);
+ PL_NonL1NonFinalFold = _new_invlist_C_array(
+ NonL1_Perl_Non_Final_Folds_invlist);
+
+ PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
+ PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
+ PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
+ PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
+ PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
+ PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
+ PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
+
+ /* The below are used only by deprecated functions. They could be removed */
+ PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
+ PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
+ PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
+}
+
+SV *
+Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
+ const bool to_fold, bool * invert)
+{
+ /* Parse the interior meat of \p{} passed to this in 'name' with length
+ * 'name_len', and return an inversion list if a property with 'name' is
+ * found, or NULL if not. 'name' point to the input with leading and
+ * trailing space trimmed. 'to_fold' indicates if /i is in effect.
+ *
+ * When the return is an inversion list, '*invert' will be set to a boolean
+ * indicating if it should be inverted or not
+ *
+ * This currently doesn't handle all cases. A NULL return indicates the
+ * caller should try a different approach
+ */
+
+ char* lookup_name;
+ bool stricter = FALSE;
+ bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one
+ of the cjk numeric properties (though
+ it requires extra effort to compile
+ them) */
+ unsigned int i;
+ unsigned int j = 0, lookup_len;
+ int equals_pos = -1; /* Where the '=' is found, or negative if none */
+ int slash_pos = -1; /* Where the '/' is found, or negative if none */
+ int table_index = 0;
+ bool starts_with_In_or_Is = FALSE;
+ Size_t lookup_offset = 0;
+
+ PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
+
+ /* The input will be modified into 'lookup_name' */
+ Newx(lookup_name, name_len, char);
+ SAVEFREEPV(lookup_name);
+
+ /* Parse the input. */
+ for (i = 0; i < name_len; i++) {
+ char cur = name[i];
+
+ /* These characters can be freely ignored in most situations. Later it
+ * may turn out we shouldn't have ignored them, and we have to reparse,
+ * but we don't have enough information yet to make that decision */
+ if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
+ continue;
+ }
+
+ /* Case differences are also ignored. Our lookup routine assumes
+ * everything is lowercase */
+ if (isUPPER_A(cur)) {
+ lookup_name[j++] = toLOWER(cur);
+ continue;
+ }
+
+ /* A double colon is either an error, or a package qualifier to a
+ * subroutine user-defined property; neither of which do we currently
+ * handle
+ *
+ * But a single colon is a synonym for '=' */
+ if (cur == ':') {
+ if (i < name_len - 1 && name[i+1] == ':') {
+ return NULL;
+ }
+ cur = '=';
+ }
+
+ /* Otherwise, this character is part of the name. */
+ lookup_name[j++] = cur;
+
+ /* Only the equals sign needs further processing */
+ if (cur == '=') {
+ equals_pos = j; /* Note where it occurred in the input */
+ break;
+ }
+ }
+
+ /* Here, we are either done with the whole property name, if it was simple;
+ * or are positioned just after the '=' if it is compound. */
+
+ if (equals_pos >= 0) {
+ assert(! stricter); /* We shouldn't have set this yet */
+
+ /* Space immediately after the '=' is ignored */
+ i++;
+ for (; i < name_len; i++) {
+ if (! isSPACE_A(name[i])) {
+ break;
+ }
+ }
+
+ /* Certain properties need special handling. They may optionally be
+ * prefixed by 'is'. Ignore that prefix for the purposes of checking
+ * if this is one of those properties */
+ if (memBEGINPs(lookup_name, name_len, "is")) {
+ lookup_offset = 2;
+ }
+
+ /* Then check if it is one of these properties. This is hard-coded
+ * because easier this way, and the list is unlikely to change. There
+ * are several properties like this in the Unihan DB, which is unlikely
+ * to be compiled, and they all end with 'numeric'. The interiors
+ * aren't checked for the precise property. This would stop working if
+ * a cjk property were to be created that ended with 'numeric' and
+ * wasn't a numeric type */
+ is_nv_type = memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "numericvalue")
+ || memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "nv")
+ || ( memENDPs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "numeric")
+ && ( memBEGINPs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "cjk")
+ || memBEGINPs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "k")));
+ if ( is_nv_type
+ || memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "canonicalcombiningclass")
+ || memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "ccc")
+ || memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "age")
+ || memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "in")
+ || memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "presentin"))
+ {
+ unsigned int k;
+
+ /* What makes these properties special is that the stuff after the
+ * '=' is a number. Therefore, we can't throw away '-'
+ * willy-nilly, as those could be a minus sign. Other stricter
+ * rules also apply. However, these properties all can have the
+ * rhs not be a number, in which case they contain at least one
+ * alphabetic. In those cases, the stricter rules don't apply.
+ * But the numeric type properties can have the alphas [Ee] to
+ * signify an exponent, and it is still a number with stricter
+ * rules. So look for an alpha that signifys not-strict */
+ stricter = TRUE;
+ for (k = i; k < name_len; k++) {
+ if ( isALPHA_A(name[k])
+ && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
+ {
+ stricter = FALSE;
+ break;
+ }
+ }
+ }
+
+ if (stricter) {
+
+ /* A number may have a leading '+' or '-'. The latter is retained
+ * */
+ if (name[i] == '+') {
+ i++;
+ }
+ else if (name[i] == '-') {
+ lookup_name[j++] = '-';
+ i++;
+ }
+
+ /* Skip leading zeros including single underscores separating the
+ * zeros, or between the final leading zero and the first other
+ * digit */
+ for (; i < name_len - 1; i++) {
+ if ( name[i] != '0'
+ && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
+ {
+ break;
+ }
+ }
+ }
+ }
+ else { /* No '=' */
+
+ /* We are now in a position to determine if this property should have
+ * been parsed using stricter rules. Only a few are like that, and
+ * unlikely to change. */
+ if ( memBEGINPs(lookup_name, j, "perl")
+ && memNEs(lookup_name + 4, j - 4, "space")
+ && memNEs(lookup_name + 4, j - 4, "word"))
+ {
+ stricter = TRUE;
+
+ /* We set the inputs back to 0 and the code below will reparse,
+ * using strict */
+ i = j = 0;
+ }
+ }
+
+ /* Here, we have either finished the property, or are positioned to parse
+ * the remainder, and we know if stricter rules apply. Finish out, if not
+ * already done */
+ for (; i < name_len; i++) {
+ char cur = name[i];
+
+ /* In all instances, case differences are ignored, and we normalize to
+ * lowercase */
+ if (isUPPER_A(cur)) {
+ lookup_name[j++] = toLOWER(cur);
+ continue;
+ }
+
+ /* An underscore is skipped, but not under strict rules unless it
+ * separates two digits */
+ if (cur == '_') {
+ if ( stricter
+ && ( i == 0 || (int) i == equals_pos || i == name_len- 1
+ || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
+ {
+ lookup_name[j++] = '_';
+ }
+ continue;
+ }
+
+ /* Hyphens are skipped except under strict */
+ if (cur == '-' && ! stricter) {
+ continue;
+ }
+
+ /* XXX Bug in documentation. It says white space skipped adjacent to
+ * non-word char. Maybe we should, but shouldn't skip it next to a dot
+ * in a number */
+ if (isSPACE_A(cur) && ! stricter) {
+ continue;
+ }
+
+ lookup_name[j++] = cur;
+
+ /* Unless this is a non-trailing slash, we are done with it */
+ if (i >= name_len - 1 || cur != '/') {
+ continue;
+ }
+
+ slash_pos = j;
+
+ /* A slash in the 'numeric value' property indicates that what follows
+ * is a denominator. It can have a leading '+' and '0's that should be
+ * skipped. But we have never allowed a negative denominator, so treat
+ * a minus like every other character. (No need to rule out a second
+ * '/', as that won't match anything anyway */
+ if (is_nv_type) {
+ i++;
+ if (i < name_len && name[i] == '+') {
+ i++;
+ }
+
+ /* Skip leading zeros including underscores separating digits */
+ for (; i < name_len - 1; i++) {
+ if ( name[i] != '0'
+ && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
+ {
+ break;
+ }
+ }
+
+ /* Store the first real character in the denominator */
+ lookup_name[j++] = name[i];
+ }
+ }
+
+ /* Here are completely done parsing the input 'name', and 'lookup_name'
+ * contains a copy, normalized.
+ *
+ * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
+ * different from without the underscores. */
+ if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
+ || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
+ && UNLIKELY(name[name_len-1] == '_'))
+ {
+ lookup_name[j++] = '&';
+ }
+ else if (name_len > 2 && name[0] == 'I' && ( name[1] == 'n'
+ || name[1] == 's'))
+ {
+
+ /* Also, if the original input began with 'In' or 'Is', it could be a
+ * subroutine call instead of a property names, which currently isn't
+ * handled by this function. Subroutine calls can't happen if there is
+ * an '=' in the name */
+ if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
+ {
+ return NULL;
+ }
+
+ starts_with_In_or_Is = TRUE;
+ }
+
+ lookup_len = j; /* Use a more mnemonic name starting here */
+
+ /* Get the index into our pointer table of the inversion list corresponding
+ * to the property */
+ table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+
+ /* If it didn't find the property */
+ if (table_index == 0) {
+
+ /* If didn't find the property, we try again stripping off any initial
+ * 'In' or 'Is' */
+ if (starts_with_In_or_Is) {
+ lookup_name += 2;
+ lookup_len -= 2;
+ equals_pos -= 2;
+ slash_pos -= 2;
+
+ table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+ }
+
+ if (table_index == 0) {
+ char * canonical;
+
+ /* If not found, and not a numeric type property, isn't a legal
+ * property */
+ if (! is_nv_type) {
+ return NULL;
+ }
+
+ /* But the numeric type properties need more work to decide. What
+ * we do is make sure we have the number in canonical form and look
+ * that up. */
+
+ if (slash_pos < 0) { /* No slash */
+
+ /* When it isn't a rational, take the input, convert it to a
+ * NV, then create a canonical string representation of that
+ * NV. */
+
+ NV value;
+
+ /* Get the value */
+ if (my_atof3(lookup_name + equals_pos, &value,
+ lookup_len - equals_pos)
+ != lookup_name + lookup_len)
+ {
+ return NULL;
+ }
+
+ /* If the value is an integer, the canonical value is integral */
+ if (Perl_ceil(value) == value) {
+ canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
+ equals_pos, lookup_name, value);
+ }
+ else { /* Otherwise, it is %e with a known precision */
+ char * exp_ptr;
+
+ canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
+ equals_pos, lookup_name,
+ PL_E_FORMAT_PRECISION, value);
+
+ /* The exponent generated is expecting two digits, whereas
+ * %e on some systems will generate three. Remove leading
+ * zeros in excess of 2 from the exponent. We start
+ * looking for them after the '=' */
+ exp_ptr = strchr(canonical + equals_pos, 'e');
+ if (exp_ptr) {
+ char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
+ SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
+
+ assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
+
+ if (excess_exponent_len > 0) {
+ SSize_t leading_zeros = strspn(cur_ptr, "0");
+ SSize_t excess_leading_zeros
+ = MIN(leading_zeros, excess_exponent_len);
+ if (excess_leading_zeros > 0) {
+ Move(cur_ptr + excess_leading_zeros,
+ cur_ptr,
+ strlen(cur_ptr) - excess_leading_zeros
+ + 1, /* Copy the NUL as well */
+ char);
+ }
+ }
+ }
+ }
+ }
+ else { /* Has a slash. Create a rational in canonical form */
+ UV numerator, denominator, gcd, trial;
+ const char * end_ptr;
+ const char * sign = "";
+
+ /* We can't just find the numerator, denominator, and do the
+ * division, then use the method above, because that is
+ * inexact. And the input could be a rational that is within
+ * epsilon (given our precision) of a valid rational, and would
+ * then incorrectly compare valid.
+ *
+ * We're only interested in the part after the '=' */
+ const char * this_lookup_name = lookup_name + equals_pos;
+ lookup_len -= equals_pos;
+ slash_pos -= equals_pos;
+
+ /* Handle any leading minus */
+ if (this_lookup_name[0] == '-') {
+ sign = "-";
+ this_lookup_name++;
+ lookup_len--;
+ slash_pos--;
+ }
+
+ /* Convert the numerator to numeric */
+ end_ptr = this_lookup_name + slash_pos;
+ if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
+ return NULL;
+ }
+
+ /* It better have included all characters before the slash */
+ if (*end_ptr != '/') {
+ return NULL;
+ }
+
+ /* Set to look at just the denominator */
+ this_lookup_name += slash_pos;
+ lookup_len -= slash_pos;
+ end_ptr = this_lookup_name + lookup_len;
+
+ /* Convert the denominator to numeric */
+ if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
+ return NULL;
+ }
+
+ /* It better be the rest of the characters, and don't divide by
+ * 0 */
+ if ( end_ptr != this_lookup_name + lookup_len
+ || denominator == 0)
+ {
+ return NULL;
+ }
+
+ /* Get the greatest common denominator using
+ http://en.wikipedia.org/wiki/Euclidean_algorithm */
+ gcd = numerator;
+ trial = denominator;
+ while (trial != 0) {
+ UV temp = trial;
+ trial = gcd % trial;
+ gcd = temp;
+ }
+
+ /* If already in lowest possible terms, we have already tried
+ * looking this up */
+ if (gcd == 1) {
+ return NULL;
+ }
+
+ /* Reduce the rational, which should put it in canonical form.
+ * Then look it up */
+ numerator /= gcd;
+ denominator /= gcd;
+
+ canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
+ equals_pos, lookup_name, sign, numerator, denominator);
+ }
+
+ /* Here, we have the number in canonical form. Try that */
+ table_index = match_uniprop((U8 *) canonical, strlen(canonical));
+ if (table_index == 0) {
+ return NULL;
+ }
+ }
+ }
+
+ /* The return is an index into a table of ptrs. A negative return
+ * signifies that the real index is the absolute value, but the result
+ * needs to be inverted */
+ if (table_index < 0) {
+ *invert = TRUE;
+ table_index = -table_index;
+ }
+ else {
+ *invert = FALSE;
+ }
+
+ /* Out-of band indices indicate a deprecated property. The proper index is
+ * modulo it with the table size. And dividing by the table size yields
+ * an offset into a table constructed to contain the corresponding warning
+ * message */
+ if (table_index > MAX_UNI_KEYWORD_INDEX) {
+ Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
+ table_index %= MAX_UNI_KEYWORD_INDEX;
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
+ "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
+ (int) name_len, name, deprecated_property_msgs[warning_offset]);
+ }
+
+ /* In a few properties, a different property is used under /i. These are
+ * unlikely to change, so are hard-coded here. */
+ if (to_fold) {
+ if ( table_index == UNI_XPOSIXUPPER
+ || table_index == UNI_XPOSIXLOWER
+ || table_index == UNI_TITLE)
+ {
+ table_index = UNI_CASED;
+ }
+ else if ( table_index == UNI_UPPERCASELETTER
+ || table_index == UNI_LOWERCASELETTER
+# ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
+ || table_index == UNI_TITLECASELETTER
+# endif
+ ) {
+ table_index = UNI_CASEDLETTER;
+ }
+ else if ( table_index == UNI_POSIXUPPER
+ || table_index == UNI_POSIXLOWER)
+ {
+ table_index = UNI_POSIXALPHA;
+ }
+ }
+
+ /* Create and return the inversion list */
+ return _new_invlist_C_array(uni_prop_ptrs[table_index]);
+}
+
+#endif
+
/*
* ex: set ts=8 sts=4 sw=4 et:
*/