#define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
+#ifndef MAX
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+
/* this is a chain of data about sub patterns we are processing that
need to be handled separately/specially in study_chunk. Its so
we can simulate recursion without losing state. */
U32 flags; /* RXf_* are we folding, multilining? */
U32 pm_flags; /* PMf_* stuff from the calling PMOP */
char *precomp; /* uncompiled string. */
+ char *precomp_end; /* pointer to end of uncompiled string. */
REGEXP *rx_sv; /* The SV that is the regexp. */
regexp *rx; /* perl core regexp structure */
regexp_internal *rxi; /* internal data for regexp object
char *start; /* Start of input for compile */
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
+ char *adjusted_start; /* 'start', adjusted. See code use */
+ STRLEN precomp_adj; /* an offset beyond precomp. See code use */
SSize_t whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
regnode *emit_bound; /* First regnode outside of the
scan_frame *frame_head;
scan_frame *frame_last;
U32 frame_count;
- U32 strict;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#endif
bool seen_unfolded_sharp_s;
+ bool strict;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_pm_flags (pRExC_state->pm_flags)
#define RExC_precomp (pRExC_state->precomp)
+#define RExC_precomp_adj (pRExC_state->precomp_adj)
+#define RExC_adjusted_start (pRExC_state->adjusted_start)
+#define RExC_precomp_end (pRExC_state->precomp_end)
#define RExC_rx_sv (pRExC_state->rx_sv)
#define RExC_rx (pRExC_state->rx)
#define RExC_rxi (pRExC_state->rxi)
#define REPORT_LOCATION " in regex; marked by " MARKER1 \
" in m/%"UTF8f MARKER2 "%"UTF8f"/"
-#define REPORT_LOCATION_ARGS(offset) \
- UTF8fARG(UTF, offset, RExC_precomp), \
- UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
+/* The code in this file in places uses one level of recursion with parsing
+ * rebased to an alternate string constructed by us in memory. This can take
+ * the form of something that is completely different from the input, or
+ * something that uses the input as part of the alternate. In the first case,
+ * there should be no possibility of an error, as we are in complete control of
+ * the alternate string. But in the second case we don't control the input
+ * portion, so there may be errors in that. Here's an example:
+ * /[abc\x{DF}def]/ui
+ * is handled specially because \x{df} folds to a sequence of more than one
+ * character, 'ss'. What is done is to create and parse an alternate string,
+ * which looks like this:
+ * /(?:\x{DF}|[abc\x{DF}def])/ui
+ * where it uses the input unchanged in the middle of something it constructs,
+ * which is a branch for the DF outside the character class, and clustering
+ * parens around the whole thing. (It knows enough to skip the DF inside the
+ * class while in this substitute parse.) 'abc' and 'def' may have errors that
+ * need to be reported. The general situation looks like this:
+ *
+ * sI tI xI eI
+ * Input: ----------------------------------------------------
+ * Constructed: ---------------------------------------------------
+ * sC tC xC eC EC
+ *
+ * The input string sI..eI is the input pattern. The string sC..EC is the
+ * constructed substitute parse string. The portions sC..tC and eC..EC are
+ * constructed by us. The portion tC..eC is an exact duplicate of the input
+ * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
+ * while parsing, we find an error at xC. We want to display a message showing
+ * the real input string. Thus we need to find the point xI in it which
+ * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
+ * been constructed by us, and so shouldn't have errors. We get:
+ *
+ * xI = sI + (tI - sI) + (xC - tC)
+ *
+ * and, the offset into sI is:
+ *
+ * (xI - sI) = (tI - sI) + (xC - tC)
+ *
+ * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
+ * and we save tC as RExC_adjusted_start.
+ *
+ * During normal processing of the input pattern, everything points to that,
+ * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
+ */
+
+#define tI_sI RExC_precomp_adj
+#define tC RExC_adjusted_start
+#define sC RExC_precomp
+#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
+#define xI(xC) (sC + xI_offset(xC))
+#define eC RExC_precomp_end
+
+#define REPORT_LOCATION_ARGS(xC) \
+ UTF8fARG(UTF, \
+ (xI(xC) > eC) /* Don't run off end */ \
+ ? eC - sC /* Length before the <--HERE */ \
+ : xI_offset(xC), \
+ sC), /* The input pattern printed up to the <--HERE */ \
+ UTF8fARG(UTF, \
+ (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
+ (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
/* Used to point after bad bytes for an error message, but avoid skipping
* past a nul byte. */
*/
#define _FAIL(code) STMT_START { \
const char *ellipses = ""; \
- IV len = RExC_end - RExC_precomp; \
+ IV len = RExC_precomp_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEFREESV(RExC_rx_sv); \
* Simple_vFAIL -- like FAIL, but marks the current location in the scan
*/
#define Simple_vFAIL(m) STMT_START { \
- const IV offset = \
- (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
- m, REPORT_LOCATION_ARGS(offset)); \
+ m, REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
/*
* Like Simple_vFAIL(), but accepts two arguments.
*/
#define Simple_vFAIL2(m,a1) STMT_START { \
- const IV offset = RExC_parse - RExC_precomp; \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
- REPORT_LOCATION_ARGS(offset)); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
+ REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
/*
* Like Simple_vFAIL(), but accepts three arguments.
*/
#define Simple_vFAIL3(m, a1, a2) STMT_START { \
- const IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
- REPORT_LOCATION_ARGS(offset)); \
+ REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
/*
* Like Simple_vFAIL(), but accepts four arguments.
*/
#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
- const IV offset = RExC_parse - RExC_precomp; \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
- REPORT_LOCATION_ARGS(offset)); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
+ REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
#define vFAIL4(m,a1,a2,a3) STMT_START { \
} STMT_END
/* A specialized version of vFAIL2 that works with UTF8f */
-#define vFAIL2utf8f(m, a1) STMT_START { \
- const IV offset = RExC_parse - RExC_precomp; \
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
- REPORT_LOCATION_ARGS(offset)); \
+#define vFAIL2utf8f(m, a1) STMT_START { \
+ if (!SIZE_ONLY) \
+ SAVEFREESV(RExC_rx_sv); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
+ REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
#define vFAIL3utf8f(m, a1, a2) STMT_START { \
- const IV offset = RExC_parse - RExC_precomp; \
if (!SIZE_ONLY) \
SAVEFREESV(RExC_rx_sv); \
S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
- REPORT_LOCATION_ARGS(offset)); \
+ REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
/* These have asserts in them because of [perl #122671] Many warnings in
/* m is not necessarily a "literal string", in this macro */
#define reg_warn_non_literal_string(loc, m) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
- m, REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
+ "%s" REPORT_LOCATION, \
+ m, REPORT_LOCATION_ARGS(loc)); \
} STMT_END
#define ckWARNreg(loc,m) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
#define vWARN(loc, m) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
#define vWARN_dep(loc, m) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
- REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
+ m REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
#define ckWARNdep(loc,m) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
- m REPORT_LOCATION, \
- REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
+ m REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
-#define ckWARNregdep(loc,m) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
- m REPORT_LOCATION, \
- REPORT_LOCATION_ARGS(offset)); \
+#define ckWARNregdep(loc,m) STMT_START { \
+ __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
+ WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
-#define ckWARN2reg_d(loc,m, a1) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
- m REPORT_LOCATION, \
- a1, REPORT_LOCATION_ARGS(offset)); \
+#define ckWARN2reg_d(loc,m, a1) STMT_START { \
+ __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, REPORT_LOCATION_ARGS(loc)); \
} STMT_END
-#define ckWARN2reg(loc, m, a1) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, REPORT_LOCATION_ARGS(offset)); \
+#define ckWARN2reg(loc, m, a1) STMT_START { \
+ __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, REPORT_LOCATION_ARGS(loc)); \
} STMT_END
-#define vWARN3(loc, m, a1, a2) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, REPORT_LOCATION_ARGS(offset)); \
+#define vWARN3(loc, m, a1, a2) STMT_START { \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, a2, REPORT_LOCATION_ARGS(loc)); \
} STMT_END
-#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, REPORT_LOCATION_ARGS(offset)); \
+#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
+ __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, a2, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, a2, a3, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, a2, a3, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
- const IV offset = loc - RExC_precomp; \
- __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
- a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, a2, a3, a4, \
+ REPORT_LOCATION_ARGS(loc)); \
} STMT_END
/* Macros for recording node offsets. 20001227 mjd@plover.com
PerlIO_printf(Perl_debug_log,"\n"); \
});
+/* =========================================================
+ * BEGIN edit_distance stuff.
+ *
+ * This calculates how many single character changes of any type are needed to
+ * transform a string into another one. It is taken from version 3.1 of
+ *
+ * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
+ */
+
+/* Our unsorted dictionary linked list. */
+/* Note we use UVs, not chars. */
+
+struct dictionary{
+ UV key;
+ UV value;
+ struct dictionary* next;
+};
+typedef struct dictionary item;
+
+
+PERL_STATIC_INLINE item*
+push(UV key,item* curr)
+{
+ item* head;
+ Newxz(head, 1, item);
+ head->key = key;
+ head->value = 0;
+ head->next = curr;
+ return head;
+}
+
+
+PERL_STATIC_INLINE item*
+find(item* head, UV key)
+{
+ item* iterator = head;
+ while (iterator){
+ if (iterator->key == key){
+ return iterator;
+ }
+ iterator = iterator->next;
+ }
+
+ return NULL;
+}
+
+PERL_STATIC_INLINE item*
+uniquePush(item* head,UV key)
+{
+ item* iterator = head;
+
+ while (iterator){
+ if (iterator->key == key) {
+ return head;
+ }
+ iterator = iterator->next;
+ }
+
+ return push(key,head);
+}
+
+PERL_STATIC_INLINE void
+dict_free(item* head)
+{
+ item* iterator = head;
+
+ while (iterator) {
+ item* temp = iterator;
+ iterator = iterator->next;
+ Safefree(temp);
+ }
+
+ head = NULL;
+}
+
+/* End of Dictionary Stuff */
+
+/* All calculations/work are done here */
+STATIC int
+S_edit_distance(const UV* src,
+ const UV* tgt,
+ const STRLEN x, /* length of src[] */
+ const STRLEN y, /* length of tgt[] */
+ const SSize_t maxDistance
+)
+{
+ item *head = NULL;
+ UV swapCount,swapScore,targetCharCount,i,j;
+ UV *scores;
+ UV score_ceil = x + y;
+
+ PERL_ARGS_ASSERT_EDIT_DISTANCE;
+
+ /* intialize matrix start values */
+ Newxz(scores, ( (x + 2) * (y + 2)), UV);
+ scores[0] = score_ceil;
+ scores[1 * (y + 2) + 0] = score_ceil;
+ scores[0 * (y + 2) + 1] = score_ceil;
+ scores[1 * (y + 2) + 1] = 0;
+ head = uniquePush(uniquePush(head,src[0]),tgt[0]);
+
+ /* work loops */
+ /* i = src index */
+ /* j = tgt index */
+ for (i=1;i<=x;i++) {
+ if (i < x)
+ head = uniquePush(head,src[i]);
+ scores[(i+1) * (y + 2) + 1] = i;
+ scores[(i+1) * (y + 2) + 0] = score_ceil;
+ swapCount = 0;
+
+ for (j=1;j<=y;j++) {
+ if (i == 1) {
+ if(j < y)
+ head = uniquePush(head,tgt[j]);
+ scores[1 * (y + 2) + (j + 1)] = j;
+ scores[0 * (y + 2) + (j + 1)] = score_ceil;
+ }
+
+ targetCharCount = find(head,tgt[j-1])->value;
+ swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
+
+ if (src[i-1] != tgt[j-1]){
+ scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
+ }
+ else {
+ swapCount = j;
+ scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
+ }
+ }
+
+ find(head,src[i-1])->value = i;
+ }
+
+ {
+ IV score = scores[(x+1) * (y + 2) + (y + 1)];
+ dict_free(head);
+ Safefree(scores);
+ return (maxDistance != 0 && maxDistance < score)?(-1):score;
+ }
+}
+
+/* END of edit_distance() stuff
+ * ========================================================= */
+
/* is c a control character for which we have a mnemonic? */
#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
* returned list must, and will, contain every code point that is a
* possibility. */
- SV* invlist = sv_2mortal(_new_invlist(0));
+ SV* invlist = NULL;
SV* only_utf8_locale_invlist = NULL;
unsigned int i;
const U32 n = ARG(node);
/* Here, no compile-time swash, and there are things that won't be
* known until runtime -- we have to assume it could be anything */
+ invlist = sv_2mortal(_new_invlist(1));
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
else if (ary[3] && ary[3] != &PL_sv_undef) {
}
/* Get the code points valid only under UTF-8 locales */
- if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
+ if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
&& ary[2] && ary[2] != &PL_sv_undef)
{
only_utf8_locale_invlist = ary[2];
}
}
+ if (! invlist) {
+ invlist = sv_2mortal(_new_invlist(0));
+ }
+
/* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
* code points, and an inversion list for the others, but if there are code
* points that should match only conditionally on the target string being
/* Add in the points from the bit map */
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
- invlist = add_cp_to_invlist(invlist, i);
+ unsigned int start = i++;
+
+ for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
new_node_has_latin1 = TRUE;
}
}
/* If this can match all upper Latin1 code points, have to add them
- * as well */
- if (OP(node) == ANYOFD
+ * as well. But don't add them if inverting, as when that gets done below,
+ * it would exclude all these characters, including the ones it shouldn't
+ * that were added just above */
+ if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
&& (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
{
_invlist_union(invlist, PL_UpperLatin1, &invlist);
if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
_invlist_invert(invlist);
}
- else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
+ else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
/* Under /li, any 0-255 could fold to any other 0-255, depending on the
* locale. We can skip this if there are no 0-255 at all. */
else {
anded_flags = ANYOF_FLAGS(and_with)
&( ANYOF_COMMON_FLAGS
- |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER);
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+ if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+ anded_flags &=
+ ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+ }
}
}
if (OP(or_with) != ANYOFD) {
ored_flags
|= ANYOF_FLAGS(or_with)
- & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+ if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+ ored_flags |=
+ ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+ }
}
}
* (unassigned, private use, surrogates, controls and formats). This
* is a much large number. */
- const U32 max_match = (LOC)
- ? 127
- : (! UNI_SEMANTICS)
- ? 63
- : (invlist_highest(ssc->invlist) < 256)
- ? 127
- : ((NON_OTHER_COUNT + 1) / 2) - 1;
U32 count = 0; /* Running total of number of code points matched by
'ssc' */
UV start, end; /* Start and end points of current range in inversion
list */
+ const U32 max_code_points = (LOC)
+ ? 256
+ : (( ! UNI_SEMANTICS
+ || invlist_highest(ssc->invlist) < 256)
+ ? 128
+ : NON_OTHER_COUNT);
+ const U32 max_match = max_code_points / 2;
PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
invlist_iterinit(ssc->invlist);
while (invlist_iternext(ssc->invlist, &start, &end)) {
-
- /* /u is the only thing that we expect to match above 255; so if not /u
- * and even if there are matches above 255, ignore them. This catches
- * things like \d under /d which does match the digits above 255, but
- * since the pattern is /d, it is not likely to be expecting them */
- if (! UNI_SEMANTICS) {
- if (start > 255) {
- break;
- }
- end = MIN(end, 255);
+ if (start >= max_code_points) {
+ break;
}
+ end = MIN(end, max_code_points - 1);
count += end - start + 1;
- if (count > max_match) {
+ if (count >= max_match) {
invlist_iterfinish(ssc->invlist);
return FALSE;
}
* by the time we reach here */
assert(! (ANYOF_FLAGS(ssc)
& ~( ANYOF_COMMON_FLAGS
- |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)));
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
* The adjacent nodes actually may be separated by NOTHING-kind nodes, and
* these get optimized out
*
+ * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
+ * as possible, even if that means splitting an existing node so that its first
+ * part is moved to the preceeding node. This would maximise the efficiency of
+ * memEQ during matching. Elsewhere in this file, khw proposes splitting
+ * EXACTFish nodes into portions that don't change under folding vs those that
+ * do. Those portions that don't change may be the only things in the pattern that
+ * could be used to find fixed and floating strings.
+ *
* If a node is to match under /i (folded), the number of characters it matches
* can be different than its character length if it contains a multi-character
* fold. *min_subtract is set to the total delta number of characters of the
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
"Quantifier unexpected on zero-length expression "
"in regex m/%"UTF8f"/",
- UTF8fARG(UTF, RExC_end - RExC_precomp,
+ UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
RExC_precomp));
(void)ReREFCNT_inc(RExC_rx_sv);
}
/* Initialize these here instead of as-needed, as is quick and avoids
* having to test them each time otherwise */
if (! PL_AboveLatin1) {
+#ifdef DEBUGGING
+ char * dump_len_string;
+#endif
+
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
PL_InBitmap = _new_invlist(2);
PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
NUM_ANYOF_CODE_POINTS - 1);
+#ifdef DEBUGGING
+ dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+ if ( ! dump_len_string
+ || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+ {
+ PL_dump_re_max_len = 0;
+ }
+#endif
}
pRExC_state->code_blocks = NULL;
}
RExC_precomp = exp;
+ RExC_precomp_adj = 0;
RExC_flags = rx_flags;
RExC_pm_flags = pm_flags;
if (runtime_code) {
- if (TAINTING_get && TAINT_get)
+ assert(TAINTING_get || !TAINT_get);
+ if (TAINT_get)
Perl_croak(aTHX_ "Eval-group in insecure regular expression");
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
/* First pass: determine size, legality. */
RExC_parse = exp;
- RExC_start = exp;
+ RExC_start = RExC_adjusted_start = exp;
RExC_end = exp + plen;
+ RExC_precomp_end = RExC_end;
RExC_naughty = 0;
RExC_npar = 1;
RExC_nestroot = 0;
RExC_recurse_count = 0;
pRExC_state->code_index = 0;
+ /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
+ * code makes sure the final byte is an uncounted NUL. But should this
+ * ever not be the case, lots of things could read beyond the end of the
+ * buffer: loops like
+ * while(isFOO(*RExC_parse)) RExC_parse++;
+ * strchr(RExC_parse, "foo");
+ * etc. So it is worth noting. */
+ assert(*RExC_end == '\0');
+
DEBUG_PARSE_r(
PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
RExC_lastnum=0;
assert (RExC_parse <= RExC_end);
if (RExC_parse == RExC_end) NOOP;
else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
- /* skip IDFIRST by using do...while */
+ /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
+ * using do...while */
if (UTF)
do {
RExC_parse += UTF8SKIP(RExC_parse);
#ifndef PERL_IN_XSUB_RE
+STATIC void
+S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
+{
+ /* Replaces the inversion list in 'src' with the one in 'dest'. It steals
+ * the list from 'src', so 'src' is made to have a NULL list. This is
+ * similar to what SvSetMagicSV() would do, if it were implemented on
+ * inversion lists, though this routine avoids a copy */
+
+ const UV src_len = _invlist_len(src);
+ const bool src_offset = *get_invlist_offset_addr(src);
+ const STRLEN src_byte_len = SvLEN(src);
+ char * array = SvPVX(src);
+
+ const int oldtainted = TAINT_get;
+
+ PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
+
+ assert(SvTYPE(src) == SVt_INVLIST);
+ assert(SvTYPE(dest) == SVt_INVLIST);
+ assert(! invlist_is_iterating(src));
+ assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
+
+ /* Make sure it ends in the right place with a NUL, as our inversion list
+ * manipulations aren't careful to keep this true, but sv_usepvn_flags()
+ * asserts it */
+ array[src_byte_len - 1] = '\0';
+
+ TAINT_NOT; /* Otherwise it breaks */
+ sv_usepvn_flags(dest,
+ (char *) array,
+ src_byte_len - 1,
+
+ /* This flag is documented to cause a copy to be avoided */
+ SV_HAS_TRAILING_NUL);
+ TAINT_set(oldtainted);
+ SvPV_set(src, 0);
+ SvLEN_set(src, 0);
+ SvCUR_set(src, 0);
+
+ /* Finish up copying over the other fields in an inversion list */
+ *get_invlist_offset_addr(dest) = src_offset;
+ invlist_set_len(dest, src_len, src_offset);
+ *get_invlist_previous_index_addr(dest) = 0;
+ invlist_iterfinish(dest);
+}
+
PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV* invlist)
{
}
PERL_STATIC_INLINE void
-S_invlist_trim(SV* const invlist)
+S_invlist_trim(SV* invlist)
{
+ /* Free the not currently-being-used space in an inversion list */
+
+ /* But don't free up the space needed for the 0 UV that is always at the
+ * beginning of the list, nor the trailing NUL */
+ const UV min_size = TO_INTERNAL_SIZE(1) + 1;
+
PERL_ARGS_ASSERT_INVLIST_TRIM;
assert(SvTYPE(invlist) == SVt_INVLIST);
- /* Change the length of the inversion list to how many entries it currently
- * has */
- SvPV_shrink_to_cur((SV *) invlist);
+ SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
}
+PERL_STATIC_INLINE void
+S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
+{
+ PERL_ARGS_ASSERT_INVLIST_CLEAR;
+
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ invlist_set_len(invlist, 0, 0);
+ invlist_trim(invlist);
+}
+
+#endif /* ifndef PERL_IN_XSUB_RE */
+
PERL_STATIC_INLINE bool
S_invlist_is_iterating(SV* const invlist)
{
return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
-#endif /* ifndef PERL_IN_XSUB_RE */
-
PERL_STATIC_INLINE UV
S_invlist_max(SV* const invlist)
{
/* Searches the inversion list for the entry that contains the input code
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
* return value is the index into the list's array of the range that
- * contains <cp> */
+ * contains <cp>, that is, 'i' such that
+ * array[i] <= cp < array[i+1]
+ */
IV low = 0;
IV mid;
array = invlist_array(invlist);
mid = invlist_previous_index(invlist);
- assert(mid >=0 && mid <= highest_element);
+ assert(mid >=0);
+ if (mid > highest_element) {
+ mid = highest_element;
+ }
/* <mid> contains the cache of the result of the previous call to this
* function (0 the first time). See if this call is for the same result,
/* Take the union of two inversion lists and point <output> to it. *output
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise *output will be made correspondingly
- * mortal. The first list, <a>, may be NULL, in which case a copy of the
- * second list is returned. If <complement_b> is TRUE, the union is taken
- * of the complement (inversion) of <b> instead of b itself.
+ * temporary (mortal); otherwise just its contents will be modified to be
+ * the union. The first list, <a>, may be NULL, in which case a copy of
+ * the second list is returned. If <complement_b> is TRUE, the union is
+ * taken of the complement (inversion) of <b> instead of b itself.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- /* If either one is empty, the union is the other one */
- if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
- bool make_temp = FALSE; /* Should we mortalize the result? */
+ len_b = _invlist_len(b);
+ if (len_b == 0) {
- if (*output == a) {
- if (a != NULL) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
+ /* Here, 'b' is empty. If the output is the complement of 'b', the
+ * union is all possible code points, and we need not even look at 'a'.
+ * It's easiest to create a new inversion list that matches everything.
+ * */
+ if (complement_b) {
+ SV* everything = _new_invlist(1);
+ _append_range_to_invlist(everything, 0, UV_MAX);
+
+ /* If the output didn't exist, just point it at the new list */
+ if (*output == NULL) {
+ *output = everything;
+ return;
}
- }
- if (*output != b) {
- *output = invlist_clone(b);
- if (complement_b) {
- _invlist_invert(*output);
+
+ /* Otherwise, replace its contents with the new list */
+ invlist_replace_list_destroys_src(*output, everything);
+ SvREFCNT_dec_NN(everything);
+ return;
+ }
+
+ /* Here, we don't want the complement of 'b', and since it is empty,
+ * the union will come entirely from 'a'. If 'a' is NULL or empty, the
+ * output will be empty */
+
+ if (a == NULL) {
+ *output = _new_invlist(0);
+ return;
+ }
+
+ if (_invlist_len(a) == 0) {
+ invlist_clear(*output);
+ return;
+ }
+
+ /* Here, 'a' is not empty, and entirely determines the union. If the
+ * output is not to overwrite 'b', we can just return 'a'. */
+ if (*output != b) {
+
+ /* If the output is to overwrite 'a', we have a no-op, as it's
+ * already in 'a' */
+ if (*output == a) {
+ return;
}
- } /* else *output already = b; */
- if (make_temp) {
- sv_2mortal(*output);
+ /* But otherwise we have to copy 'a' to the output */
+ *output = invlist_clone(a);
+ return;
}
+
+ /* Here, 'b' is to be overwritten by the output, which will be 'a' */
+ u = invlist_clone(a);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
+
return;
}
- else if ((len_b = _invlist_len(b)) == 0) {
- bool make_temp = FALSE;
- if (*output == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
+
+ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
+
+ /* Here, 'a' is empty (and b is not). That means the union will come
+ * entirely from 'b'. If the output is not to overwrite 'a', we can
+ * just return what's in 'b'. */
+ if (*output != a) {
+
+ /* If the output is to overwrite 'b', it's already in 'b', but
+ * otherwise we have to copy 'b' to the output */
+ if (*output != b) {
+ *output = invlist_clone(b);
}
- }
- /* The complement of an empty list is a list that has everything in it,
- * so the union with <a> includes everything too */
- if (complement_b) {
- if (a == *output) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
+ /* And if the output is to be the inversion of 'b', do that */
+ if (complement_b) {
+ _invlist_invert(*output);
}
- *output = _new_invlist(1);
- _append_range_to_invlist(*output, 0, UV_MAX);
+
+ return;
}
- else if (*output != a) {
- *output = invlist_clone(a);
+
+ /* Here, 'a', which is empty or even NULL, is to be overwritten by the
+ * output, which will either be 'b' or the complement of 'b' */
+
+ if (a == NULL) {
+ *output = invlist_clone(b);
}
- /* else *output already = a; */
+ else {
+ u = invlist_clone(b);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
+ }
- if (make_temp) {
- sv_2mortal(*output);
+ if (complement_b) {
+ _invlist_invert(*output);
}
+
return;
}
/* Here, have chosen which of the two inputs to look at. Only output
* if the running count changes to/from 0, which marks the
- * beginning/end of a range in that's in the set */
+ * beginning/end of a range that's in the set */
if (cp_in_set) {
if (count == 0) {
array_u[i_u++] = cp;
* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
* decrementing to 0 insures that we look at the remainder of the
* non-exhausted set */
- if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
+ if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
{
count--;
len_u += (len_a - i_a) + (len_b - i_b);
}
- /* Set result to final length, which can change the pointer to array_u, so
- * re-find it */
+ /* Set the result to the final length, which can change the pointer to
+ * array_u, so re-find it. (Note that it is unlikely that this will
+ * change, as we are shrinking the space, not enlarging it) */
if (len_u != _invlist_len(u)) {
invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
invlist_trim(u);
/* When 'count' is 0, the list that was exhausted (if one was shorter than
* the other) ended with everything above it not in its set. That means
* that the remaining part of the union is precisely the same as the
- * non-exhausted list, so can just copy it unchanged. (If both list were
+ * non-exhausted list, so can just copy it unchanged. (If both lists were
* exhausted at the same time, then the operations below will be both 0.)
*/
if (count == 0) {
}
}
- /* We may be removing a reference to one of the inputs. If so, the output
- * is made mortal if the input was. (Mortal SVs shouldn't have their ref
- * count decremented) */
- if (a == *output || b == *output) {
+ /* If the output is not to overwrite either of the inputs, just return the
+ * calculated union */
+ if (a != *output && b != *output) {
+ *output = u;
+ }
+ else {
+ /* Here, the output is to be the same as one of the input scalars,
+ * hence replacing it. The simple thing to do is to free the input
+ * scalar, making it instead be the output one. But experience has
+ * shown [perl #127392] that if the input is a mortal, we can get a
+ * huge build-up of these during regex compilation before they get
+ * freed. So for that case, replace just the input's interior with
+ * the output's, and then free the output */
+
assert(! invlist_is_iterating(*output));
- if ((SvTEMP(*output))) {
- sv_2mortal(u);
+
+ if (! SvTEMP(*output)) {
+ SvREFCNT_dec_NN(*output);
+ *output = u;
}
else {
- SvREFCNT_dec_NN(*output);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
}
}
- *output = u;
-
return;
}
/* Take the intersection of two inversion lists and point <i> to it. *i
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise *i will be made correspondingly mortal.
- * The first list, <a>, may be NULL, in which case an empty list is
- * returned. If <complement_b> is TRUE, the result will be the
- * intersection of <a> and the complement (or inversion) of <b> instead of
- * <b> directly.
+ * temporary (mortal); otherwise just its contents will be modified to be
+ * the intersection. The first list, <a>, may be NULL, in which case an
+ * empty list is returned. If <complement_b> is TRUE, the result will be
+ * the intersection of <a> and the complement (or inversion) of <b> instead
+ * of <b> directly.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
/* Special case if either one is empty */
len_a = (a == NULL) ? 0 : _invlist_len(a);
if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
- bool make_temp = FALSE;
-
if (len_a != 0 && complement_b) {
- /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
- * be empty. Here, also we are using 'b's complement, which hence
- * must be every possible code point. Thus the intersection is
- * simply 'a'. */
- if (*i != a) {
- if (*i == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
- }
- }
+ /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
+ * must be empty. Here, also we are using 'b's complement, which
+ * hence must be every possible code point. Thus the intersection
+ * is simply 'a'. */
- *i = invlist_clone(a);
+ if (*i == a) { /* No-op */
+ return;
}
- /* else *i is already 'a' */
- if (make_temp) {
- sv_2mortal(*i);
+ /* If not overwriting either input, just make a copy of 'a' */
+ if (*i != b) {
+ *i = invlist_clone(a);
+ return;
}
+
+ /* Here we are overwriting 'b' with 'a's contents */
+ r = invlist_clone(a);
+ invlist_replace_list_destroys_src(*i, r);
+ SvREFCNT_dec_NN(r);
return;
}
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
* intersection must be empty */
- if (*i == a) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
- }
- else if (*i == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
- }
- }
- *i = _new_invlist(0);
- if (make_temp) {
- sv_2mortal(*i);
+ if (*i == NULL) {
+ *i = _new_invlist(0);
+ return;
}
+ invlist_clear(*i);
return;
}
* everything that remains in the non-exhausted set.
* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
* remains 1. And the intersection has nothing more. */
- if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
+ if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
|| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
{
count++;
len_r += (len_a - i_a) + (len_b - i_b);
}
- /* Set result to final length, which can change the pointer to array_r, so
- * re-find it */
+ /* Set the result to the final length, which can change the pointer to
+ * array_r, so re-find it. (Note that it is unlikely that this will
+ * change, as we are shrinking the space, not enlarging it) */
if (len_r != _invlist_len(r)) {
invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
invlist_trim(r);
}
}
- /* We may be removing a reference to one of the inputs. If so, the output
- * is made mortal if the input was. (Mortal SVs shouldn't have their ref
- * count decremented) */
- if (a == *i || b == *i) {
+ /* If the output is not to overwrite either of the inputs, just return the
+ * calculated intersection */
+ if (a != *i && b != *i) {
+ *i = r;
+ }
+ else {
+ /* Here, the output is to be the same as one of the input scalars,
+ * hence replacing it. The simple thing to do is to free the input
+ * scalar, making it instead be the output one. But experience has
+ * shown [perl #127392] that if the input is a mortal, we can get a
+ * huge build-up of these during regex compilation before they get
+ * freed. So for that case, replace just the input's interior with
+ * the output's, and then free the output. A short-cut in this case
+ * is if the output is empty, we can just set the input to be empty */
+
assert(! invlist_is_iterating(*i));
- if (SvTEMP(*i)) {
- sv_2mortal(r);
+
+ if (! SvTEMP(*i)) {
+ SvREFCNT_dec_NN(*i);
+ *i = r;
}
else {
- SvREFCNT_dec_NN(*i);
+ if (len_r) {
+ invlist_replace_list_destroys_src(*i, r);
+ }
+ else {
+ invlist_clear(*i);
+ }
+ SvREFCNT_dec_NN(r);
}
}
- *i = r;
-
return;
}
: array[len - 1] - 1;
}
-#ifndef PERL_IN_XSUB_RE
-SV *
-Perl__invlist_contents(pTHX_ SV* const invlist)
+STATIC SV *
+S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
{
/* Get the contents of an inversion list into a string SV so that they can
- * be printed out. It uses the format traditionally done for debug tracing
- */
+ * be printed out. If 'traditional_style' is TRUE, it uses the format
+ * traditionally done for debug tracing; otherwise it uses a format
+ * suitable for just copying to the output, with blanks between ranges and
+ * a dash between range components */
UV start, end;
- SV* output = newSVpvs("\n");
+ SV* output;
+ const char intra_range_delimiter = (traditional_style ? '\t' : '-');
+ const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
+
+ if (traditional_style) {
+ output = newSVpvs("\n");
+ }
+ else {
+ output = newSVpvs("");
+ }
- PERL_ARGS_ASSERT__INVLIST_CONTENTS;
+ PERL_ARGS_ASSERT_INVLIST_CONTENTS;
assert(! invlist_is_iterating(invlist));
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
+ start, intra_range_delimiter,
+ inter_range_delimiter);
}
else if (end != start) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
- start, end);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
+ start,
+ intra_range_delimiter,
+ end, inter_range_delimiter);
}
else {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
+ start, inter_range_delimiter);
}
}
+ if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
+ SvCUR_set(output, SvCUR(output) - 1);
+ }
+
return output;
}
-#endif
#ifndef PERL_IN_XSUB_RE
void
}
#endif
-#ifdef PERL_ARGS_ASSERT__INVLISTEQ
+#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
bool
-S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
+Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
{
/* Return a boolean as to if the two passed in inversion lists are
* identical. The final argument, if TRUE, says to take the complement of
cs = REGEX_UNICODE_CHARSET;
}
- while (*RExC_parse) {
+ while (RExC_parse < RExC_end) {
/* && strchr("iogcmsx", *RExC_parse) */
/* (?g), (?gc) and (?o) are useless here
and must be globally applied -- japhy */
NOT_REACHED; /*NOTREACHED*/
}
- ++RExC_parse;
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
vFAIL("Sequence (?... not terminated");
#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
#endif
+PERL_STATIC_INLINE regnode *
+S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
+ I32 *flagp,
+ char * parse_start,
+ char ch
+ )
+{
+ regnode *ret;
+ char* name_start = RExC_parse;
+ U32 num = 0;
+ SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
+ ? REG_RSN_RETURN_NULL
+ : REG_RSN_RETURN_DATA);
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
+
+ if (RExC_parse == name_start || *RExC_parse != ch) {
+ /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
+ vFAIL2("Sequence %.3s... not terminated",parse_start);
+ }
+
+ if (!SIZE_ONLY) {
+ num = add_data( pRExC_state, STR_WITH_LEN("S"));
+ RExC_rxi->data->data[num]=(void*)sv_dat;
+ SvREFCNT_inc_simple_void(sv_dat);
+ }
+ RExC_sawback = 1;
+ ret = reganode(pRExC_state,
+ ((! FOLD)
+ ? NREF
+ : (ASCII_FOLD_RESTRICTED)
+ ? NREFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? NREFFU
+ : (LOC)
+ ? NREFFL
+ : NREFF),
+ num);
+ *flagp |= HASWIDTH;
+
+ Set_Node_Offset(ret, parse_start+1);
+ Set_Node_Cur_Length(ret, parse_start);
+
+ nextchar(pRExC_state);
+ return ret;
+}
+
/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
*flagp = 0; /* Tentatively. */
+ /* Having this true makes it feasible to have a lot fewer tests for the
+ * parse pointer being in scope. For example, we can write
+ * while(isFOO(*RExC_parse)) RExC_parse++;
+ * instead of
+ * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
+ */
+ assert(*RExC_end == '\0');
/* Make an OPEN node, if parenthesized. */
if (paren) {
* indivisible */
bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Unmatched (");
+ }
+
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
- char *start_verb = RExC_parse;
- STRLEN verb_len = 0;
+ char *start_verb = RExC_parse + 1;
+ STRLEN verb_len;
char *start_arg = NULL;
unsigned char op = 0;
int arg_required = 0;
int internal_argval = -1; /* if >-1 we are not allowed an argument*/
if (has_intervening_patws) {
- RExC_parse++;
+ RExC_parse++; /* past the '*' */
vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
}
- while ( *RExC_parse && *RExC_parse != ')' ) {
+ while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
start_arg = RExC_parse + 1;
break;
}
- RExC_parse++;
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
- ++start_verb;
verb_len = RExC_parse - start_verb;
if ( start_arg ) {
- RExC_parse++;
- while ( *RExC_parse && *RExC_parse != ')' )
- RExC_parse++;
- if ( *RExC_parse != ')' )
+ if (RExC_parse >= RExC_end) {
+ goto unterminated_verb_pattern;
+ }
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ while ( RExC_parse < RExC_end && *RExC_parse != ')' )
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
+ unterminated_verb_pattern:
vFAIL("Unterminated verb pattern argument");
if ( RExC_parse == start_arg )
start_arg = NULL;
} else {
- if ( *RExC_parse != ')' )
+ if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
vFAIL("Unterminated verb pattern");
}
+ /* Here, we know that RExC_parse < RExC_end */
+
switch ( *start_verb ) {
case 'A': /* (*ACCEPT) */
if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
vFAIL("In '(?...)', the '(' and '?' must be adjacent");
}
- RExC_parse++;
- paren = *RExC_parse++;
+ RExC_parse++; /* past the '?' */
+ paren = *RExC_parse; /* might be a trailing NUL, if not
+ well-formed */
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ if (RExC_parse > RExC_end) {
+ paren = '\0';
+ }
ret = NULL; /* For look-ahead/behind. */
switch (paren) {
case 'P': /* (?P...) variants for those used to PCRE/Python */
- paren = *RExC_parse++;
- if ( paren == '<') /* (?P<...>) named capture */
+ paren = *RExC_parse;
+ if ( paren == '<') { /* (?P<...>) named capture */
+ RExC_parse++;
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Sequence (?P<... not terminated");
+ }
goto named_capture;
+ }
else if (paren == '>') { /* (?P>name) named recursion */
+ RExC_parse++;
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Sequence (?P>... not terminated");
+ }
goto named_recursion;
}
else if (paren == '=') { /* (?P=...) named backref */
- /* this pretty much dupes the code for \k<NAME> in
- * regatom(), if you change this make sure you change that
- * */
- char* name_start = RExC_parse;
- U32 num = 0;
- SV *sv_dat = reg_scan_name(pRExC_state,
- SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
- if (RExC_parse == name_start || *RExC_parse != ')')
- /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
- vFAIL2("Sequence %.3s... not terminated",parse_start);
-
- if (!SIZE_ONLY) {
- num = add_data( pRExC_state, STR_WITH_LEN("S"));
- RExC_rxi->data->data[num]=(void*)sv_dat;
- SvREFCNT_inc_simple_void(sv_dat);
- }
- RExC_sawback = 1;
- ret = reganode(pRExC_state,
- ((! FOLD)
- ? NREF
- : (ASCII_FOLD_RESTRICTED)
- ? NREFFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? NREFFU
- : (LOC)
- ? NREFFL
- : NREFF),
- num);
- *flagp |= HASWIDTH;
-
- Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret, parse_start);
-
- nextchar(pRExC_state);
- return ret;
+ RExC_parse++;
+ return handle_named_backref(pRExC_state, flagp,
+ parse_start, ')');
}
- --RExC_parse;
RExC_parse += SKIP_IF_CHAR(RExC_parse);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
char *name_start;
SV *svname;
paren= '>';
+ /* FALLTHROUGH */
case '\'': /* (?'...') */
- name_start= RExC_parse;
- svname = reg_scan_name(pRExC_state,
+ name_start = RExC_parse;
+ svname = reg_scan_name(pRExC_state,
SIZE_ONLY /* reverse test from the others */
? REG_RSN_RETURN_NAME
: REG_RSN_RETURN_NULL);
- if (RExC_parse == name_start || *RExC_parse != paren)
+ if ( RExC_parse == name_start
+ || RExC_parse >= RExC_end
+ || *RExC_parse != paren)
+ {
vFAIL2("Sequence (?%c... not terminated",
paren=='>' ? '<' : paren);
+ }
if (SIZE_ONLY) {
HE *he_str;
SV *sv_dat = NULL;
RExC_seen |= REG_LOOKBEHIND_SEEN;
RExC_in_lookbehind++;
RExC_parse++;
+ assert(RExC_parse < RExC_end);
/* FALLTHROUGH */
case '=': /* (?=...) */
RExC_seen_zerolen++;
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
}
- if (RExC_parse == RExC_end || *RExC_parse != ')')
+ if (RExC_parse >= RExC_end || *RExC_parse != ')')
vFAIL("Sequence (?&... not terminated");
goto gen_recurse_regop;
/* NOTREACHED */
/* FALLTHROUGH */
case '1': case '2': case '3': case '4': /* (?1) */
case '5': case '6': case '7': case '8': case '9':
- RExC_parse--;
+ RExC_parse = (char *) seqstart + 1; /* Point to the digit */
parse_recursion:
{
bool is_neg = FALSE;
NOT_REACHED; /*NOTREACHED*/
}
*flagp |= POSTPONED;
- paren = *RExC_parse++;
+ paren = '{';
+ RExC_parse++;
/* FALLTHROUGH */
case '{': /* (?{...}) */
{
int is_define= 0;
const int DEFINE_len = sizeof("DEFINE") - 1;
if (RExC_parse[0] == '?') { /* (?(?...)) */
- if (
- RExC_parse[1] == '=' ||
- RExC_parse[1] == '!' ||
- RExC_parse[1] == '<' ||
- RExC_parse[1] == '{'
+ if ( RExC_parse < RExC_end - 1
+ && ( RExC_parse[1] == '='
+ || RExC_parse[1] == '!'
+ || RExC_parse[1] == '<'
+ || RExC_parse[1] == '{')
) { /* Lookahead or eval. */
I32 flag;
regnode *tail;
U32 num = 0;
SV *sv_dat=reg_scan_name(pRExC_state,
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
- if (RExC_parse == name_start || *RExC_parse != ch)
+ if ( RExC_parse == name_start
+ || RExC_parse >= RExC_end
+ || *RExC_parse != ch)
+ {
vFAIL2("Sequence (?(%c... not terminated",
(ch == '>' ? '<' : ch));
+ }
RExC_parse++;
if (!SIZE_ONLY) {
num = add_data( pRExC_state, STR_WITH_LEN("S"));
else
lastbr = NULL;
if (c != ')') {
- if (RExC_parse>RExC_end)
+ if (RExC_parse >= RExC_end)
vFAIL("Switch (?(condition)... not terminated");
else
vFAIL("Switch (?(condition)... contains too many branches");
case '[': /* (?[ ... ]) */
return handle_regex_sets(pRExC_state, NULL, flagp, depth,
oregcomp_parse);
- case 0:
+ case 0: /* A NUL */
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
default: /* e.g., (?i) */
- --RExC_parse;
+ RExC_parse = (char *) seqstart + 1;
parse_flags:
parse_lparen_question_flags(pRExC_state);
if (UCHARAT(RExC_parse) != ':') {
- if (*RExC_parse)
+ if (RExC_parse < RExC_end)
nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
/* We can't back off the size because we have to reserve
* enough space for all the things we are about to throw
- * away, but we can shrink it by the ammount we are about
+ * away, but we can shrink it by the amount we are about
* to re-use here */
RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
}
ret = reganode(pRExC_state, OPFAIL, 0);
return ret;
}
- else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
+ else if (min == max && *RExC_parse == '?')
{
if (PASS2) {
ckWARN2reg(RExC_parse + 1,
"Useless use of greediness modifier '%c'",
*RExC_parse);
}
- /* Absorb the modifier, so later code doesn't see nor use it */
- nextchar(pRExC_state);
}
do_curly:
(void)ReREFCNT_inc(RExC_rx_sv);
}
- if (RExC_parse < RExC_end && *RExC_parse == '?') {
+ if (*RExC_parse == '?') {
nextchar(pRExC_state);
reginsert(pRExC_state, MINMOD, ret, depth+1);
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
- else
- if (RExC_parse < RExC_end && *RExC_parse == '+') {
+ else if (*RExC_parse == '+') {
regnode *ender;
nextchar(pRExC_state);
ender = reg_node(pRExC_state, SUCCEED);
REGTAIL(pRExC_state, ret, ender);
}
- if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
+ if (ISMULT2(RExC_parse)) {
RExC_parse++;
vFAIL("Nested quantifiers");
}
UV * code_point_p,
int * cp_count,
I32 * flagp,
+ const bool strict,
const U32 depth
)
{
semantics */
if (endbrace == RExC_parse) { /* empty: \N{} */
+ if (strict) {
+ RExC_parse++; /* Position after the "}" */
+ vFAIL("Zero length \\N{}");
+ }
if (cp_count) {
*cp_count = 0;
}
RExC_parse += 2; /* Skip past the 'U+' */
+ /* Because toke.c has generated a special construct for us guaranteed not
+ * to have NULs, we can use a str function */
endchar = RExC_parse + strcspn(RExC_parse, ".}");
/* Code points are separated by dots. If none, there is only one code
SV * substitute_parse;
STRLEN len;
char *orig_end = RExC_end;
+ char *save_start = RExC_start;
I32 flags;
/* Count the code points, if desired, in the sequence */
}
sv_catpv(substitute_parse, ")");
- RExC_parse = SvPV(substitute_parse, len);
+ RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
+ len);
/* Don't allow empty number */
if (len < (STRLEN) 8) {
}
/* Restore the saved values */
+ RExC_start = RExC_adjusted_start = save_start;
RExC_parse = endbrace;
RExC_end = orig_end;
RExC_override_recoding = 0;
* it returns U+FFFD (Replacement character) and sets *encp to NULL.
*/
STATIC UV
-S_reg_recode(pTHX_ const char value, SV **encp)
+S_reg_recode(pTHX_ const U8 value, SV **encp)
{
STRLEN numlen = 1;
- SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
+ SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
const STRLEN newlen = SvCUR(sv);
UV uv = UNICODE_REPLACEMENT;
tryagain:
parse_start = RExC_parse;
+ assert(RExC_parse < RExC_end);
switch ((U8)*RExC_parse) {
case '^':
RExC_seen_zerolen++;
FALSE, /* don't silence non-portable warnings. */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
+ NULL,
NULL);
if (ret == NULL) {
if (*flagp & (RESTART_PASS1|NEED_UTF8))
ret = reg(pRExC_state, 2, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
- if (RExC_parse == RExC_end) {
+ if (RExC_parse >= RExC_end) {
/* Make parent create an empty node if needed. */
*flagp |= TRYAGAIN;
return(NULL);
required, as the default for this switch is to jump to the
literal text handling code.
*/
- switch ((U8)*++RExC_parse) {
+ RExC_parse++;
+ switch ((U8)*RExC_parse) {
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, op);
*flagp |= SIMPLE;
- if (*(RExC_parse + 1) != '{') {
+ if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
FLAGS(ret) = TRADITIONAL_BOUND;
if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
OP(ret) = BOUNDA;
}
FLAGS(ret) = GCB_BOUND;
break;
+ case 'l':
+ if (length != 2 || *(RExC_parse + 1) != 'b') {
+ goto bad_bound_type;
+ }
+ FLAGS(ret) = LB_BOUND;
+ break;
case 's':
if (length != 2 || *(RExC_parse + 1) != 'b') {
goto bad_bound_type;
non-portables */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
+ NULL,
NULL);
if (*flagp & RESTART_PASS1)
return NULL;
NULL, /* Don't need a count of how many code
points */
flagp,
+ RExC_strict,
depth)
) {
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
{
- char ch= RExC_parse[1];
- if (ch != '<' && ch != '\'' && ch != '{') {
+ char ch;
+ if ( RExC_parse >= RExC_end - 1
+ || (( ch = RExC_parse[1]) != '<'
+ && ch != '\''
+ && ch != '{'))
+ {
RExC_parse++;
/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Sequence %.2s... not terminated",parse_start);
} else {
- /* this pretty much dupes the code for (?P=...) in reg(), if
- you change this make sure you change that */
- char* name_start = (RExC_parse += 2);
- U32 num = 0;
- SV *sv_dat = reg_scan_name(pRExC_state,
- SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
- ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
- if (RExC_parse == name_start || *RExC_parse != ch)
- /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
- vFAIL2("Sequence %.3s... not terminated",parse_start);
-
- if (!SIZE_ONLY) {
- num = add_data( pRExC_state, STR_WITH_LEN("S"));
- RExC_rxi->data->data[num]=(void*)sv_dat;
- SvREFCNT_inc_simple_void(sv_dat);
- }
-
- RExC_sawback = 1;
- ret = reganode(pRExC_state,
- ((! FOLD)
- ? NREF
- : (ASCII_FOLD_RESTRICTED)
- ? NREFFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? NREFFU
- : (LOC)
- ? NREFFL
- : NREFF),
- num);
- *flagp |= HASWIDTH;
-
- /* override incorrect value set in reganode MJD */
- Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret, parse_start);
- nextchar(pRExC_state);
-
+ RExC_parse += 2;
+ ret = handle_named_backref(pRExC_state,
+ flagp,
+ parse_start,
+ (ch == '<')
+ ? '>'
+ : (ch == '{')
+ ? '}'
+ : '\'');
}
break;
}
goto parse_named_seq;
}
+ if (RExC_parse >= RExC_end) {
+ goto unterminated_g;
+ }
num = S_backref_value(RExC_parse);
if (num == 0)
vFAIL("Reference to invalid group 0");
if (isDIGIT(*RExC_parse))
vFAIL("Reference to nonexistent group");
else
+ unterminated_g:
vFAIL("Unterminated \\g... pattern");
}
reparse:
/* We look for the EXACTFish to EXACT node optimizaton only if
- * folding. (And we don't need to figure this out until pass 2) */
+ * folding. (And we don't need to figure this out until pass 2).
+ * XXX It might actually make sense to split the node into portions
+ * that are exact and ones that aren't, so that we could later use
+ * the exact ones to find the longest fixed and floating strings.
+ * One would want to join them back into a larger node. One could
+ * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
maybe_exact = FOLD && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
NULL, /* Don't need a count of
how many code points */
flagp,
+ RExC_strict,
depth)
) {
if (*flagp & NEED_UTF8)
p += numlen;
if (PASS2 /* like \08, \178 */
&& numlen < 3
- && p < RExC_end
&& isDIGIT(*p) && ckWARN(WARN_REGEXP))
{
reg_warn_non_literal_string(
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- ender = reg_recode((const char)(U8)ender, &enc);
+ ender = reg_recode((U8)ender, &enc);
if (!enc && PASS2)
ckWARNreg(p, "Invalid escape in the specified encoding");
REQUIRE_UTF8(flagp);
goto not_fold_common;
}
else /* A regular FOLD code point */
- if (! ( UTF
+ if (! ( UTF
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- /* See comments for join_exact() as to why we fold this
- * non-UTF at compile time */
- || (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S)
+ /* See comments for join_exact() as to why we fold
+ * this non-UTF at compile time */
+ || ( node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S)
#endif
)) {
/* Here, are folding and are not UTF-8 encoded; therefore
if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
}
- else if (end >= NUM_ANYOF_CODE_POINTS) {
- ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
- }
/* Quit if are above what we should change */
if (start >= NUM_ANYOF_CODE_POINTS) {
#define POSIXCC_DONE(c) ((c) == ':')
#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
+
+#define WARNING_PREFIX "Assuming NOT a POSIX class since "
+#define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
+#define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
+
+#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
+
+/* 'posix_warnings' and 'warn_text' are names of variables in the following
+ * routine. q.v. */
+#define ADD_POSIX_WARNING(p, text) STMT_START { \
+ if (posix_warnings) { \
+ if (! warn_text) warn_text = newAV(); \
+ av_push(warn_text, Perl_newSVpvf(aTHX_ \
+ WARNING_PREFIX \
+ text \
+ REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(p))); \
+ } \
+ } STMT_END
-PERL_STATIC_INLINE I32
-S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
+STATIC int
+S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
+
+ const char * const s, /* Where the putative posix class begins.
+ Normally, this is one past the '['. This
+ parameter exists so it can be somewhere
+ besides RExC_parse. */
+ char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
+ NULL */
+ AV ** posix_warnings, /* Where to place any generated warnings, or
+ NULL */
+ const bool check_only /* Don't die if error */
+)
{
- I32 namedclass = OOB_NAMEDCLASS;
+ /* This parses what the caller thinks may be one of the three POSIX
+ * constructs:
+ * 1) a character class, like [:blank:]
+ * 2) a collating symbol, like [. .]
+ * 3) an equivalence class, like [= =]
+ * In the latter two cases, it croaks if it finds a syntactically legal
+ * one, as these are not handled by Perl.
+ *
+ * The main purpose is to look for a POSIX character class. It returns:
+ * a) the class number
+ * if it is a completely syntactically and semantically legal class.
+ * 'updated_parse_ptr', if not NULL, is set to point to just after the
+ * closing ']' of the class
+ * b) OOB_NAMEDCLASS
+ * if it appears that one of the three POSIX constructs was meant, but
+ * its specification was somehow defective. 'updated_parse_ptr', if
+ * not NULL, is set to point to the character just after the end
+ * character of the class. See below for handling of warnings.
+ * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
+ * if it doesn't appear that a POSIX construct was intended.
+ * 'updated_parse_ptr' is not changed. No warnings nor errors are
+ * raised.
+ *
+ * In b) there may be errors or warnings generated. If 'check_only' is
+ * TRUE, then any errors are discarded. Warnings are returned to the
+ * caller via an AV* created into '*posix_warnings' if it is not NULL. If
+ * instead it is NULL, warnings are suppressed. This is done in all
+ * passes. The reason for this is that the rest of the parsing is heavily
+ * dependent on whether this routine found a valid posix class or not. If
+ * it did, the closing ']' is absorbed as part of the class. If no class,
+ * or an invalid one is found, any ']' will be considered the terminator of
+ * the outer bracketed character class, leading to very different results.
+ * In particular, a '(?[ ])' construct will likely have a syntax error if
+ * the class is parsed other than intended, and this will happen in pass1,
+ * before the warnings would normally be output. This mechanism allows the
+ * caller to output those warnings in pass1 just before dieing, giving a
+ * much better clue as to what is wrong.
+ *
+ * The reason for this function, and its complexity is that a bracketed
+ * character class can contain just about anything. But it's easy to
+ * mistype the very specific posix class syntax but yielding a valid
+ * regular bracketed class, so it silently gets compiled into something
+ * quite unintended.
+ *
+ * The solution adopted here maintains backward compatibility except that
+ * it adds a warning if it looks like a posix class was intended but
+ * improperly specified. The warning is not raised unless what is input
+ * very closely resembles one of the 14 legal posix classes. To do this,
+ * it uses fuzzy parsing. It calculates how many single-character edits it
+ * would take to transform what was input into a legal posix class. Only
+ * if that number is quite small does it think that the intention was a
+ * posix class. Obviously these are heuristics, and there will be cases
+ * where it errs on one side or another, and they can be tweaked as
+ * experience informs.
+ *
+ * The syntax for a legal posix class is:
+ *
+ * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
+ *
+ * What this routine considers syntactically to be an intended posix class
+ * is this (the comments indicate some restrictions that the pattern
+ * doesn't show):
+ *
+ * qr/(?x: \[? # The left bracket, possibly
+ * # omitted
+ * \h* # possibly followed by blanks
+ * (?: \^ \h* )? # possibly a misplaced caret
+ * [:;]? # The opening class character,
+ * # possibly omitted. A typo
+ * # semi-colon can also be used.
+ * \h*
+ * \^? # possibly a correctly placed
+ * # caret, but not if there was also
+ * # a misplaced one
+ * \h*
+ * .{3,15} # The class name. If there are
+ * # deviations from the legal syntax,
+ * # its edit distance must be close
+ * # to a real class name in order
+ * # for it to be considered to be
+ * # an intended posix class.
+ * \h*
+ * [:punct:]? # The closing class character,
+ * # possibly omitted. If not a colon
+ * # nor semi colon, the class name
+ * # must be even closer to a valid
+ * # one
+ * \h*
+ * \]? # The right bracket, possibly
+ * # omitted.
+ * )/
+ *
+ * In the above, \h must be ASCII-only.
+ *
+ * These are heuristics, and can be tweaked as field experience dictates.
+ * There will be cases when someone didn't intend to specify a posix class
+ * that this warns as being so. The goal is to minimize these, while
+ * maximizing the catching of things intended to be a posix class that
+ * aren't parsed as such.
+ */
+
+ const char* p = s;
+ const char * const e = RExC_end;
+ unsigned complement = 0; /* If to complement the class */
+ bool found_problem = FALSE; /* Assume OK until proven otherwise */
+ bool has_opening_bracket = FALSE;
+ bool has_opening_colon = FALSE;
+ int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
+ valid class */
+ AV* warn_text = NULL; /* any warning messages */
+ const char * possible_end = NULL; /* used for a 2nd parse pass */
+ const char* name_start; /* ptr to class name first char */
+
+ /* If the number of single-character typos the input name is away from a
+ * legal name is no more than this number, it is considered to have meant
+ * the legal name */
+ int max_distance = 2;
+
+ /* to store the name. The size determines the maximum length before we
+ * decide that no posix class was intended. Should be at least
+ * sizeof("alphanumeric") */
+ UV input_text[15];
+
+ PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
+
+ if (p >= e) {
+ return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ }
+
+ if (*(p - 1) != '[') {
+ ADD_POSIX_WARNING(p, "it doesn't start with a '['");
+ found_problem = TRUE;
+ }
+ else {
+ has_opening_bracket = TRUE;
+ }
+
+ /* They could be confused and think you can put spaces between the
+ * components */
+ if (isBLANK(*p)) {
+ found_problem = TRUE;
+
+ do {
+ p++;
+ } while (p < e && isBLANK(*p));
- PERL_ARGS_ASSERT_REGPPOSIXCC;
+ ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
+ }
- if (value == '[' && RExC_parse + 1 < RExC_end &&
- /* I smell either [: or [= or [. -- POSIX has been here, right? */
- POSIXCC(UCHARAT(RExC_parse)))
+ /* For [. .] and [= =]. These are quite different internally from [: :],
+ * so they are handled separately. */
+ if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
+ and 1 for at least one char in it
+ */
{
- const char c = UCHARAT(RExC_parse);
- char* const s = RExC_parse++;
+ const char open_char = *p;
+ const char * temp_ptr = p + 1;
+
+ /* These two constructs are not handled by perl, and if we find a
+ * syntactically valid one, we croak. khw, who wrote this code, finds
+ * this explanation of them very unclear:
+ * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
+ * And searching the rest of the internet wasn't very helpful either.
+ * It looks like just about any byte can be in these constructs,
+ * depending on the locale. But unless the pattern is being compiled
+ * under /l, which is very rare, Perl runs under the C or POSIX locale.
+ * In that case, it looks like [= =] isn't allowed at all, and that
+ * [. .] could be any single code point, but for longer strings the
+ * constituent characters would have to be the ASCII alphabetics plus
+ * the minus-hyphen. Any sensible locale definition would limit itself
+ * to these. And any portable one definitely should. Trying to parse
+ * the general case is a nightmare (see [perl #127604]). So, this code
+ * looks only for interiors of these constructs that match:
+ * qr/.|[-\w]{2,}/
+ * Using \w relaxes the apparent rules a little, without adding much
+ * danger of mistaking something else for one of these constructs.
+ *
+ * [. .] in some implementations described on the internet is usable to
+ * escape a character that otherwise is special in bracketed character
+ * classes. For example [.].] means a literal right bracket instead of
+ * the ending of the class
+ *
+ * [= =] can legitimately contain a [. .] construct, but we don't
+ * handle this case, as that [. .] construct will later get parsed
+ * itself and croak then. And [= =] is checked for even when not under
+ * /l, as Perl has long done so.
+ *
+ * The code below relies on there being a trailing NUL, so it doesn't
+ * have to keep checking if the parse ptr < e.
+ */
+ if (temp_ptr[1] == open_char) {
+ temp_ptr++;
+ }
+ else while ( temp_ptr < e
+ && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
+ {
+ temp_ptr++;
+ }
- while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
- RExC_parse++;
- if (RExC_parse == RExC_end) {
- if (strict) {
+ if (*temp_ptr == open_char) {
+ temp_ptr++;
+ if (*temp_ptr == ']') {
+ temp_ptr++;
+ if (! found_problem && ! check_only) {
+ RExC_parse = (char *) temp_ptr;
+ vFAIL3("POSIX syntax [%c %c] is reserved for future "
+ "extensions", open_char, open_char);
+ }
- /* Try to give a better location for the error (than the end of
- * the string) by looking for the matching ']' */
- RExC_parse = s;
- while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
- RExC_parse++;
+ /* Here, the syntax wasn't completely valid, or else the call
+ * is to check-only */
+ if (updated_parse_ptr) {
+ *updated_parse_ptr = (char *) temp_ptr;
}
- vFAIL2("Unmatched '%c' in POSIX class", c);
+
+ return OOB_NAMEDCLASS;
}
- /* Grandfather lone [:, [=, [. */
- RExC_parse = s;
}
- else {
- const char* const t = RExC_parse++; /* skip over the c */
- assert(*t == c);
-
- if (UCHARAT(RExC_parse) == ']') {
- const char *posixcc = s + 1;
- RExC_parse++; /* skip over the ending ] */
-
- if (*s == ':') {
- const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
- const I32 skip = t - posixcc;
-
- /* Initially switch on the length of the name. */
- switch (skip) {
- case 4:
- if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
- this is the Perl \w
- */
- namedclass = ANYOF_WORDCHAR;
- break;
- case 5:
- /* Names all of length 5. */
- /* alnum alpha ascii blank cntrl digit graph lower
- print punct space upper */
- /* Offset 4 gives the best switch position. */
- switch (posixcc[4]) {
- case 'a':
- if (memEQ(posixcc, "alph", 4)) /* alpha */
- namedclass = ANYOF_ALPHA;
- break;
- case 'e':
- if (memEQ(posixcc, "spac", 4)) /* space */
- namedclass = ANYOF_SPACE;
- break;
- case 'h':
- if (memEQ(posixcc, "grap", 4)) /* graph */
- namedclass = ANYOF_GRAPH;
- break;
- case 'i':
- if (memEQ(posixcc, "asci", 4)) /* ascii */
- namedclass = ANYOF_ASCII;
- break;
- case 'k':
- if (memEQ(posixcc, "blan", 4)) /* blank */
- namedclass = ANYOF_BLANK;
- break;
- case 'l':
- if (memEQ(posixcc, "cntr", 4)) /* cntrl */
- namedclass = ANYOF_CNTRL;
- break;
- case 'm':
- if (memEQ(posixcc, "alnu", 4)) /* alnum */
- namedclass = ANYOF_ALPHANUMERIC;
- break;
- case 'r':
- if (memEQ(posixcc, "lowe", 4)) /* lower */
- namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
- else if (memEQ(posixcc, "uppe", 4)) /* upper */
- namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
- break;
- case 't':
- if (memEQ(posixcc, "digi", 4)) /* digit */
- namedclass = ANYOF_DIGIT;
- else if (memEQ(posixcc, "prin", 4)) /* print */
- namedclass = ANYOF_PRINT;
- else if (memEQ(posixcc, "punc", 4)) /* punct */
- namedclass = ANYOF_PUNCT;
- break;
- }
- break;
- case 6:
- if (memEQ(posixcc, "xdigit", 6))
- namedclass = ANYOF_XDIGIT;
- break;
- }
- if (namedclass == OOB_NAMEDCLASS)
- vFAIL2utf8f(
- "POSIX class [:%"UTF8f":] unknown",
- UTF8fARG(UTF, t - s - 1, s + 1));
+ /* If we find something that started out to look like one of these
+ * constructs, but isn't, we continue below so that it can be checked
+ * for being a class name with a typo of '.' or '=' instead of a colon.
+ * */
+ }
- /* The #defines are structured so each complement is +1 to
- * the normal one */
- if (complement) {
- namedclass++;
- }
- assert (posixcc[skip] == ':');
- assert (posixcc[skip+1] == ']');
- } else if (!SIZE_ONLY) {
- /* [[=foo=]] and [[.foo.]] are still future. */
-
- /* adjust RExC_parse so the warning shows after
- the class closes */
- while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
- RExC_parse++;
- vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
- }
- } else {
- /* Maternal grandfather:
- * "[:" ending in ":" but not in ":]" */
- if (strict) {
- vFAIL("Unmatched '[' in POSIX class");
- }
+ /* Here, we think there is a possibility that a [: :] class was meant, and
+ * we have the first real character. It could be they think the '^' comes
+ * first */
+ if (*p == '^') {
+ found_problem = TRUE;
+ ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
+ complement = 1;
+ p++;
- /* Grandfather lone [:, [=, [. */
- RExC_parse = s;
- }
- }
+ if (isBLANK(*p)) {
+ found_problem = TRUE;
+
+ do {
+ p++;
+ } while (p < e && isBLANK(*p));
+
+ ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
+ }
}
- return namedclass;
-}
+ /* But the first character should be a colon, which they could have easily
+ * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
+ * distinguish from a colon, so treat that as a colon). */
+ if (*p == ':') {
+ p++;
+ has_opening_colon = TRUE;
+ }
+ else if (*p == ';') {
+ found_problem = TRUE;
+ p++;
+ ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
+ has_opening_colon = TRUE;
+ }
+ else {
+ found_problem = TRUE;
+ ADD_POSIX_WARNING(p, "there must be a starting ':'");
-STATIC bool
-S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
-{
- /* This applies some heuristics at the current parse position (which should
- * be at a '[') to see if what follows might be intended to be a [:posix:]
- * class. It returns true if it really is a posix class, of course, but it
- * also can return true if it thinks that what was intended was a posix
- * class that didn't quite make it.
- *
- * It will return true for
- * [:alphanumerics:
- * [:alphanumerics] (as long as the ] isn't followed immediately by a
- * ')' indicating the end of the (?[
- * [:any garbage including %^&$ punctuation:]
- *
- * This is designed to be called only from S_handle_regex_sets; it could be
- * easily adapted to be called from the spot at the beginning of regclass()
- * that checks to see in a normal bracketed class if the surrounding []
- * have been omitted ([:word:] instead of [[:word:]]). But doing so would
- * change long-standing behavior, so I (khw) didn't do that */
- char* p = RExC_parse + 1;
- char first_char = *p;
+ /* Consider an initial punctuation (not one of the recognized ones) to
+ * be a left terminator */
+ if (*p != '^' && *p != ']' && isPUNCT(*p)) {
+ p++;
+ }
+ }
- PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
+ /* They may think that you can put spaces between the components */
+ if (isBLANK(*p)) {
+ found_problem = TRUE;
- assert(*(p - 1) == '[');
+ do {
+ p++;
+ } while (p < e && isBLANK(*p));
- if (! POSIXCC(first_char)) {
- return FALSE;
+ ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
}
- p++;
- while (p < RExC_end && isWORDCHAR(*p)) p++;
+ if (*p == '^') {
- if (p >= RExC_end) {
- return FALSE;
+ /* We consider something like [^:^alnum:]] to not have been intended to
+ * be a posix class, but XXX maybe we should */
+ if (complement) {
+ return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ }
+
+ complement = 1;
+ p++;
}
- if (p - RExC_parse > 2 /* Got at least 1 word character */
- && (*p == first_char
- || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
- {
- return TRUE;
+ /* Again, they may think that you can put spaces between the components */
+ if (isBLANK(*p)) {
+ found_problem = TRUE;
+
+ do {
+ p++;
+ } while (p < e && isBLANK(*p));
+
+ ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
}
- p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
+ if (*p == ']') {
+
+ /* XXX This ']' may be a typo, and something else was meant. But
+ * treating it as such creates enough complications, that that
+ * possibility isn't currently considered here. So we assume that the
+ * ']' is what is intended, and if we've already found an initial '[',
+ * this leaves this construct looking like [:] or [:^], which almost
+ * certainly weren't intended to be posix classes */
+ if (has_opening_bracket) {
+ return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ }
+
+ /* But this function can be called when we parse the colon for
+ * something like qr/[alpha:]]/, so we back up to look for the
+ * beginning */
+ p--;
+
+ if (*p == ';') {
+ found_problem = TRUE;
+ ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
+ }
+ else if (*p != ':') {
+
+ /* XXX We are currently very restrictive here, so this code doesn't
+ * consider the possibility that, say, /[alpha.]]/ was intended to
+ * be a posix class. */
+ return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ }
+
+ /* Here we have something like 'foo:]'. There was no initial colon,
+ * and we back up over 'foo. XXX Unlike the going forward case, we
+ * don't handle typos of non-word chars in the middle */
+ has_opening_colon = FALSE;
+ p--;
+
+ while (p > RExC_start && isWORDCHAR(*p)) {
+ p--;
+ }
+ p++;
+
+ /* Here, we have positioned ourselves to where we think the first
+ * character in the potential class is */
+ }
+
+ /* Now the interior really starts. There are certain key characters that
+ * can end the interior, or these could just be typos. To catch both
+ * cases, we may have to do two passes. In the first pass, we keep on
+ * going unless we come to a sequence that matches
+ * qr/ [[:punct:]] [[:blank:]]* \] /xa
+ * This means it takes a sequence to end the pass, so two typos in a row if
+ * that wasn't what was intended. If the class is perfectly formed, just
+ * this one pass is needed. We also stop if there are too many characters
+ * being accumulated, but this number is deliberately set higher than any
+ * real class. It is set high enough so that someone who thinks that
+ * 'alphanumeric' is a correct name would get warned that it wasn't.
+ * While doing the pass, we keep track of where the key characters were in
+ * it. If we don't find an end to the class, and one of the key characters
+ * was found, we redo the pass, but stop when we get to that character.
+ * Thus the key character was considered a typo in the first pass, but a
+ * terminator in the second. If two key characters are found, we stop at
+ * the second one in the first pass. Again this can miss two typos, but
+ * catches a single one
+ *
+ * In the first pass, 'possible_end' starts as NULL, and then gets set to
+ * point to the first key character. For the second pass, it starts as -1.
+ * */
+
+ name_start = p;
+ parse_name:
+ {
+ bool has_blank = FALSE;
+ bool has_upper = FALSE;
+ bool has_terminating_colon = FALSE;
+ bool has_terminating_bracket = FALSE;
+ bool has_semi_colon = FALSE;
+ unsigned int name_len = 0;
+ int punct_count = 0;
+
+ while (p < e) {
+
+ /* Squeeze out blanks when looking up the class name below */
+ if (isBLANK(*p) ) {
+ has_blank = TRUE;
+ found_problem = TRUE;
+ p++;
+ continue;
+ }
+
+ /* The name will end with a punctuation */
+ if (isPUNCT(*p)) {
+ const char * peek = p + 1;
+
+ /* Treat any non-']' punctuation followed by a ']' (possibly
+ * with intervening blanks) as trying to terminate the class.
+ * ']]' is very likely to mean a class was intended (but
+ * missing the colon), but the warning message that gets
+ * generated shows the error position better if we exit the
+ * loop at the bottom (eventually), so skip it here. */
+ if (*p != ']') {
+ if (peek < e && isBLANK(*peek)) {
+ has_blank = TRUE;
+ found_problem = TRUE;
+ do {
+ peek++;
+ } while (peek < e && isBLANK(*peek));
+ }
+
+ if (peek < e && *peek == ']') {
+ has_terminating_bracket = TRUE;
+ if (*p == ':') {
+ has_terminating_colon = TRUE;
+ }
+ else if (*p == ';') {
+ has_semi_colon = TRUE;
+ has_terminating_colon = TRUE;
+ }
+ else {
+ found_problem = TRUE;
+ }
+ p = peek + 1;
+ goto try_posix;
+ }
+ }
+
+ /* Here we have punctuation we thought didn't end the class.
+ * Keep track of the position of the key characters that are
+ * more likely to have been class-enders */
+ if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
+
+ /* Allow just one such possible class-ender not actually
+ * ending the class. */
+ if (possible_end) {
+ break;
+ }
+ possible_end = p;
+ }
+
+ /* If we have too many punctuation characters, no use in
+ * keeping going */
+ if (++punct_count > max_distance) {
+ break;
+ }
+
+ /* Treat the punctuation as a typo. */
+ input_text[name_len++] = *p;
+ p++;
+ }
+ else if (isUPPER(*p)) { /* Use lowercase for lookup */
+ input_text[name_len++] = toLOWER(*p);
+ has_upper = TRUE;
+ found_problem = TRUE;
+ p++;
+ } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
+ input_text[name_len++] = *p;
+ p++;
+ }
+ else {
+ input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
+ p+= UTF8SKIP(p);
+ }
+
+ /* The declaration of 'input_text' is how long we allow a potential
+ * class name to be, before saying they didn't mean a class name at
+ * all */
+ if (name_len >= C_ARRAY_LENGTH(input_text)) {
+ break;
+ }
+ }
+
+ /* We get to here when the possible class name hasn't been properly
+ * terminated before:
+ * 1) we ran off the end of the pattern; or
+ * 2) found two characters, each of which might have been intended to
+ * be the name's terminator
+ * 3) found so many punctuation characters in the purported name,
+ * that the edit distance to a valid one is exceeded
+ * 4) we decided it was more characters than anyone could have
+ * intended to be one. */
+
+ found_problem = TRUE;
+
+ /* In the final two cases, we know that looking up what we've
+ * accumulated won't lead to a match, even a fuzzy one. */
+ if ( name_len >= C_ARRAY_LENGTH(input_text)
+ || punct_count > max_distance)
+ {
+ /* If there was an intermediate key character that could have been
+ * an intended end, redo the parse, but stop there */
+ if (possible_end && possible_end != (char *) -1) {
+ possible_end = (char *) -1; /* Special signal value to say
+ we've done a first pass */
+ p = name_start;
+ goto parse_name;
+ }
+
+ /* Otherwise, it can't have meant to have been a class */
+ return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ }
+
+ /* If we ran off the end, and the final character was a punctuation
+ * one, back up one, to look at that final one just below. Later, we
+ * will restore the parse pointer if appropriate */
+ if (name_len && p == e && isPUNCT(*(p-1))) {
+ p--;
+ name_len--;
+ }
+
+ if (p < e && isPUNCT(*p)) {
+ if (*p == ']') {
+ has_terminating_bracket = TRUE;
+
+ /* If this is a 2nd ']', and the first one is just below this
+ * one, consider that to be the real terminator. This gives a
+ * uniform and better positioning for the warning message */
+ if ( possible_end
+ && possible_end != (char *) -1
+ && *possible_end == ']'
+ && name_len && input_text[name_len - 1] == ']')
+ {
+ name_len--;
+ p = possible_end;
+
+ /* And this is actually equivalent to having done the 2nd
+ * pass now, so set it to not try again */
+ possible_end = (char *) -1;
+ }
+ }
+ else {
+ if (*p == ':') {
+ has_terminating_colon = TRUE;
+ }
+ else if (*p == ';') {
+ has_semi_colon = TRUE;
+ has_terminating_colon = TRUE;
+ }
+ p++;
+ }
+ }
+
+ try_posix:
+
+ /* Here, we have a class name to look up. We can short circuit the
+ * stuff below for short names that can't possibly be meant to be a
+ * class name. (We can do this on the first pass, as any second pass
+ * will yield an even shorter name) */
+ if (name_len < 3) {
+ return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ }
+
+ /* Find which class it is. Initially switch on the length of the name.
+ * */
+ switch (name_len) {
+ case 4:
+ if (memEQ(name_start, "word", 4)) {
+ /* this is not POSIX, this is the Perl \w */
+ class_number = ANYOF_WORDCHAR;
+ }
+ break;
+ case 5:
+ /* Names all of length 5: alnum alpha ascii blank cntrl digit
+ * graph lower print punct space upper
+ * Offset 4 gives the best switch position. */
+ switch (name_start[4]) {
+ case 'a':
+ if (memEQ(name_start, "alph", 4)) /* alpha */
+ class_number = ANYOF_ALPHA;
+ break;
+ case 'e':
+ if (memEQ(name_start, "spac", 4)) /* space */
+ class_number = ANYOF_SPACE;
+ break;
+ case 'h':
+ if (memEQ(name_start, "grap", 4)) /* graph */
+ class_number = ANYOF_GRAPH;
+ break;
+ case 'i':
+ if (memEQ(name_start, "asci", 4)) /* ascii */
+ class_number = ANYOF_ASCII;
+ break;
+ case 'k':
+ if (memEQ(name_start, "blan", 4)) /* blank */
+ class_number = ANYOF_BLANK;
+ break;
+ case 'l':
+ if (memEQ(name_start, "cntr", 4)) /* cntrl */
+ class_number = ANYOF_CNTRL;
+ break;
+ case 'm':
+ if (memEQ(name_start, "alnu", 4)) /* alnum */
+ class_number = ANYOF_ALPHANUMERIC;
+ break;
+ case 'r':
+ if (memEQ(name_start, "lowe", 4)) /* lower */
+ class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
+ else if (memEQ(name_start, "uppe", 4)) /* upper */
+ class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
+ break;
+ case 't':
+ if (memEQ(name_start, "digi", 4)) /* digit */
+ class_number = ANYOF_DIGIT;
+ else if (memEQ(name_start, "prin", 4)) /* print */
+ class_number = ANYOF_PRINT;
+ else if (memEQ(name_start, "punc", 4)) /* punct */
+ class_number = ANYOF_PUNCT;
+ break;
+ }
+ break;
+ case 6:
+ if (memEQ(name_start, "xdigit", 6))
+ class_number = ANYOF_XDIGIT;
+ break;
+ }
+
+ /* If the name exactly matches a posix class name the class number will
+ * here be set to it, and the input almost certainly was meant to be a
+ * posix class, so we can skip further checking. If instead the syntax
+ * is exactly correct, but the name isn't one of the legal ones, we
+ * will return that as an error below. But if neither of these apply,
+ * it could be that no posix class was intended at all, or that one
+ * was, but there was a typo. We tease these apart by doing fuzzy
+ * matching on the name */
+ if (class_number == OOB_NAMEDCLASS && found_problem) {
+ const UV posix_names[][6] = {
+ { 'a', 'l', 'n', 'u', 'm' },
+ { 'a', 'l', 'p', 'h', 'a' },
+ { 'a', 's', 'c', 'i', 'i' },
+ { 'b', 'l', 'a', 'n', 'k' },
+ { 'c', 'n', 't', 'r', 'l' },
+ { 'd', 'i', 'g', 'i', 't' },
+ { 'g', 'r', 'a', 'p', 'h' },
+ { 'l', 'o', 'w', 'e', 'r' },
+ { 'p', 'r', 'i', 'n', 't' },
+ { 'p', 'u', 'n', 'c', 't' },
+ { 's', 'p', 'a', 'c', 'e' },
+ { 'u', 'p', 'p', 'e', 'r' },
+ { 'w', 'o', 'r', 'd' },
+ { 'x', 'd', 'i', 'g', 'i', 't' }
+ };
+ /* The names of the above all have added NULs to make them the same
+ * size, so we need to also have the real lengths */
+ const UV posix_name_lengths[] = {
+ sizeof("alnum") - 1,
+ sizeof("alpha") - 1,
+ sizeof("ascii") - 1,
+ sizeof("blank") - 1,
+ sizeof("cntrl") - 1,
+ sizeof("digit") - 1,
+ sizeof("graph") - 1,
+ sizeof("lower") - 1,
+ sizeof("print") - 1,
+ sizeof("punct") - 1,
+ sizeof("space") - 1,
+ sizeof("upper") - 1,
+ sizeof("word") - 1,
+ sizeof("xdigit")- 1
+ };
+ unsigned int i;
+ int temp_max = max_distance; /* Use a temporary, so if we
+ reparse, we haven't changed the
+ outer one */
+
+ /* Use a smaller max edit distance if we are missing one of the
+ * delimiters */
+ if ( has_opening_bracket + has_opening_colon < 2
+ || has_terminating_bracket + has_terminating_colon < 2)
+ {
+ temp_max--;
+ }
+
+ /* See if the input name is close to a legal one */
+ for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
+
+ /* Short circuit call if the lengths are too far apart to be
+ * able to match */
+ if (abs( (int) (name_len - posix_name_lengths[i]))
+ > temp_max)
+ {
+ continue;
+ }
+
+ if (edit_distance(input_text,
+ posix_names[i],
+ name_len,
+ posix_name_lengths[i],
+ temp_max
+ )
+ > -1)
+ { /* If it is close, it probably was intended to be a class */
+ goto probably_meant_to_be;
+ }
+ }
+
+ /* Here the input name is not close enough to a valid class name
+ * for us to consider it to be intended to be a posix class. If
+ * we haven't already done so, and the parse found a character that
+ * could have been terminators for the name, but which we absorbed
+ * as typos during the first pass, repeat the parse, signalling it
+ * to stop at that character */
+ if (possible_end && possible_end != (char *) -1) {
+ possible_end = (char *) -1;
+ p = name_start;
+ goto parse_name;
+ }
+
+ /* Here neither pass found a close-enough class name */
+ return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ }
+
+ probably_meant_to_be:
+
+ /* Here we think that a posix specification was intended. Update any
+ * parse pointer */
+ if (updated_parse_ptr) {
+ *updated_parse_ptr = (char *) p;
+ }
+
+ /* If a posix class name was intended but incorrectly specified, we
+ * output or return the warnings */
+ if (found_problem) {
+
+ /* We set flags for these issues in the parse loop above instead of
+ * adding them to the list of warnings, because we can parse it
+ * twice, and we only want one warning instance */
+ if (has_upper) {
+ ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
+ }
+ if (has_blank) {
+ ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
+ }
+ if (has_semi_colon) {
+ ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
+ }
+ else if (! has_terminating_colon) {
+ ADD_POSIX_WARNING(p, "there is no terminating ':'");
+ }
+ if (! has_terminating_bracket) {
+ ADD_POSIX_WARNING(p, "there is no terminating ']'");
+ }
+
+ if (warn_text) {
+ if (posix_warnings) {
+ /* mortalize to avoid a leak with FATAL warnings */
+ *posix_warnings = (AV *) sv_2mortal((SV *) warn_text);
+ }
+ else {
+ SvREFCNT_dec_NN(warn_text);
+ }
+ }
+ }
+ else if (class_number != OOB_NAMEDCLASS) {
+ /* If it is a known class, return the class. The class number
+ * #defines are structured so each complement is +1 to the normal
+ * one */
+ return class_number + complement;
+ }
+ else if (! check_only) {
+
+ /* Here, it is an unrecognized class. This is an error (unless the
+ * call is to check only, which we've already handled above) */
+ const char * const complement_string = (complement)
+ ? "^"
+ : "";
+ RExC_parse = (char *) p;
+ vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown",
+ complement_string,
+ UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
+ }
+ }
- return (p
- && p - RExC_parse > 2 /* [:] evaluates to colon;
- [::] is a bad posix class. */
- && first_char == *(p - 1));
+ return OOB_NAMEDCLASS;
}
+#undef ADD_POSIX_WARNING
STATIC unsigned int
S_regex_set_precedence(const U8 my_operator) {
const bool save_fold = FOLD; /* Temporary */
char *save_end, *save_parse; /* Temporaries */
const bool in_locale = LOC; /* we turn off /l during processing */
+ AV* posix_warnings = NULL;
GET_RE_DEBUG_FLAGS_DECL;
default:
break;
case '\\':
- /* Skip the next byte (which could cause us to end up in
- * the middle of a UTF-8 character, but since none of those
- * are confusable with anything we currently handle in this
- * switch (invariants all), it's safe. We'll just hit the
- * default: case next time and keep on incrementing until
- * we find one of the invariants we do handle. */
+ /* Skip past this, so the next character gets skipped, after
+ * the switch */
RExC_parse++;
if (*RExC_parse == 'c') {
/* Skip the \cX notation for control characters */
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
break;
+
case '[':
{
- /* If this looks like it is a [:posix:] class, leave the
- * parse pointer at the '[' to fool regclass() into
- * thinking it is part of a '[[:posix:]]'. That function
- * will use strict checking to force a syntax error if it
- * doesn't work out to a legitimate class */
- bool is_posix_class
- = could_it_be_a_POSIX_class(pRExC_state);
+ /* See if this is a [:posix:] class. */
+ bool is_posix_class = (OOB_NAMEDCLASS
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL,
+ TRUE /* checking only */));
+ /* If it is a posix class, leave the parse pointer at the
+ * '[' to fool regclass() into thinking it is part of a
+ * '[[:posix:]]'. */
if (! is_posix_class) {
RExC_parse++;
}
TRUE, /* silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t
+ ¤t,
+ &posix_warnings
))
FAIL2("panic: regclass returned NULL to handle_sets, "
"flags=%#"UVxf"", (UV) *flagp);
case ']':
if (depth--) break;
RExC_parse++;
- if (RExC_parse < RExC_end
- && *RExC_parse == ')')
- {
+ if (*RExC_parse == ')') {
node = reganode(pRExC_state, ANYOF, 0);
RExC_size += ANYOF_SKIP;
nextchar(pRExC_state);
}
no_close:
+ /* We output the messages even if warnings are off, because we'll fail
+ * the very next thing, and these give a likely diagnosis for that */
+ if (posix_warnings && av_tindex(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
+ }
+
FAIL("Syntax error in (?[...])");
}
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
"The regex_sets feature is experimental" REPORT_LOCATION,
- UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
- UTF8fARG(UTF,
- RExC_end - RExC_start - (RExC_parse - RExC_precomp),
- RExC_precomp + (RExC_parse - RExC_precomp)));
+ REPORT_LOCATION_ARGS(RExC_parse));
/* Everything in this construct is a metacharacter. Operands begin with
* either a '\' (for an escape sequence), or a '[' for a bracketed
case '(':
- if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
+ if ( RExC_parse < RExC_end - 1
+ && (UCHARAT(RExC_parse + 1) == '?'))
{
/* If is a '(?', could be an embedded '(?flags:(?[...])'.
* This happens when we have some thing like
* inversion list, and RExC_parse points to the trailing
* ']'; the next character should be the ')' */
RExC_parse++;
- assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+ assert(UCHARAT(RExC_parse) == ')');
/* Then the ')' matching the original '(' handled by this
* case: statement */
RExC_parse++;
- assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+ assert(UCHARAT(RExC_parse) == ')');
RExC_parse++;
RExC_flags = save_flags;
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t))
+ ¤t,
+ NULL))
{
FAIL2("panic: regclass returned NULL to handle_sets, "
"flags=%#"UVxf"", (UV) *flagp);
case '[': /* Is a bracketed character class */
{
- bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
-
+ /* See if this is a [:posix:] class. */
+ bool is_posix_class = (OOB_NAMEDCLASS
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL,
+ TRUE /* checking only */));
+ /* If it is a posix class, leave the parse pointer at the '['
+ * to fool regclass() into thinking it is part of a
+ * '[[:posix:]]'. */
if (! is_posix_class) {
RExC_parse++;
}
/* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
* multi-char folds are allowed. */
- if(!regclass(pRExC_state, flagp,depth+1,
- is_posix_class, /* parse the whole char class
- only if not a posix class */
- FALSE, /* don't allow multi-char folds */
- FALSE, /* don't silence non-portable warnings. */
- TRUE, /* strict */
- FALSE, /* Require return to be an ANYOF */
- ¤t
- ))
+ if (!regclass(pRExC_state, flagp,depth+1,
+ is_posix_class, /* parse the whole char
+ class only if not a
+ posix class */
+ FALSE, /* don't allow multi-char folds */
+ TRUE, /* silence non-portable warnings. */
+ TRUE, /* strict */
+ FALSE, /* Require return to be an ANYOF */
+ ¤t,
+ NULL
+ ))
{
FAIL2("panic: regclass returned NULL to handle_sets, "
"flags=%#"UVxf"", (UV) *flagp);
}
lhs = av_pop(stack);
- assert(IS_OPERAND(lhs));
+
+ if (! IS_OPERAND(lhs)) {
+
+ /* This can happen when there is an empty (), like in
+ * /(?[[0]+()+])/ */
+ goto bad_syntax;
+ }
switch (stacked_operator) {
case '&':
av_push(stack, rhs);
goto redo_curchar;
- case '!': /* Highest priority, right associative, so just push
- onto stack */
- av_push(stack, newSVuv(curchar));
+ case '!': /* Highest priority, right associative */
+
+ /* If what's already at the top of the stack is another '!",
+ * they just cancel each other out */
+ if ( (top_ptr = av_fetch(stack, top_index, FALSE))
+ && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
+ {
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ }
+ else { /* Otherwise, since it's right associative, just push
+ onto the stack */
+ av_push(stack, newSVuv(curchar));
+ }
break;
default:
they're valid on this machine */
FALSE, /* similarly, no need for strict */
FALSE, /* Require return to be an ANYOF */
+ NULL,
NULL
);
if (!node)
assert(OP(node) == ANYOF);
OP(node) = ANYOFL;
- ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+ ANYOF_FLAGS(node)
+ |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
if (save_fold) {
}
}
+STATIC void
+S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
+{
+ /* If the final parameter is NULL, output the elements of the array given
+ * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
+ * pushed onto it, (creating if necessary) */
+
+ SV * msg;
+ const bool first_is_fatal = ! return_posix_warnings
+ && ckDEAD(packWARN(WARN_REGEXP));
+
+ PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
+
+ while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
+ if (return_posix_warnings) {
+ if (! *return_posix_warnings) { /* mortalize to not leak if
+ warnings are fatal */
+ *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
+ }
+ av_push(*return_posix_warnings, msg);
+ }
+ else {
+ if (first_is_fatal) { /* Avoid leaking this */
+ av_undef(posix_warnings); /* This isn't necessary if the
+ array is mortal, but is a
+ fail-safe */
+ (void) sv_2mortal(msg);
+ if (PASS2) {
+ SAVEFREESV(RExC_rx_sv);
+ }
+ }
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
+ SvREFCNT_dec_NN(msg);
+ }
+ }
+}
+
STATIC AV *
S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
{
#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
STMT_START { \
if (do_skip) { \
- while ( p < RExC_end \
- && isBLANK_A(UCHARAT(p))) \
+ while (isBLANK_A(UCHARAT(p))) \
{ \
p++; \
} \
const bool strict,
bool optimizable, /* ? Allow a non-ANYOF return
node */
- SV** ret_invlist /* Return an inversion list, not a node */
+ SV** ret_invlist, /* Return an inversion list, not a node */
+ AV** return_posix_warnings
)
{
/* parse a bracketed class specification. Most of these will produce an
UV value = OOB_UNICODE, save_value = OOB_UNICODE;
regnode *ret;
STRLEN numlen;
- IV namedclass = OOB_NAMEDCLASS;
+ int namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
bool need_class = 0;
SV *listsv = NULL;
bool has_user_defined_property = FALSE;
/* inversion list of code points this node matches only when the target
- * string is in UTF-8. (Because is under /d) */
- SV* depends_list = NULL;
+ * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
+ * /d) */
+ SV* has_upper_latin1_only_utf8_matches = NULL;
/* Inversion list of code points this node matches regardless of things
* like locale, folding, utf8ness of the target string */
const char * orig_parse = RExC_parse;
const SSize_t orig_size = RExC_size;
bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
+
+ /* This variable is used to mark where the end in the input is of something
+ * that looks like a POSIX construct but isn't. During the parse, when
+ * something looks like it could be such a construct is encountered, it is
+ * checked for being one, but not if we've already checked this area of the
+ * input. Only after this position is reached do we check again */
+ char *not_posix_region_end = RExC_parse - 1;
+
+ AV* posix_warnings = NULL;
+ const bool do_posix_warnings = return_posix_warnings
+ || (PASS2 && ckWARN(WARN_REGEXP));
+
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGCLASS;
ret = reganode(pRExC_state,
(LOC)
? ANYOFL
- : (DEPENDS_SEMANTICS)
- ? ANYOFD
- : ANYOF,
+ : ANYOF,
0);
if (SIZE_ONLY) {
SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
- if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
+ assert(RExC_parse <= RExC_end);
+
+ if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
RExC_parse++;
invert = TRUE;
allow_multi_folds = FALSE;
}
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
- if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
- const char *s = RExC_parse;
- const char c = *s++;
-
- if (*s == '^') {
- s++;
+ if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
+ int maybe_class = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ ¬_posix_region_end,
+ NULL,
+ TRUE /* checking only */);
+ if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
+ SAVEFREESV(RExC_rx_sv);
+ ckWARN4reg(not_posix_region_end,
+ "POSIX syntax [%c %c] belongs inside character classes%s",
+ *RExC_parse, *RExC_parse,
+ (maybe_class == OOB_NAMEDCLASS)
+ ? ((POSIXCC_NOTYET(*RExC_parse))
+ ? " (but this one isn't implemented)"
+ : " (but this one isn't fully valid)")
+ : ""
+ );
+ (void)ReREFCNT_inc(RExC_rx_sv);
}
- while (isWORDCHAR(*s))
- s++;
- if (*s && c == *s && s[1] == ']') {
- SAVEFREESV(RExC_rx_sv);
- ckWARN3reg(s+2,
- "POSIX syntax [%c %c] belongs inside character classes",
- c, c);
- (void)ReREFCNT_inc(RExC_rx_sv);
- }
}
/* If the caller wants us to just parse a single element, accomplish this
goto charclassloop;
while (1) {
+
+ if ( posix_warnings
+ && av_tindex(posix_warnings) >= 0
+ && RExC_parse > not_posix_region_end)
+ {
+ /* Warnings about posix class issues are considered tentative until
+ * we are far enough along in the parse that we can no longer
+ * change our mind, at which point we either output them or add
+ * them, if it has so specified, to what gets returned to the
+ * caller. This is done each time through the loop so that a later
+ * class won't zap them before they have been dealt with. */
+ output_or_return_posix_warnings(pRExC_state, posix_warnings,
+ return_posix_warnings);
+ }
+
if (RExC_parse >= stop_ptr) {
break;
}
element_count++;
non_portable_endpoint = 0;
}
- if (UTF) {
+ if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
else
value = UCHARAT(RExC_parse++);
- if (value == '['
- && RExC_parse < RExC_end
- && POSIXCC(UCHARAT(RExC_parse)))
- {
- namedclass = regpposixcc(pRExC_state, value, strict);
- }
- else if (value == '\\') {
+ if (value == '[') {
+ char * posix_class_end;
+ namedclass = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ &posix_class_end,
+ do_posix_warnings ? &posix_warnings : NULL,
+ FALSE /* die if error */);
+ if (namedclass > OOB_NAMEDCLASS) {
+
+ /* If there was an earlier attempt to parse this particular
+ * posix class, and it failed, it was a false alarm, as this
+ * successful one proves */
+ if ( posix_warnings
+ && av_tindex(posix_warnings) >= 0
+ && not_posix_region_end >= RExC_parse
+ && not_posix_region_end <= posix_class_end)
+ {
+ av_undef(posix_warnings);
+ }
+
+ RExC_parse = posix_class_end;
+ }
+ else if (namedclass == OOB_NAMEDCLASS) {
+ not_posix_region_end = posix_class_end;
+ }
+ else {
+ namedclass = OOB_NAMEDCLASS;
+ }
+ }
+ else if ( RExC_parse - 1 > not_posix_region_end
+ && MAYBE_POSIXCC(value))
+ {
+ (void) handle_possible_posix(
+ pRExC_state,
+ RExC_parse - 1, /* -1 because parse has already been
+ advanced */
+ ¬_posix_region_end,
+ do_posix_warnings ? &posix_warnings : NULL,
+ TRUE /* checking only */);
+ }
+ else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
+
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Unmatched [");
+ }
+
if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&value, /* Yes single value */
&cp_count, /* Multiple code pt count */
flagp,
+ strict,
depth)
) {
vFAIL("\\N in a character class must be a named character: \\N{...}");
}
else if (cp_count == 0) {
- if (strict) {
- RExC_parse++; /* Position after the "}" */
- vFAIL("Zero length \\N{}");
- }
- else if (PASS2) {
+ if (PASS2) {
ckWARNreg(RExC_parse,
"Ignoring zero length \\N{} in character class");
}
|_CORE_SWASH_INIT_ACCEPT_INVLIST;
if (RExC_parse >= RExC_end)
- vFAIL2("Empty \\%c{}", (U8)value);
+ vFAIL2("Empty \\%c", (U8)value);
if (*RExC_parse == '{') {
const U8 c = (U8)value;
e = strchr(RExC_parse, '}');
SV* invlist;
char* name;
char* base_name; /* name after any packages are stripped */
+ char* lookup_name = NULL;
const char * const colon_colon = "::";
/* Try to get the definition of the property into
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- name = savepv(Perl_form(aTHX_
- "%s%.*s%s\n",
- (FOLD) ? "__" : "",
- (int)n,
- RExC_parse,
- (FOLD) ? "_i" : ""
- ));
+ name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
+ SAVEFREEPV(name);
+ if (FOLD) {
+ lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
+ }
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
- if (swash) { /* Return any left-overs */
- SvREFCNT_dec_NN(swash);
- }
- swash = _core_swash_init("utf8", name, &PL_sv_undef,
+ SvREFCNT_dec(swash); /* Free any left-overs */
+ swash = _core_swash_init("utf8",
+ (lookup_name)
+ ? lookup_name
+ : name,
+ &PL_sv_undef,
1, /* binary */
0, /* not tr/// */
NULL, /* No inversion list */
&swash_init_flags
);
+ if (lookup_name) {
+ Safefree(lookup_name);
+ }
if (! swash || ! (invlist = _get_swash_invlist(swash))) {
HV* curpkg = (IN_PERL_COMPILETIME)
? PL_curstash
pkgname,
name);
n = strlen(full_name);
- Safefree(name);
name = savepvn(full_name, n);
+ SAVEFREEPV(name);
}
}
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n",
(value == 'p' ? '+' : '!'),
- UTF8fARG(UTF, n, name));
+ (FOLD) ? "__" : "",
+ UTF8fARG(UTF, n, name),
+ (FOLD) ? "_i" : "");
has_user_defined_property = TRUE;
optimizable = FALSE; /* Will have to leave this an
ANYOF node */
- /* We don't know yet, so have to assume that the
- * property could match something in the upper Latin1
- * range, hence something that isn't utf8. Note that
- * this would cause things in <depends_list> to match
- * inappropriately, except that any \p{}, including
- * this one forces Unicode semantics, which means there
- * is no <depends_list> */
- ANYOF_FLAGS(ret)
- |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
+ /* We don't know yet what this matches, so have to flag
+ * it */
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
else {
_invlist_union(properties, invlist, &properties);
}
}
- Safefree(name);
}
RExC_parse = e + 1;
namedclass = ANYOF_UNIPROP; /* no official name, but it's
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- value = reg_recode((const char)(U8)value, &enc);
+ value = reg_recode((U8)value, &enc);
if (!enc) {
if (strict) {
vFAIL("Invalid escape in the specified encoding");
"\"%.*s\" is more clearly written simply as \"%s\"",
(int) (RExC_parse - rangebegin),
rangebegin,
- cntrl_to_mnemonic((char) value)
+ cntrl_to_mnemonic((U8) value)
);
}
}
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
+
+ if ( posix_warnings && av_tindex(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings,
+ return_posix_warnings);
+ }
+
/* If anything in the class expands to more than one character, we have to
* deal with them by building up a substitute parse string, and recursively
* calling reg() on it, instead of proceeding */
STRLEN len;
char *save_end = RExC_end;
char *save_parse = RExC_parse;
+ char *save_start = RExC_start;
+ STRLEN prefix_end = 0; /* We copy the character class after a
+ prefix supplied here. This is the size
+ + 1 of that prefix */
bool first_time = TRUE; /* First multi-char occurrence doesn't get
a "|" */
I32 reg_flags;
assert(! invert);
+ assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
+
#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
because too confusing */
if (invert) {
* multi-character folds, have to include it in recursive parsing */
if (element_count) {
sv_catpv(substitute_parse, "|[");
+ prefix_end = SvCUR(substitute_parse);
sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
- sv_catpv(substitute_parse, "]");
+
+ /* Put in a closing ']' only if not going off the end, as otherwise
+ * we are adding something that really isn't there */
+ if (RExC_parse < RExC_end) {
+ sv_catpv(substitute_parse, "]");
+ }
}
sv_catpv(substitute_parse, ")");
}
#endif
- RExC_parse = SvPV(substitute_parse, len);
+ /* Set up the data structure so that any errors will be properly
+ * reported. See the comments at the definition of
+ * REPORT_LOCATION_ARGS for details */
+ RExC_precomp_adj = orig_parse - RExC_precomp;
+ RExC_start = RExC_parse = SvPV(substitute_parse, len);
+ RExC_adjusted_start = RExC_start + prefix_end;
RExC_end = RExC_parse + len;
RExC_in_multi_char_class = 1;
RExC_override_recoding = 1;
*flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
- RExC_parse = save_parse;
+ /* And restore so can parse the rest of the pattern */
+ RExC_parse = save_parse;
+ RExC_start = RExC_adjusted_start = save_start;
+ RExC_precomp_adj = 0;
RExC_end = save_end;
RExC_in_multi_char_class = 0;
RExC_override_recoding = 0;
if (UNLIKELY(posixl_matches_all)) {
op = SANY;
}
- else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
- \w or [:digit:] or \p{foo}
- */
+ else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
+ class, like \w or [:digit:]
+ or \p{foo} */
/* All named classes are mapped into POSIXish nodes, with its FLAG
* argument giving which class it is */
PL_fold_latin1[j]);
}
else {
- depends_list =
- add_cp_to_invlist(depends_list,
- PL_fold_latin1[j]);
+ has_upper_latin1_only_utf8_matches
+ = add_cp_to_invlist(
+ has_upper_latin1_only_utf8_matches,
+ PL_fold_latin1[j]);
}
}
else {
/* Similarly folds involving non-ascii Latin1
* characters under /d are added to their list */
- depends_list = add_cp_to_invlist(depends_list,
- c);
+ has_upper_latin1_only_utf8_matches
+ = add_cp_to_invlist(
+ has_upper_latin1_only_utf8_matches,
+ c);
}
}
}
cp_list = posixes;
}
- if (depends_list) {
- _invlist_union(depends_list, nonascii_but_latin1_properties,
- &depends_list);
+ if (has_upper_latin1_only_utf8_matches) {
+ _invlist_union(has_upper_latin1_only_utf8_matches,
+ nonascii_but_latin1_properties,
+ &has_upper_latin1_only_utf8_matches);
SvREFCNT_dec_NN(nonascii_but_latin1_properties);
}
else {
- depends_list = nonascii_but_latin1_properties;
+ has_upper_latin1_only_utf8_matches
+ = nonascii_but_latin1_properties;
}
}
}
* class that isn't a Unicode property, and which matches above Unicode, \W
* or [\x{110000}] for example.
* (Note that in this case, unlike the Posix one above, there is no
- * <depends_list>, because having a Unicode property forces Unicode
- * semantics */
+ * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
+ * forces Unicode semantics */
if (properties) {
if (cp_list) {
/* If it matters to the final outcome, see if a non-property
* component of the class matches above Unicode. If so, the
* warning gets suppressed. This is true even if just a single
- * such code point is specified, as though not strictly correct if
+ * such code point is specified, as, though not strictly correct if
* another such code point is matched against, the fact that they
* are using above-Unicode code points indicates they should know
* the issues involved */
* fetching). We know to set the flag if we have a non-NULL list for UTF-8
* locales, or the class matches at least one 0-255 range code point */
if (LOC && FOLD) {
+
+ /* Some things on the list might be unconditionally included because of
+ * other components. Remove them, and clean up the list if it goes to
+ * 0 elements */
+ if (only_utf8_locale_list && cp_list) {
+ _invlist_subtract(only_utf8_locale_list, cp_list,
+ &only_utf8_locale_list);
+
+ if (_invlist_len(only_utf8_locale_list) == 0) {
+ SvREFCNT_dec_NN(only_utf8_locale_list);
+ only_utf8_locale_list = NULL;
+ }
+ }
if (only_utf8_locale_list) {
- ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
+ ANYOF_FLAGS(ret)
+ |= ANYOFL_FOLD
+ |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
else if (cp_list) { /* Look to see if a 0-255 code point is in list */
UV start, end;
invlist_iterinit(cp_list);
if (invlist_iternext(cp_list, &start, &end) && start < 256) {
- ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
+ ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
}
invlist_iterfinish(cp_list);
}
}
+#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \
+ ( DEPENDS_SEMANTICS \
+ && (ANYOF_FLAGS(ret) \
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+
+ /* See if we can simplify things under /d */
+ if ( has_upper_latin1_only_utf8_matches
+ || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+ {
+ /* But not if we are inverting, as that screws it up */
+ if (! invert) {
+ if (has_upper_latin1_only_utf8_matches) {
+ if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+
+ /* Here, we have both the flag and inversion list. Any
+ * character in 'has_upper_latin1_only_utf8_matches'
+ * matches when UTF-8 is in effect, but it also matches
+ * when UTF-8 is not in effect because of
+ * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches
+ * unconditionally, so can be added to the regular list,
+ * and 'has_upper_latin1_only_utf8_matches' cleared */
+ _invlist_union(cp_list,
+ has_upper_latin1_only_utf8_matches,
+ &cp_list);
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
+ else if (cp_list) {
+
+ /* Here, 'cp_list' gives chars that always match, and
+ * 'has_upper_latin1_only_utf8_matches' gives chars that
+ * were specified to match only if the target string is in
+ * UTF-8. It may be that these overlap, so we can subtract
+ * the unconditionally matching from the conditional ones,
+ * to make the conditional list as small as possible,
+ * perhaps even clearing it, in which case more
+ * optimizations are possible later */
+ _invlist_subtract(has_upper_latin1_only_utf8_matches,
+ cp_list,
+ &has_upper_latin1_only_utf8_matches);
+ if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
+ }
+ }
+
+ /* Similarly, if the unconditional matches include every upper
+ * latin1 character, we can clear that flag to permit later
+ * optimizations */
+ if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+ SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+ _invlist_subtract(only_non_utf8_list, cp_list,
+ &only_non_utf8_list);
+ if (_invlist_len(only_non_utf8_list) == 0) {
+ ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ }
+ SvREFCNT_dec_NN(only_non_utf8_list);
+ only_non_utf8_list = NULL;;
+ }
+ }
+
+ /* If we haven't gotten rid of all conditional matching, we change the
+ * regnode type to indicate that */
+ if ( has_upper_latin1_only_utf8_matches
+ || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+ {
+ OP(ret) = ANYOFD;
+ optimizable = FALSE;
+ }
+ }
+#undef MATCHES_ALL_NON_UTF8_NON_ASCII
+
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
* at compile time. Besides not inverting folded locale now, we can't
* invert if there are things such as \w, which aren't known until runtime
* */
if (cp_list
&& invert
+ && OP(ret) != ANYOFD
&& ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
- && ! depends_list
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
_invlist_invert(cp_list);
* adjacent such nodes. And if the class is equivalent to things like /./,
* expensive run-time swashes can be avoided. Now that we have more
* complete information, we can find things necessarily missed by the
- * earlier code. I (khw) did some benchmarks and found essentially no
- * speed difference between using a POSIXA node versus an ANYOF node, so
- * there is no reason to optimize, for example [A-Za-z0-9_] into
- * [[:word:]]/a (although if we did it in the sizing pass it would save
- * space). _invlistEQ() could be used if one ever wanted to do something
- * like this at this point in the code */
-
- if (optimizable && cp_list && ! invert && ! depends_list) {
+ * earlier code. Another possible "optimization" that isn't done is that
+ * something like [Ee] could be changed into an EXACTFU. khw tried this
+ * and found that the ANYOF is faster, including for code points not in the
+ * bitmap. This still might make sense to do, provided it got joined with
+ * an adjacent node(s) to create a longer EXACTFU one. This could be
+ * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
+ * routine would know is joinable. If that didn't happen, the node type
+ * could then be made a straight ANYOF */
+
+ if (optimizable && cp_list && ! invert) {
UV start, end;
U8 op = END; /* The optimzation node-type */
+ int posix_class = -1; /* Illegal value */
const char * cur_parse= RExC_parse;
invlist_iterinit(cp_list);
}
invlist_iterfinish(cp_list);
+ if (op == END) {
+ const UV cp_list_len = _invlist_len(cp_list);
+ const UV* cp_list_array = invlist_array(cp_list);
+
+ /* Here, didn't find an optimization. See if this matches any of
+ * the POSIX classes. These run slightly faster for above-Unicode
+ * code points, so don't bother with POSIXA ones nor the 2 that
+ * have no above-Unicode matches. We can avoid these checks unless
+ * the ANYOF matches at least as high as the lowest POSIX one
+ * (which was manually found to be \v. The actual code point may
+ * increase in later Unicode releases, if a higher code point is
+ * assigned to be \v, but this code will never break. It would
+ * just mean we could execute the checks for posix optimizations
+ * unnecessarily) */
+
+ if (cp_list_array[cp_list_len-1] > 0x2029) {
+ for (posix_class = 0;
+ posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+ posix_class++)
+ {
+ int try_inverted;
+ if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
+ continue;
+ }
+ for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+
+ /* Check if matches normal or inverted */
+ if (_invlistEQ(cp_list,
+ PL_XPosix_ptrs[posix_class],
+ try_inverted))
+ {
+ op = (try_inverted)
+ ? NPOSIXU
+ : POSIXU;
+ *flagp |= HASWIDTH|SIMPLE;
+ goto found_posix;
+ }
+ }
+ }
+ found_posix: ;
+ }
+ }
+
if (op != END) {
RExC_parse = (char *)orig_parse;
RExC_emit = (regnode *)orig_emit;
TRUE /* downgradable to EXACT */
);
}
+ else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
+ FLAGS(ret) = posix_class;
+ }
SvREFCNT_dec_NN(cp_list);
return ret;
/* Here, the bitmap has been populated with all the Latin1 code points that
* always match. Can now add to the overall list those that match only
- * when the target string is UTF-8 (<depends_list>). */
- if (depends_list) {
+ * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
+ * */
+ if (has_upper_latin1_only_utf8_matches) {
if (cp_list) {
- _invlist_union(cp_list, depends_list, &cp_list);
- SvREFCNT_dec_NN(depends_list);
+ _invlist_union(cp_list,
+ has_upper_latin1_only_utf8_matches,
+ &cp_list);
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
}
else {
- cp_list = depends_list;
+ cp_list = has_upper_latin1_only_utf8_matches;
}
- ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
/* If there is a swash and more than one element, we can't use the swash in
if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
assert(! (ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
+ & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
}
else {
AV * const av = newAV();
SV *rv;
- assert(ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
-
av_store(av, 0, (runtime_defns)
? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
if (swash) {
bool doinit,
SV** listsvp,
SV** only_utf8_locale_ptr,
- SV* exclude_list)
+ SV** output_invlist)
{
/* For internal core use only.
* swash exists, by calling this function with 'doinit' set to false, in
* which case the components that will be used to eventually create the
* swash are returned (in a printable form).
- * If <exclude_list> is not NULL, it is an inversion list of things to
- * exclude from what's returned in <listsvp>.
+ * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
+ * store an inversion list of code points that should match only if the
+ * execution-time locale is a UTF-8 one.
+ * If <output_invlist> is not NULL, it is where this routine is to store an
+ * inversion list of the code points that would be instead returned in
+ * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
+ * when this parameter is used, is just the non-code point data that
+ * will go into creating the swash. This currently should be just
+ * user-defined properties whose definitions were not known at compile
+ * time. Using this parameter allows for easier manipulation of the
+ * swash's data by the caller. It is illegal to call this function with
+ * this parameter set, but not <listsvp>
+ *
* Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
* that, in spite of this function's name, the swash it returns may include
* the bitmap data as well */
SV *sw = NULL;
SV *si = NULL; /* Input swash initialization string */
- SV* invlist = NULL;
+ SV* invlist = NULL;
RXi_GET_DECL(prog,progi);
const struct reg_data * const data = prog ? progi->data : NULL;
PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
-
- assert(ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
+ assert(! output_invlist || listsvp);
if (data && data->count) {
const U32 n = ARG(node);
si = *ary; /* ary[0] = the string to initialize the swash with */
- /* Elements 3 and 4 are either both present or both absent. [3] is
- * any inversion list generated at compile time; [4] indicates if
- * that inversion list has any user-defined properties in it. */
if (av_tindex(av) >= 2) {
if (only_utf8_locale_ptr
&& ary[2]
*only_utf8_locale_ptr = NULL;
}
+ /* Elements 3 and 4 are either both present or both absent. [3]
+ * is any inversion list generated at compile time; [4]
+ * indicates if that inversion list has any user-defined
+ * properties in it. */
if (av_tindex(av) >= 3) {
invlist = ary[3];
if (SvUV(ary[4])) {
/* If requested, return a printable version of what this swash matches */
if (listsvp) {
- SV* matches_string = newSVpvs("");
+ SV* matches_string = NULL;
/* The swash should be used, if possible, to get the data, as it
* contains the resolved data. But this function can be called at
if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
&& (si && si != &PL_sv_undef))
{
- sv_catsv(matches_string, si);
+ /* Here, we only have 'si' (and possibly some passed-in data in
+ * 'invlist', which is handled below) If the caller only wants
+ * 'si', use that. */
+ if (! output_invlist) {
+ matches_string = newSVsv(si);
+ }
+ else {
+ /* But if the caller wants an inversion list of the node, we
+ * need to parse 'si' and place as much as possible in the
+ * desired output inversion list, making 'matches_string' only
+ * contain the currently unresolvable things */
+ const char *si_string = SvPVX(si);
+ STRLEN remaining = SvCUR(si);
+ UV prev_cp = 0;
+ U8 count = 0;
+
+ /* Ignore everything before the first new-line */
+ while (*si_string != '\n' && remaining > 0) {
+ si_string++;
+ remaining--;
+ }
+ assert(remaining > 0);
+
+ si_string++;
+ remaining--;
+
+ while (remaining > 0) {
+
+ /* The data consists of just strings defining user-defined
+ * property names, but in prior incarnations, and perhaps
+ * somehow from pluggable regex engines, it could still
+ * hold hex code point definitions. Each component of a
+ * range would be separated by a tab, and each range by a
+ * new-line. If these are found, instead add them to the
+ * inversion list */
+ I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
+ |PERL_SCAN_SILENT_NON_PORTABLE;
+ STRLEN len = remaining;
+ UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
+
+ /* If the hex decode routine found something, it should go
+ * up to the next \n */
+ if ( *(si_string + len) == '\n') {
+ if (count) { /* 2nd code point on line */
+ *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
+ }
+ else {
+ *output_invlist = add_cp_to_invlist(*output_invlist, cp);
+ }
+ count = 0;
+ goto prepare_for_next_iteration;
+ }
+
+ /* If the hex decode was instead for the lower range limit,
+ * save it, and go parse the upper range limit */
+ if (*(si_string + len) == '\t') {
+ assert(count == 0);
+
+ prev_cp = cp;
+ count = 1;
+ prepare_for_next_iteration:
+ si_string += len + 1;
+ remaining -= len + 1;
+ continue;
+ }
+
+ /* Here, didn't find a legal hex number. Just add it from
+ * here to the next \n */
+
+ remaining -= len;
+ while (*(si_string + len) != '\n' && remaining > 0) {
+ remaining--;
+ len++;
+ }
+ if (*(si_string + len) == '\n') {
+ len++;
+ remaining--;
+ }
+ if (matches_string) {
+ sv_catpvn(matches_string, si_string, len - 1);
+ }
+ else {
+ matches_string = newSVpvn(si_string, len - 1);
+ }
+ si_string += len;
+ sv_catpvs(matches_string, " ");
+ } /* end of loop through the text */
+
+ assert(matches_string);
+ if (SvCUR(matches_string)) { /* Get rid of trailing blank */
+ SvCUR_set(matches_string, SvCUR(matches_string) - 1);
+ }
+ } /* end of has an 'si' but no swash */
}
- /* Add the inversion list to whatever we have. This may have come from
- * the swash, or from an input parameter */
- if (invlist) {
- if (exclude_list) {
- SV* clone = invlist_clone(invlist);
- _invlist_subtract(clone, exclude_list, &clone);
- sv_catsv(matches_string, _invlist_contents(clone));
- SvREFCNT_dec_NN(clone);
+ /* If we have a swash in place, its equivalent inversion list was above
+ * placed into 'invlist'. If not, this variable may contain a stored
+ * inversion list which is information beyond what is in 'si' */
+ if (invlist) {
+
+ /* Again, if the caller doesn't want the output inversion list, put
+ * everything in 'matches-string' */
+ if (! output_invlist) {
+ if ( ! matches_string) {
+ matches_string = newSVpvs("\n");
+ }
+ sv_catsv(matches_string, invlist_contents(invlist,
+ TRUE /* traditional style */
+ ));
+ }
+ else if (! *output_invlist) {
+ *output_invlist = invlist_clone(invlist);
}
else {
- sv_catsv(matches_string, _invlist_contents(invlist));
+ _invlist_union(*output_invlist, invlist, output_invlist);
}
- }
+ }
+
*listsvp = matches_string;
}
{
PERL_ARGS_ASSERT_NEXTCHAR;
- assert( ! UTF
- || UTF8_IS_INVARIANT(*RExC_parse)
- || UTF8_IS_START(*RExC_parse));
+ if (RExC_parse < RExC_end) {
+ assert( ! UTF
+ || UTF8_IS_INVARIANT(*RExC_parse)
+ || UTF8_IS_START(*RExC_parse));
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
- skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- FALSE /* Don't assume /x */ );
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't assume /x */ );
+ }
}
STATIC regnode *
- regtail - set the next-pointer at the end of a node chain of p to val.
- SEE ALSO: regtail_study
*/
-/* TODO: All three parms should be const */
STATIC void
-S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
- const regnode *val,U32 depth)
+S_regtail(pTHX_ RExC_state_t * pRExC_state,
+ const regnode * const p,
+ const regnode * const val,
+ const U32 depth)
{
regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
return;
/* Find last node. */
- scan = p;
+ scan = (regnode *) p;
for (;;) {
regnode * const temp = regnext(scan);
DEBUG_PARSE_r({
#endif /* DEBUGGING */
}
+/* Should be synchronized with ANYOF_ #defines in regcomp.h */
+#ifdef DEBUGGING
+
+# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
+ || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
+ || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
+ || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
+ || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
+ || _CC_VERTSPACE != 15
+# error Need to adjust order of anyofs[]
+# endif
+static const char * const anyofs[] = {
+ "\\w",
+ "\\W",
+ "\\d",
+ "\\D",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:cased:]",
+ "[:^cased:]",
+ "\\s",
+ "\\S",
+ "[:blank:]",
+ "[:^blank:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "\\v",
+ "\\V"
+};
+#endif
+
/*
- regprop - printable representation of opcode, with run time support
*/
{
#ifdef DEBUGGING
int k;
-
- /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
- static const char * const anyofs[] = {
-#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
- || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
- || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
- || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
- || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
- #error Need to adjust order of anyofs[]
-#endif
- "\\w",
- "\\W",
- "\\d",
- "\\D",
- "[:alpha:]",
- "[:^alpha:]",
- "[:lower:]",
- "[:^lower:]",
- "[:upper:]",
- "[:^upper:]",
- "[:punct:]",
- "[:^punct:]",
- "[:print:]",
- "[:^print:]",
- "[:alnum:]",
- "[:^alnum:]",
- "[:graph:]",
- "[:^graph:]",
- "[:cased:]",
- "[:^cased:]",
- "\\s",
- "\\S",
- "[:blank:]",
- "[:^blank:]",
- "[:xdigit:]",
- "[:^xdigit:]",
- "[:cntrl:]",
- "[:^cntrl:]",
- "[:ascii:]",
- "[:^ascii:]",
- "\\v",
- "\\V"
- };
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
(void) put_charclass_bitmap_innards(sv,
- (IS_ANYOF_TRIE(op))
+ ((IS_ANYOF_TRIE(op))
? ANYOF_BITMAP(o)
- : TRIE_BITMAP(trie),
- NULL);
+ : TRIE_BITMAP(trie)),
+ NULL,
+ NULL,
+ NULL
+ );
sv_catpvs(sv, "]");
}
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
const U8 flags = ANYOF_FLAGS(o);
- int do_sep = 0;
- SV* bitmap_invlist; /* Will hold what the bit map contains */
+ bool do_sep = FALSE; /* Do we need to separate various components of
+ the output? */
+ /* Set if there is still an unresolved user-defined property */
+ SV *unresolved = NULL;
+
+ /* Things that are ignored except when the runtime locale is UTF-8 */
+ SV *only_utf8_locale_invlist = NULL;
+
+ /* Code points that don't fit in the bitmap */
+ SV *nonbitmap_invlist = NULL;
+ /* And things that aren't in the bitmap, but are small enough to be */
+ SV* bitmap_range_not_in_bitmap = NULL;
if (OP(o) == ANYOFL) {
- if (flags & ANYOF_LOC_REQ_UTF8) {
- sv_catpvs(sv, "{utf8-loc}");
+ if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
+ sv_catpvs(sv, "{utf8-locale-reqd}");
}
- else {
- sv_catpvs(sv, "{loc}");
+ if (flags & ANYOFL_FOLD) {
+ sv_catpvs(sv, "{i}");
}
}
- if (flags & ANYOF_LOC_FOLD)
- sv_catpvs(sv, "{i}");
- Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (flags & ANYOF_INVERT)
- sv_catpvs(sv, "^");
- /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
- * */
- do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
- &bitmap_invlist);
+ /* If there is stuff outside the bitmap, get it */
+ if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+ (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+ &unresolved,
+ &only_utf8_locale_invlist,
+ &nonbitmap_invlist);
+ /* The non-bitmap data may contain stuff that could fit in the
+ * bitmap. This could come from a user-defined property being
+ * finally resolved when this call was done; or much more likely
+ * because there are matches that require UTF-8 to be valid, and so
+ * aren't in the bitmap. This is teased apart later */
+ _invlist_intersection(nonbitmap_invlist,
+ PL_InBitmap,
+ &bitmap_range_not_in_bitmap);
+ /* Leave just the things that don't fit into the bitmap */
+ _invlist_subtract(nonbitmap_invlist,
+ PL_InBitmap,
+ &nonbitmap_invlist);
+ }
- /* output any special charclass tests (used entirely under use
- * locale) * */
- if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
- int i;
- for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
- if (ANYOF_POSIXL_TEST(o,i)) {
- sv_catpv(sv, anyofs[i]);
- do_sep = 1;
- }
- }
+ /* Obey this flag to add all above-the-bitmap code points */
+ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ NUM_ANYOF_CODE_POINTS,
+ UV_MAX);
}
- if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
- |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
- |ANYOF_LOC_FOLD)))
- {
+ /* Ready to start outputting. First, the initial left bracket */
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+ /* Then all the things that could fit in the bitmap */
+ do_sep = put_charclass_bitmap_innards(sv,
+ ANYOF_BITMAP(o),
+ bitmap_range_not_in_bitmap,
+ only_utf8_locale_invlist,
+ o);
+ SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+ /* If there are user-defined properties which haven't been defined yet,
+ * output them, in a separate [] from the bitmap range stuff */
+ if (unresolved) {
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
- if (flags & ANYOF_INVERT)
- /*make sure the invert info is in each */
- sv_catpvs(sv, "^");
}
-
- if (OP(o) == ANYOFD
- && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
- {
- sv_catpvs(sv, "{non-utf8-latin1-all}");
+ if (flags & ANYOF_INVERT) {
+ sv_catpvs(sv, "^");
}
+ sv_catsv(sv, unresolved);
+ do_sep = TRUE;
+ SvREFCNT_dec_NN(unresolved);
+ }
- if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
- sv_catpvs(sv, "{above_bitmap_all}");
-
- if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
- SV *lv; /* Set if there is something outside the bit map. */
- bool byte_output = FALSE; /* If something has been output */
- SV *only_utf8_locale;
-
- /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
- * is used to guarantee that nothing in the bitmap gets
- * returned */
- (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
- &lv, &only_utf8_locale,
- bitmap_invlist);
- if (lv && lv != &PL_sv_undef) {
- char *s = savesvpv(lv);
- char * const origs = s;
-
- while (*s && *s != '\n')
- s++;
-
- if (*s == '\n') {
- const char * const t = ++s;
+ /* And, finally, add the above-the-bitmap stuff */
+ if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
+ SV* contents;
- if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
- sv_catpvs(sv, "{outside bitmap}");
- }
- else {
- sv_catpvs(sv, "{utf8}");
- }
+ /* See if truncation size is overridden */
+ const STRLEN dump_len = (PL_dump_re_max_len)
+ ? PL_dump_re_max_len
+ : 256;
- if (byte_output) {
- sv_catpvs(sv, " ");
- }
-
- while (*s) {
- if (*s == '\n') {
+ /* This is output in a separate [] */
+ if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ }
- /* Truncate very long output */
- if (s - origs > 256) {
- Perl_sv_catpvf(aTHX_ sv,
- "%.*s...",
- (int) (s - origs - 1),
- t);
- goto out_dump;
- }
- *s = ' ';
- }
- else if (*s == '\t') {
- *s = '-';
- }
- s++;
- }
- if (s[-1] == ' ')
- s[-1] = 0;
+ /* And, for easy of understanding, it is always output not-shown as
+ * complemented */
+ if (flags & ANYOF_INVERT) {
+ _invlist_invert(nonbitmap_invlist);
+ _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
+ }
- sv_catpv(sv, t);
- }
+ contents = invlist_contents(nonbitmap_invlist,
+ FALSE /* output suitable for catsv */
+ );
- out_dump:
+ /* If the output is shorter than the permissible maximum, just do it. */
+ if (SvCUR(contents) <= dump_len) {
+ sv_catsv(sv, contents);
+ }
+ else {
+ const char * contents_string = SvPVX(contents);
+ STRLEN i = dump_len;
- Safefree(origs);
- SvREFCNT_dec_NN(lv);
+ /* Otherwise, start at the permissible max and work back to the
+ * first break possibility */
+ while (i > 0 && contents_string[i] != ' ') {
+ i--;
}
-
- if ((flags & ANYOF_LOC_FOLD)
- && only_utf8_locale
- && only_utf8_locale != &PL_sv_undef)
- {
- UV start, end;
- int max_entries = 256;
-
- sv_catpvs(sv, "{utf8 locale}");
- invlist_iterinit(only_utf8_locale);
- while (invlist_iternext(only_utf8_locale,
- &start, &end)) {
- put_range(sv, start, end, FALSE);
- max_entries --;
- if (max_entries < 0) {
- sv_catpvs(sv, "...");
- break;
- }
- }
- invlist_iterfinish(only_utf8_locale);
+ if (i == 0) { /* Fail-safe. Use the max if we couldn't
+ find a legal break */
+ i = dump_len;
}
+
+ sv_catpvn(sv, contents_string, i);
+ sv_catpvs(sv, "...");
}
- }
- SvREFCNT_dec(bitmap_invlist);
+ SvREFCNT_dec_NN(contents);
+ SvREFCNT_dec_NN(nonbitmap_invlist);
+ }
+ /* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
else if (k == POSIXD || k == NPOSIXD) {
const char * const bounds[] = {
"", /* Traditional */
"{gcb}",
+ "{lb}",
"{sb}",
"{wb}"
};
+ assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
sv_catpv(sv, bounds[FLAGS(o)]);
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
}
else if (isPRINT(c)) {
const char string = (char) c;
- if (isBACKSLASHED_PUNCT(c))
+
+ /* We use {phrase} as metanotation in the class, so also escape literal
+ * braces */
+ if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
sv_catpvs(sv, "\\");
sv_catpvn(sv, &string, 1);
}
+ else if (isMNEMONIC_CNTRL(c)) {
+ Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
+ }
else {
- const char * const mnemonic = cntrl_to_mnemonic((char) c);
- if (mnemonic) {
- Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
- }
- else {
- Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
- }
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
}
}
S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
{
/* Appends to 'sv' a displayable version of the range of code points from
- * 'start' to 'end'. It assumes that only ASCII printables are displayable
- * as-is (though some of these will be escaped by put_code_point()). */
+ * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
+ * that have them, when they occur at the beginning or end of the range.
+ * It uses hex to output the remaining code points, unless 'allow_literals'
+ * is true, in which case the printable ASCII ones are output as-is (though
+ * some of these will be escaped by put_code_point()).
+ *
+ * NOTE: This is designed only for printing ranges of code points that fit
+ * inside an ANYOF bitmap. Higher code points are simply suppressed
+ */
const unsigned int min_range_count = 3;
if (end - start < min_range_count) {
- /* Individual chars in short ranges */
+ /* Output chars individually when they occur in short ranges */
for (; start <= end; start++) {
put_code_point(sv, start);
}
/* If permitted by the input options, and there is a possibility that
* this range contains a printable literal, look to see if there is
- * one. */
+ * one. */
if (allow_literals && start <= MAX_PRINT_A) {
- /* If the range begin isn't an ASCII printable, effectively split
- * the range into two parts:
+ /* If the character at the beginning of the range isn't an ASCII
+ * printable, effectively split the range into two parts:
* 1) the portion before the first such printable,
* 2) the rest
* and output them separately. */
temp_end = end + 1;
}
- /* Output the first part of the split range, the part that
- * doesn't have printables, with no looking for literals
- * (otherwise we would infinitely recurse) */
+ /* Output the first part of the split range: the part that
+ * doesn't have printables, with the parameter set to not look
+ * for literals (otherwise we would infinitely recurse) */
put_range(sv, start, temp_end - 1, FALSE);
/* The 2nd part of the range (if any) starts here. */
start = temp_end;
- /* We continue instead of dropping down because even if the 2nd
- * part is non-empty, it could be so short that we want to
- * output it specially, as tested for at the top of this loop.
- * */
+ /* We do a continue, instead of dropping down, because even if
+ * the 2nd part is non-empty, it could be so short that we want
+ * to output it as individual characters, as tested for at the
+ * top of this loop. */
continue;
}
temp_end--;
}
- /* And separately output the range that doesn't have mnemonics */
+ /* And separately output the interior range that doesn't start or
+ * end with mnemonics */
put_range(sv, start, temp_end, FALSE);
/* Then output the mnemonic trailing controls */
: NUM_ANYOF_CODE_POINTS - 1;
#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
- ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
+ ? "\\x%02"UVXf"-\\x%02"UVXf""
: "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
#else
- format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
+ format = "\\x%02"UVXf"-\\x%02"UVXf"";
#endif
GCC_DIAG_IGNORE(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
}
}
-STATIC bool
-S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
+STATIC void
+S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
{
- /* Appends to 'sv' a displayable version of the innards of the bracketed
- * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
- * output anything, and bitmap_invlist, if not NULL, will point to an
- * inversion list of what is in the bit map */
+ /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
+ * 'invlist' */
- int i;
UV start, end;
- unsigned int punct_count = 0;
- SV* invlist = NULL;
- SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
bool allow_literals = TRUE;
- PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
-
- invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
-
- /* Worst case is exactly every-other code point is in the list */
- *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
-
- /* Convert the bit map to an inversion list, keeping track of how many
- * ASCII puncts are set, including an extra amount for the backslashed
- * ones. */
- for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
- if (BITMAP_TEST(bitmap, i)) {
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
- if (isPUNCT_A(i)) {
- punct_count++;
- if isBACKSLASHED_PUNCT(i) {
- punct_count++;
- }
- }
- }
- }
-
- /* Nothing to output */
- if (_invlist_len(*invlist_ptr) == 0) {
- SvREFCNT_dec(invlist);
- return FALSE;
- }
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
/* Generally, it is more readable if printable characters are output as
* literals, but if a range (nearly) spans all of them, it's best to output
* it as a single range. This code will use a single range if all but 2
- * printables are in it */
- invlist_iterinit(*invlist_ptr);
- while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ * ASCII printables are in it */
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
- /* If range starts beyond final printable, it doesn't have any in it */
+ /* If the range starts beyond the final printable, it doesn't have any
+ * in it */
if (start > MAX_PRINT_A) {
break;
}
break;
}
}
- invlist_iterfinish(*invlist_ptr);
-
- /* The legibility of the output depends mostly on how many punctuation
- * characters are output. There are 32 possible ASCII ones, and some have
- * an additional backslash, bringing it to currently 36, so if any more
- * than 18 are to be output, we can instead output it as its complement,
- * yielding fewer puncts, and making it more legible. But give some weight
- * to the fact that outputting it as a complement is less legible than a
- * straight output, so don't complement unless we are somewhat over the 18
- * mark */
- if (allow_literals && punct_count > 22) {
- sv_catpvs(sv, "^");
-
- /* Add everything remaining to the list, so when we invert it just
- * below, it will be excluded */
- _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
- _invlist_invert(*invlist_ptr);
- }
+ invlist_iterfinish(invlist);
/* Here we have figured things out. Output each range */
- invlist_iterinit(*invlist_ptr);
- while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
if (start >= NUM_ANYOF_CODE_POINTS) {
break;
}
put_range(sv, start, end, allow_literals);
}
- invlist_iterfinish(*invlist_ptr);
+ invlist_iterfinish(invlist);
- return TRUE;
+ return;
+}
+
+STATIC SV*
+S_put_charclass_bitmap_innards_common(pTHX_
+ SV* invlist, /* The bitmap */
+ SV* posixes, /* Under /l, things like [:word:], \S */
+ SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
+ SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
+ SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
+ const bool invert /* Is the result to be inverted? */
+)
+{
+ /* Create and return an SV containing a displayable version of the bitmap
+ * and associated information determined by the input parameters. */
+
+ SV * output;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
+
+ if (invert) {
+ output = newSVpvs("^");
+ }
+ else {
+ output = newSVpvs("");
+ }
+
+ /* First, the code points in the bitmap that are unconditionally there */
+ put_charclass_bitmap_innards_invlist(output, invlist);
+
+ /* Traditionally, these have been placed after the main code points */
+ if (posixes) {
+ sv_catsv(output, posixes);
+ }
+
+ if (only_utf8 && _invlist_len(only_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8);
+ }
+
+ if (not_utf8 && _invlist_len(not_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, not_utf8);
+ }
+
+ if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
+
+ /* This is the only list in this routine that can legally contain code
+ * points outside the bitmap range. The call just above to
+ * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
+ * output them here. There's about a half-dozen possible, and none in
+ * contiguous ranges longer than 2 */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ UV start, end;
+ SV* above_bitmap = NULL;
+
+ _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
+
+ invlist_iterinit(above_bitmap);
+ while (invlist_iternext(above_bitmap, &start, &end)) {
+ UV i;
+
+ for (i = start; i <= end; i++) {
+ put_code_point(output, i);
+ }
+ }
+ invlist_iterfinish(above_bitmap);
+ SvREFCNT_dec_NN(above_bitmap);
+ }
+ }
+
+ /* If the only thing we output is the '^', clear it */
+ if (invert && SvCUR(output) == 1) {
+ SvCUR_set(output, 0);
+ }
+
+ return output;
+}
+
+STATIC bool
+S_put_charclass_bitmap_innards(pTHX_ SV *sv,
+ char *bitmap,
+ SV *nonbitmap_invlist,
+ SV *only_utf8_locale_invlist,
+ const regnode * const node)
+{
+ /* Appends to 'sv' a displayable version of the innards of the bracketed
+ * character class defined by the other arguments:
+ * 'bitmap' points to the bitmap.
+ * 'nonbitmap_invlist' is an inversion list of the code points that are in
+ * the bitmap range, but for some reason aren't in the bitmap; NULL if
+ * none. The reasons for this could be that they require some
+ * condition such as the target string being or not being in UTF-8
+ * (under /d), or because they came from a user-defined property that
+ * was not resolved at the time of the regex compilation (under /u)
+ * 'only_utf8_locale_invlist' is an inversion list of the code points that
+ * are valid only if the runtime locale is a UTF-8 one; NULL if none
+ * 'node' is the regex pattern node. It is needed only when the above two
+ * parameters are not null, and is passed so that this routine can
+ * tease apart the various reasons for them.
+ *
+ * It returns TRUE if there was actually something output. (It may be that
+ * the bitmap, etc is empty.)
+ *
+ * When called for outputting the bitmap of a non-ANYOF node, just pass the
+ * bitmap, with the succeeding parameters set to NULL.
+ *
+ */
+
+ /* In general, it tries to display the 'cleanest' representation of the
+ * innards, choosing whether to display them inverted or not, regardless of
+ * whether the class itself is to be inverted. However, there are some
+ * cases where it can't try inverting, as what actually matches isn't known
+ * until runtime, and hence the inversion isn't either. */
+ bool inverting_allowed = TRUE;
+
+ int i;
+ STRLEN orig_sv_cur = SvCUR(sv);
+
+ SV* invlist; /* Inversion list we accumulate of code points that
+ are unconditionally matched */
+ SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
+ UTF-8 */
+ SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
+ */
+ SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
+ SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
+ is UTF-8 */
+
+ SV* as_is_display; /* The output string when we take the inputs
+ literally */
+ SV* inverted_display; /* The output string when we invert the inputs */
+
+ U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
+
+ bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
+ to match? */
+ /* We are biased in favor of displaying things without them being inverted,
+ * as that is generally easier to understand */
+ const int bias = 5;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+ /* Start off with whatever code points are passed in. (We clone, so we
+ * don't change the caller's list) */
+ if (nonbitmap_invlist) {
+ assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
+ invlist = invlist_clone(nonbitmap_invlist);
+ }
+ else { /* Worst case size is every other code point is matched */
+ invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+ }
+
+ if (flags) {
+ if (OP(node) == ANYOFD) {
+
+ /* This flag indicates that the code points below 0x100 in the
+ * nonbitmap list are precisely the ones that match only when the
+ * target is UTF-8 (they should all be non-ASCII). */
+ if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
+ {
+ _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
+ _invlist_subtract(invlist, only_utf8, &invlist);
+ }
+
+ /* And this flag for matching all non-ASCII 0xFF and below */
+ if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ {
+ if (invert) {
+ not_utf8 = _new_invlist(0);
+ }
+ else {
+ not_utf8 = invlist_clone(PL_UpperLatin1);
+ }
+ inverting_allowed = FALSE; /* XXX needs more work to be able
+ to allow this */
+ }
+ }
+ else if (OP(node) == ANYOFL) {
+
+ /* If either of these flags are set, what matches isn't
+ * determinable except during execution, so don't know enough here
+ * to invert */
+ if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
+ inverting_allowed = FALSE;
+ }
+
+ /* What the posix classes match also varies at runtime, so these
+ * will be output symbolically. */
+ if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
+ int i;
+
+ posixes = newSVpvs("");
+ for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
+ if (ANYOF_POSIXL_TEST(node,i)) {
+ sv_catpv(posixes, anyofs[i]);
+ }
+ }
+ }
+ }
+ }
+
+ /* Accumulate the bit map into the unconditional match list */
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST(bitmap, i)) {
+ int start = i++;
+ for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
+ }
+ }
+
+ /* Make sure that the conditional match lists don't have anything in them
+ * that match unconditionally; otherwise the output is quite confusing.
+ * This could happen if the code that populates these misses some
+ * duplication. */
+ if (only_utf8) {
+ _invlist_subtract(only_utf8, invlist, &only_utf8);
+ }
+ if (not_utf8) {
+ _invlist_subtract(not_utf8, invlist, ¬_utf8);
+ }
+
+ if (only_utf8_locale_invlist) {
+
+ /* Since this list is passed in, we have to make a copy before
+ * modifying it */
+ only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
+
+ _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
+
+ /* And, it can get really weird for us to try outputting an inverted
+ * form of this list when it has things above the bitmap, so don't even
+ * try */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ inverting_allowed = FALSE;
+ }
+ }
+
+ /* Calculate what the output would be if we take the input as-is */
+ as_is_display = put_charclass_bitmap_innards_common(invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale,
+ invert);
+
+ /* If have to take the output as-is, just do that */
+ if (! inverting_allowed) {
+ sv_catsv(sv, as_is_display);
+ }
+ else { /* But otherwise, create the output again on the inverted input, and
+ use whichever version is shorter */
+
+ int inverted_bias, as_is_bias;
+
+ /* We will apply our bias to whichever of the the results doesn't have
+ * the '^' */
+ if (invert) {
+ invert = FALSE;
+ as_is_bias = bias;
+ inverted_bias = 0;
+ }
+ else {
+ invert = TRUE;
+ as_is_bias = 0;
+ inverted_bias = bias;
+ }
+
+ /* Now invert each of the lists that contribute to the output,
+ * excluding from the result things outside the possible range */
+
+ /* For the unconditional inversion list, we have to add in all the
+ * conditional code points, so that when inverted, they will be gone
+ * from it */
+ _invlist_union(only_utf8, invlist, &invlist);
+ _invlist_union(only_utf8_locale, invlist, &invlist);
+ _invlist_invert(invlist);
+ _invlist_intersection(invlist, PL_InBitmap, &invlist);
+
+ if (only_utf8) {
+ _invlist_invert(only_utf8);
+ _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
+ }
+
+ if (not_utf8) {
+ _invlist_invert(not_utf8);
+ _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8);
+ }
+
+ if (only_utf8_locale) {
+ _invlist_invert(only_utf8_locale);
+ _invlist_intersection(only_utf8_locale,
+ PL_InBitmap,
+ &only_utf8_locale);
+ }
+
+ inverted_display = put_charclass_bitmap_innards_common(
+ invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale, invert);
+
+ /* Use the shortest representation, taking into account our bias
+ * against showing it inverted */
+ if (SvCUR(inverted_display) + inverted_bias
+ < SvCUR(as_is_display) + as_is_bias)
+ {
+ sv_catsv(sv, inverted_display);
+ }
+ else {
+ sv_catsv(sv, as_is_display);
+ }
+
+ SvREFCNT_dec_NN(as_is_display);
+ SvREFCNT_dec_NN(inverted_display);
+ }
+
+ SvREFCNT_dec_NN(invlist);
+ SvREFCNT_dec(only_utf8);
+ SvREFCNT_dec(not_utf8);
+ SvREFCNT_dec(posixes);
+ SvREFCNT_dec(only_utf8_locale);
+
+ return SvCUR(sv) > orig_sv_cur;
}
#define CLEAR_OPTSTART \