struct scan_frame *next_frame; /* next frame */
} scan_frame;
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c) \
+ ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
+
struct RExC_state_t {
U32 flags; /* RXf_* are we folding, multilining? */
U32 pm_flags; /* PMf_* stuff from the calling PMOP */
REPORT_LOCATION_ARGS(offset)); \
} STMT_END
+#define vWARN(loc, m) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(offset)); \
+} STMT_END
+
#define vWARN_dep(loc, m) STMT_START { \
const IV offset = loc - RExC_precomp; \
__ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
PerlIO_printf(Perl_debug_log,"\n"); \
});
-#ifdef DEBUGGING
-
/* is c a control character for which we have a mnemonic? */
#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
return NULL;
}
-#endif
-
/* Mark that we cannot extend a found fixed substring at this point.
Update the longest found anchored substring and the longest found
floating substrings if needed. */
SAVEFREEPV(RExC_recurse);
}
-reStudy:
+ reStudy:
r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
DEBUG_r(
RExC_study_chunk_recursed_count= 0;
ret = reganode(pRExC_state,NGROUPP,num);
goto insert_if_check_paren;
}
- else if (strnEQ(RExC_parse, "DEFINE",
- MIN(DEFINE_len, RExC_end - RExC_parse)))
+ else if (RExC_end - RExC_parse >= DEFINE_len
+ && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
{
ret = reganode(pRExC_state,DEFINEP,0);
RExC_parse += DEFINE_len;
PERL_ARGS_ASSERT_REGATOM;
-tryagain:
+ tryagain:
switch ((U8)*RExC_parse) {
case '^':
RExC_seen_zerolen++;
FALSE, /* means parse the whole char class */
TRUE, /* allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
- RExC_strict,
+ (bool) RExC_strict,
NULL);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
arg = ANYOF_WORDCHAR;
goto join_posix;
- case 'b':
+ case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = BOUND + get_regex_charset(RExC_flags);
- if (op > BOUNDA) { /* /aa is same as /a */
- op = BOUNDA;
+ op = NBOUND + get_regex_charset(RExC_flags);
+ if (op > NBOUNDA) { /* /aa is same as /a */
+ op = NBOUNDA;
}
- else if (op == BOUNDL) {
+ else if (op == NBOUNDL) {
RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
- FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if ((U8) *(RExC_parse + 1) == '{') {
/* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
+ vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
}
goto finish_meta_pat;
- case 'B':
+
+ case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = NBOUND + get_regex_charset(RExC_flags);
- if (op > NBOUNDA) { /* /aa is same as /a */
- op = NBOUNDA;
+ op = BOUND + get_regex_charset(RExC_flags);
+ if (op > BOUNDA) { /* /aa is same as /a */
+ op = BOUNDA;
}
- else if (op == NBOUNDL) {
+ else if (op == BOUNDL) {
RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
- FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if ((U8) *(RExC_parse + 1) == '{') {
/* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
+ vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
}
goto finish_meta_pat;
FALSE, /* don't silence non-portable warnings.
It would be a bug if these returned
non-portables */
- RExC_strict,
+ (bool) RExC_strict,
NULL);
/* regclass() can only return RESTART_UTF8 if multi-char folds
are allowed. */
&result,
&error_msg,
PASS2, /* out warnings */
- RExC_strict,
+ (bool) RExC_strict,
TRUE, /* Output warnings
for non-
portables */
&result,
&error_msg,
PASS2, /* out warnings */
- RExC_strict,
+ (bool) RExC_strict,
TRUE, /* Silence warnings
for non-
portables */
separate for a while from the non-complemented
versions because of complications with /d
matching */
+ SV* simple_posixes = NULL; /* But under some conditions, the classes can be
+ treated more simply than the general case,
+ leading to less compilation and execution
+ work */
UV element_count = 0; /* Number of distinct elements in the class.
Optimizations may be possible if this is tiny */
AV * multi_char_matches = NULL; /* Code points that fold to more than one
* runtime locale is UTF-8 */
SV* only_utf8_locale_list = NULL;
-#ifdef EBCDIC
- /* In a range, counts how many 0-2 of the ends of it came from literals,
- * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
- UV literal_endpoint = 0;
+ /* In a range, if one of the endpoints is non-character-set portable,
+ * meaning that it hard-codes a code point that may mean a different
+ * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
+ * mnemonic '\t' which each mean the same character no matter which
+ * character set the platform is on. */
+ unsigned int non_portable_endpoint = 0;
/* Is the range unicode? which means on a platform that isn't 1-1 native
* to Unicode (i.e. non-ASCII), each code point in it should be considered
* to be a Unicode value. */
bool unicode_range = FALSE;
-#endif
bool invert = FALSE; /* Is this class to be complemented */
bool warn_super = ALWAYS_WARN_SUPER;
if (!range) {
rangebegin = RExC_parse;
element_count++;
-#ifdef EBCDIC
- literal_endpoint = 0;
-#endif
+ non_portable_endpoint = 0;
}
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
{
namedclass = regpposixcc(pRExC_state, value, strict);
}
- else if (value != '\\') {
-#ifdef EBCDIC
- literal_endpoint++;
-#endif
- }
- else {
+ else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
value = utf8n_to_uvchr((U8*)RExC_parse,
prevvalue = save_prevvalue;
continue; /* Back to top of loop to get next char */
}
+
/* Here, is a single code point, and <value> contains it */
-#ifdef EBCDIC
- /* We consider named characters to be literal characters,
- * and they are Unicode */
- literal_endpoint++;
- unicode_range = TRUE;
-#endif
+ unicode_range = TRUE; /* \N{} are Unicode */
}
break;
case 'p':
vFAIL(error_msg);
}
}
+ non_portable_endpoint++;
if (IN_ENCODING && value < 0x100) {
goto recode_encoding;
}
vFAIL(error_msg);
}
}
+ non_portable_endpoint++;
if (IN_ENCODING && value < 0x100)
goto recode_encoding;
break;
case 'c':
value = grok_bslash_c(*RExC_parse++, PASS2);
+ non_portable_endpoint++;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
(void)ReREFCNT_inc(RExC_rx_sv);
}
}
+ non_portable_endpoint++;
if (IN_ENCODING && value < 0x100)
goto recode_encoding;
break;
&cp_list);
}
}
- else { /* Garden variety class. If is NASCII, NDIGIT, ...
+ else if (UNI_SEMANTICS
+ || classnum == _CC_ASCII
+ || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+ || classnum == _CC_XDIGIT)))
+ {
+ /* We usually have to worry about /d and /a affecting what
+ * POSIX classes match, with special code needed for /d
+ * because we won't know until runtime what all matches.
+ * But there is no extra work needed under /u, and
+ * [:ascii:] is unaffected by /a and /d; and :digit: and
+ * :xdigit: don't have runtime differences under /d. So we
+ * can special case these, and avoid some extra work below,
+ * and at runtime. */
+ _invlist_union_maybe_complement_2nd(
+ simple_posixes,
+ PL_XPosix_ptrs[classnum],
+ namedclass % 2 != 0,
+ &simple_posixes);
+ }
+ else { /* Garden variety class. If is NUPPER, NALPHA, ...
complement and use nposixes */
SV** posixes_ptr = namedclass % 2 == 0
? &posixes
: &nposixes;
- SV** source_ptr = &PL_XPosix_ptrs[classnum];
_invlist_union_maybe_complement_2nd(
*posixes_ptr,
- *source_ptr,
+ PL_XPosix_ptrs[classnum],
namedclass % 2 != 0,
posixes_ptr);
}
if (range) {
#ifdef EBCDIC
/* For unicode ranges, we have to test that the Unicode as opposed
- * to the native values are not decreasing. (Above 255, and there
- * is no difference between native and Unicode) */
+ * to the native values are not decreasing. (Above 255, there is
+ * no difference between native and Unicode) */
if (unicode_range && prevvalue < 255 && value < 255) {
if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
goto backwards_range;
}
}
+ if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
+ if (range) {
+
+ /* If the range starts above 255, everything is portable and
+ * likely to be so for any forseeable character set, so don't
+ * warn. */
+ if (unicode_range && non_portable_endpoint && prevvalue < 256) {
+ vWARN(RExC_parse, "Both or neither range ends should be Unicode");
+ }
+ else if (prevvalue != value) {
+
+ /* Under strict, ranges that stop and/or end in an ASCII
+ * printable should have each end point be a portable value
+ * for it (preferably like 'A', but we don't warn if it is
+ * a (portable) Unicode name or code point), and the range
+ * must be be all digits or all letters of the same case.
+ * Otherwise, the range is non-portable and unclear as to
+ * what it contains */
+ if ((isPRINT_A(prevvalue) || isPRINT_A(value))
+ && (non_portable_endpoint
+ || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
+ || (isLOWER_A(prevvalue) && isLOWER_A(value))
+ || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
+ {
+ vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
+ }
+ else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
+
+ /* But the nature of Unicode and languages mean we
+ * can't do the same checks for above-ASCII ranges,
+ * except in the case of digit ones. These should
+ * contain only digits from the same group of 10. The
+ * ASCII case is handled just above. 0x660 is the
+ * first digit character beyond ASCII. Hence here, the
+ * range could be a range of digits. Find out. */
+ IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+ prevvalue);
+ IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+ value);
+
+ /* If the range start and final points are in the same
+ * inversion list element, it means that either both
+ * are not digits, or both are digits in a consecutive
+ * sequence of digits. (So far, Unicode has kept all
+ * such sequences as distinct groups of 10, but assert
+ * to make sure). If the end points are not in the
+ * same element, neither should be a digit. */
+ if (index_start == index_final) {
+ assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
+ || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+ - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 10);
+ }
+ else if ((index_start >= 0
+ && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
+ || (index_final >= 0
+ && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
+ {
+ vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
+ }
+ }
+ }
+ }
+ if ((! range || prevvalue == value) && non_portable_endpoint) {
+ if (isPRINT_A(value)) {
+ char literal[3];
+ unsigned d = 0;
+ if (isBACKSLASHED_PUNCT(value)) {
+ literal[d++] = '\\';
+ }
+ literal[d++] = (char) value;
+ literal[d++] = '\0';
+
+ vWARN4(RExC_parse,
+ "\"%.*s\" is more clearly written simply as \"%s\"",
+ (int) (RExC_parse - rangebegin),
+ rangebegin,
+ literal
+ );
+ }
+ else if isMNEMONIC_CNTRL(value) {
+ vWARN4(RExC_parse,
+ "\"%.*s\" is more clearly written simply as \"%s\"",
+ (int) (RExC_parse - rangebegin),
+ rangebegin,
+ cntrl_to_mnemonic((char) value)
+ );
+ }
+ }
+ }
+
/* Deal with this element of the class */
if (! SIZE_ONLY) {
+
#ifndef EBCDIC
cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
prevvalue, value);
if ((UNLIKELY(prevvalue == 0) && value >= 255)
|| ! (prevvalue < 256
&& (unicode_range
- || (literal_endpoint == 2
+ || (! non_portable_endpoint
&& ((isLOWER_A(prevvalue) && isLOWER_A(value))
|| (isUPPER_A(prevvalue)
&& isUPPER_A(value)))))))
op = POSIXA;
}
}
- else if (prevvalue == 'A') {
- if (value == 'Z'
+ else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+ /* We can optimize A-Z or a-z, but not if they could match
+ * something like the KELVIN SIGN under /i (/a means they
+ * can't) */
+ if (prevvalue == 'A') {
+ if (value == 'Z'
#ifdef EBCDIC
- && literal_endpoint == 2
+ && ! non_portable_end_point
#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
- op = POSIXA;
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+ op = POSIXA;
+ }
}
- }
- else if (prevvalue == 'a') {
- if (value == 'z'
+ else if (prevvalue == 'a') {
+ if (value == 'z'
#ifdef EBCDIC
- && literal_endpoint == 2
+ && ! non_portable_end_point
#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
- op = POSIXA;
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+ op = POSIXA;
+ }
}
}
}
SvREFCNT_dec(posixes);
SvREFCNT_dec(nposixes);
+ SvREFCNT_dec(simple_posixes);
SvREFCNT_dec(cp_list);
SvREFCNT_dec(cp_foldable_list);
return ret;
* classes. The lists are kept separate up to now because we don't want to
* fold the classes (folding of those is automatically handled by the swash
* fetching code) */
+ if (simple_posixes) {
+ _invlist_union(cp_list, simple_posixes, &cp_list);
+ SvREFCNT_dec_NN(simple_posixes);
+ }
if (posixes || nposixes) {
if (posixes && AT_LEAST_ASCII_RESTRICTED) {
/* Under /a and /aa, nothing above ASCII matches these */
sv_catpvs(sv, "{non-utf8-latin1-all}");
}
+ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
+ sv_catpvs(sv, "{above_bitmap_all}");
+
if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
SV *lv; /* Set if there is something outside the bit map. */
- bool byte_output = FALSE; /* If something in the bitmap has
- been output */
+ bool byte_output = FALSE; /* If something has been output */
SV *only_utf8_locale;
/* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
invlist_iterfinish(only_utf8_locale);
}
}
-
- if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
- sv_catpvs(sv, "{above_bitmap_all}");
}
SvREFCNT_dec(bitmap_invlist);
}
#ifdef DEBUGGING
-/* Certain characters are output as a sequence with the first being a
- * backslash. */
-#define isBACKSLASHED_PUNCT(c) \
- ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
STATIC void
S_put_code_point(pTHX_ SV *sv, UV c)