unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
- const U8 flags = OP(node) == ANYOFHb ? 0 : ANYOF_FLAGS(node);
+ const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHb))
+ ? 0
+ : ANYOF_FLAGS(node);
PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
- U8 and_with_flags = (OP(and_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(and_with);
+ U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHb)
+ ? 0
+ : ANYOF_FLAGS(and_with);
U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
SV* ored_cp_list;
U8 ored_flags;
- U8 or_with_flags = (OP(or_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(or_with);
+ U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHb)
+ ? 0
+ : ANYOF_FLAGS(or_with);
PERL_ARGS_ASSERT_SSC_OR;
&& ! upper_latin1_only_utf8_matches
&& anyof_flags == 0)
{
+ U8 low_utf8[UTF8_MAXBYTES+1];
UV highest_cp = invlist_highest(cp_list);
- /* If the lowest and highest code point in the class have the same
- * UTF-8 first byte, then all do, and we can store that byte for
- * regexec.c to use so that it can more quickly scan the target
- * string for potential matches for this class. We co-opt the
- * flags field for this, and make the node ANYOFb. We do accept
- * here very large code points (for future use), but don't do
- * this optimization for them, as it would cause other
- * complications */
op = ANYOFH;
+
+ /* Currently the maximum allowed code point by the system is
+ * IV_MAX. Higher ones are reserved for future internal use. This
+ * particular regnode can be used for higher ones, but we can't
+ * calculate the code point of those. IV_MAX suffices though, as
+ * it will be a large first byte */
+ (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+ /* We store the lowest possible first byte of the UTF-8
+ * representation, using the flags field. This allows for quick
+ * ruling out of some inputs without having to convert from UTF-8
+ * to code point. For EBCDIC, this has to be I8. */
+ anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+ /* If the lowest and highest code point in the class have the same
+ * UTF-8 first byte, then all have that byte, and we can get an
+ * exact first byte instead of a minimum. We signal this with a
+ * different regnode */
if (highest_cp <= IV_MAX) {
- U8 low_utf8[UTF8_MAXBYTES+1];
U8 high_utf8[UTF8_MAXBYTES+1];
- (void) uvchr_to_utf8(low_utf8, start[0]);
- (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+ (void) uvchr_to_utf8(high_utf8, highest_cp);
if (low_utf8[0] == high_utf8[0]) {
+
+ /* No need to convert to I8 for EBCDIC as this is an exact
+ * match */
anyof_flags = low_utf8[0];
op = ANYOFHb;
}
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
- const U8 flags = (OP(o) == ANYOFHb) ? 0 : ANYOF_FLAGS(o);
+ const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHb)
+ ? 0
+ : ANYOF_FLAGS(o);
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
- if (OP(o) == ANYOFHb) {
- Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+ if (inRANGE(OP(o), ANYOFH, ANYOFHb)) {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x", FLAGS(o));
+ if (OP(o) == ANYOFH) {
+ /* Not strictly true for 32-bit or EBCDIC, but good
+ * enough */
+ Perl_sv_catpvf(aTHX_ sv, "..\\xff");
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
}
case ANYOFH:
if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
REXEC_FBC_CLASS_SCAN(TRUE,
- reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+ ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+ && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
}
break;
case ANYOFH:
if ( ! utf8_target
|| NEXTCHR_IS_EOS
+ || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
|| ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
utf8_target))
{
if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
while ( hardcount < max
&& scan < this_eol
+ && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
&& reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
{
scan += UTF8SKIP(scan);
S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
{
dVAR;
- const char flags = (OP(n) == ANYOFHb) ? 0 : ANYOF_FLAGS(n);
+ const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHb))
+ ? 0
+ : ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;