#define PL_Dir (vTHX->IDir)
#define PL_Env (vTHX->IEnv)
#define PL_HasMultiCharFold (vTHX->IHasMultiCharFold)
+#define PL_InBitmap (vTHX->IInBitmap)
#define PL_LIO (vTHX->ILIO)
#define PL_Latin1 (vTHX->ILatin1)
#define PL_Mem (vTHX->IMem)
PERLVAR(I, Latin1, SV *)
PERLVAR(I, UpperLatin1, SV *) /* Code points 128 - 255 */
PERLVAR(I, AboveLatin1, SV *)
+PERLVAR(I, InBitmap, SV *)
PERLVAR(I, NonL1NonFinalFold, SV *)
PERLVAR(I, HasMultiCharFold, SV *)
SvREFCNT_dec(PL_utf8_foldable);
SvREFCNT_dec(PL_utf8_foldclosures);
SvREFCNT_dec(PL_AboveLatin1);
+ SvREFCNT_dec(PL_InBitmap);
SvREFCNT_dec(PL_UpperLatin1);
SvREFCNT_dec(PL_Latin1);
SvREFCNT_dec(PL_NonL1NonFinalFold);
PL_utf8_idcont = NULL;
PL_utf8_foldclosures = NULL;
PL_AboveLatin1 = NULL;
+ PL_InBitmap = NULL;
PL_HasMultiCharFold = NULL;
PL_Latin1 = NULL;
PL_NonL1NonFinalFold = NULL;
/* Similarly for these */
if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
- invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
+ _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
}
if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
PL_HasMultiCharFold =
_new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
+
+ /* This is calculated here, because the Perl program that generates the
+ * static global ones doesn't currently have access to
+ * NUM_ANYOF_CODE_POINTS */
+ PL_InBitmap = _new_invlist(2);
+ PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
+ NUM_ANYOF_CODE_POINTS - 1);
}
#endif
UV high;
int i;
- if (end == UV_MAX && start <= 256) {
+ if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
}
- else if (end >= 256) {
+ else if (end >= NUM_ANYOF_CODE_POINTS) {
ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
}
* *invlist_ptr; similarly for code points above the bitmap if we have
* a flag to match all of them anyways */
if (change_invlist) {
- _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
+ _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
}
if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
- _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
+ _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
}
/* If have completely emptied it, remove it completely */
/* Add everything remaining to the list, so when we invert it just
* below, it will be excluded */
- *invlist_ptr = _add_range_to_invlist(*invlist_ptr,
- NUM_ANYOF_CODE_POINTS, UV_MAX);
+ _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
_invlist_invert(*invlist_ptr);
}
U16 arg2;
};
-#define NUM_ANYOF_CODE_POINTS 256
+/* This give the number of code points that can be in the bitmap of an ANYOF
+ * node. The shift number must currently be one of: 8..12. It can't be less
+ * than 8 (256) because some code relies on it being at least that. Above 12
+ * (4096), and you start running into warnings that some data structure widths
+ * have been exceeded, though the test suite as of this writing still passes
+ * for up through 16, which is as high as anyone would ever want to go,
+ * encompassing all of the Unicode BMP, and thus including all the economically
+ * important world scripts. At 12 most of them are: including Arabic,
+ * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
+ * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
+ * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
+ * below above 12.) Be sure to benchmark before changing, as larger sizes do
+ * significantly slow down the test suite */
+#define NUM_ANYOF_CODE_POINTS (1 << 8)
#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / 8) /* 8 bits/Byte */
}
else if (flags & ANYOF_LOCALE_FLAGS) {
if ((flags & ANYOF_LOC_FOLD)
+ && c < 256
&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
{
match = TRUE;
}
- else if (ANYOF_POSIXL_TEST_ANY_SET(n)) {
+ else if (ANYOF_POSIXL_TEST_ANY_SET(n)
+ && c < 256
+ ) {
/* The data structure is arranged so bits 0, 2, 4, ... are set
* if the class includes the Posix character class given by
PL_Latin1 = sv_dup_inc(proto_perl->ILatin1, param);
PL_UpperLatin1 = sv_dup_inc(proto_perl->IUpperLatin1, param);
PL_AboveLatin1 = sv_dup_inc(proto_perl->IAboveLatin1, param);
+ PL_InBitmap = sv_dup_inc(proto_perl->IInBitmap, param);
PL_NonL1NonFinalFold = sv_dup_inc(proto_perl->INonL1NonFinalFold, param);
PL_HasMultiCharFold = sv_dup_inc(proto_perl->IHasMultiCharFold, param);