uv &= UTF_START_MASK(expectlen);
/* Now, loop through the remaining bytes, accumulating each into the
- * working total as we go */
+ * working total as we go. (I khw tried unrolling the loop for up to 4
+ * bytes, but there was no performance improvement) */
for (++s; s < send; s++) {
uv = UTF8_ACCUMULATE(uv, *s);
}
* validating routine */
if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
return FALSE;
- if (!*swash)
- *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
+ if (!*swash) {
+ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+ *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
+ }
return swash_fetch(*swash, p, TRUE) != 0;
}
}
bool
-Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
{
dVAR;
- PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+ PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN;
- return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+ return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin");
}
bool
bool
Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
{
+ /* If no code points in the Unicode version being worked on match
+ * GCB=Prepend, this will set PL_utf8_X_prepend to &PL_sv_undef during its
+ * first call. Otherwise, it will set it to a swash created for it.
+ * swash_fetch() hence can't be used without checking first if it is valid
+ * to do so. */
+
dVAR;
+ bool initialized = cBOOL(PL_utf8_X_prepend);
+ bool ret;
PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
- return is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend");
+ if (PL_utf8_X_prepend == &PL_sv_undef) {
+ return FALSE;
+ }
+
+ if ((ret = is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend"))
+ || initialized)
+ {
+ return ret;
+ }
+
+ /* Here the code point being checked was not a prepend, and we hadn't
+ * initialized PL_utf8_X_prepend, so we don't know if it is just this
+ * particular input code point that didn't match, or if the table is
+ * completely empty. The is_utf8_common() call did the initialization, so
+ * we can inspect the swash's inversion list to find out. If there are no
+ * elements in its inversion list, it's empty, and nothing will ever match,
+ * so set things up so we can skip the check in future calls. */
+ if (_invlist_len(_get_swash_invlist(PL_utf8_X_prepend)) == 0) {
+ SvREFCNT_dec(PL_utf8_X_prepend);
+ PL_utf8_X_prepend = &PL_sv_undef;
+ }
+
+ return FALSE;
}
bool
-Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
+Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
{
dVAR;
- PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
+ PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN;
- return is_utf8_common(p, &PL_utf8_X_non_hangul, "_X_HST_Not_Applicable");
+ return is_utf8_common(p, &PL_utf8_X_special_begin, "_X_Special_Begin");
}
bool
return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L");
}
+bool
+Perl_is_utf8_X_RI(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_RI;
+
+ return is_utf8_common(p, &PL_utf8_X_RI, "_X_RI");
+}
+
/* These constants are for finding GCB=LV and GCB=LVT. These are for the
* pre-composed Hangul syllables, which are all in a contiguous block and
* arranged there in such a way so as to facilitate alorithmic determination of
uvuni_to_utf8(tmpbuf, uv1);
if (!*swashp) /* load on-demand */
- *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
+ *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
if (special) {
/* It might be "special" (sometimes, but not always,
Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
{
/* Initialize and return a swash, creating it if necessary. It does this
- * by calling utf8_heavy.pl in the general case.
+ * by calling utf8_heavy.pl in the general case. The returned value may be
+ * the swash's inversion list instead if the input parameters allow it.
+ * Which is returned should be immaterial to callers, as the only
+ * operations permitted on a swash, swash_fetch() and
+ * _get_swash_invlist(), handle both these transparently.
*
* This interface should only be used by functions that won't destroy or
* adversely change the swash, as doing so affects all other uses of the
* came from a user-defined property. (I O)
* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
* when the swash cannot be located, to simply return NULL. (I)
+ * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
+ * return of an inversion list instead of a swash hash if this routine
+ * thinks that would result in faster execution of swash_fetch() later
+ * on. (I)
*
* Thus there are three possible inputs to find the swash: <name>,
* <listsv>, and <invlist>. At least one must be specified. The result
dVAR;
SV* retval = &PL_sv_undef;
HV* swash_hv = NULL;
+ const int invlist_swash_boundary =
+ (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
+ ? 512 /* Based on some benchmarking, but not extensive, see commit
+ message */
+ : -1; /* Never return just an inversion list */
assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
assert(! invlist || minbits == 1);
* inversion list, or create one for it */
if (swash_hv) {
- swash_invlistsvp = hv_fetchs(swash_hv, "I", FALSE);
+ swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
if (swash_invlistsvp) {
swash_invlist = *swash_invlistsvp;
invlist_in_swash_is_valid = TRUE;
}
else {
- /* Here, there is no swash already. Set up a minimal one */
- swash_hv = newHV();
- retval = newRV_inc(MUTABLE_SV(swash_hv));
+ /* Here, there is no swash already. Set up a minimal one, if
+ * we are going to return a swash */
+ if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
+ swash_hv = newHV();
+ retval = newRV_inc(MUTABLE_SV(swash_hv));
+ }
swash_invlist = invlist;
}
}
/* Here, we have computed the union of all the passed-in data. It may
* be that there was an inversion list in the swash which didn't get
* touched; otherwise save the one computed one */
- if (! invlist_in_swash_is_valid) {
- if (! hv_stores(MUTABLE_HV(SvRV(retval)), "INVLIST", swash_invlist))
+ if (! invlist_in_swash_is_valid
+ && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
+ {
+ if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
{
Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
}
}
+
+ if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
+ SvREFCNT_dec(retval);
+ retval = newRV_inc(swash_invlist);
+ }
}
return retval;
PERL_ARGS_ASSERT_SWASH_FETCH;
+ /* If it really isn't a hash, it isn't really swash; must be an inversion
+ * list */
+ if (SvTYPE(hv) != SVt_PVHV) {
+ return _invlist_contains_cp((SV*)hv,
+ (do_utf8)
+ ? valid_utf8_to_uvchr(ptr, NULL)
+ : c);
+ }
+
/* Convert to utf8 if not already */
if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
U8 *l, *lend, *x, *xend, *s, *send;
STRLEN lcur, xcur, scur;
HV *const hv = MUTABLE_HV(SvRV(swash));
- SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
+ SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
SV** listsvp = NULL; /* The string containing the main body of the table */
SV** extssvp = NULL;
PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
- if (! SvROK(swash) || SvTYPE(SvRV(swash)) != SVt_PVHV) {
+ if (! SvROK(swash)) {
return NULL;
}
- ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "INVLIST", FALSE);
+ /* If it really isn't a hash, it isn't really swash; must be an inversion
+ * list */
+ if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
+ return SvRV(swash);
+ }
+ ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
if (! ptr) {
return NULL;
}