X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/08bc774e50070edc2d51d5c5ad3fb50bf97f2361..62e4c90a271e4c9a7e8d172f3d36399885df56bc:/utf8.c diff --git a/utf8.c b/utf8.c index 2eb673e..dd103cd 100644 --- a/utf8.c +++ b/utf8.c @@ -1957,8 +1957,10 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * validating routine */ if (!is_utf8_char_buf(p, p + UTF8SKIP(p))) return FALSE; - if (!*swash) - *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0); + if (!*swash) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags); + } return swash_fetch(*swash, p, TRUE) != 0; } @@ -2208,13 +2210,13 @@ Perl_is_utf8_mark(pTHX_ const U8 *p) } bool -Perl_is_utf8_X_begin(pTHX_ const U8 *p) +Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p) { dVAR; - PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN; + PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN; - return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin"); + return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin"); } bool @@ -2230,21 +2232,51 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p) bool Perl_is_utf8_X_prepend(pTHX_ const U8 *p) { + /* If no code points in the Unicode version being worked on match + * GCB=Prepend, this will set PL_utf8_X_prepend to &PL_sv_undef during its + * first call. Otherwise, it will set it to a swash created for it. + * swash_fetch() hence can't be used without checking first if it is valid + * to do so. */ + dVAR; + bool initialized = cBOOL(PL_utf8_X_prepend); + bool ret; PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND; - return is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend"); + if (PL_utf8_X_prepend == &PL_sv_undef) { + return FALSE; + } + + if ((ret = is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend")) + || initialized) + { + return ret; + } + + /* Here the code point being checked was not a prepend, and we hadn't + * initialized PL_utf8_X_prepend, so we don't know if it is just this + * particular input code point that didn't match, or if the table is + * completely empty. The is_utf8_common() call did the initialization, so + * we can inspect the swash's inversion list to find out. If there are no + * elements in its inversion list, it's empty, and nothing will ever match, + * so set things up so we can skip the check in future calls. */ + if (_invlist_len(_get_swash_invlist(PL_utf8_X_prepend)) == 0) { + SvREFCNT_dec(PL_utf8_X_prepend); + PL_utf8_X_prepend = &PL_sv_undef; + } + + return FALSE; } bool -Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p) +Perl_is_utf8_X_special_begin(pTHX_ const U8 *p) { dVAR; - PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL; + PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN; - return is_utf8_common(p, &PL_utf8_X_non_hangul, "_X_HST_Not_Applicable"); + return is_utf8_common(p, &PL_utf8_X_special_begin, "_X_Special_Begin"); } bool @@ -2257,6 +2289,16 @@ Perl_is_utf8_X_L(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L"); } +bool +Perl_is_utf8_X_RI(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_RI; + + return is_utf8_common(p, &PL_utf8_X_RI, "_X_RI"); +} + /* These constants are for finding GCB=LV and GCB=LVT. These are for the * pre-composed Hangul syllables, which are all in a contiguous block and * arranged there in such a way so as to facilitate alorithmic determination of @@ -2433,7 +2475,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, uvuni_to_utf8(tmpbuf, uv1); if (!*swashp) /* load on-demand */ - *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); + *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); if (special) { /* It might be "special" (sometimes, but not always, @@ -2922,7 +2964,11 @@ SV* Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p) { /* Initialize and return a swash, creating it if necessary. It does this - * by calling utf8_heavy.pl in the general case. + * by calling utf8_heavy.pl in the general case. The returned value may be + * the swash's inversion list instead if the input parameters allow it. + * Which is returned should be immaterial to callers, as the only + * operations permitted on a swash, swash_fetch() and + * _get_swash_invlist(), handle both these transparently. * * This interface should only be used by functions that won't destroy or * adversely change the swash, as doing so affects all other uses of the @@ -2947,6 +2993,10 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m * came from a user-defined property. (I O) * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking * when the swash cannot be located, to simply return NULL. (I) + * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a + * return of an inversion list instead of a swash hash if this routine + * thinks that would result in faster execution of swash_fetch() later + * on. (I) * * Thus there are three possible inputs to find the swash: , * , and . At least one must be specified. The result @@ -2958,6 +3008,11 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m dVAR; SV* retval = &PL_sv_undef; HV* swash_hv = NULL; + const int invlist_swash_boundary = + (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST) + ? 512 /* Based on some benchmarking, but not extensive, see commit + message */ + : -1; /* Never return just an inversion list */ assert(listsv != &PL_sv_undef || strNE(name, "") || invlist); assert(! invlist || minbits == 1); @@ -3093,9 +3148,12 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m } else { - /* Here, there is no swash already. Set up a minimal one */ - swash_hv = newHV(); - retval = newRV_inc(MUTABLE_SV(swash_hv)); + /* Here, there is no swash already. Set up a minimal one, if + * we are going to return a swash */ + if ((int) _invlist_len(invlist) > invlist_swash_boundary) { + swash_hv = newHV(); + retval = newRV_inc(MUTABLE_SV(swash_hv)); + } swash_invlist = invlist; } } @@ -3103,12 +3161,19 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m /* Here, we have computed the union of all the passed-in data. It may * be that there was an inversion list in the swash which didn't get * touched; otherwise save the one computed one */ - if (! invlist_in_swash_is_valid) { + if (! invlist_in_swash_is_valid + && (int) _invlist_len(swash_invlist) > invlist_swash_boundary) + { if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist)) { Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); } } + + if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { + SvREFCNT_dec(retval); + retval = newRV_inc(swash_invlist); + } } return retval; @@ -3174,6 +3239,15 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) PERL_ARGS_ASSERT_SWASH_FETCH; + /* If it really isn't a hash, it isn't really swash; must be an inversion + * list */ + if (SvTYPE(hv) != SVt_PVHV) { + return _invlist_contains_cp((SV*)hv, + (do_utf8) + ? valid_utf8_to_uvchr(ptr, NULL) + : c); + } + /* Convert to utf8 if not already */ if (!do_utf8 && !UNI_IS_INVARIANT(c)) { tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); @@ -4147,12 +4221,17 @@ Perl__get_swash_invlist(pTHX_ SV* const swash) PERL_ARGS_ASSERT__GET_SWASH_INVLIST; - if (! SvROK(swash) || SvTYPE(SvRV(swash)) != SVt_PVHV) { + if (! SvROK(swash)) { return NULL; } - ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE); + /* If it really isn't a hash, it isn't really swash; must be an inversion + * list */ + if (SvTYPE(SvRV(swash)) != SVt_PVHV) { + return SvRV(swash); + } + ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE); if (! ptr) { return NULL; }