X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/08bc774e50070edc2d51d5c5ad3fb50bf97f2361..0c415a7950ced3bdd13d9361e7154695c677851b:/utf8.c diff --git a/utf8.c b/utf8.c index 2eb673e..184e429 100644 --- a/utf8.c +++ b/utf8.c @@ -507,7 +507,13 @@ determinable reasonable value. The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other flags) malformation is found. If this flag is set, the routine assumes that the caller will raise a warning, and this function will silently just set -C to C<-1> and return zero. +C to C<-1> (cast to C) and return zero. + +Note that this API requires disambiguation between successful decoding a NUL +character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as +in both cases, 0 is returned. To disambiguate, upon a zero return, see if the +first byte of C is 0 as well. If so, the input was a NUL; if not, the input +had an error. Certain code points are considered problematic. These are Unicode surrogates, Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. @@ -1096,10 +1102,7 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) if (e < s) goto warn_and_return; while (s < e) { - if (!UTF8_IS_INVARIANT(*s)) - s += UTF8SKIP(s); - else - s++; + s += UTF8SKIP(s); len++; } @@ -1957,8 +1960,10 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * validating routine */ if (!is_utf8_char_buf(p, p + UTF8SKIP(p))) return FALSE; - if (!*swash) - *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0); + if (!*swash) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags); + } return swash_fetch(*swash, p, TRUE) != 0; } @@ -2208,13 +2213,13 @@ Perl_is_utf8_mark(pTHX_ const U8 *p) } bool -Perl_is_utf8_X_begin(pTHX_ const U8 *p) +Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p) { dVAR; - PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN; + PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN; - return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin"); + return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin"); } bool @@ -2227,146 +2232,6 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend"); } -bool -Perl_is_utf8_X_prepend(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND; - - return is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend"); -} - -bool -Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL; - - return is_utf8_common(p, &PL_utf8_X_non_hangul, "_X_HST_Not_Applicable"); -} - -bool -Perl_is_utf8_X_L(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_L; - - return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L"); -} - -/* These constants are for finding GCB=LV and GCB=LVT. These are for the - * pre-composed Hangul syllables, which are all in a contiguous block and - * arranged there in such a way so as to facilitate alorithmic determination of - * their characteristics. As such, they don't need a swash, but can be - * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one - * is a GCB=LV */ -#define SBASE 0xAC00 /* Start of block */ -#define SCount 11172 /* Length of block */ -#define TCount 28 - -#if 0 /* This routine is not currently used */ -bool -Perl_is_utf8_X_LV(pTHX_ const U8 *p) -{ - /* Unlike most other similarly named routines here, this does not create a - * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */ - - dVAR; - - UV cp = valid_utf8_to_uvchr(p, NULL); - - PERL_ARGS_ASSERT_IS_UTF8_X_LV; - - /* The earliest Unicode releases did not have these precomposed Hangul - * syllables. Set to point to undef in that case, so will return false on - * every call */ - if (! PL_utf8_X_LV) { /* Set up if this is the first time called */ - PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0); - if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) { - SvREFCNT_dec(PL_utf8_X_LV); - PL_utf8_X_LV = &PL_sv_undef; - } - } - - return (PL_utf8_X_LV != &PL_sv_undef - && cp >= SBASE && cp < SBASE + SCount - && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */ -} -#endif - -bool -Perl_is_utf8_X_LVT(pTHX_ const U8 *p) -{ - /* Unlike most other similarly named routines here, this does not create a - * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */ - - dVAR; - - UV cp = valid_utf8_to_uvchr(p, NULL); - - PERL_ARGS_ASSERT_IS_UTF8_X_LVT; - - /* The earliest Unicode releases did not have these precomposed Hangul - * syllables. Set to point to undef in that case, so will return false on - * every call */ - if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */ - PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0); - if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) { - SvREFCNT_dec(PL_utf8_X_LVT); - PL_utf8_X_LVT = &PL_sv_undef; - } - } - - return (PL_utf8_X_LVT != &PL_sv_undef - && cp >= SBASE && cp < SBASE + SCount - && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */ -} - -bool -Perl_is_utf8_X_T(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_T; - - return is_utf8_common(p, &PL_utf8_X_T, "_X_GCB_T"); -} - -bool -Perl_is_utf8_X_V(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_V; - - return is_utf8_common(p, &PL_utf8_X_V, "_X_GCB_V"); -} - -bool -Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V; - - return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V"); -} - -bool -Perl__is_utf8_quotemeta(pTHX_ const U8 *p) -{ - /* For exclusive use of pp_quotemeta() */ - - dVAR; - - PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA; - - return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta"); -} - /* =for apidoc to_utf8_case @@ -2433,7 +2298,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, uvuni_to_utf8(tmpbuf, uv1); if (!*swashp) /* load on-demand */ - *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); + *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); if (special) { /* It might be "special" (sometimes, but not always, @@ -2922,7 +2787,11 @@ SV* Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p) { /* Initialize and return a swash, creating it if necessary. It does this - * by calling utf8_heavy.pl in the general case. + * by calling utf8_heavy.pl in the general case. The returned value may be + * the swash's inversion list instead if the input parameters allow it. + * Which is returned should be immaterial to callers, as the only + * operations permitted on a swash, swash_fetch() and + * _get_swash_invlist(), handle both these transparently. * * This interface should only be used by functions that won't destroy or * adversely change the swash, as doing so affects all other uses of the @@ -2947,6 +2816,10 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m * came from a user-defined property. (I O) * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking * when the swash cannot be located, to simply return NULL. (I) + * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a + * return of an inversion list instead of a swash hash if this routine + * thinks that would result in faster execution of swash_fetch() later + * on. (I) * * Thus there are three possible inputs to find the swash: , * , and . At least one must be specified. The result @@ -2958,6 +2831,11 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m dVAR; SV* retval = &PL_sv_undef; HV* swash_hv = NULL; + const int invlist_swash_boundary = + (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST) + ? 512 /* Based on some benchmarking, but not extensive, see commit + message */ + : -1; /* Never return just an inversion list */ assert(listsv != &PL_sv_undef || strNE(name, "") || invlist); assert(! invlist || minbits == 1); @@ -2978,25 +2856,31 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m ENTER; SAVEHINTS(); save_re_context(); + /* We might get here via a subroutine signature which uses a utf8 + * parameter name, at which point PL_subname will have been set + * but not yet used. */ + save_item(PL_subname); if (PL_parser && PL_parser->error_count) SAVEI8(PL_parser->error_count), PL_parser->error_count = 0; method = gv_fetchmeth(stash, "SWASHNEW", 8, -1); if (!method) { /* demand load utf8 */ ENTER; errsv_save = newSVsv(ERRSV); + SAVEFREESV(errsv_save); /* It is assumed that callers of this routine are not passing in * any user derived data. */ /* Need to do this after save_re_context() as it will set * PL_tainted to 1 while saving $1 etc (see the code after getrx: * in Perl_magic_get). Even line to create errsv_save can turn on * PL_tainted. */ - SAVEBOOL(PL_tainted); - PL_tainted = 0; +#ifndef NO_TAINT_SUPPORT + SAVEBOOL(TAINT_get); + TAINT_NOT; +#endif Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), NULL); if (!SvTRUE(ERRSV)) sv_setsv(ERRSV, errsv_save); - SvREFCNT_dec(errsv_save); LEAVE; } SPAGAIN; @@ -3009,6 +2893,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m mPUSHi(none); PUTBACK; errsv_save = newSVsv(ERRSV); + SAVEFREESV(errsv_save); /* If we already have a pointer to the method, no need to use * call_method() to repeat the lookup. */ if (method ? call_sv(MUTABLE_SV(method), G_SCALAR) @@ -3019,7 +2904,6 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m } if (!SvTRUE(ERRSV)) sv_setsv(ERRSV, errsv_save); - SvREFCNT_dec(errsv_save); LEAVE; POPSTACK; if (IN_PERL_COMPILETIME) { @@ -3093,9 +2977,12 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m } else { - /* Here, there is no swash already. Set up a minimal one */ - swash_hv = newHV(); - retval = newRV_inc(MUTABLE_SV(swash_hv)); + /* Here, there is no swash already. Set up a minimal one, if + * we are going to return a swash */ + if ((int) _invlist_len(invlist) > invlist_swash_boundary) { + swash_hv = newHV(); + retval = newRV_inc(MUTABLE_SV(swash_hv)); + } swash_invlist = invlist; } } @@ -3103,12 +2990,19 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m /* Here, we have computed the union of all the passed-in data. It may * be that there was an inversion list in the swash which didn't get * touched; otherwise save the one computed one */ - if (! invlist_in_swash_is_valid) { + if (! invlist_in_swash_is_valid + && (int) _invlist_len(swash_invlist) > invlist_swash_boundary) + { if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist)) { Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); } } + + if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { + SvREFCNT_dec(retval); + retval = newRV_inc(swash_invlist); + } } return retval; @@ -3174,6 +3068,15 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) PERL_ARGS_ASSERT_SWASH_FETCH; + /* If it really isn't a hash, it isn't really swash; must be an inversion + * list */ + if (SvTYPE(hv) != SVt_PVHV) { + return _invlist_contains_cp((SV*)hv, + (do_utf8) + ? valid_utf8_to_uvchr(ptr, NULL) + : c); + } + /* Convert to utf8 if not already */ if (!do_utf8 && !UNI_IS_INVARIANT(c)) { tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); @@ -3711,7 +3614,7 @@ HV* Perl__swash_inversion_hash(pTHX_ SV* const swash) { - /* Subject to change or removal. For use only in one place in regcomp.c. + /* Subject to change or removal. For use only in regcomp.c and regexec.c * Can't be used on a property that is subject to user override, as it * relies on the value of SPECIALS in the swash which would be set by * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set @@ -4147,12 +4050,17 @@ Perl__get_swash_invlist(pTHX_ SV* const swash) PERL_ARGS_ASSERT__GET_SWASH_INVLIST; - if (! SvROK(swash) || SvTYPE(SvRV(swash)) != SVt_PVHV) { + if (! SvROK(swash)) { return NULL; } - ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE); + /* If it really isn't a hash, it isn't really swash; must be an inversion + * list */ + if (SvTYPE(SvRV(swash)) != SVt_PVHV) { + return SvRV(swash); + } + ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE); if (! ptr) { return NULL; } @@ -4394,9 +4302,11 @@ scanning won't continue past that goal. Correspondingly for C with respect C. If C is non-NULL and the pointer it points to is not NULL, that pointer is -considered an end pointer beyond which scanning of C will not continue under -any circumstances. This means that if both C and C are specified, and -C +considered an end pointer to the position 1 byte past the maximum point +in C beyond which scanning will not continue under any circumstances. +(This routine assumes that UTF-8 encoded input strings are not malformed; +malformed input can cause it to read past C). +This means that if both C and C are specified, and C is less than C+C, the match will never be successful because it can never get as far as its goal (and in fact is asserted against). Correspondingly for