This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Add a const to a function parameter
[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c
index 5fca6f7..ed5b027 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -146,7 +146,7 @@ For details, see the description for L</uvchr_to_utf8_flags>.
 #define MASK    UTF_CONTINUATION_MASK
 
 U8 *
-Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
 {
     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
 
@@ -2428,13 +2428,13 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
        uvchr_to_utf8(p, c);
        return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
     }
-    else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
+    else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
               the special flags. */
        U8 utf8_c[UTF8_MAXBYTES + 1];
 
       needs_full_generality:
        uvchr_to_utf8(utf8_c, c);
-       return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
+       return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), p, lenp, flags);
     }
 }
 
@@ -2541,11 +2541,20 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const name,
                Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
             }
 
+            if (instr(file, "mathoms.c")) {
+                Perl_warner(aTHX_ WARN_DEPRECATED,
+                            "In %s, line %d, starting in Perl v5.30, %s()"
+                            " will be removed.  Avoid this message by"
+                            " converting to use %s().\n",
+                            file, line, name, alternative);
+            }
+            else {
                 Perl_warner(aTHX_ WARN_DEPRECATED,
                             "In %s, line %d, starting in Perl v5.30, %s() will"
                             " require an additional parameter.  Avoid this"
                             " message by converting to use %s().\n",
                             file, line, name, alternative);
+            }
         }
     }
 }
@@ -2565,7 +2574,6 @@ Perl__is_utf8_FOO(pTHX_       U8   classnum,
     warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
 
     if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
-        SV * invlist;
 
         switch (classnum) {
             case _CC_WORDCHAR:
@@ -2598,14 +2606,18 @@ Perl__is_utf8_FOO(pTHX_       U8   classnum,
                 return is_VERTWS_high(p);
             case _CC_IDFIRST:
                 if (! PL_utf8_perl_idstart) {
-                    invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
+                    PL_utf8_perl_idstart
+                                = _new_invlist_C_array(_Perl_IDStart_invlist);
                 }
-                return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
+                return is_utf8_common(p, &PL_utf8_perl_idstart,
+                                      "_Perl_IDStart", NULL);
             case _CC_IDCONT:
                 if (! PL_utf8_perl_idcont) {
-                    invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
+                    PL_utf8_perl_idcont
+                                = _new_invlist_C_array(_Perl_IDCont_invlist);
                 }
-                return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
+                return is_utf8_common(p, &PL_utf8_perl_idcont,
+                                      "_Perl_IDCont", NULL);
         }
     }
 
@@ -2718,10 +2730,12 @@ Perl__is_utf8_mark(pTHX_ const U8 *p)
 /*
 =for apidoc to_utf8_case
 
-Instead use the appropriate one of L</toUPPER_utf8>,
-L</toTITLE_utf8>,
-L</toLOWER_utf8>,
-or L</toFOLD_utf8>.
+Instead use the appropriate one of L</toUPPER_utf8_safe>,
+L</toTITLE_utf8_safe>,
+L</toLOWER_utf8_safe>,
+or L</toFOLD_utf8_safe>.
+
+This function will be removed in Perl v5.28.
 
 C<p> contains the pointer to the UTF-8 string encoding
 the character that is being converted.  This routine assumes that the character
@@ -2754,9 +2768,19 @@ UV
 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
                        SV **swashp, const char *normal, const char *special)
 {
+    STRLEN len_cp;
+    UV cp;
+    const U8 * e = p + UTF8SKIP(p);
+
     PERL_ARGS_ASSERT_TO_UTF8_CASE;
 
-    return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
+    cp = utf8n_to_uvchr(p, e - p, &len_cp, UTF8_CHECK_ONLY);
+    if (len_cp == (STRLEN) -1) {
+        _force_out_malformed_utf8_message(p, e,
+                                   _UTF8_NO_CONFIDENCE_IN_CURLEN, 1 /* Die */ );
+    }
+
+    return _to_utf8_case(cp, p, ustrp, lenp, swashp, normal, special);
 }
 
     /* change namve uv1 to 'from' */
@@ -2989,6 +3013,84 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
     return original;
 }
 
+STATIC U32
+S_check_and_deprecate(pTHX_ const U8 *p,
+                            const U8 **e,
+                            const unsigned int type,    /* See below */
+                            const bool use_locale,      /* Is this a 'LC_'
+                                                           macro call? */
+                            const char * const file,
+                            const unsigned line)
+{
+    /* This is a temporary function to deprecate the unsafe calls to the case
+     * changing macros and functions.  It keeps all the special stuff in just
+     * one place.
+     *
+     * It updates *e with the pointer to the end of the input string.  If using
+     * the old-style macros, *e is NULL on input, and so this function assumes
+     * the input string is long enough to hold the entire UTF-8 sequence, and
+     * sets *e accordingly, but it then returns a flag to pass the
+     * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
+     * using the full length if possible.
+     *
+     * It also does the assert that *e > p when *e is not NULL.  This should be
+     * migrated to the callers when this function gets deleted.
+     *
+     * The 'type' parameter is used for the caller to specify which case
+     * changing function this is called from: */
+
+#       define DEPRECATE_TO_UPPER 0
+#       define DEPRECATE_TO_TITLE 1
+#       define DEPRECATE_TO_LOWER 2
+#       define DEPRECATE_TO_FOLD  3
+
+    U32 utf8n_flags = 0;
+    const char * name;
+    const char * alternative;
+
+    PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
+
+    if (*e == NULL) {
+        utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
+        *e = p + UTF8SKIP(p);
+
+        /* For mathoms.c calls, we use the function name we know is stored
+         * there.  It could be part of a larger path */
+        if (type == DEPRECATE_TO_UPPER) {
+            name = instr(file, "mathoms.c")
+                   ? "to_utf8_upper"
+                   : "toUPPER_utf8";
+            alternative = "toUPPER_utf8_safe";
+        }
+        else if (type == DEPRECATE_TO_TITLE) {
+            name = instr(file, "mathoms.c")
+                   ? "to_utf8_title"
+                   : "toTITLE_utf8";
+            alternative = "toTITLE_utf8_safe";
+        }
+        else if (type == DEPRECATE_TO_LOWER) {
+            name = instr(file, "mathoms.c")
+                   ? "to_utf8_lower"
+                   : "toLOWER_utf8";
+            alternative = "toLOWER_utf8_safe";
+        }
+        else if (type == DEPRECATE_TO_FOLD) {
+            name = instr(file, "mathoms.c")
+                   ? "to_utf8_fold"
+                   : "toFOLD_utf8";
+            alternative = "toFOLD_utf8_safe";
+        }
+        else Perl_croak(aTHX_ "panic: Unexpected case change type");
+
+        warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
+    }
+    else {
+        assert (p < *e);
+    }
+
+    return utf8n_flags;
+}
+
 /* The process for changing the case is essentially the same for the four case
  * change types, except there are complications for folding.  Otherwise the
  * difference is only which case to change to.  To make sure that they all do
@@ -3019,6 +3121,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
  * going on. */
 #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func,    \
                                L1_func_extra_param)                          \
+                                                                             \
     if (flags & (locale_flags)) {                                            \
         /* Treat a UTF-8 locale as not being in locale at all */             \
         if (IN_UTF8_CTYPE_LOCALE) {                                          \
@@ -3037,7 +3140,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
         }                                                                    \
     }                                                                        \
-    else if UTF8_IS_DOWNGRADEABLE_START(*p) {                                \
+    else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
         if (flags & (locale_flags)) {                                        \
             result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p,         \
                                                                  *(p+1)));   \
@@ -3047,8 +3150,13 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
                            ustrp, lenp,  L1_func_extra_param);               \
         }                                                                    \
     }                                                                        \
-    else {  /* malformed UTF-8 */                                            \
-        result = valid_utf8_to_uvchr(p, NULL);                               \
+    else {  /* malformed UTF-8 or ord above 255 */                           \
+        STRLEN len_result;                                                   \
+        result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
+        if (len_result == (STRLEN) -1) {                                     \
+            _force_out_malformed_utf8_message(p, e, utf8n_flags,             \
+                                                            1 /* Die */ );   \
+        }
 
 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
         result = change_macro(result, p, ustrp, lenp);                       \
@@ -3075,7 +3183,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
 /*
 =for apidoc to_utf8_upper
 
-Instead use L</toUPPER_utf8>.
+Instead use L</toUPPER_utf8_safe>.
 
 =cut */
 
@@ -3084,9 +3192,17 @@ Instead use L</toUPPER_utf8>.
  *         be used. */
 
 UV
-Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
+                                const U8 *e,
+                                U8* ustrp,
+                                STRLEN *lenp,
+                                bool flags,
+                                const char * const file,
+                                const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
 
@@ -3099,7 +3215,7 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
 /*
 =for apidoc to_utf8_title
 
-Instead use L</toTITLE_utf8>.
+Instead use L</toTITLE_utf8_safe>.
 
 =cut */
 
@@ -3110,9 +3226,17 @@ Instead use L</toTITLE_utf8>.
  */
 
 UV
-Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_title_flags(pTHX_ const U8 *p,
+                                const U8 *e,
+                                U8* ustrp,
+                                STRLEN *lenp,
+                                bool flags,
+                                const char * const file,
+                                const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
 
@@ -3124,7 +3248,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
 /*
 =for apidoc to_utf8_lower
 
-Instead use L</toLOWER_utf8>.
+Instead use L</toLOWER_utf8_safe>.
 
 =cut */
 
@@ -3134,9 +3258,17 @@ Instead use L</toLOWER_utf8>.
  */
 
 UV
-Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
+                                const U8 *e,
+                                U8* ustrp,
+                                STRLEN *lenp,
+                                bool flags,
+                                const char * const file,
+                                const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
 
@@ -3147,7 +3279,7 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
 /*
 =for apidoc to_utf8_fold
 
-Instead use L</toFOLD_utf8>.
+Instead use L</toFOLD_utf8_safe>.
 
 =cut */
 
@@ -3162,9 +3294,17 @@ Instead use L</toFOLD_utf8>.
  */
 
 UV
-Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
+Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
+                               const U8 *e,
+                               U8* ustrp,
+                               STRLEN *lenp,
+                               U8 flags,
+                               const char * const file,
+                               const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
 
@@ -4626,7 +4766,6 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
             invlist = _new_invlist(0);
         }
         else {
-            while (isSPACE(*l)) l++;
             l = (U8 *) after_atou;
 
             /* Get the 0th element, which is needed to setup the inversion list */
@@ -5151,7 +5290,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                     *foldbuf1 = toFOLD(*p1);
                 }
                 else if (u1) {
-                    _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
+                    _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
                 }
                 else {  /* Not UTF-8, get UTF-8 fold */
                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
@@ -5175,7 +5314,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                     *foldbuf2 = toFOLD(*p2);
                 }
                 else if (u2) {
-                    _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder);
+                    _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
                 }
                 else {
                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);