This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Use fnc to force out malformed warnings
authorKarl Williamson <khw@cpan.org>
Fri, 9 Dec 2016 15:45:18 +0000 (08:45 -0700)
committerKarl Williamson <khw@cpan.org>
Fri, 23 Dec 2016 20:21:31 +0000 (13:21 -0700)
The previous commit added a function to do this task.  This current
commit changes the several places in the core that have here-to-fore
done this in an ad-hoc (and not as reliable) manner to use the new
function.

A couple of messages in toke.c are left in so as to avoid changing
diagnostics unnecessarily.  If those messages had been created in the
project after the enhanced malformation warnings were created, they
would have been phrased differently.

The reason some of the methods weren't so reliable, is they relied on
fatalizing the warnng message.  However if warnings are turned off, it
never gets to the point of outputting, hence doesn't necessarily die.

regexec.c
t/op/lex.t
t/uni/parser.t
toke.c

index f6f293d..e9c74e6 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -9225,10 +9225,14 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
         STRLEN c_len = 0;
      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
         STRLEN c_len = 0;
-       c = utf8n_to_uvchr(p, p_end - p, &c_len, (  UTF8_ALLOW_DEFAULT
-                                                  | UTF8_CHECK_ONLY));
-       if (c_len == (STRLEN)-1)
-           Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+        const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
+       c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
+       if (c_len == (STRLEN)-1) {
+            _force_out_malformed_utf8_message(p, p_end,
+                                              utf8n_flags,
+                                              1 /* 1 means die */ );
+            NOT_REACHED; /* NOTREACHED */
+        }
         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
         }
         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
         }
index bd6bb0f..e50f0eb 100644 (file)
@@ -254,8 +254,9 @@ SKIP:
       or skip "These tests won't work on EBCIDIC", 3;
     fresh_perl_is(
         "BEGIN{\$^H=hex ~0}\xF3",
       or skip "These tests won't work on EBCIDIC", 3;
     fresh_perl_is(
         "BEGIN{\$^H=hex ~0}\xF3",
-        "Integer overflow in hexadecimal number at - line 1.\n" .
-        "Malformed UTF-8 character: \\xf3 (too short; 1 byte available, need 4) at - line 1.",
+        "Integer overflow in hexadecimal number at - line 1.\n"
+      . "Malformed UTF-8 character: \\xf3 (too short; 1 byte available, need 4) at - line 1.\n"
+      . "Malformed UTF-8 character (fatal) at - line 1.",
         {},
         '[perl #128996] - use of PL_op after op is freed'
     );
         {},
         '[perl #128996] - use of PL_op after op is freed'
     );
@@ -267,7 +268,7 @@ SKIP:
     );
     fresh_perl_like(
         qq(BEGIN{\$^H=0x800000}\n   0m 0\xB5\xB500\xB5\0),
     );
     fresh_perl_like(
         qq(BEGIN{\$^H=0x800000}\n   0m 0\xB5\xB500\xB5\0),
-        qr/Unrecognized character \\x\{0\}; marked by <-- HERE after    0m.*<-- HERE near column 12 at - line 2./,
+        qr/Malformed UTF-8 character: \\xb5 \(unexpected continuation byte 0xb5, with no preceding start byte\)/,
         {},
         '[perl #129000] read before buffer'
     );
         {},
         '[perl #129000] read before buffer'
     );
index 6c524b2..624fdd0 100644 (file)
@@ -191,6 +191,8 @@ like( $@, qr/Bad name after Foo'/, 'Bad name after Foo\'' );
 
 {
     no warnings 'utf8';
 
 {
     no warnings 'utf8';
+    local $SIG{__WARN__} = sub { }; # The eval will also output a warning,
+                                    # which we ignore
     my $malformed_to_be = ($::IS_EBCDIC)   # Overlong sequence
                            ? "\x{74}\x{41}"
                            : "\x{c0}\x{a0}";
     my $malformed_to_be = ($::IS_EBCDIC)   # Overlong sequence
                            ? "\x{74}\x{41}"
                            : "\x{c0}\x{a0}";
diff --git a/toke.c b/toke.c
index f0a7dbc..e0a6376 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1032,13 +1032,11 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
                } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
                    p++;
                    highhalf++;
                } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
                    p++;
                    highhalf++;
-               } else if (! UTF8_IS_INVARIANT(c)) {
-                   /* malformed UTF-8 */
-                   ENTER;
-                   SAVESPTR(PL_warnhook);
-                   PL_warnhook = PERL_WARNHOOK_FATAL;
-                   utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
-                   LEAVE;
+                } else if (! UTF8_IS_INVARIANT(c)) {
+                    _force_out_malformed_utf8_message((U8 *) p, (U8 *) e,
+                                                      0,
+                                                      1 /* 1 means die */ );
+                    NOT_REACHED; /* NOTREACHED */
                }
            }
            if (!highhalf)
                }
            }
            if (!highhalf)
@@ -1428,12 +1426,11 @@ Perl_lex_peek_unichar(pTHX_ U32 flags)
        }
        unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
        if (retlen == (STRLEN)-1) {
        }
        unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
        if (retlen == (STRLEN)-1) {
-           /* malformed UTF-8 */
-           ENTER;
-           SAVESPTR(PL_warnhook);
-           PL_warnhook = PERL_WARNHOOK_FATAL;
-           utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
-           LEAVE;
+            _force_out_malformed_utf8_message((U8 *) s,
+                                              (U8 *) bufend,
+                                              0,
+                                              1 /* 1 means die */ );
+            NOT_REACHED; /* NOTREACHED */
        }
        return unichar;
     } else {
        }
        return unichar;
     } else {
@@ -2554,15 +2551,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
                                      e - backslash_ptr,
                                      &first_bad_char_loc))
     {
                                      e - backslash_ptr,
                                      &first_bad_char_loc))
     {
-        /* If warnings are on, this will print a more detailed analysis of what
-         * is wrong than the error message below */
-        utf8n_to_uvchr(first_bad_char_loc,
-                       e - ((char *) first_bad_char_loc),
-                       NULL, 0);
-
-        /* We deliberately don't try to print the malformed character, which
-         * might not print very well; it also may be just the first of many
-         * malformations, so don't print what comes after it */
+        _force_out_malformed_utf8_message(first_bad_char_loc,
+                                          (U8 *) PL_parser->bufend,
+                                          0,
+                                          0 /* 0 means don't die */ );
         yyerror_pv(Perl_form(aTHX_
             "Malformed UTF-8 character immediately after '%.*s'",
             (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
         yyerror_pv(Perl_form(aTHX_
             "Malformed UTF-8 character immediately after '%.*s'",
             (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
@@ -2695,15 +2687,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         STRLEN len;
         const char* const str = SvPV_const(res, len);
         if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
         STRLEN len;
         const char* const str = SvPV_const(res, len);
         if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
-            /* If warnings are on, this will print a more detailed analysis of
-             * what is wrong than the error message below */
-            utf8n_to_uvchr(first_bad_char_loc,
-                           (char *) first_bad_char_loc - str,
-                           NULL, 0);
-
-            /* We deliberately don't try to print the malformed character,
-             * which might not print very well; it also may be just the first
-             * of many malformations, so don't print what comes after it */
+            _force_out_malformed_utf8_message(first_bad_char_loc,
+                                              (U8 *) PL_parser->bufend,
+                                              0,
+                                              0 /* 0 means don't die */ );
             yyerror_pv(
               Perl_form(aTHX_
                 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
             yyerror_pv(
               Perl_form(aTHX_
                 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
@@ -4902,11 +4889,10 @@ Perl_yylex(pTHX)
     default:
        if (UTF) {
             if (! isUTF8_CHAR((U8 *) s, (U8 *) PL_bufend)) {
     default:
        if (UTF) {
             if (! isUTF8_CHAR((U8 *) s, (U8 *) PL_bufend)) {
-                ENTER;
-                SAVESPTR(PL_warnhook);
-                PL_warnhook = PERL_WARNHOOK_FATAL;
-                utf8n_to_uvchr((U8*)s, PL_bufend-s, NULL, 0);
-                LEAVE;
+                _force_out_malformed_utf8_message((U8 *) s, (U8 *) PL_bufend,
+                                                  0,
+                                                  1 /* 1 means die */ );
+                NOT_REACHED; /* NOTREACHED */
             }
             if (isIDFIRST_utf8((U8*)s)) {
                 goto keylookup;
             }
             if (isIDFIRST_utf8((U8*)s)) {
                 goto keylookup;