toke.c: Fail on malformed UTF-8 in \N{} input

author Karl Williamson <public@khwilliamson.com>

Thu, 1 Nov 2012 18:06:32 +0000 (12:06 -0600)

committer Karl Williamson <public@khwilliamson.com>

Sun, 11 Nov 2012 17:11:34 +0000 (10:11 -0700)
author Karl Williamson <public@khwilliamson.com>
Thu, 1 Nov 2012 18:06:32 +0000 (12:06 -0600)
committer Karl Williamson <public@khwilliamson.com>
Sun, 11 Nov 2012 17:11:34 +0000 (10:11 -0700)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 0f98ee8..ba30c17 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -2702,6 +2702,14 @@ message.
  
  See also L<Encode/"Handling Malformed Data">.
  
+=item Malformed UTF-8 character immediately after '%s'
+
+(F) You said C<use utf8>, but the program file doesn't comply with UTF-8
+encoding rules.  The message prints out the properly encoded characters
+just before the first bad one.  If C<utf8> warnings are enabled, a
+warning is generated that gives more details about the type of
+malformation.
+
  =item Malformed UTF-8 returned by \N
  
  (F) The charnames handler returned malformed UTF-8.
diff --git a/t/uni/parser.t b/t/uni/parser.t

index 79e4612..fb7b720 100644 (file)
--- a/t/uni/parser.t
+++ b/t/uni/parser.t
@@ -7,7 +7,7 @@ BEGIN {
      require './test.pl';
  }
  
-plan (tests => 47);
+plan (tests => 48);
  
  use utf8;
  use open qw( :utf8 :std );
@@ -145,3 +145,10 @@ eval q{ Ｆｏｏ::$bar };
  like( $@, qr/Bad name after Ｆｏｏ::/, 'Bad name after Ｆｏｏ::' );
  eval q{ Ｆｏｏ''bar };
  like( $@, qr/Bad name after Ｆｏｏ'/, 'Bad name after Ｆｏｏ\'' );
+
+{
+    no warnings 'utf8';
+    my $malformed_to_be = "\x{c0}\x{a0}";   # Overlong sequence
+    CORE::evalbytes "use charnames ':full'; use utf8; my \$x = \"\\N{abc$malformed_to_be}\"";
+    like( $@, qr/Malformed UTF-8 character immediately after '\\N{abc' at .* within string/, 'Malformed UTF-8 input to \N{}');
+}
diff --git a/toke.c b/toke.c

index 0f946aa..c71cdb7 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -2654,9 +2654,28 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
  
      PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
  
-    res = new_constant( NULL, 0, "charnames",
-                        /* includes all of: \N{...} */
-                        res, NULL, s - 3, e - s + 4 );
+    if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
+                                     e - backslash_ptr,
+                                     &first_bad_char_loc))
+    {
+        /* If warnings are on, this will print a more detailed analysis of what
+         * is wrong than the error message below */
+        utf8n_to_uvuni(first_bad_char_loc,
+                       e - ((char *) first_bad_char_loc),
+                       NULL, 0);
+
+        /* We deliberately don't try to print the malformed character, which
+         * might not print very well; it also may be just the first of many
+         * malformations, so don't print what comes after it */
+        yyerror(Perl_form(aTHX_
+            "Malformed UTF-8 character immediately after '%.*s'",
+            (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
+       return NULL;
+    }
+
+    res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
+                        /* include the <}> */
+                        e - backslash_ptr + 1);
      if (! SvPOK(res)) {
          return NULL;
      }
@@ -8931,7 +8950,8 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
  /* Either returns sv, or mortalizes sv and returns a new SV*.
     Best used as sv=new_constant(..., sv, ...).
     If s, pv are NULL, calls subroutine with one argument,
-   and type is used with error messages only. */
+   and <type> is used with error messages only.
+   <type> is assumed to be well formed UTF-8 */
  
  STATIC SV *
  S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
author	Karl Williamson <public@khwilliamson.com>
	Thu, 1 Nov 2012 18:06:32 +0000 (12:06 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Sun, 11 Nov 2012 17:11:34 +0000 (10:11 -0700)
pod/perldiag.pod		patch \| blob \| blame \| history
t/uni/parser.t		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history