PATCH: [perl #132163] regex assertion failure

author Karl Williamson <khw@cpan.org>

Tue, 6 Mar 2018 19:32:58 +0000 (12:32 -0700)

committer Karl Williamson <khw@cpan.org>

Tue, 6 Mar 2018 21:37:46 +0000 (14:37 -0700)
author Karl Williamson <khw@cpan.org>
Tue, 6 Mar 2018 19:32:58 +0000 (12:32 -0700)
committer Karl Williamson <khw@cpan.org>
Tue, 6 Mar 2018 21:37:46 +0000 (14:37 -0700)
diff --git a/MANIFEST b/MANIFEST

index b41aa29..c11c8f5 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -5485,6 +5485,7 @@ t/lib/croak/pp                    Test croak calls from pp.c
  t/lib/croak/pp_ctl             Test croak calls from pp_ctl.c
  t/lib/croak/pp_hot             Test croak calls from pp_hot.c
  t/lib/croak/pp_sys             Test croak calls from pp_sys.c
+t/lib/croak/regcomp            Test croak calls from regcomp.c
  t/lib/croak/toke               Test croak calls from toke.c
  t/lib/croak/toke_l1            Test croak calls from toke.c; file is not UTF-8 encoded
  t/lib/cygwin.t                 Builtin cygwin function tests
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 891989c..72b3632 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -7151,7 +7151,10 @@ modifier is not presently meaningful in substitutions.
  use the /g modifier.  Currently, /c is meaningful only when /g is
  used.  (This may change in the future.)
  
-=item Use of code point 0x%s is not allowed; the permissible max is 0x%s.
+=item Use of code point 0x%s is not allowed; the permissible max is 0x%x
+
+=item Use of code point 0x%s is not allowed; the permissible max is 0x%x
+in regex; marked by <-- HERE in m/%s/
  
  (F) You used a code point that is not allowed, because it is too large.
  Unicode only allows code points up to 0x10FFFF, but Perl allows much
diff --git a/regcomp.c b/regcomp.c

index f8d9771..92c8fe4 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -12307,15 +12307,13 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
    */
  
      char * endbrace;    /* points to '}' following the name */
-    char * endchar;     /* Points to '.' or '}' ending cur char in the input
-                           stream */
      char* p = RExC_parse; /* Temporary */
  
      SV * substitute_parse;
-    STRLEN len;
      char *orig_end;
      char *save_start;
      I32 flags;
+    Size_t count = 0;   /* code point count kept internally by this function */
  
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -12400,139 +12398,165 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          vFAIL("\\N{NAME} must be resolved by the lexer");
      }
  
+        /* This code purposely indented below because of future changes coming */
+
+        /* We can get to here when the input is \N{U+...} or when toke.c has
+         * converted a name to the \N{U+...} form.  This include changing a
+         * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
+
          RExC_parse += 2;    /* Skip past the 'U+' */
  
-        /* Because toke.c has generated a special construct for us guaranteed
-         * not to have NULs, we can use a str function */
-        endchar = RExC_parse + strcspn(RExC_parse, ".}");
+        /* Code points are separated by dots.  The '}' terminates the whole
+         * thing. */
+
+        do {    /* Loop until the ending brace */
+            UV cp = 0;
+            char * start_digit;     /* The first of the current code point */
+            if (! isXDIGIT(*RExC_parse)) {
+                RExC_parse++;
+                vFAIL("Invalid hexadecimal number in \\N{U+...}");
+            }
+
+            start_digit = RExC_parse;
+            count++;
  
-        /* Code points are separated by dots.  If none, there is only one code
-         * point, and is terminated by the brace */
+            /* Loop through the hex digits of the current code point */
+            do {
+                /* Adding this digit will shift the result 4 bits.  If that
+                 * result would be above IV_MAX, it's overflow */
+                if (cp > IV_MAX >> 4) {
+
+                    /* Find the end of the code point */
+                    do {
+                        RExC_parse ++;
+                    } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
+
+                    /* Be sure to synchronize this message with the similar one
+                     * in utf8.c */
+                    vFAIL4("Use of code point 0x%.*s is not allowed; the"
+                        " permissible max is 0x%" UVxf,
+                        (int) (RExC_parse - start_digit), start_digit, IV_MAX);
+                }
  
-        if (endchar >= endbrace) {
-            STRLEN length_of_hex;
-            I32 grok_hex_flags;
+                /* Accumulate this (valid) digit into the running total */
+                cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
  
-            /* Here, exactly one code point.  If that isn't what is wanted,
-             * fail */
-            if (! code_point_p) {
-                RExC_parse = p;
-                return FALSE;
+                /* READ_XDIGIT advanced the input pointer.  Ignore a single
+                 * underscore separator */
+                if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
+                    RExC_parse++;
+                }
+            } while (isXDIGIT(*RExC_parse));
+
+            /* Here, have accumulated the next code point */
+            if (RExC_parse >= endbrace) {   /* If done ... */
+                if (count != 1) {
+                    goto do_concat;
+                }
+
+                /* Here, is a single code point; fail if doesn't want that */
+                if (! code_point_p) {
+                    RExC_parse = p;
+                    return FALSE;
+                }
+
+                /* A single code point is easy to handle; just return it */
+                *code_point_p = UNI_TO_NATIVE(cp);
+                RExC_parse = endbrace;
+                nextchar(pRExC_state);
+                return TRUE;
              }
  
-            /* Convert code point from hex */
-            length_of_hex = (STRLEN)(endchar - RExC_parse);
-            grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
-                            | PERL_SCAN_DISALLOW_PREFIX
-
-                                /* No errors in the first pass (See [perl
-                                * #122671].)  We let the code below find the
-                                * errors when there are multiple chars. */
-                            | ((SIZE_ONLY)
-                                ? PERL_SCAN_SILENT_ILLDIGIT
-                                : 0);
-
-            /* This routine is the one place where both single- and
-             * double-quotish \N{U+xxxx} are evaluated.  The value is a Unicode
-             * code point which must be converted to native. */
-            *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
-                                            &length_of_hex,
-                                            &grok_hex_flags,
-                                            NULL));
-
-            /* The tokenizer should have guaranteed validity, but it's possible
-             * to bypass it by using single quoting, so check.  Don't do the
-             * check here when there are multiple chars; we do it below anyway.
-             * */
-            if (length_of_hex == 0
-                || length_of_hex != (STRLEN)(endchar - RExC_parse) )
-            {
-                RExC_parse += length_of_hex;    /* Includes all the valid */
+            /* Here, the only legal thing would be a multiple character
+             * sequence (of the form "\N{U+c1.c2. ... }".   So the next
+             * character must be a dot (and the one after that can't be the
+             * endbrace, or we'd have something like \N{U+100.} ) */
+            if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
                  RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
                                  ? UTF8SKIP(RExC_parse)
                                  : 1;
-                /* Guard against malformed utf8 */
-                if (RExC_parse >= endchar) {
-                    RExC_parse = endchar;
+                if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
+                    RExC_parse = endbrace;
                  }
                  vFAIL("Invalid hexadecimal number in \\N{U+...}");
              }
  
-            RExC_parse = endbrace + 1;
-            return TRUE;
-        }
-
-        /* Here, is a multiple character sequence */
-
-        /* Count the code points, if desired, in the sequence */
-        if (cp_count) {
-            *cp_count = 0;
-            while (RExC_parse < endbrace) {
-                /* Point to the beginning of the next character in the sequence. */
-                RExC_parse = endchar + 1;
-                endchar = RExC_parse + strcspn(RExC_parse, ".}");
-                (*cp_count)++;
-            }
-        }
+            /* Here, looks like its really a multiple character sequence.  Fail
+             * if that's not what the caller wants. */
+            if (! node_p) {
+
+                /* But even if failing, we count the code points if requested, and
+                 * don't back up up the pointer as the caller is expected to
+                 * handle this situation */
+                if (cp_count) {
+                    char * dot = RExC_parse + 1;
+                    do {
+                        dot = (char *) memchr(dot, '.', endbrace - dot);
+                        if (! dot) {
+                            break;
+                        }
+                        count++;
+                        dot++;
+                    } while (dot < endbrace);
+                    count++;
  
-        /* Fail if caller doesn't want to handle a multi-code-point sequence.
-         * But don't backup up the pointer if the caller wants to know how many
-         * code points there are (they can then handle things) */
-        if (! node_p) {
-            if (! cp_count) {
-                RExC_parse = p;
+                    *cp_count = count;
+                    RExC_parse = endbrace;
+                    nextchar(pRExC_state);
+                }
+                else {  /* Back up the pointer. */
+                    RExC_parse = p;
+                }
+                return FALSE;
              }
-            return FALSE;
-        }
  
-        /* What is done here is to convert this to a sub-pattern of the form
-         * \x{char1}\x{char2}...  and then call reg recursively to parse it
-         * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
-         * while not having to worry about special handling that some code
-         * points may have. */
+            /* What is done here is to convert this to a sub-pattern of the
+             * form \x{char1}\x{char2}...  and then call reg recursively to
+             * parse it (enclosing in "(?: ... )" ).  That way, it retains its
+             * atomicness, while not having to worry about special handling
+             * that some code points may have. */
  
-        substitute_parse = newSVpvs("?:");
+            if (count == 1) {
+                substitute_parse = newSVpvs("?:");
+            }
  
-        while (RExC_parse < endbrace) {
+          do_concat:
  
              /* Convert to notation the rest of the code understands */
              sv_catpv(substitute_parse, "\\x{");
-            sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+            sv_catpvn(substitute_parse, start_digit, RExC_parse - start_digit);
              sv_catpv(substitute_parse, "}");
  
-            /* Point to the beginning of the next character in the sequence. */
-            RExC_parse = endchar + 1;
-            endchar = RExC_parse + strcspn(RExC_parse, ".}");
-
-        }
-        sv_catpv(substitute_parse, ")");
+            /* Move to after the dot (or ending brace the final time through.)
+             * */
+            RExC_parse++;
  
-        len = SvCUR(substitute_parse);
+        } while (RExC_parse < endbrace);
  
-        /* Don't allow empty number */
-        if (len < (STRLEN) 8) {
-            RExC_parse = endbrace;
-           vFAIL("Invalid hexadecimal number in \\N{U+...}");
-       }
+        sv_catpv(substitute_parse, ")");
  
-        /* The values are Unicode, and therefore not subject to recoding, but
-         * have to be converted to native on a non-Unicode (meaning non-ASCII)
-         * platform. */
  #ifdef EBCDIC
+        /* The values are Unicode, and therefore have to be converted to native
+         * on a non-Unicode (meaning non-ASCII) platform. */
          RExC_recode_x_to_native = 1;
  #endif
  
+    /* Here, we have the string the name evaluates to, ready to be parsed,
+     * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
+     * constructs.  This can be called from within a substitute parse already.
+     * The error reporting mechanism doesn't work for 2 levels of this, but the
+     * code above has validated this new construct, so there should be no
+     * errors generated by the below.*/
      save_start = RExC_start;
      orig_end = RExC_end;
  
-    RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
-                                                         len);
-    RExC_end = RExC_parse + len;
+    RExC_parse = RExC_start = SvPVX(substitute_parse);
+    RExC_end = RExC_parse + SvCUR(substitute_parse);
  
      *node_p = reg(pRExC_state, 1, &flags, depth+1);
  
      /* Restore the saved values */
-    RExC_start = RExC_adjusted_start = save_start;
+    RExC_start = save_start;
      RExC_parse = endbrace;
      RExC_end = orig_end;
  #ifdef EBCDIC
diff --git a/t/lib/croak/regcomp b/t/lib/croak/regcomp

new file mode 100644 (file)

index 0000000..19586d5
--- /dev/null
+++ b/t/lib/croak/regcomp
@@ -0,0 +1,56 @@
+__END__
+# NAME \N{U+too large} on 64-bit machine
+# SKIP ? use Config; $Config{uvsize} < 8 && "Not 64 bit"
+qr/\N{U+7FFFFFFFFFFFFFFF}/;
+qr/\N{U+1_0000_0000_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000_0000_0000 is not allowed; the permissible max is 0x7fffffffffffffff in regex; marked by <-- HERE in m/\N{U+1_0000_0000_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+too large} on 32-bit machine
+# SKIP ? use Config; $Config{uvsize} > 4 && "Not 32 bit"
+qr/\N{U+7FFFFFFF}/;
+qr/\N{U+1_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000 is not allowed; the permissible max is 0x7fffffff in regex; marked by <-- HERE in m/\N{U+1_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+100.too large} on 64-bit machine
+# SKIP ? use Config; $Config{uvsize} < 8 && "Not 64 bit"
+qr/\N{U+100.7FFFFFFFFFFFFFFF}/;
+qr/\N{U+100.1_0000_0000_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000_0000_0000 is not allowed; the permissible max is 0x7fffffffffffffff in regex; marked by <-- HERE in m/\N{U+100.1_0000_0000_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+100.too large} on 32-bit machine
+# SKIP ? use Config; $Config{uvsize} > 4 && "Not 32 bit"
+qr/\N{U+100.7FFFFFFF}/;
+qr/\N{U+100.1_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000 is not allowed; the permissible max is 0x7fffffff in regex; marked by <-- HERE in m/\N{U+100.1_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+.}
+my $p00="\\N{U+.}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+. <-- HERE }/ at - line 1.
+########
+# NAME \N{U+100.}
+my $p00="\\N{U+100.}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+100. <-- HERE }/ at - line 1.
+########
+# NAME \N{U+_100}
+my $p00="\\N{U+_100}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+_ <-- HERE 100}/ at - line 1.
+########
+# NAME \N{U+100_}
+my $p00="\\N{U+100_}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+100_ <-- HERE }/ at - line 1.
+########
+# NAME [ß\N{U+.}]
+my $p00="[ß\\N{U+.}]"; qr/$p00/ui;
+# The sharp s under /i recodes the parse, and this was causing a segfault when
+# the error message referred to the original pattern
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/[ß\N{U+. <-- HERE }]/ at - line 1.
+########
diff --git a/utf8.c b/utf8.c

index eede51e..aacfc57 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -36,6 +36,8 @@
  static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
                          "Malformed UTF-8 character (unexpected end of string)";
+
+/* Be sure to synchronize this message with the similar one in regcomp.c */
  static const char cp_above_legal_max[] =
                          "Use of code point 0x%" UVXf " is not allowed; the"
                          " permissible max is 0x%" UVXf;
author	Karl Williamson <khw@cpan.org>
	Tue, 6 Mar 2018 19:32:58 +0000 (12:32 -0700)
committer	Karl Williamson <khw@cpan.org>
	Tue, 6 Mar 2018 21:37:46 +0000 (14:37 -0700)
MANIFEST		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
t/lib/croak/regcomp	[new file with mode: 0644]	patch \| blob
utf8.c		patch \| blob \| blame \| history