toke.c: Allow \N{} handling fcn to be used elsewhere in core

[perl5.git] / toke.c
diff --git a/toke.c b/toke.c

index fc87252..af3a5eb 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -41,8 +41,8 @@ Individual members of C<PL_parser> have their own documentation.
  #include "dquote_inline.h"
  #include "invlist_inline.h"
  
-#define new_constant(a,b,c,d,e,f,g)    \
-       S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
+#define new_constant(a,b,c,d,e,f,g, h) \
+       S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
  
  #define pl_yylval      (PL_parser->yylval)
  
@@ -1829,14 +1829,14 @@ S_incline(pTHX_ const char *s, const char *end)
                     }
                     else if (GvAV(cfgv)) {
                         AV * const av = GvAV(cfgv);
-                       const I32 start = CopLINE(PL_curcop)+1;
-                       I32 items = AvFILLp(av) - start;
+                       const line_t start = CopLINE(PL_curcop)+1;
+                       SSize_t items = AvFILLp(av) - start;
                         if (items > 0) {
                             AV * const av2 = GvAVn(gv2);
                             SV **svp = AvARRAY(av) + start;
-                           I32 l = (I32)line_num+1;
-                           while (items--)
-                               av_store(av2, l++, SvREFCNT_inc(*svp++));
+                           Size_t l = line_num+1;
+                           while (items-- && l < SSize_t_MAX && l == (line_t)l)
+                               av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
                         }
                     }
                 }
@@ -2068,6 +2068,7 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
      SV * const sv = newSVpvn_utf8(start, len,
                      ! IN_BYTES
                    &&  UTF
+                  &&  len != 0
                    &&  is_utf8_non_invariant_string((const U8*)start, len));
      return sv;
  }
@@ -2331,7 +2332,7 @@ S_tokeq(pTHX_ SV *sv)
      SvCUR_set(sv, d - SvPVX_const(sv));
    finish:
      if ( PL_hints & HINT_NEW_STRING )
-       return new_constant(NULL, 0, "q", sv, pv, "q", 1);
+       return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
      return sv;
  }
  
@@ -2575,16 +2576,8 @@ S_sublex_done(pTHX)
         const line_t l = CopLINE(PL_curcop);
         LEAVE;
          if (PL_parser->sub_error_count != PL_error_count) {
-            const char * const name = OutCopFILE(PL_curcop);
              if (PL_parser->sub_no_recover) {
-                const char * msg = "";
-                if (PL_in_eval) {
-                    SV *errsv = ERRSV;
-                    if (SvCUR(ERRSV)) {
-                        msg = Perl_form(aTHX_ "%" SVf, SVfARG(errsv));
-                    }
-                }
-                abort_execution(msg, name);
+                yyquit();
                  NOT_REACHED;
              }
          }
@@ -2598,33 +2591,69 @@ S_sublex_done(pTHX)
  }
  
  STATIC SV*
-S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
+S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
+{
+    /* This justs wraps get_and_check_backslash_N_name() to output any error
+     * message it returns. */
+
+    const char * error_msg = NULL;
+    SV * result;
+
+    PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
+
+    /* charnames doesn't work well if there have been errors found */
+    if (PL_error_count > 0) {
+       return NULL;
+    }
+
+    result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
+
+    if (error_msg) {
+        yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
+    }
+
+    return result;
+}
+
+SV*
+Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
+                                          const char* const e,
+                                          const bool is_utf8,
+                                          const char ** error_msg)
  {
      /* <s> points to first character of interior of \N{}, <e> to one beyond the
       * interior, hence to the "}".  Finds what the name resolves to, returning
-     * an SV* containing it; NULL if no valid one found */
-
-    SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
+     * an SV* containing it; NULL if no valid one found.
+     *
+     * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
+     * doesn't have to be. */
  
+    SV* res;
      HV * table;
      SV **cvp;
      SV *cv;
      SV *rv;
      HV *stash;
      const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
+    dVAR;
  
      PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
  
+    assert(e >= s);
+    assert(s > (char *) 3);
+
+    res = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
+
      if (!SvCUR(res)) {
          SvREFCNT_dec_NN(res);
          /* diag_listed_as: Unknown charname '%s' */
-        yyerror("Unknown charname ''");
+        *error_msg = Perl_form(aTHX_ "Unknown charname ''");
          return NULL;
      }
  
      res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
                          /* include the <}> */
-                        e - backslash_ptr + 1);
+                        e - backslash_ptr + 1, error_msg);
      if (! SvPOK(res)) {
          SvREFCNT_dec_NN(res);
          return NULL;
@@ -2653,7 +2682,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
       * characters that begin a character name alias are alphabetic, otherwise
       * would have to create a isCHARNAME_BEGIN macro */
  
-    if (! UTF) {
+    if (! is_utf8) {
          if (! isALPHAU(*s)) {
              goto bad_charname;
          }
@@ -2727,18 +2756,15 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
          /* diag_listed_as: charnames alias definitions may not contain
                             trailing white-space; marked by <-- HERE in %s
           */
-        yyerror_pv(
-            Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
              "charnames alias definitions may not contain trailing "
              "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
              (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-            ),
-        UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
          return NULL;
      }
  
-    if (SvUTF8(res)) { /* Don't accept malformed input */
+    if (SvUTF8(res)) { /* Don't accept malformed charname value */
          const U8* first_bad_char_loc;
          STRLEN len;
          const char* const str = SvPV_const(res, len);
@@ -2751,13 +2777,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
                                                0 /* 0 means don't die */ );
              /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
                                 immediately after '%s' */
-            yyerror_pv(
-              Perl_form(aTHX_
+            *error_msg = Perl_form(aTHX_
                  "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
                   (int) (e - backslash_ptr + 1), backslash_ptr,
-                 (int) ((char *) first_bad_char_loc - str), str
-              ),
-              SVf_UTF8);
+                 (int) ((char *) first_bad_char_loc - str), str);
              return NULL;
          }
      }
@@ -2770,13 +2793,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
           * that this print won't run off the end of the string */
          /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
                             in \N{%s} */
-        yyerror_pv(
-          Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
              "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
              (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-          ),
-          UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
          return NULL;
      }
  
@@ -2784,14 +2804,11 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
          /* diag_listed_as: charnames alias definitions may not contain a
                             sequence of multiple spaces; marked by <-- HERE
                             in %s */
-        yyerror_pv(
-          Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
              "charnames alias definitions may not contain a sequence of "
              "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
              (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-          ),
-          UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
          return NULL;
  }
  
@@ -3770,7 +3787,7 @@ S_scan_const(pTHX_ char *start)
                     }
                 }
                 else /* Here is \N{NAME} but not \N{U+...}. */
-                     if ((res = get_and_check_backslash_N_name(s, e)))
+                     if ((res = get_and_check_backslash_N_name_wrapper(s, e)))
                  {
                      STRLEN len;
                      const char *str = SvPV_const(res, len);
@@ -4124,7 +4141,7 @@ S_scan_const(pTHX_ char *start)
             }
  
             sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
-                               type, typelen);
+                               type, typelen, NULL);
         }
          pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
      }
@@ -5099,6 +5116,14 @@ Perl_yylex(pTHX)
  
         return yylex();
      case LEX_FORMLINE:
+        if (PL_parser->sub_error_count != PL_error_count) {
+            /* There was an error parsing a formline, which tends to
+               mess up the parser.
+               Unlike interpolated sub-parsing, we can't treat any of
+               these as recoverable, so no need to check sub_no_recover.
+            */
+            yyquit();
+        }
         assert(PL_lex_formbrack);
         s = scan_formline(PL_bufptr);
         if (!PL_lex_formbrack)
@@ -6518,6 +6543,7 @@ Perl_yylex(pTHX)
                 SAVEI32(PL_lex_formbrack);
                 PL_parser->form_lex_state = PL_lex_state;
                 PL_lex_formbrack = PL_lex_brackets + 1;
+                PL_parser->sub_error_count = PL_error_count;
                 goto leftbracket;
             }
         }
@@ -7248,10 +7274,7 @@ Perl_yylex(pTHX)
             else {                      /* no override */
                 tmp = -tmp;
                 if (tmp == KEY_dump) {
-                   Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
-                                    "dump() better written as CORE::dump(). "
-                                     "dump() will no longer be available "
-                                     "in Perl 5.30");
+                   Perl_croak(aTHX_ "dump() must be written as CORE::dump() as of Perl 5.30");
                 }
                 gv = NULL;
                 gvp = 0;
@@ -8741,9 +8764,9 @@ Perl_yylex(pTHX)
                 /* Look for a prototype */
                 if (*s == '(' && !is_sigsub) {
                     s = scan_str(s,FALSE,FALSE,FALSE,NULL);
-                   COPLINE_SET_FROM_MULTI_END;
                     if (!s)
                         Perl_croak(aTHX_ "Prototype not terminated");
+                   COPLINE_SET_FROM_MULTI_END;
                     (void)validate_proto(PL_subname, PL_lex_stuff,
                                          ckWARN(WARN_ILLEGALPROTO), 0);
                     have_proto = TRUE;
@@ -9176,11 +9199,15 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
     Best used as sv=new_constant(..., sv, ...).
     If s, pv are NULL, calls subroutine with one argument,
     and <type> is used with error messages only.
-   <type> is assumed to be well formed UTF-8 */
+   <type> is assumed to be well formed UTF-8.
+
+   If error_msg is not NULL, *error_msg will be set to any error encountered.
+   Otherwise yyerror() will be used to output it */
  
  STATIC SV *
  S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
-              SV *sv, SV *pv, const char *type, STRLEN typelen)
+              SV *sv, SV *pv, const char *type, STRLEN typelen,
+               const char ** error_msg)
  {
      dSP;
      HV * table = GvHV(PL_hintgv);               /* ^H */
@@ -9195,13 +9222,6 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
      if (*key == 'c') { assert (strEQ(key, "charnames")); }
      assert(type || s);
  
-    /* charnames doesn't work well if there have been errors found */
-    if (PL_error_count > 0 && *key == 'c')
-    {
-       SvREFCNT_dec_NN(sv);
-       return &PL_sv_undef;
-    }
-
      sv_2mortal(sv);                    /* Parent created it permanently */
      if (!table
         || ! (PL_hints & HINT_LOCALIZE_HH)
@@ -9256,7 +9276,12 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
                                      (type ? type: s), why1, why2, why3);
              }
          }
-       yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
+        if (error_msg) {
+            *error_msg = msg;
+        }
+        else {
+            yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
+        }
         return SvREFCNT_inc_simple_NN(sv);
      }
    now_ok:
@@ -10331,7 +10356,7 @@ S_scan_heredoc(pTHX_ char *s)
         while (ss < se) {
             /* newline only? Copy and move on */
             if (*ss == '\n') {
-               sv_catpv(newstr,"\n");
+               sv_catpvs(newstr,"\n");
                 ss++;
                 linecount++;
  
@@ -10613,14 +10638,11 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
      const char * opening_delims = "([{<";
      const char * closing_delims = ")]}>";
  
+    /* The only non-UTF character that isn't a stand alone grapheme is
+     * white-space, hence can't be a delimiter. */
      const char * non_grapheme_msg = "Use of unassigned code point or"
                                      " non-standalone grapheme for a delimiter"
-                                    " will be a fatal error starting in Perl"
-                                    " 5.30";
-    /* The only non-UTF character that isn't a stand alone grapheme is
-     * white-space, hence can't be a delimiter.  So can skip for non-UTF-8 */
-    bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED);
-
+                                    " is not allowed";
      PERL_ARGS_ASSERT_SCAN_STR;
  
      /* skip space before the delimiter */
@@ -10639,26 +10661,12 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
      }
      else {
         termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
-        if (check_grapheme) {
-            if (   UNLIKELY(UNICODE_IS_SUPER(termcode))
-                || UNLIKELY(UNICODE_IS_NONCHAR(termcode)))
-            {
-                /* These are considered graphemes, and since the ending
-                 * delimiter will be the same, we don't have to check the other
-                 * end */
-                check_grapheme = FALSE;
-            }
-            else if (UNLIKELY(! _is_grapheme((U8 *) start,
-                                             (U8 *) s,
-                                             (U8 *) PL_bufend,
-                                             termcode)))
-            {
-                Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), "%s", non_grapheme_msg);
-
-                /* Don't have to check the other end, as have already warned at
-                 * this one */
-                check_grapheme = FALSE;
-            }
+        if (UTF && UNLIKELY(! _is_grapheme((U8 *) start,
+                                           (U8 *) s,
+                                           (U8 *) PL_bufend,
+                                                  termcode)))
+        {
+            yyerror(non_grapheme_msg);
          }
  
         Copy(s, termstr, termlen, U8);
@@ -10724,14 +10732,13 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
                      if (   s + termlen <= PL_bufend
                          && memEQ(s + 1, (char*)termstr + 1, termlen - 1))
                      {
-                        if (   check_grapheme
+                        if (   UTF
                              && UNLIKELY(! _is_grapheme((U8 *) start,
-                                                              (U8 *) s,
-                                                              (U8 *) PL_bufend,
+                                                       (U8 *) s,
+                                                       (U8 *) PL_bufend,
                                                                termcode)))
                          {
-                            Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                                        "%s", non_grapheme_msg);
+                            yyerror(non_grapheme_msg);
                          }
                         break;
                      }
@@ -11264,9 +11271,10 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
             }
             if (just_zero && (PL_hints & HINT_NEW_INTEGER))
                 sv = new_constant(start, s - start, "integer",
-                                 sv, NULL, NULL, 0);
+                                 sv, NULL, NULL, 0, NULL);
             else if (PL_hints & HINT_NEW_BINARY)
-               sv = new_constant(start, s - start, "binary", sv, NULL, NULL, 0);
+               sv = new_constant(start, s - start, "binary",
+                                  sv, NULL, NULL, 0, NULL);
         }
         break;
  
@@ -11471,7 +11479,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
             const char *const key = floatit ? "float" : "integer";
             const STRLEN keylen = floatit ? 5 : 7;
             sv = S_new_constant(aTHX_ PL_tokenbuf, d - PL_tokenbuf,
-                               key, keylen, sv, NULL, NULL, 0);
+                               key, keylen, sv, NULL, NULL, 0, NULL);
         }
         break;