This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Allow \N{} handling fcn to be used elsewhere in core
[perl5.git] / toke.c
diff --git a/toke.c b/toke.c
index fc87252..af3a5eb 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -41,8 +41,8 @@ Individual members of C<PL_parser> have their own documentation.
 #include "dquote_inline.h"
 #include "invlist_inline.h"
 
-#define new_constant(a,b,c,d,e,f,g)    \
-       S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
+#define new_constant(a,b,c,d,e,f,g, h) \
+       S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
 
 #define pl_yylval      (PL_parser->yylval)
 
@@ -1829,14 +1829,14 @@ S_incline(pTHX_ const char *s, const char *end)
                    }
                    else if (GvAV(cfgv)) {
                        AV * const av = GvAV(cfgv);
-                       const I32 start = CopLINE(PL_curcop)+1;
-                       I32 items = AvFILLp(av) - start;
+                       const line_t start = CopLINE(PL_curcop)+1;
+                       SSize_t items = AvFILLp(av) - start;
                        if (items > 0) {
                            AV * const av2 = GvAVn(gv2);
                            SV **svp = AvARRAY(av) + start;
-                           I32 l = (I32)line_num+1;
-                           while (items--)
-                               av_store(av2, l++, SvREFCNT_inc(*svp++));
+                           Size_t l = line_num+1;
+                           while (items-- && l < SSize_t_MAX && l == (line_t)l)
+                               av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
                        }
                    }
                }
@@ -2068,6 +2068,7 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
     SV * const sv = newSVpvn_utf8(start, len,
                     ! IN_BYTES
                   &&  UTF
+                  &&  len != 0
                   &&  is_utf8_non_invariant_string((const U8*)start, len));
     return sv;
 }
@@ -2331,7 +2332,7 @@ S_tokeq(pTHX_ SV *sv)
     SvCUR_set(sv, d - SvPVX_const(sv));
   finish:
     if ( PL_hints & HINT_NEW_STRING )
-       return new_constant(NULL, 0, "q", sv, pv, "q", 1);
+       return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
     return sv;
 }
 
@@ -2575,16 +2576,8 @@ S_sublex_done(pTHX)
        const line_t l = CopLINE(PL_curcop);
        LEAVE;
         if (PL_parser->sub_error_count != PL_error_count) {
-            const char * const name = OutCopFILE(PL_curcop);
             if (PL_parser->sub_no_recover) {
-                const char * msg = "";
-                if (PL_in_eval) {
-                    SV *errsv = ERRSV;
-                    if (SvCUR(ERRSV)) {
-                        msg = Perl_form(aTHX_ "%" SVf, SVfARG(errsv));
-                    }
-                }
-                abort_execution(msg, name);
+                yyquit();
                 NOT_REACHED;
             }
         }
@@ -2598,33 +2591,69 @@ S_sublex_done(pTHX)
 }
 
 STATIC SV*
-S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
+S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
+{
+    /* This justs wraps get_and_check_backslash_N_name() to output any error
+     * message it returns. */
+
+    const char * error_msg = NULL;
+    SV * result;
+
+    PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
+
+    /* charnames doesn't work well if there have been errors found */
+    if (PL_error_count > 0) {
+       return NULL;
+    }
+
+    result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
+
+    if (error_msg) {
+        yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
+    }
+
+    return result;
+}
+
+SV*
+Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
+                                          const char* const e,
+                                          const bool is_utf8,
+                                          const char ** error_msg)
 {
     /* <s> points to first character of interior of \N{}, <e> to one beyond the
      * interior, hence to the "}".  Finds what the name resolves to, returning
-     * an SV* containing it; NULL if no valid one found */
-
-    SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
+     * an SV* containing it; NULL if no valid one found.
+     *
+     * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
+     * doesn't have to be. */
 
+    SV* res;
     HV * table;
     SV **cvp;
     SV *cv;
     SV *rv;
     HV *stash;
     const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
+    dVAR;
 
     PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
 
+    assert(e >= s);
+    assert(s > (char *) 3);
+
+    res = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
+
     if (!SvCUR(res)) {
         SvREFCNT_dec_NN(res);
         /* diag_listed_as: Unknown charname '%s' */
-        yyerror("Unknown charname ''");
+        *error_msg = Perl_form(aTHX_ "Unknown charname ''");
         return NULL;
     }
 
     res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
                         /* include the <}> */
-                        e - backslash_ptr + 1);
+                        e - backslash_ptr + 1, error_msg);
     if (! SvPOK(res)) {
         SvREFCNT_dec_NN(res);
         return NULL;
@@ -2653,7 +2682,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
      * characters that begin a character name alias are alphabetic, otherwise
      * would have to create a isCHARNAME_BEGIN macro */
 
-    if (! UTF) {
+    if (! is_utf8) {
         if (! isALPHAU(*s)) {
             goto bad_charname;
         }
@@ -2727,18 +2756,15 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         /* diag_listed_as: charnames alias definitions may not contain
                            trailing white-space; marked by <-- HERE in %s
          */
-        yyerror_pv(
-            Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
             "charnames alias definitions may not contain trailing "
             "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
             (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-            ),
-        UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
         return NULL;
     }
 
-    if (SvUTF8(res)) { /* Don't accept malformed input */
+    if (SvUTF8(res)) { /* Don't accept malformed charname value */
         const U8* first_bad_char_loc;
         STRLEN len;
         const char* const str = SvPV_const(res, len);
@@ -2751,13 +2777,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
                                               0 /* 0 means don't die */ );
             /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
                                immediately after '%s' */
-            yyerror_pv(
-              Perl_form(aTHX_
+            *error_msg = Perl_form(aTHX_
                 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
                  (int) (e - backslash_ptr + 1), backslash_ptr,
-                 (int) ((char *) first_bad_char_loc - str), str
-              ),
-              SVf_UTF8);
+                 (int) ((char *) first_bad_char_loc - str), str);
             return NULL;
         }
     }
@@ -2770,13 +2793,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
          * that this print won't run off the end of the string */
         /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
                            in \N{%s} */
-        yyerror_pv(
-          Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
             "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
             (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-          ),
-          UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
         return NULL;
     }
 
@@ -2784,14 +2804,11 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         /* diag_listed_as: charnames alias definitions may not contain a
                            sequence of multiple spaces; marked by <-- HERE
                            in %s */
-        yyerror_pv(
-          Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
             "charnames alias definitions may not contain a sequence of "
             "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
             (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-          ),
-          UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
         return NULL;
 }
 
@@ -3770,7 +3787,7 @@ S_scan_const(pTHX_ char *start)
                    }
                }
                else /* Here is \N{NAME} but not \N{U+...}. */
-                     if ((res = get_and_check_backslash_N_name(s, e)))
+                     if ((res = get_and_check_backslash_N_name_wrapper(s, e)))
                 {
                     STRLEN len;
                     const char *str = SvPV_const(res, len);
@@ -4124,7 +4141,7 @@ S_scan_const(pTHX_ char *start)
            }
 
            sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
-                               type, typelen);
+                               type, typelen, NULL);
        }
         pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
     }
@@ -5099,6 +5116,14 @@ Perl_yylex(pTHX)
 
        return yylex();
     case LEX_FORMLINE:
+        if (PL_parser->sub_error_count != PL_error_count) {
+            /* There was an error parsing a formline, which tends to
+               mess up the parser.
+               Unlike interpolated sub-parsing, we can't treat any of
+               these as recoverable, so no need to check sub_no_recover.
+            */
+            yyquit();
+        }
        assert(PL_lex_formbrack);
        s = scan_formline(PL_bufptr);
        if (!PL_lex_formbrack)
@@ -6518,6 +6543,7 @@ Perl_yylex(pTHX)
                SAVEI32(PL_lex_formbrack);
                PL_parser->form_lex_state = PL_lex_state;
                PL_lex_formbrack = PL_lex_brackets + 1;
+                PL_parser->sub_error_count = PL_error_count;
                goto leftbracket;
            }
        }
@@ -7248,10 +7274,7 @@ Perl_yylex(pTHX)
            else {                      /* no override */
                tmp = -tmp;
                if (tmp == KEY_dump) {
-                   Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
-                                    "dump() better written as CORE::dump(). "
-                                     "dump() will no longer be available "
-                                     "in Perl 5.30");
+                   Perl_croak(aTHX_ "dump() must be written as CORE::dump() as of Perl 5.30");
                }
                gv = NULL;
                gvp = 0;
@@ -8741,9 +8764,9 @@ Perl_yylex(pTHX)
                /* Look for a prototype */
                if (*s == '(' && !is_sigsub) {
                    s = scan_str(s,FALSE,FALSE,FALSE,NULL);
-                   COPLINE_SET_FROM_MULTI_END;
                    if (!s)
                        Perl_croak(aTHX_ "Prototype not terminated");
+                   COPLINE_SET_FROM_MULTI_END;
                    (void)validate_proto(PL_subname, PL_lex_stuff,
                                         ckWARN(WARN_ILLEGALPROTO), 0);
                    have_proto = TRUE;
@@ -9176,11 +9199,15 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
    Best used as sv=new_constant(..., sv, ...).
    If s, pv are NULL, calls subroutine with one argument,
    and <type> is used with error messages only.
-   <type> is assumed to be well formed UTF-8 */
+   <type> is assumed to be well formed UTF-8.
+
+   If error_msg is not NULL, *error_msg will be set to any error encountered.
+   Otherwise yyerror() will be used to output it */
 
 STATIC SV *
 S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
-              SV *sv, SV *pv, const char *type, STRLEN typelen)
+              SV *sv, SV *pv, const char *type, STRLEN typelen,
+               const char ** error_msg)
 {
     dSP;
     HV * table = GvHV(PL_hintgv);               /* ^H */
@@ -9195,13 +9222,6 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
     if (*key == 'c') { assert (strEQ(key, "charnames")); }
     assert(type || s);
 
-    /* charnames doesn't work well if there have been errors found */
-    if (PL_error_count > 0 && *key == 'c')
-    {
-       SvREFCNT_dec_NN(sv);
-       return &PL_sv_undef;
-    }
-
     sv_2mortal(sv);                    /* Parent created it permanently */
     if (!table
        || ! (PL_hints & HINT_LOCALIZE_HH)
@@ -9256,7 +9276,12 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
                                     (type ? type: s), why1, why2, why3);
             }
         }
-       yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
+        if (error_msg) {
+            *error_msg = msg;
+        }
+        else {
+            yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
+        }
        return SvREFCNT_inc_simple_NN(sv);
     }
   now_ok:
@@ -10331,7 +10356,7 @@ S_scan_heredoc(pTHX_ char *s)
        while (ss < se) {
            /* newline only? Copy and move on */
            if (*ss == '\n') {
-               sv_catpv(newstr,"\n");
+               sv_catpvs(newstr,"\n");
                ss++;
                linecount++;
 
@@ -10613,14 +10638,11 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
     const char * opening_delims = "([{<";
     const char * closing_delims = ")]}>";
 
+    /* The only non-UTF character that isn't a stand alone grapheme is
+     * white-space, hence can't be a delimiter. */
     const char * non_grapheme_msg = "Use of unassigned code point or"
                                     " non-standalone grapheme for a delimiter"
-                                    " will be a fatal error starting in Perl"
-                                    " 5.30";
-    /* The only non-UTF character that isn't a stand alone grapheme is
-     * white-space, hence can't be a delimiter.  So can skip for non-UTF-8 */
-    bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED);
-
+                                    " is not allowed";
     PERL_ARGS_ASSERT_SCAN_STR;
 
     /* skip space before the delimiter */
@@ -10639,26 +10661,12 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
     }
     else {
        termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
-        if (check_grapheme) {
-            if (   UNLIKELY(UNICODE_IS_SUPER(termcode))
-                || UNLIKELY(UNICODE_IS_NONCHAR(termcode)))
-            {
-                /* These are considered graphemes, and since the ending
-                 * delimiter will be the same, we don't have to check the other
-                 * end */
-                check_grapheme = FALSE;
-            }
-            else if (UNLIKELY(! _is_grapheme((U8 *) start,
-                                             (U8 *) s,
-                                             (U8 *) PL_bufend,
-                                             termcode)))
-            {
-                Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), "%s", non_grapheme_msg);
-
-                /* Don't have to check the other end, as have already warned at
-                 * this one */
-                check_grapheme = FALSE;
-            }
+        if (UTF && UNLIKELY(! _is_grapheme((U8 *) start,
+                                           (U8 *) s,
+                                           (U8 *) PL_bufend,
+                                                  termcode)))
+        {
+            yyerror(non_grapheme_msg);
         }
 
        Copy(s, termstr, termlen, U8);
@@ -10724,14 +10732,13 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
                     if (   s + termlen <= PL_bufend
                         && memEQ(s + 1, (char*)termstr + 1, termlen - 1))
                     {
-                        if (   check_grapheme
+                        if (   UTF
                             && UNLIKELY(! _is_grapheme((U8 *) start,
-                                                              (U8 *) s,
-                                                              (U8 *) PL_bufend,
+                                                       (U8 *) s,
+                                                       (U8 *) PL_bufend,
                                                               termcode)))
                         {
-                            Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                                        "%s", non_grapheme_msg);
+                            yyerror(non_grapheme_msg);
                         }
                        break;
                     }
@@ -11264,9 +11271,10 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
            }
            if (just_zero && (PL_hints & HINT_NEW_INTEGER))
                sv = new_constant(start, s - start, "integer",
-                                 sv, NULL, NULL, 0);
+                                 sv, NULL, NULL, 0, NULL);
            else if (PL_hints & HINT_NEW_BINARY)
-               sv = new_constant(start, s - start, "binary", sv, NULL, NULL, 0);
+               sv = new_constant(start, s - start, "binary",
+                                  sv, NULL, NULL, 0, NULL);
        }
        break;
 
@@ -11471,7 +11479,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
            const char *const key = floatit ? "float" : "integer";
            const STRLEN keylen = floatit ? 5 : 7;
            sv = S_new_constant(aTHX_ PL_tokenbuf, d - PL_tokenbuf,
-                               key, keylen, sv, NULL, NULL, 0);
+                               key, keylen, sv, NULL, NULL, 0, NULL);
        }
        break;