toke.c: White-space only

[perl5.git] / toke.c
diff --git a/toke.c b/toke.c

index e0a6376..f018d36 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -527,11 +527,17 @@ S_no_op(pTHX_ const char *const what, char *s)
         if (is_first)
             Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
                     "\t(Missing semicolon on previous line?)\n");
-       else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
+        else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
+                                                           PL_bufend,
+                                                           UTF))
+        {
             const char *t;
-           for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
-                                                            t += UTF ? UTF8SKIP(t) : 1)
+           for (t = PL_oldoldbufptr;
+                 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
+                 t += UTF ? UTF8SKIP(t) : 1)
+            {
                 NOOP;
+            }
             if (t < PL_bufptr && isSPACE(*t))
                 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
                         "\t(Do you need to predeclare %" UTF8f "?)\n",
@@ -933,7 +939,8 @@ Perl_lex_grow_linestr(pTHX_ STRLEN len)
      /* Is the lex_shared linestr SV the same as the current linestr SV?
       * Only in this case does re_eval_start need adjusting, since it
       * points within lex_shared->ls_linestr's buffer */
-    current = (linestr == PL_parser->lex_shared->ls_linestr);
+    current = (   !PL_parser->lex_shared->ls_linestr
+               || linestr == PL_parser->lex_shared->ls_linestr);
  
      bufend_pos = PL_parser->bufend - buf;
      bufptr_pos = PL_parser->bufptr - buf;
@@ -1286,6 +1293,8 @@ Perl_lex_next_chunk(pTHX_ U32 flags)
      STRLEN linestart_pos, last_uni_pos, last_lop_pos;
      bool got_some_for_debugger = 0;
      bool got_some;
+    const U8* first_bad_char_loc;
+
      if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
         Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
      if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
@@ -1350,6 +1359,18 @@ Perl_lex_next_chunk(pTHX_ U32 flags)
      new_bufend_pos = SvCUR(linestr);
      PL_parser->bufend = buf + new_bufend_pos;
      PL_parser->bufptr = buf + bufptr_pos;
+
+    if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr,
+                                    PL_parser->bufend - PL_parser->bufptr,
+                                    &first_bad_char_loc))
+    {
+        _force_out_malformed_utf8_message(first_bad_char_loc,
+                                          (U8 *) PL_parser->bufend,
+                                          0,
+                                          1 /* 1 means die */ );
+        NOT_REACHED; /* NOTREACHED */
+    }
+
      PL_parser->oldbufptr = buf + oldbufptr_pos;
      PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
      PL_parser->linestart = buf + linestart_pos;
@@ -1871,7 +1892,7 @@ S_check_uni(pTHX)
      while (isSPACE(*PL_last_uni))
         PL_last_uni++;
      s = PL_last_uni;
-    while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
+    while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
         s += UTF ? UTF8SKIP(s) : 1;
      if ((t = strchr(s, '(')) && t < PL_bufptr)
         return;
@@ -2041,7 +2062,7 @@ S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
  
      start = skipspace(start);
      s = start;
-    if (isIDFIRST_lazy_if(s,UTF)
+    if (   isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
          || (allow_pack && *s == ':' && s[1] == ':') )
      {
         s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
@@ -2434,7 +2455,7 @@ S_sublex_push(pTHX)
      if (is_heredoc)
         CopLINE_set(PL_curcop, (line_t)PL_multi_start);
      PL_copline = NOLINE;
-    
+
      Newxz(shared, 1, LEXSHARED);
      shared->ls_prev = PL_parser->lex_shared;
      PL_parser->lex_shared = shared;
@@ -2835,6 +2856,11 @@ S_scan_const(pTHX_ char *start)
                                             when the source isn't utf8, as for
                                             example when it is entirely composed
                                             of hex constants */
+    STRLEN utf8_variant_count = 0;      /* When not in UTF-8, this counts the
+                                           number of characters found so far
+                                           that will expand (into 2 bytes)
+                                           should we have to convert to
+                                           UTF-8) */
      SV *res;                           /* result from charnames */
      STRLEN offset_to_max;   /* The offset in the output to where the range
                                 high-end character is temporarily placed */
@@ -2849,7 +2875,7 @@ S_scan_const(pTHX_ char *start)
       * the needed size, SvGROW() is called.  Its size parameter each time is
       * based on the best guess estimate at the time, namely the length used so
       * far, plus the length the current construct will occupy, plus room for
-     * the trailing NUL, plus one byte for every input byte still unscanned */ 
+     * the trailing NUL, plus one byte for every input byte still unscanned */
  
      UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
                         before set */
@@ -3181,8 +3207,7 @@ S_scan_const(pTHX_ char *start)
             if (!esc)
                 in_charclass = TRUE;
         }
-
-       else if (*s == ']' && PL_lex_inpat &&  in_charclass) {
+       else if (*s == ']' && PL_lex_inpat && in_charclass) {
             char *s1 = s-1;
             int esc = 0;
             while (s1 >= start && *s1-- == '\\')
@@ -3226,8 +3251,12 @@ S_scan_const(pTHX_ char *start)
            (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
            */
         else if (*s == '@' && s[1]) {
-           if (UTF ? isIDFIRST_utf8((U8*)s+1) : isWORDCHAR_A(s[1]))
+           if (UTF
+               ? isIDFIRST_utf8_safe(s+1, send)
+               : isWORDCHAR_A(s[1]))
+            {
                 break;
+            }
             if (strchr(":'{$", s[1]))
                 break;
             if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
@@ -3371,38 +3400,57 @@ S_scan_const(pTHX_ char *start)
                 }
                 else {
                     if (!has_utf8 && uv > 255) {
-                       /* Might need to recode whatever we have accumulated so
-                        * far if it contains any chars variant in utf8 or
-                        * utf-ebcdic. */
-                         
-                       SvCUR_set(sv, d - SvPVX_const(sv));
-                       SvPOK_on(sv);
-                       *d = '\0';
-                       /* See Note on sizing above.  */
-                       sv_utf8_upgrade_flags_grow(
-                                       sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
-                                                  /* Above-latin1 in string
-                                                   * implies no encoding */
-                                                  |SV_UTF8_NO_ENCODING,
-                                       UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
-                       d = SvPVX(sv) + SvCUR(sv);
-                       has_utf8 = TRUE;
+
+                        /* Here, 'uv' won't fit unless we convert to UTF-8.
+                         * If we've only seen invariants so far, all we have to
+                         * do is turn on the flag */
+                        if (utf8_variant_count == 0) {
+                            SvUTF8_on(sv);
+                        }
+                        else {
+                            SvCUR_set(sv, d - SvPVX_const(sv));
+                            SvPOK_on(sv);
+                            *d = '\0';
+
+                            sv_utf8_upgrade_flags_grow(
+                                           sv,
+                                           SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
+                                                      /* Above-latin1 in string
+                                                       * implies no encoding */
+                                                      |SV_UTF8_NO_ENCODING,
+
+                                           /* Since we're having to grow here,
+                                            * make sure we have enough room for
+                                            * this escape and a NUL, so the
+                                            * code immediately below won't have
+                                            * to actually grow again */
+                                          UVCHR_SKIP(uv)
+                                        + (STRLEN)(send - s) + 1);
+                            d = SvPVX(sv) + SvCUR(sv);
+                        }
+
+                        has_utf8 = TRUE;
                      }
  
-                    if (has_utf8) {
+                    if (! has_utf8) {
+                       *d++ = (char)uv;
+                        utf8_variant_count++;
+                    }
+                   else {
                         /* Usually, there will already be enough room in 'sv'
                          * since such escapes are likely longer than any UTF-8
                          * sequence they can end up as.  This isn't the case on
                          * EBCDIC where \x{40000000} contains 12 bytes, and the
                          * UTF-8 for it contains 14.  And, we have to allow for
                          * a trailing NUL.  It probably can't happen on ASCII
-                        * platforms, but be safe */
-                        const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv)
+                        * platforms, but be safe.  See Note on sizing above. */
+                        const STRLEN needed = d - SvPVX(sv)
+                                            + UVCHR_SKIP(uv)
+                                            + (send - s)
                                              + 1;
                          if (UNLIKELY(needed > SvLEN(sv))) {
                              SvCUR_set(sv, d - SvPVX_const(sv));
-                            d = sv_grow(sv, needed) + SvCUR(sv);
+                            d = SvCUR(sv) + SvGROW(sv, needed);
                          }
  
                         d = (char*)uvchr_to_utf8((U8*)d, uv);
@@ -3413,9 +3461,6 @@ S_scan_const(pTHX_ char *start)
                                 (PL_lex_repl ? OPpTRANS_FROM_UTF
                                              : OPpTRANS_TO_UTF);
                         }
-                    }
-                   else {
-                       *d++ = (char)uv;
                     }
                 }
  #ifdef EBCDIC
@@ -3463,7 +3508,7 @@ S_scan_const(pTHX_ char *start)
                   * braces */
                 s++;
                 if (*s != '{') {
-                   yyerror("Missing braces on \\N{}"); 
+                   yyerror("Missing braces on \\N{}");
                     continue;
                 }
                 s++;
@@ -3529,15 +3574,25 @@ S_scan_const(pTHX_ char *start)
                         if (! has_utf8 && (   uv > 0xFF
                                             || PL_lex_inwhat != OP_TRANS))
                          {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
+
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(
-                                    sv,
-                                    SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                   OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(
+                                               sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
+
                             has_utf8 = TRUE;
                         }
  
@@ -3696,21 +3751,31 @@ S_scan_const(pTHX_ char *start)
                           /* Upgrade destination to be utf8 if this new
                            * component is */
                         if (! has_utf8 && SvUTF8(res)) {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = len + (send - s) + 1;
+
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(sv,
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(sv,
                                                 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                               len + (STRLEN)(send - s) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
                             has_utf8 = TRUE;
                         } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
  
                             /* See Note on sizing above.  (NOTE: SvCUR() is not
                              * set correctly here). */
+                            const STRLEN extra = len + (send - e) + 1;
                             const STRLEN off = d - SvPVX_const(sv);
-                           d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
+                           d = off + SvGROW(sv, off + extra);
                         }
                         Copy(str, d, len, char);
                         d += len;
@@ -3774,42 +3839,61 @@ S_scan_const(pTHX_ char *start)
           * to/from UTF-8.
           *
           * If the input has the same representation in UTF-8 as not, it will be
-         * a single byte, and we don't care about UTF8ness; or if neither
-         * source nor output is UTF-8, just copy the byte */
-        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s)) || (! this_utf8 && ! has_utf8))
-        {
+         * a single byte, and we don't care about UTF8ness; just copy the byte */
+        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
             *d++ = *s++;
          }
-        else {
-           STRLEN len  = 1;
+        else if (! this_utf8 && ! has_utf8) {
+            /* If neither source nor output is UTF-8, is also a single byte,
+             * just copy it; but this byte counts should we later have to
+             * convert to UTF-8 */
+           *d++ = *s++;
+            utf8_variant_count++;
+        }
+        else if (this_utf8 && has_utf8) {   /* Both UTF-8, can just copy */
+           const STRLEN len = UTF8SKIP(s);
  
-           /* One might think that it is wasted effort in the case of the
-            * source being utf8 (this_utf8 == TRUE) to take the next character
-            * in the source, convert it to an unsigned value, and then convert
-            * it back again.  But the source has not been validated here.  The
-            * routine that does the conversion checks for errors like
-            * malformed utf8 */
+            /* We expect the source to have already been checked for
+             * malformedness */
+            assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
  
+            Copy(s, d, len, U8);
+            d += len;
+            s += len;
+        }
+        else { /* UTF8ness matters and doesn't match, need to convert */
+           STRLEN len = 1;
             const UV nextuv   = (this_utf8)
                                  ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                  : (UV) ((U8) *s);
-           const STRLEN need = UVCHR_SKIP(nextuv);
+           STRLEN need = UVCHR_SKIP(nextuv);
+
             if (!has_utf8) {
                 SvCUR_set(sv, d - SvPVX_const(sv));
                 SvPOK_on(sv);
                 *d = '\0';
-               /* See Note on sizing above.  */
-               sv_utf8_upgrade_flags_grow(sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       need + (STRLEN)(send - s) + 1);
-               d = SvPVX(sv) + SvCUR(sv);
+
+                /* See Note on sizing above. */
+                need += (STRLEN)(send - s) + 1;
+
+                if (utf8_variant_count == 0) {
+                    SvUTF8_on(sv);
+                    d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
+                }
+                else {
+                    sv_utf8_upgrade_flags_grow(sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               need);
+                    d = SvPVX(sv) + SvCUR(sv);
+                }
                 has_utf8 = TRUE;
             } else if (need > len) {
                 /* encoded value larger than old, may need extra space (NOTE:
                  * SvCUR() is not set correctly here).   See Note on sizing
                  * above.  */
+                const STRLEN extra = need + (send - s) + 1;
                 const STRLEN off = d - SvPVX_const(sv);
-               d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
+               d = off + SvGROW(sv, off + extra);
             }
             s += len;
  
@@ -3963,7 +4047,7 @@ S_intuit_more(pTHX_ char *s)
             case '&':
             case '$':
                 weight -= seen[un_char] * 10;
-               if (isWORDCHAR_lazy_if(s+1,UTF)) {
+               if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
                     int len;
                      char *tmp = PL_bufend;
                      PL_bufend = (char*)send;
@@ -4194,7 +4278,7 @@ Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
                 STRLEN const last_lop_pos =
                     PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
                 av_push(PL_rsfp_filters, linestr);
-               PL_parser->linestr = 
+               PL_parser->linestr =
                     newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
                 buf = SvPVX(PL_parser->linestr);
                 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
@@ -4438,12 +4522,18 @@ static void
  S_check_scalar_slice(pTHX_ char *s)
  {
      s++;
-    while (*s == ' ' || *s == '\t') s++;
-    if (*s == 'q' && s[1] == 'w'
-     && !isWORDCHAR_lazy_if(s+2,UTF))
+    while (SPACE_OR_TAB(*s)) s++;
+    if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2,
+                                                             PL_bufend,
+                                                             UTF))
+    {
         return;
-    while (*s && (isWORDCHAR_lazy_if(s,UTF) || strchr(" \t$#+-'\"", *s)))
-       s += UTF ? UTF8SKIP(s) : 1;
+    }
+    while (    isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
+           || (*s && strchr(" \t$#+-'\"", *s)))
+    {
+        s += UTF ? UTF8SKIP(s) : 1;
+    }
      if (*s == '}' || *s == ']')
         pl_yylval.ival = OPpSLICEWARNING;
  }
@@ -4850,7 +4940,7 @@ Perl_yylex(pTHX)
                  break;
              }
              s = skipspace(s);
-            if (isIDFIRST_lazy_if(s, UTF)) {
+            if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                  char *dest = PL_tokenbuf + 1;
                  /* read var name, including sigil, into PL_tokenbuf */
                  PL_tokenbuf[0] = sigil;
@@ -4894,7 +4984,7 @@ Perl_yylex(pTHX)
                                                    1 /* 1 means die */ );
                  NOT_REACHED; /* NOTREACHED */
              }
-            if (isIDFIRST_utf8((U8*)s)) {
+            if (isIDFIRST_utf8_safe(s, PL_bufend)) {
                  goto keylookup;
              }
          }
@@ -5427,7 +5517,7 @@ Perl_yylex(pTHX)
                     PL_expect = XPOSTDEREF;
                     TOKEN(ARROW);
                 }
-               if (isIDFIRST_lazy_if(s,UTF)) {
+               if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                     s = force_word(s,METHOD,FALSE,TRUE);
                     TOKEN(ARROW);
                 }
@@ -5612,7 +5702,7 @@ Perl_yylex(pTHX)
          grabattrs:
             s = skipspace(s);
             attrs = NULL;
-           while (isIDFIRST_lazy_if(s,UTF)) {
+            while (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                 I32 tmp;
                 SV *sv;
                 d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
@@ -5818,7 +5908,7 @@ Perl_yylex(pTHX)
                 while (d < PL_bufend && SPACE_OR_TAB(*d))
                     d++;
             }
-           if (d < PL_bufend && isIDFIRST_lazy_if(d,UTF)) {
+            if (d < PL_bufend && isIDFIRST_lazy_if_safe(d, PL_bufend, UTF)) {
                 d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
                               FALSE, &len);
                 while (d < PL_bufend && SPACE_OR_TAB(*d))
@@ -5938,13 +6028,19 @@ Perl_yylex(pTHX)
                     }
                     else
                         /* skip plain q word */
-                       while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
+                       while (   t < PL_bufend
+                               && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                        {
                             t += UTF ? UTF8SKIP(t) : 1;
+                        }
                 }
-               else if (isWORDCHAR_lazy_if(t,UTF)) {
+               else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) {
                     t += UTF ? UTF8SKIP(t) : 1;
-                   while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
+                   while (   t < PL_bufend
+                           && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                    {
                         t += UTF ? UTF8SKIP(t) : 1;
+                    }
                 }
                 while (t < PL_bufend && isSPACE(*t))
                     t++;
@@ -6039,8 +6135,9 @@ Perl_yylex(pTHX)
         }
         s--;
         if (PL_expect == XOPERATOR) {
-           if (PL_bufptr == PL_linestart && ckWARN(WARN_SEMICOLON)
-               && isIDFIRST_lazy_if(s,UTF))
+           if (   PL_bufptr == PL_linestart
+                && ckWARN(WARN_SEMICOLON)
+               && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
             {
                 CopLINE_dec(PL_curcop);
                 Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
@@ -6322,7 +6419,10 @@ Perl_yylex(pTHX)
             POSTDEREF('$');
         }
  
-       if (s[1] == '#' && (isIDFIRST_lazy_if(s+2,UTF) || strchr("{$:+-@", s[2]))) {
+       if (   s[1] == '#'
+            && (   isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF)
+                || strchr("{$:+-@", s[2])))
+        {
             PL_tokenbuf[0] = '@';
             s = scan_ident(s + 1, PL_tokenbuf + 1,
                            sizeof PL_tokenbuf - 1, FALSE);
@@ -6371,8 +6471,12 @@ Perl_yylex(pTHX)
                     if (ckWARN(WARN_SYNTAX)) {
                         char *t = s+1;
  
-                       while (isSPACE(*t) || isWORDCHAR_lazy_if(t,UTF) || *t == '$')
+                        while (   isSPACE(*t)
+                               || isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)
+                               || *t == '$')
+                        {
                             t += UTF ? UTF8SKIP(t) : 1;
+                        }
                         if (*t++ == ',') {
                             PL_bufptr = skipspace(PL_bufptr); /* XXX can realloc */
                             while (t < PL_bufend && *t != ']')
@@ -6393,17 +6497,21 @@ Perl_yylex(pTHX)
                             do {
                                 t++;
                             } while (isSPACE(*t));
-                           if (isIDFIRST_lazy_if(t,UTF)) {
+                           if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) {
                                 STRLEN len;
                                 t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE,
                                               &len);
                                 while (isSPACE(*t))
                                     t++;
-                               if (*t == ';'
-                                       && get_cvn_flags(tmpbuf, len, UTF ? SVf_UTF8 : 0))
+                                if (  *t == ';'
+                                    && get_cvn_flags(tmpbuf, len, UTF
+                                                                  ? SVf_UTF8
+                                                                  : 0))
+                                {
                                     Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
                                         "You need to quote \"%" UTF8f "\"",
                                          UTF8fARG(UTF, len, tmpbuf));
+                                }
                             }
                         }
                 }
@@ -6416,9 +6524,12 @@ Perl_yylex(pTHX)
                     PL_expect = XOPERATOR;
                 else if (strchr("$@\"'`q", *s))
                     PL_expect = XTERM;          /* e.g. print $fh "foo" */
-               else if (strchr("&*<%", *s) && isIDFIRST_lazy_if(s+1,UTF))
+               else if (   strchr("&*<%", *s)
+                         && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF))
+                {
                     PL_expect = XTERM;          /* e.g. print $fh &sub */
-               else if (isIDFIRST_lazy_if(s,UTF)) {
+                }
+               else if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                     char tmpbuf[sizeof PL_tokenbuf];
                     int t2;
                     scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
@@ -6517,10 +6628,10 @@ Perl_yylex(pTHX)
          }
         else {
             /* Disable warning on "study /blah/" */
-           if (PL_oldoldbufptr == PL_last_uni
-            && (*PL_last_uni != 's' || s - PL_last_uni < 5
-                || memNE(PL_last_uni, "study", 5)
-                || isWORDCHAR_lazy_if(PL_last_uni+5,UTF)
+           if (    PL_oldoldbufptr == PL_last_uni
+                && (   *PL_last_uni != 's' || s - PL_last_uni < 5
+                    || memNE(PL_last_uni, "study", 5)
+                    || isWORDCHAR_lazy_if_safe(PL_last_uni+5, PL_bufend, UTF)
              ))
                 check_uni();
             s = scan_pat(s,OP_MATCH);
@@ -7047,8 +7158,8 @@ Perl_yylex(pTHX)
                     s = skipspace(s);
  
                     /* Two barewords in a row may indicate method call. */
-
-                   if ((isIDFIRST_lazy_if(s,UTF) || *s == '$')
+                   if (   (   isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
+                            || *s == '$')
                          && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
                      {
                         goto method;
@@ -7133,9 +7244,11 @@ Perl_yylex(pTHX)
  
                 /* If followed by a bareword, see if it looks like indir obj. */
  
-               if (tmp == 1 && !orig_keyword
-                       && (isIDFIRST_lazy_if(s,UTF) || *s == '$')
-                       && (tmp = intuit_method(s, lex ? NULL : sv, cv))) {
+               if (   tmp == 1
+                    && !orig_keyword
+                    && (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || *s == '$')
+                    && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
+                {
                   method:
                     if (lex && !off) {
                         assert(cSVOPx(pl_yylval.opval)->op_sv == sv);
@@ -7629,7 +7742,9 @@ Perl_yylex(pTHX)
                 return REPORT(0);
             pl_yylval.ival = CopLINE(PL_curcop);
             s = skipspace(s);
-           if (PL_expect == XSTATE && isIDFIRST_lazy_if(s,UTF)) {
+            if (   PL_expect == XSTATE
+                && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
+            {
                 char *p = s;
  
                 if ((PL_bufend - p) >= 3
@@ -7642,7 +7757,7 @@ Perl_yylex(pTHX)
                     p += 3;
                 p = skipspace(p);
                  /* skip optional package name, as in "for my abc $x (..)" */
-               if (isIDFIRST_lazy_if(p,UTF)) {
+               if (isIDFIRST_lazy_if_safe(p, PL_bufend, UTF)) {
                     p = scan_word(p, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
                     p = skipspace(p);
                 }
@@ -7885,7 +8000,7 @@ Perl_yylex(pTHX)
             }
             PL_in_my = (U16)tmp;
             s = skipspace(s);
-           if (isIDFIRST_lazy_if(s,UTF)) {
+            if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
                 if (len == 3 && strEQs(PL_tokenbuf, "sub"))
                     goto really_sub;
@@ -7935,10 +8050,10 @@ Perl_yylex(pTHX)
  
         case KEY_open:
             s = skipspace(s);
-           if (isIDFIRST_lazy_if(s,UTF)) {
-          const char *t;
-          d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE,
-              &len);
+            if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
+                const char *t;
+                d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE,
+                              &len);
                 for (t=d; isSPACE(*t);)
                     t++;
                 if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE)
@@ -8095,9 +8210,13 @@ Perl_yylex(pTHX)
             {
                 *PL_tokenbuf = '\0';
                 s = force_word(s,BAREWORD,TRUE,TRUE);
-               if (isIDFIRST_lazy_if(PL_tokenbuf,UTF))
+                if (isIDFIRST_lazy_if_safe(PL_tokenbuf,
+                                           PL_tokenbuf + sizeof(PL_tokenbuf),
+                                           UTF))
+                {
                     gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf),
                                  GV_ADD | (UTF ? SVf_UTF8 : 0));
+                }
                 else if (*s == '<')
                     yyerror("<> at require-statement should be quotes");
             }
@@ -8105,7 +8224,7 @@ Perl_yylex(pTHX)
                 orig_keyword = 0;
                 pl_yylval.ival = 1;
             }
-           else 
+           else
                 pl_yylval.ival = 0;
             PL_expect = PL_nexttoke ? XOPERATOR : XTERM;
             PL_bufptr = s;
@@ -8299,7 +8418,7 @@ Perl_yylex(pTHX)
                 s = skipspace(s);
                  d = SvPVX(PL_linestr)+off;
  
-               if (isIDFIRST_lazy_if(s,UTF)
+                if (   isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
                      || *s == '\''
                      || (*s == ':' && s[1] == ':'))
                 {
@@ -8745,10 +8864,10 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
         s++;
      while (s < PL_bufend && isSPACE(*s))
         s++;
-    if (isIDFIRST_lazy_if(s,UTF)) {
+    if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
         const char * const w = s;
          s += UTF ? UTF8SKIP(s) : 1;
-       while (isWORDCHAR_lazy_if(s,UTF))
+       while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
             s += UTF ? UTF8SKIP(s) : 1;
         while (s < PL_bufend && isSPACE(*s))
             s++;
@@ -8921,21 +9040,23 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
  
  PERL_STATIC_INLINE void
  S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
-                    bool is_utf8, bool check_dollar) {
+                    bool is_utf8, bool check_dollar)
+{
      PERL_ARGS_ASSERT_PARSE_IDENT;
  
-    for (;;) {
+    while (*s < PL_bufend) {
          if (*d >= e)
              Perl_croak(aTHX_ "%s", ident_too_long);
-        if (is_utf8 && isIDFIRST_utf8((U8*)*s)) {
+        if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) {
               /* The UTF-8 case must come first, otherwise things
               * like c\N{COMBINING TILDE} would start failing, as the
               * isWORDCHAR_A case below would gobble the 'c' up.
               */
  
              char *t = *s + UTF8SKIP(*s);
-            while (isIDCONT_utf8((U8*)t))
+            while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) {
                  t += UTF8SKIP(t);
+            }
              if (*d + (t - *s) > e)
                  Perl_croak(aTHX_ "%s", ident_too_long);
              Copy(*s, *d, t - *s, char);
@@ -8947,7 +9068,10 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
                  *(*d)++ = *(*s)++;
              } while (isWORDCHAR_A(**s) && *d < e);
          }
-        else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) {
+        else if (   allow_package
+                 && **s == '\''
+                 && isIDFIRST_lazy_if_safe((*s)+1, PL_bufend, is_utf8))
+        {
              *(*d)++ = ':';
              *(*d)++ = ':';
              (*s)++;
@@ -8998,10 +9122,10 @@ S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN
   *      Because all ASCII characters have the same representation whether
   *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
   *      '{' without knowing if is UTF-8 or not. */
-#define VALID_LEN_ONE_IDENT(s, is_utf8)                                       \
-    (isGRAPH_A(*(s)) || ((is_utf8)                                            \
-                         ? isIDFIRST_utf8((U8*) (s))                          \
-                         : (isGRAPH_L1(*s)                                    \
+#define VALID_LEN_ONE_IDENT(s, e, is_utf8)                                  \
+    (isGRAPH_A(*(s)) || ((is_utf8)                                          \
+                         ? isIDFIRST_utf8_safe(s, e)                        \
+                         : (isGRAPH_L1(*s)                                  \
                              && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
  
  STATIC char *
@@ -9042,7 +9166,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
      /* Here, it is not a run-of-the-mill identifier name */
  
      if (*s == '$' && s[1]
-        && (isIDFIRST_lazy_if(s+1,is_utf8)
+        && (   isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8)
              || isDIGIT_A((U8)s[1])
              || s[1] == '$'
              || s[1] == '{'
@@ -9065,7 +9189,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
      if ((s <= PL_bufend - (is_utf8)
                            ? UTF8SKIP(s)
                            : 1)
-        && VALID_LEN_ONE_IDENT(s, is_utf8))
+        && VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
      {
          if (is_utf8) {
              const STRLEN skip = UTF8SKIP(s);
@@ -9093,7 +9217,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
          bool skip;
          char *s2;
          /* If we were processing {...} notation then...  */
-       if (isIDFIRST_lazy_if(d,is_utf8)) {
+        if (isIDFIRST_lazy_if_safe(d, e, is_utf8)) {
              /* if it starts as a valid identifier, assume that it is one.
                 (the later check for } being at the expected point will trap
                 cases where this doesn't pan out.)  */
@@ -9146,7 +9270,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
              s2 = peekspace(s);
          else
              s2 = s;
-           
+
          /* Expect to find a closing } after consuming any trailing whitespace.
           */
          if (*s2 == '}') {
@@ -9209,7 +9333,7 @@ S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charse
      STRLEN charlen = UTF ? UTF8SKIP(*s) : 1;
  
      if ( charlen != 1 || ! strchr(valid_flags, c) ) {
-        if (isWORDCHAR_lazy_if(*s, UTF)) {
+        if (isWORDCHAR_lazy_if_safe( *s, PL_bufend, UTF)) {
              yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", (int)charlen, *s),
                         UTF ? SVf_UTF8 : 0);
              (*s) += charlen;
@@ -9353,7 +9477,7 @@ S_scan_pat(pTHX_ char *start, I32 type)
      /* issue a warning if /c is specified,but /g is not */
      if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL))
      {
-        Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), 
+        Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
                        "Use of /c modifier is meaningless without /g" );
      }
  
@@ -9594,10 +9718,12 @@ S_scan_heredoc(pTHX_ char *s)
             s++, term = '\'';
         else
             term = '"';
-       if (!isWORDCHAR_lazy_if(s,UTF))
+       if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
             deprecate("bare << to mean <<\"\"");
         peek = s;
-       while (isWORDCHAR_lazy_if(peek,UTF)) {
+        while (
+               isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))
+        {
             peek += UTF ? UTF8SKIP(peek) : 1;
         }
         len = (peek - s >= e - d) ? (e - d) : (peek - s);
@@ -9712,7 +9838,7 @@ S_scan_heredoc(pTHX_ char *s)
  
                     /* Only valid if it's preceded by whitespace only */
                     while (backup != myolds && --backup >= myolds) {
-                       if (*backup != ' ' && *backup != '\t') {
+                       if (! SPACE_OR_TAB(*backup)) {
                             break;
                         }
  
@@ -9848,14 +9974,14 @@ S_scan_heredoc(pTHX_ char *s)
  
                 /* Only valid if it's preceded by whitespace only */
                 while (backup != s && --backup >= s) {
-                   if (*backup != ' ' && *backup != '\t') {
+                   if (! SPACE_OR_TAB(*backup)) {
                         break;
                     }
                     indent_len++;
                 }
  
                 /* All whitespace or none! */
-               if (backup == found || *backup == ' ' || *backup == '\t') {
+               if (backup == found || SPACE_OR_TAB(*backup)) {
                     Newxz(indent, indent_len + 1, char);
                     memcpy(indent, backup, indent_len);
                     SvREFCNT_dec(PL_linestr);
@@ -10014,8 +10140,9 @@ S_scan_inputsymbol(pTHX_ char *start)
      if (*d == '$' && d[1]) d++;
  
      /* allow <Pkg'VALUE> or <Pkg::VALUE> */
-    while (*d && (isWORDCHAR_lazy_if(d,UTF) || *d == '\'' || *d == ':'))
+    while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == '\'' || *d == ':') {
         d += UTF ? UTF8SKIP(d) : 1;
+    }
  
      /* If we've tried to read what we allow filehandles to look like, and
         there's still text left, then it must be a glob() and not a getline.
@@ -10183,6 +10310,14 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
      const char * opening_delims = "([{<";
      const char * closing_delims = ")]}>";
  
+    const char * non_grapheme_msg = "Use of unassigned code point or"
+                                    " non-standalone grapheme for a delimiter"
+                                    " will be a fatal error starting in Perl"
+                                    " v5.30";
+    /* The only non-UTF character that isn't a stand alone grapheme is
+     * white-space, hence can't be a delimiter.  So can skip for non-UTF-8 */
+    bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED);
+
      PERL_ARGS_ASSERT_SCAN_STR;
  
      /* skip space before the delimiter */
@@ -10201,6 +10336,28 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
      }
      else {
         termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
+        if (check_grapheme) {
+            if (   UNLIKELY(UNICODE_IS_SUPER(termcode))
+                || UNLIKELY(UNICODE_IS_NONCHAR(termcode)))
+            {
+                /* These are considered graphemes, and since the ending
+                 * delimiter will be the same, we don't have to check the other
+                 * end */
+                check_grapheme = FALSE;
+            }
+            else if (UNLIKELY(! _is_grapheme((U8 *) start,
+                                             (U8 *) s,
+                                             (U8 *) PL_bufend,
+                                             termcode)))
+            {
+                Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), "%s", non_grapheme_msg);
+
+                /* Don't have to check the other end, as have already warned at
+                 * this one */
+                check_grapheme = FALSE;
+            }
+        }
+
         Copy(s, termstr, termlen, U8);
      }
  
@@ -10259,6 +10416,15 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
                     if (termlen == 1)
                         break;
                     if (s+termlen <= PL_bufend && memEQ(s, (char*)termstr, termlen))
+                        if (   check_grapheme
+                            && UNLIKELY(! _is_grapheme((U8 *) start,
+                                                              (U8 *) s,
+                                                              (U8 *) PL_bufend,
+                                                              termcode)))
+                        {
+                            Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+                                        "%s", non_grapheme_msg);
+                        }
                         break;
                 }
                 else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
@@ -10335,7 +10501,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
             CopLINE_set(PL_curcop, (line_t)PL_multi_start);
             return NULL;
         }
-       s = PL_bufptr;
+       s = start = PL_bufptr;
      }
  
      /* at this point, we have successfully read the delimited string */
@@ -11075,8 +11241,7 @@ S_scan_formline(pTHX_ char *s)
         PL_expect = XSTATE;
         if (needargs) {
             const char *s2 = s;
-           while (*s2 == '\r' || *s2 == ' ' || *s2 == '\t' || *s2 == '\f'
-               || *s2 == '\v')
+           while (isSPACE(*s2) && *s2 != '\n')
                 s2++;
             if (*s2 == '{') {
                 PL_expect = XTERMBLOCK;
@@ -11887,7 +12052,7 @@ Perl_parse_label(pTHX_ U32 flags)
         STRLEN wlen, bufptr_pos;
         lex_read_space(0);
         t = s = PL_bufptr;
-        if (!isIDFIRST_lazy_if(s, UTF))
+        if (!isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
             goto no_label;
         t = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &wlen);
         if (word_takes_any_delimiter(s, wlen))