Add rt69056.t to MANIFEST

[perl5.git] / toke.c
diff --git a/toke.c b/toke.c

index 750e4d4..de163fb 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -124,16 +124,14 @@ static I32 utf16rev_textfilter(pTHX_ int idx, SV *sv, int maxlen);
  #   define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
  #endif
  
+/* The maximum number of characters preceding the unrecognized one to display */
+#define UNRECOGNIZED_PRECEDE_COUNT 10
+
  /* In variables named $^X, these are the legal values for X.
   * 1999-02-27 mjd-perl-patch@plover.com */
  #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
  
-/* On MacOS, respect nonbreaking spaces */
-#ifdef MACOS_TRADITIONAL
-#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\312'||(c)=='\t')
-#else
  #define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
-#endif
  
  /* LEX_* are values for PL_lex_state, the state of the lexer.
   * They are arranged oddly so that the guard on the switch statement
@@ -824,7 +822,7 @@ S_incline(pTHX_ const char *s)
      n = s;
      while (isDIGIT(*s))
         s++;
-    if (!SPACE_OR_TAB(*s) && *s != '\n' && *s != '\0')
+    if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
         return;
      while (SPACE_OR_TAB(*s))
         s++;
@@ -1386,7 +1384,9 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
  {
      dVAR;
      SV * const sv = newSVpvn_utf8(start, len,
-                                 UTF && !IN_BYTES
+                                 !IN_BYTES
+                                 && UTF
+                                 && !is_ascii_string((const U8*)start, len)
                                   && is_utf8_string((const U8*)start, len));
      return sv;
  }
@@ -1932,7 +1932,9 @@ S_sublex_done(pTHX)
                   handle \cV (control characters)
                   handle printf-style backslashes (\f, \r, \n, etc)
               } (end switch)
+             continue
           } (end if backslash)
+          handle regular character
      } (end while character to read)
                 
  */
@@ -1942,13 +1944,32 @@ S_scan_const(pTHX_ char *start)
  {
      dVAR;
      register char *send = PL_bufend;           /* end of the constant */
-    SV *sv = newSV(send - start);              /* sv for the constant */
+    SV *sv = newSV(send - start);              /* sv for the constant.  See
+                                                  note below on sizing. */
      register char *s = start;                  /* start of the constant */
      register char *d = SvPVX(sv);              /* destination for copies */
      bool dorange = FALSE;                      /* are we in a translit range? */
      bool didrange = FALSE;                     /* did we just finish a range? */
      I32  has_utf8 = FALSE;                     /* Output constant is UTF8 */
-    I32  this_utf8 = UTF;                      /* The source string is assumed to be UTF8 */
+    I32  this_utf8 = UTF;                      /* Is the source string assumed
+                                                  to be UTF8?  But, this can
+                                                  show as true when the source
+                                                  isn't utf8, as for example
+                                                  when it is entirely composed
+                                                  of hex constants */
+
+    /* Note on sizing:  The scanned constant is placed into sv, which is
+     * initialized by newSV() assuming one byte of output for every byte of
+     * input.  This routine expects newSV() to allocate an extra byte for a
+     * trailing NUL, which this routine will append if it gets to the end of
+     * the input.  There may be more bytes of input than output (eg., \N{LATIN
+     * CAPITAL LETTER A}), or more output than input if the constant ends up
+     * recoded to utf8, but each time a construct is found that might increase
+     * the needed size, SvGROW() is called.  Its size parameter each time is
+     * based on the best guess estimate at the time, namely the length used so
+     * far, plus the length the current construct will occupy, plus room for
+     * the trailing NUL, plus one byte for every input byte still unscanned */ 
+
      UV uv;
  #ifdef EBCDIC
      UV literal_endpoint = 0;
@@ -2228,18 +2249,18 @@ S_scan_const(pTHX_ char *start)
                     goto default_action;
                 }
  
-           /* \132 indicates an octal constant */
+           /* eg. \132 indicates the octal constant 0x132 */
             case '0': case '1': case '2': case '3':
             case '4': case '5': case '6': case '7':
                 {
                      I32 flags = 0;
                      STRLEN len = 3;
-                   uv = grok_oct(s, &len, &flags, NULL);
+                   uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
                     s += len;
                 }
                 goto NUM_ESCAPE_INSERT;
  
-           /* \x24 indicates a hex constant */
+           /* eg. \x24 indicates the hex constant 0x24 */
             case 'x':
                 ++s;
                 if (*s == '{') {
@@ -2254,67 +2275,47 @@ S_scan_const(pTHX_ char *start)
                         continue;
                     }
                      len = e - s;
-                   uv = grok_hex(s, &len, &flags, NULL);
+                   uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
                     s = e + 1;
                 }
                 else {
                     {
                         STRLEN len = 2;
                          I32 flags = PERL_SCAN_DISALLOW_PREFIX;
-                       uv = grok_hex(s, &len, &flags, NULL);
+                       uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
                         s += len;
                     }
                 }
  
               NUM_ESCAPE_INSERT:
-               /* Insert oct or hex escaped character.
-                * There will always enough room in sv since such
-                * escapes will be longer than any UTF-8 sequence
-                * they can end up as. */
+               /* Insert oct, hex, or \N{U+...} escaped character.  There will
+                * always be enough room in sv since such escapes will be
+                * longer than any UTF-8 sequence they can end up as, except if
+                * they force us to recode the rest of the string into utf8 */
                 
-               /* We need to map to chars to ASCII before doing the tests
-                  to cover EBCDIC
-               */
-               if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) {
+               /* Here uv is the ordinal of the next character being added in
+                * unicode (converted from native).  (It has to be done before
+                * here because \N is interpreted as unicode, and oct and hex
+                * as native.) */
+               if (!UNI_IS_INVARIANT(uv)) {
                     if (!has_utf8 && uv > 255) {
-                       /* Might need to recode whatever we have
-                        * accumulated so far if it contains any
-                        * hibit chars.
-                        *
-                        * (Can't we keep track of that and avoid
-                        *  this rescan? --jhi)
-                        */
-                       int hicount = 0;
-                       U8 *c;
-                       for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) {
-                           if (!NATIVE_IS_INVARIANT(*c)) {
-                               hicount++;
-                           }
-                       }
-                       if (hicount) {
-                           const STRLEN offset = d - SvPVX_const(sv);
-                           U8 *src, *dst;
-                           d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset;
-                           src = (U8 *)d - 1;
-                           dst = src+hicount;
-                           d  += hicount;
-                           while (src >= (const U8 *)SvPVX_const(sv)) {
-                               if (!NATIVE_IS_INVARIANT(*src)) {
-                                   const U8 ch = NATIVE_TO_ASCII(*src);
-                                   *dst-- = (U8)UTF8_EIGHT_BIT_LO(ch);
-                                   *dst-- = (U8)UTF8_EIGHT_BIT_HI(ch);
-                               }
-                               else {
-                                   *dst-- = *src;
-                               }
-                               src--;
-                           }
-                        }
+                       /* Might need to recode whatever we have accumulated so
+                        * far if it contains any chars variant in utf8 or
+                        * utf-ebcdic. */
+                         
+                       SvCUR_set(sv, d - SvPVX_const(sv));
+                       SvPOK_on(sv);
+                       *d = '\0';
+                       /* See Note on sizing above.  */
+                       sv_utf8_upgrade_flags_grow(sv,
+                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                       UNISKIP(uv) + (STRLEN)(send - s) + 1);
+                       d = SvPVX(sv) + SvCUR(sv);
+                       has_utf8 = TRUE;
                      }
  
-                    if (has_utf8 || uv > 255) {
-                       d = (char*)uvchr_to_utf8((U8*)d, uv);
-                       has_utf8 = TRUE;
+                    if (has_utf8) {
+                       d = (char*)uvuni_to_utf8((U8*)d, uv);
                         if (PL_lex_inwhat == OP_TRANS &&
                             PL_sublex_info.sub_op) {
                             PL_sublex_info.sub_op->op_private |=
@@ -2335,7 +2336,8 @@ S_scan_const(pTHX_ char *start)
                 }
                 continue;
  
-           /* \N{LATIN SMALL LETTER A} is a named character */
+           /* \N{LATIN SMALL LETTER A} is a named character, and so is
+            * \N{U+0041} */
             case 'N':
                 ++s;
                 if (*s == '{') {
@@ -2350,7 +2352,8 @@ S_scan_const(pTHX_ char *start)
                         goto cont_scan;
                     }
                     if (e > s + 2 && s[1] == 'U' && s[2] == '+') {
-                       /* \N{U+...} */
+                       /* \N{U+...} The ... is a unicode value even on EBCDIC
+                        * machines */
                         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
                           PERL_SCAN_DISALLOW_PREFIX;
                         s += 3;
@@ -2388,22 +2391,24 @@ S_scan_const(pTHX_ char *start)
                          }
                     }
  #endif
+                   /* If destination is not in utf8 but this new character is,
+                    * recode the dest to utf8 */
                     if (!has_utf8 && SvUTF8(res)) {
-                       const char * const ostart = SvPVX_const(sv);
-                       SvCUR_set(sv, d - ostart);
+                       SvCUR_set(sv, d - SvPVX_const(sv));
                         SvPOK_on(sv);
                         *d = '\0';
-                       sv_utf8_upgrade(sv);
-                       /* this just broke our allocation above... */
-                       SvGROW(sv, (STRLEN)(send - start));
+                       /* See Note on sizing above.  */
+                       sv_utf8_upgrade_flags_grow(sv,
+                                           SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                           len + (STRLEN)(send - s) + 1);
                         d = SvPVX(sv) + SvCUR(sv);
                         has_utf8 = TRUE;
-                   }
-                   if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
-                       const char * const odest = SvPVX_const(sv);
+                   } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
  
-                       SvGROW(sv, (SvLEN(sv) + len - (e - s + 4)));
-                       d = SvPVX(sv) + (d - odest);
+                       /* See Note on sizing above.  (NOTE: SvCUR() is not set
+                        * correctly here). */
+                       const STRLEN off = d - SvPVX_const(sv);
+                       d = SvGROW(sv, off + len + (STRLEN)(send - s) + 1) + off;
                     }
  #ifdef EBCDIC
                     if (!dorange)
@@ -2468,20 +2473,41 @@ S_scan_const(pTHX_ char *start)
  #endif
  
      default_action:
-       /* If we started with encoded form, or already know we want it
-          and then encode the next character */
-       if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) {
+       /* If we started with encoded form, or already know we want it,
+          then encode the next character */
+       if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
             STRLEN len  = 1;
+
+
+           /* One might think that it is wasted effort in the case of the
+            * source being utf8 (this_utf8 == TRUE) to take the next character
+            * in the source, convert it to an unsigned value, and then convert
+            * it back again.  But the source has not been validated here.  The
+            * routine that does the conversion checks for errors like
+            * malformed utf8 */
+
             const UV nextuv   = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
             const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
-           s += len;
-           if (need > len) {
-               /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */
+           if (!has_utf8) {
+               SvCUR_set(sv, d - SvPVX_const(sv));
+               SvPOK_on(sv);
+               *d = '\0';
+               /* See Note on sizing above.  */
+               sv_utf8_upgrade_flags_grow(sv,
+                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                       need + (STRLEN)(send - s) + 1);
+               d = SvPVX(sv) + SvCUR(sv);
+               has_utf8 = TRUE;
+           } else if (need > len) {
+               /* encoded value larger than old, may need extra space (NOTE:
+                * SvCUR() is not set correctly here).   See Note on sizing
+                * above.  */
                 const STRLEN off = d - SvPVX_const(sv);
-               d = SvGROW(sv, SvLEN(sv) + (need-len)) + off;
+               d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
             }
+           s += len;
+
             d = (char*)uvchr_to_utf8((U8*)d, nextuv);
-           has_utf8 = TRUE;
  #ifdef EBCDIC
             if (uv > 255 && !dorange)
                 native_range = FALSE;
@@ -2800,7 +2826,7 @@ S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
        bare_package:
             start_force(PL_curforce);
             NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
-                                                  newSVpvn(tmpbuf,len));
+                                                 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
             NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
             if (PL_madskills)
                 curmad('X', newSVpvn(start,SvPVX(PL_linestr) + soff - start));
@@ -3631,8 +3657,17 @@ Perl_yylex(pTHX)
      default:
         if (isIDFIRST_lazy_if(s,UTF))
             goto keylookup;
-       len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart);
-       Perl_croak(aTHX_ "Unrecognized character \\x%02X in column %d", *s & 255, (int) len + 1);
+       {
+        unsigned char c = *s;
+        len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart);
+        if (len > UNRECOGNIZED_PRECEDE_COUNT) {
+            d = UTF ? (char *) Perl_utf8_hop(aTHX_ (U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT) : s - UNRECOGNIZED_PRECEDE_COUNT;
+        } else {
+            d = PL_linestart;
+        }      
+        *s = '\0';
+        Perl_croak(aTHX_ "Unrecognized character \\x%02X; marked by <-- HERE after %s<-- HERE near column %d", c, d, (int) len + 1);
+    }
      case 4:
      case 26:
         goto fake_eof;                  /* emulate EOF on ^D or ^Z */
@@ -3922,7 +3957,6 @@ Perl_yylex(pTHX)
                         *s = '#';       /* Don't try to parse shebang line */
                 }
  #endif /* ALTERNATE_SHEBANG */
-#ifndef MACOS_TRADITIONAL
                 if (!d &&
                     *s == '#' &&
                     ipathend > ipath &&
@@ -3938,7 +3972,7 @@ Perl_yylex(pTHX)
                     while (s < PL_bufend && isSPACE(*s))
                         s++;
                     if (s < PL_bufend) {
-                       Newxz(newargv,PL_origargc+3,char*);
+                       Newx(newargv,PL_origargc+3,char*);
                         newargv[1] = s;
                         while (s < PL_bufend && !isSPACE(*s))
                             s++;
@@ -3953,7 +3987,6 @@ Perl_yylex(pTHX)
                     PERL_FPU_POST_EXEC
                     Perl_croak(aTHX_ "Can't exec %s", ipath);
                 }
-#endif
                 if (d) {
                     while (*d && !isSPACE(*d))
                         d++;
@@ -3968,7 +4001,14 @@ Perl_yylex(pTHX)
                         const char *d1 = d;
  
                         do {
-                           if (*d1 == 'M' || *d1 == 'm' || *d1 == 'C') {
+                           bool baduni = FALSE;
+                           if (*d1 == 'C') {
+                               const char *d2 = d1 + 1;
+                               if (parse_unicode_opts((const char **)&d2)
+                                   != PL_unicode)
+                                   baduni = TRUE;
+                           }
+                           if (baduni || *d1 == 'M' || *d1 == 'm') {
                                 const char * const m = d1;
                                 while (*d1 && !isSPACE(*d1))
                                     d1++;
@@ -4016,9 +4056,6 @@ Perl_yylex(pTHX)
        "\t(Maybe you didn't strip carriage returns after a network transfer?)\n");
  #endif
      case ' ': case '\t': case '\f': case 013:
-#ifdef MACOS_TRADITIONAL
-    case '\312':
-#endif
  #ifdef PERL_MAD
         PL_realtokenstart = -1;
         if (!PL_thiswhite)
@@ -4261,7 +4298,10 @@ Perl_yylex(pTHX)
         BOop(OP_BIT_XOR);
      case '[':
         PL_lex_brackets++;
-       /* FALL THROUGH */
+       {
+           const char tmp = *s++;
+           OPERATOR(tmp);
+       }
      case '~':
         if (s[1] == '~'
             && (PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR))
@@ -4311,6 +4351,7 @@ Perl_yylex(pTHX)
                     case KEY_or:
                     case KEY_and:
                     case KEY_for:
+                   case KEY_foreach:
                     case KEY_unless:
                     case KEY_if:
                     case KEY_while:
@@ -4346,11 +4387,6 @@ Perl_yylex(pTHX)
                     if (len == 6 && strnEQ(SvPVX(sv), "unique", len)) {
                         sv_free(sv);
                         if (PL_in_my == KEY_our) {
-#ifdef USE_ITHREADS
-                           GvUNIQUE_on(cGVOPx_gv(pl_yylval.opval));
-#else
-                           /* skip to avoid loading attributes.pm */
-#endif
                             deprecate(":unique");
                         }
                         else
@@ -4365,7 +4401,7 @@ Perl_yylex(pTHX)
                     }
                     else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "locked", len)) {
                         sv_free(sv);
-                       CvLOCKED_on(PL_compcv);
+                       deprecate(":locked");
                     }
                     else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) {
                         sv_free(sv);
@@ -4778,10 +4814,6 @@ Perl_yylex(pTHX)
         pl_yylval.ival = 0;
         OPERATOR(ASSIGNOP);
      case '!':
-       if (PL_expect == XSTATE && s[1] == '!' && s[2] == '!') {
-           s += 3;
-           LOP(OP_DIE,XTERM);
-       }
         s++;
         {
             const char tmp = *s++;
@@ -5033,10 +5065,6 @@ Perl_yylex(pTHX)
             AOPERATOR(DORDOR);
         }
       case '?':                 /* may either be conditional or pattern */
-       if (PL_expect == XSTATE && s[1] == '?' && s[2] == '?') {
-           s += 3;
-           LOP(OP_WARN,XTERM);
-       }
         if (PL_expect == XOPERATOR) {
              char tmp = *s++;
              if(tmp == '?') {
@@ -5252,14 +5280,17 @@ Perl_yylex(pTHX)
         /* Is this a label? */
         if (!tmp && PL_expect == XSTATE
               && d < PL_bufend && *d == ':' && *(d + 1) != ':') {
+           tmp = keyword(PL_tokenbuf, len, 0);
+           if (tmp)
+               Perl_croak(aTHX_ "Can't use keyword '%s' as a label", PL_tokenbuf);
             s = d + 1;
             pl_yylval.pval = CopLABEL_alloc(PL_tokenbuf);
             CLINE;
             TOKEN(LABEL);
         }
-
-       /* Check for keywords */
-       tmp = keyword(PL_tokenbuf, len, 0);
+       else
+           /* Check for keywords */
+           tmp = keyword(PL_tokenbuf, len, 0);
  
         /* Is this a word before a => operator? */
         if (*d == '=' && d[1] == '>') {
@@ -5670,10 +5701,22 @@ Perl_yylex(pTHX)
  
                 /* Call it a bare word */
  
-               bareword:
                 if (PL_hints & HINT_STRICT_SUBS)
                     pl_yylval.opval->op_private |= OPpCONST_STRICT;
                 else {
+               bareword:
+                   /* after "print" and similar functions (corresponding to
+                    * "F? L" in opcode.pl), whatever wasn't already parsed as
+                    * a filehandle should be subject to "strict subs".
+                    * Likewise for the optional indirect-object argument to system
+                    * or exec, which can't be a bareword */
+                   if ((PL_last_lop_op == OP_PRINT
+                           || PL_last_lop_op == OP_PRTF
+                           || PL_last_lop_op == OP_SAY
+                           || PL_last_lop_op == OP_SYSTEM
+                           || PL_last_lop_op == OP_EXEC)
+                           && (PL_hints & HINT_STRICT_SUBS))
+                       pl_yylval.opval->op_private |= OPpCONST_STRICT;
                     if (lastchar != '-') {
                         if (ckWARN(WARN_RESERVED)) {
                             d = PL_tokenbuf;