This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Don't remap \N{} for EBCDIC
[perl5.git] / toke.c
diff --git a/toke.c b/toke.c
index 4f3eee9..388d3f0 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -753,9 +753,9 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
        parser->linestr = flags & LEX_START_COPIED
                            ? SvREFCNT_inc_simple_NN(line)
                            : newSVpvn_flags(s, len, SvUTF8(line));
-       sv_catpvs(parser->linestr, "\n;");
+       sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
     } else {
-       parser->linestr = newSVpvs("\n;");
+       parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
     }
     parser->oldoldbufptr =
        parser->oldbufptr =
@@ -1053,7 +1053,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
                    ENTER;
                    SAVESPTR(PL_warnhook);
                    PL_warnhook = PERL_WARNHOOK_FATAL;
-                   utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
+                   utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
                    LEAVE;
                }
            }
@@ -1073,7 +1073,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
                }
                else {
                     assert(p < e -1 );
-                   *bufptr++ = TWO_BYTE_UTF8_TO_UNI(*p, *(p+1));
+                   *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
                    p += 2;
                 }
            }
@@ -1437,13 +1437,13 @@ Perl_lex_peek_unichar(pTHX_ U32 flags)
                bufend = PL_parser->bufend;
            }
        }
-       unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
+       unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
        if (retlen == (STRLEN)-1) {
            /* malformed UTF-8 */
            ENTER;
            SAVESPTR(PL_warnhook);
            PL_warnhook = PERL_WARNHOOK_FATAL;
-           utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
+           utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
            LEAVE;
        }
        return unichar;
@@ -1869,13 +1869,11 @@ STATIC char *
 S_skipspace2(pTHX_ char *s, SV **svp)
 {
     char *start;
-    const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
     const I32 startoff = s - SvPVX(PL_linestr);
 
     PERL_ARGS_ASSERT_SKIPSPACE2;
 
     s = skipspace(s);
-    PL_bufptr = SvPVX(PL_linestr) + bufptroff;
     if (!PL_madskills || !svp)
        return s;
     start = SvPVX(PL_linestr) + startoff;
@@ -1908,7 +1906,7 @@ S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
            sv_setpvn(sv, buf, len);
        (void)SvIOK_on(sv);
        SvIV_set(sv, 0);
-       av_store(av, (I32)CopLINE(PL_curcop), sv);
+       av_store(av, CopLINE(PL_curcop), sv);
     }
 }
 
@@ -2763,7 +2761,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
     {
         /* If warnings are on, this will print a more detailed analysis of what
          * is wrong than the error message below */
-        utf8n_to_uvuni(first_bad_char_loc,
+        utf8n_to_uvchr(first_bad_char_loc,
                        e - ((char *) first_bad_char_loc),
                        NULL, 0);
 
@@ -2838,7 +2836,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
             }
             s++;
         } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-            if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1))))) {
+            if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
                 goto bad_charname;
             }
             s += 2;
@@ -2871,8 +2869,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
                 s++;
             }
             else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                if (! isCHARNAME_CONT(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s,
-                                                                    *(s+1)))))
+                if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
                 {
                     goto bad_charname;
                 }
@@ -2906,7 +2903,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
             /* If warnings are on, this will print a more detailed analysis of
              * what is wrong than the error message below */
-            utf8n_to_uvuni(first_bad_char_loc,
+            utf8n_to_uvchr(first_bad_char_loc,
                            (char *) first_bad_char_loc - str,
                            NULL, 0);
 
@@ -3107,7 +3104,7 @@ S_scan_const(pTHX_ char *start)
                    char *e = d++;
                    while (e-- > c)
                        *(e + 1) = *e;
-                   *c = (char)UTF_TO_NATIVE(0xff);
+                   *c = (char) ILLEGAL_UTF8_BYTE;
                    /* mark the range as done, and continue */
                    dorange = FALSE;
                    didrange = TRUE;
@@ -3161,16 +3158,16 @@ S_scan_const(pTHX_ char *start)
 
 #ifdef EBCDIC
                if (literal_endpoint == 2 &&
-                   ((isLOWER(min) && isLOWER(max)) ||
-                    (isUPPER(min) && isUPPER(max)))) {
-                   if (isLOWER(min)) {
+                   ((isLOWER_A(min) && isLOWER_A(max)) ||
+                    (isUPPER_A(min) && isUPPER_A(max)))) {
+                   if (isLOWER_A(min)) {
                        for (i = min; i <= max; i++)
-                           if (isLOWER(i))
-                               *d++ = NATIVE_TO_NEED(has_utf8,i);
+                           if (isLOWER_A(i))
+                               *d++ = i;
                    } else {
                        for (i = min; i <= max; i++)
-                           if (isUPPER(i))
-                               *d++ = NATIVE_TO_NEED(has_utf8,i);
+                           if (isUPPER_A(i))
+                               *d++ = i;
                    }
                }
                else
@@ -3178,13 +3175,7 @@ S_scan_const(pTHX_ char *start)
                    for (i = min; i <= max; i++)
 #ifdef EBCDIC
                         if (has_utf8) {
-                            const U8 ch = (U8)NATIVE_TO_UTF(i);
-                            if (UNI_IS_INVARIANT(ch))
-                                *d++ = (U8)i;
-                            else {
-                                *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
-                                *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
-                            }
+                            append_utf8_from_native_byte(i, &d);
                         }
                         else
 #endif
@@ -3194,7 +3185,7 @@ S_scan_const(pTHX_ char *start)
                 if (uvmax) {
                     d = (char*)uvchr_to_utf8((U8*)d, 0x100);
                     if (uvmax > 0x101)
-                        *d++ = (char)UTF_TO_NATIVE(0xff);
+                        *d++ = (char) ILLEGAL_UTF8_BYTE;
                     if (uvmax > 0x100)
                         d = (char*)uvchr_to_utf8((U8*)d, uvmax);
                 }
@@ -3219,7 +3210,7 @@ S_scan_const(pTHX_ char *start)
                    && !native_range
 #endif
                    ) {
-                   *d++ = (char)UTF_TO_NATIVE(0xff);   /* use illegal utf8 byte--see pmtrans */
+                   *d++ = (char) ILLEGAL_UTF8_BYTE;    /* use illegal utf8 byte--see pmtrans */
                    s++;
                    continue;
                }
@@ -3262,7 +3253,7 @@ S_scan_const(pTHX_ char *start)
        else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
            if (s[2] == '#') {
                while (s+1 < send && *s != ')')
-                   *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+                   *d++ = *s++;
            }
            else if (!PL_lex_casemods &&
                     (    s[2] == '{' /* This should match regcomp.c */
@@ -3276,7 +3267,7 @@ S_scan_const(pTHX_ char *start)
        else if (*s == '#' && PL_lex_inpat && !in_charclass &&
          ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
            while (s+1 < send && *s != '\n')
-               *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+               *d++ = *s++;
        }
 
        /* no further processing of single-quoted regex */
@@ -3351,7 +3342,7 @@ S_scan_const(pTHX_ char *start)
                        || s[1] != '{'
                        || regcurly(s + 1, FALSE)))
            {
-               *d++ = NATIVE_TO_NEED(has_utf8,'\\');
+               *d++ = '\\';
                goto default_action;
            }
 
@@ -3380,7 +3371,7 @@ S_scan_const(pTHX_ char *start)
                {
                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                     STRLEN len = 3;
-                   uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
+                   uv = grok_oct(s, &len, &flags, NULL);
                    s += len;
                     if (len < 3 && s < send && isDIGIT(*s)
                         && ckWARN(WARN_MISC))
@@ -3432,9 +3423,8 @@ S_scan_const(pTHX_ char *start)
                 * UTF-8 sequence they can end up as, except if they force us
                 * to recode the rest of the string into utf8 */
                
-               /* Here uv is the ordinal of the next character being added in
-                * unicode (converted from native). */
-               if (!UNI_IS_INVARIANT(uv)) {
+               /* Here uv is the ordinal of the next character being added */
+               if (!NATIVE_IS_INVARIANT(uv)) {
                    if (!has_utf8 && uv > 255) {
                        /* Might need to recode whatever we have accumulated so
                         * far if it contains any chars variant in utf8 or
@@ -3452,7 +3442,7 @@ S_scan_const(pTHX_ char *start)
                     }
 
                     if (has_utf8) {
-                       d = (char*)uvuni_to_utf8((U8*)d, uv);
+                       d = (char*)uvchr_to_utf8((U8*)d, uv);
                        if (PL_lex_inwhat == OP_TRANS &&
                            PL_sublex_info.sub_op) {
                            PL_sublex_info.sub_op->op_private |=
@@ -3485,16 +3475,6 @@ S_scan_const(pTHX_ char *start)
                 * now, while preserving the fact that it was a named character
                 * so that the regex compiler knows this */
 
-               /* This section of code doesn't generally use the
-                * NATIVE_TO_NEED() macro to transform the input.  I (khw) did
-                * a close examination of this macro and determined it is a
-                * no-op except on utfebcdic variant characters.  Every
-                * character generated by this that would normally need to be
-                * enclosed by this macro is invariant, so the macro is not
-                * needed, and would complicate use of copy().  XXX There are
-                * other parts of this file where the macro is used
-                * inconsistently, but are saved by it being a no-op */
-
                /* The structure of this section of code (besides checking for
                 * errors and upgrading to utf8) is:
                 *  Further disambiguate between the two meanings of \N, and if
@@ -3582,11 +3562,13 @@ S_scan_const(pTHX_ char *start)
                            has_utf8 = TRUE;
                        }
 
-                       /* Add the string to the output */
+                        /* Add the (Unicode) code point to the output. */
                        if (UNI_IS_INVARIANT(uv)) {
-                           *d++ = (char) uv;
+                           *d++ = (char) LATIN1_TO_NATIVE(uv);
                        }
-                       else d = (char*)uvuni_to_utf8((U8*)d, uv);
+                       else {
+                            d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
+                        }
                    }
                }
                else /* Here is \N{NAME} but not \N{U+...}. */
@@ -3646,19 +3628,16 @@ S_scan_const(pTHX_ char *start)
                                 char hex_string[2 * UTF8_MAXBYTES + 5];
 
                                 /* Get the first character of the result. */
-                                U32 uv = utf8n_to_uvuni((U8 *) str,
+                                U32 uv = utf8n_to_uvchr((U8 *) str,
                                                         len,
                                                         &char_length,
                                                         UTF8_ALLOW_ANYUV);
                                 /* Convert first code point to hex, including
-                                 * the boiler plate before it.  For all these,
-                                 * we convert to native format so that
-                                 * downstream code can continue to assume the
-                                 * input is native */
+                                 * the boiler plate before it. */
                                 output_length =
                                     my_snprintf(hex_string, sizeof(hex_string),
-                                            "\\N{U+%X",
-                                            (unsigned int) UNI_TO_NATIVE(uv));
+                                                "\\N{U+%X",
+                                                (unsigned int) uv);
 
                                 /* Make sure there is enough space to hold it */
                                 d = off + SvGROW(sv, off
@@ -3673,15 +3652,15 @@ S_scan_const(pTHX_ char *start)
                                 * its ordinal in hex */
                                 while ((str += char_length) < str_end) {
                                     const STRLEN off = d - SvPVX_const(sv);
-                                    U32 uv = utf8n_to_uvuni((U8 *) str,
+                                    U32 uv = utf8n_to_uvchr((U8 *) str,
                                                             str_end - str,
                                                             &char_length,
                                                             UTF8_ALLOW_ANYUV);
                                     output_length =
                                         my_snprintf(hex_string,
-                                            sizeof(hex_string),
-                                            ".%X",
-                                            (unsigned int) UNI_TO_NATIVE(uv));
+                                                    sizeof(hex_string),
+                                                    ".%X",
+                                                    (unsigned int) uv);
 
                                     d = off + SvGROW(sv, off
                                                         + output_length
@@ -3746,25 +3725,25 @@ S_scan_const(pTHX_ char *start)
 
            /* printf-style backslashes, formfeeds, newlines, etc */
            case 'b':
-               *d++ = NATIVE_TO_NEED(has_utf8,'\b');
+               *d++ = '\b';
                break;
            case 'n':
-               *d++ = NATIVE_TO_NEED(has_utf8,'\n');
+               *d++ = '\n';
                break;
            case 'r':
-               *d++ = NATIVE_TO_NEED(has_utf8,'\r');
+               *d++ = '\r';
                break;
            case 'f':
-               *d++ = NATIVE_TO_NEED(has_utf8,'\f');
+               *d++ = '\f';
                break;
            case 't':
-               *d++ = NATIVE_TO_NEED(has_utf8,'\t');
+               *d++ = '\t';
                break;
            case 'e':
-               *d++ = ASCII_TO_NEED(has_utf8,'\033');
+               *d++ = ASCII_TO_NATIVE('\033');
                break;
            case 'a':
-               *d++ = ASCII_TO_NEED(has_utf8,'\007');
+               *d++ = '\a';
                break;
            } /* end switch */
 
@@ -3818,7 +3797,7 @@ S_scan_const(pTHX_ char *start)
 #endif
        }
        else {
-           *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+           *d++ = *s++;
        }
     } /* while loop to process each character */
 
@@ -5595,14 +5574,19 @@ Perl_yylex(pTHX)
                s = SKIPSPACE0(s);
            }
            else {
-/*             if (PL_madskills && PL_lex_formbrack) { */
-                   d = s;
-                   while (d < PL_bufend && *d != '\n')
-                       d++;
-                   if (d < PL_bufend)
-                       d++;
-                   else if (d > PL_bufend) /* Found by Ilya: feed random input to Perl. */
+#endif
+                   if (PL_madskills) d = s;
+                   while (s < PL_bufend && *s != '\n')
+                       s++;
+                   if (s < PL_bufend)
+                   {
+                       s++;
+                       if (s < PL_bufend)
+                           incline(s);
+                   }
+                   else if (s > PL_bufend) /* Found by Ilya: feed random input to Perl. */
                      Perl_croak(aTHX_ "panic: input overflow");
+#ifdef PERL_MAD
                    if (PL_madskills && CopLINE(PL_curcop) >= 1) {
                        if (!PL_thiswhite)
                            PL_thiswhite = newSVpvs("");
@@ -5610,20 +5594,9 @@ Perl_yylex(pTHX)
                            sv_setpvs(PL_thiswhite, "");
                            PL_faketokens = 0;
                        }
-                       sv_catpvn(PL_thiswhite, s, d - s);
+                       sv_catpvn(PL_thiswhite, d, s - d);
                    }
-                   s = d;
-/*             }
-               *s = '\0';
-               PL_bufend = s; */
            }
-#else
-           while (s < PL_bufend && *s != '\n')
-               s++;
-           if (s < PL_bufend)
-               s++;
-           else if (s > PL_bufend) /* Found by Ilya: feed random input to Perl. */
-             Perl_croak(aTHX_ "panic: input overflow");
 #endif
        }
        goto retry;
@@ -7575,21 +7548,12 @@ Perl_yylex(pTHX)
        case KEY___END__: {
            GV *gv;
            if (PL_rsfp && (!PL_in_eval || PL_tokenbuf[2] == 'D')) {
-               const char *pname = "main";
-               STRLEN plen = 4;
-               U32 putf8 = 0;
-               if (PL_tokenbuf[2] == 'D')
-               {
-                   HV * const stash =
-                       PL_curstash ? PL_curstash : PL_defstash;
-                   pname = HvNAME_get(stash);
-                   plen  = HvNAMELEN (stash);
-                   if(HvNAMEUTF8(stash)) putf8 = SVf_UTF8;
-               }
-               gv = gv_fetchpvn_flags(
-                       Perl_form(aTHX_ "%*s::DATA", (int)plen, pname),
-                       plen+6, GV_ADD|putf8, SVt_PVIO
-               );
+               HV * const stash = PL_tokenbuf[2] == 'D' && PL_curstash
+                                       ? PL_curstash
+                                       : PL_defstash;
+               gv = (GV *)*hv_fetchs(stash, "DATA", 1);
+               if (!isGV(gv))
+                   gv_init(gv,stash,"DATA",4,0);
                GvMULTI_on(gv);
                if (!GvIO(gv))
                    GvIOp(gv) = newIO();
@@ -10373,11 +10337,15 @@ intro_sym:
 
 
 /* scan_str
-   takes: start position in buffer
-         keep_quoted preserve \ on the embedded delimiter(s)
-         keep_delims preserve the delimiters around the string
-         re_reparse  compiling a run-time /(?{})/:
-                       collapse // to /,  and skip encoding src
+   takes:
+       start                   position in buffer
+       keep_quoted             preserve \ on the embedded delimiter(s)
+       keep_delims             preserve the delimiters around the string
+       re_reparse              compiling a run-time /(?{})/:
+                                  collapse // to /,  and skip encoding src
+       deprecate_escaped_meta  issue a deprecation warning for cer-
+                               tain paired metacharacters that appear
+                               escaped within it
    returns: position to continue reading from buffer
    side-effects: multi_start, multi_close, lex_repl or lex_stuff, and
        updates the read buffer.
@@ -10419,9 +10387,7 @@ intro_sym:
 
 STATIC char *
 S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse,
-        bool deprecate_escaped_meta /* Should we issue a deprecation warning
-                                       for certain paired metacharacters that
-                                       appear escaped within it */
+                bool deprecate_escaped_meta
     )
 {
     dVAR;
@@ -11851,7 +11817,7 @@ Perl_scan_vstring(pTHX_ const char *s, const char *const e, SV *sv)
            /* Append native character for the rev point */
            tmpend = uvchr_to_utf8(tmpbuf, rev);
            sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
-           if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev)))
+           if (!NATIVE_IS_INVARIANT(rev))
                 SvUTF8_on(sv);
            if (pos + 1 < e && *pos == '.' && isDIGIT(pos[1]))
                 s = ++pos;