This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate Memoize 0.64. Few tweaks were required in
[perl5.git] / toke.c
diff --git a/toke.c b/toke.c
index cd6ed1d..fca0f73 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1,6 +1,6 @@
 /*    toke.c
  *
- *    Copyright (c) 1991-2000, Larry Wall
+ *    Copyright (c) 1991-2001, Larry Wall
  *
  *    You may distribute under the terms of either the GNU General Public
  *    License or the Artistic License, as specified in the README file.
@@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen);
 #define XFAKEBRACK 128
 #define XENUMMASK 127
 
-/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/
-#define UTF (PL_hints & HINT_UTF8)
+#ifdef EBCDIC
+/* For now 'use utf8' does not affect tokenizer on EBCDIC */
+#define UTF (PL_linestr && DO_UTF8(PL_linestr))
+#else
+#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
+#endif
 
 /* In variables name $^X, these are the legal values for X.
  * 1999-02-27 mjd-perl-patch@plover.com */
@@ -79,7 +83,7 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen);
 #  endif
 YYSTYPE* yylval_pointer[YYMAXLEVEL];
 int* yychar_pointer[YYMAXLEVEL];
-int yyactlevel = 0;
+int yyactlevel = -1;
 #  undef yylval
 #  undef yychar
 #  define yylval (*yylval_pointer[yyactlevel])
@@ -126,31 +130,42 @@ int yyactlevel = 0;
  * Also see LOP and lop() below.
  */
 
-#define TOKEN(retval) return (PL_bufptr = s,(int)retval)
-#define OPERATOR(retval) return (PL_expect = XTERM,PL_bufptr = s,(int)retval)
-#define AOPERATOR(retval) return ao((PL_expect = XTERM,PL_bufptr = s,(int)retval))
-#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s,(int)retval)
-#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s,(int)retval)
-#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s,(int)retval)
-#define TERM(retval) return (CLINE, PL_expect = XOPERATOR,PL_bufptr = s,(int)retval)
-#define LOOPX(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LOOPEX)
-#define FTST(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)UNIOP)
-#define FUN0(f) return(yylval.ival = f,PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC0)
-#define FUN1(f) return(yylval.ival = f,PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC1)
-#define BOop(f) return ao((yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)BITOROP))
-#define BAop(f) return ao((yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)BITANDOP))
-#define SHop(f) return ao((yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)SHIFTOP))
-#define PWop(f) return ao((yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)POWOP))
-#define PMop(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)MATCHOP)
-#define Aop(f) return ao((yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)ADDOP))
-#define Mop(f) return ao((yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)MULOP))
-#define Eop(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)EQOP)
-#define Rop(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)RELOP)
+/* Note that REPORT() and REPORT2() will be expressions that supply
+ * their own trailing comma, not suitable for statements as such. */
+#ifdef DEBUGGING /* Serve -DT. */
+#   define REPORT(x,retval) tokereport(x,s,(int)retval),
+#   define REPORT2(x,retval) tokereport(x,s, yylval.ival),
+#else
+#   define REPORT(x,retval)
+#   define REPORT2(x,retval)
+#endif
+
+#define TOKEN(retval) return (REPORT2("token",retval) PL_bufptr = s,(int)retval)
+#define OPERATOR(retval) return (REPORT2("operator",retval) PL_expect = XTERM, PL_bufptr = s,(int)retval)
+#define AOPERATOR(retval) return ao((REPORT2("aop",retval) PL_expect = XTERM, PL_bufptr = s,(int)retval))
+#define PREBLOCK(retval) return (REPORT2("preblock",retval) PL_expect = XBLOCK,PL_bufptr = s,(int)retval)
+#define PRETERMBLOCK(retval) return (REPORT2("pretermblock",retval) PL_expect = XTERMBLOCK,PL_bufptr = s,(int)retval)
+#define PREREF(retval) return (REPORT2("preref",retval) PL_expect = XREF,PL_bufptr = s,(int)retval)
+#define TERM(retval) return (CLINE, REPORT2("term",retval) PL_expect = XOPERATOR, PL_bufptr = s,(int)retval)
+#define LOOPX(f) return(yylval.ival=f, REPORT("loopx",f) PL_expect = XTERM,PL_bufptr = s,(int)LOOPEX)
+#define FTST(f) return(yylval.ival=f, REPORT("ftst",f) PL_expect = XTERM,PL_bufptr = s,(int)UNIOP)
+#define FUN0(f) return(yylval.ival = f, REPORT("fun0",f) PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC0)
+#define FUN1(f) return(yylval.ival = f, REPORT("fun1",f) PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC1)
+#define BOop(f) return ao((yylval.ival=f, REPORT("bitorop",f) PL_expect = XTERM,PL_bufptr = s,(int)BITOROP))
+#define BAop(f) return ao((yylval.ival=f, REPORT("bitandop",f) PL_expect = XTERM,PL_bufptr = s,(int)BITANDOP))
+#define SHop(f) return ao((yylval.ival=f, REPORT("shiftop",f) PL_expect = XTERM,PL_bufptr = s,(int)SHIFTOP))
+#define PWop(f) return ao((yylval.ival=f, REPORT("powop",f) PL_expect = XTERM,PL_bufptr = s,(int)POWOP))
+#define PMop(f) return(yylval.ival=f, REPORT("matchop",f) PL_expect = XTERM,PL_bufptr = s,(int)MATCHOP)
+#define Aop(f) return ao((yylval.ival=f, REPORT("add",f) PL_expect = XTERM,PL_bufptr = s,(int)ADDOP))
+#define Mop(f) return ao((yylval.ival=f, REPORT("mul",f) PL_expect = XTERM,PL_bufptr = s,(int)MULOP))
+#define Eop(f) return(yylval.ival=f, REPORT("eq",f) PL_expect = XTERM,PL_bufptr = s,(int)EQOP)
+#define Rop(f) return(yylval.ival=f, REPORT("rel",f) PL_expect = XTERM,PL_bufptr = s,(int)RELOP)
 
 /* This bit of chicanery makes a unary function followed by
  * a parenthesis into a function with one argument, highest precedence.
  */
 #define UNI(f) return(yylval.ival = f, \
+       REPORT("uni",f) \
        PL_expect = XTERM, \
        PL_bufptr = s, \
        PL_last_uni = PL_oldbufptr, \
@@ -158,6 +173,7 @@ int yyactlevel = 0;
        (*s == '(' || (s = skipspace(s), *s == '(') ? (int)FUNC1 : (int)UNIOP) )
 
 #define UNIBRACK(f) return(yylval.ival = f, \
+        REPORT("uni",f) \
        PL_bufptr = s, \
        PL_last_uni = PL_oldbufptr, \
        (*s == '(' || (s = skipspace(s), *s == '(') ? (int)FUNC1 : (int)UNIOP) )
@@ -165,6 +181,28 @@ int yyactlevel = 0;
 /* grandfather return to old style */
 #define OLDLOP(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP)
 
+#ifdef DEBUGGING
+
+STATIC void
+S_tokereport(pTHX_ char *thing, char* s, I32 rv)
+{
+    DEBUG_T({
+        SV* report = newSVpv(thing, 0);
+        Perl_sv_catpvf(aTHX_ report, ":line %d:%"IVdf":", CopLINE(PL_curcop),
+               (IV)rv);
+
+        if (s - PL_bufptr > 0)
+            sv_catpvn(report, PL_bufptr, s - PL_bufptr);
+        else {
+            if (PL_oldbufptr && *PL_oldbufptr)
+                sv_catpv(report, PL_tokenbuf);
+        }
+        PerlIO_printf(Perl_debug_log, "### %s\n", SvPV_nolen(report));
+    });
+}
+
+#endif
+
 /*
  * S_ao
  *
@@ -359,6 +397,8 @@ Perl_lex_start(pTHX_ SV *line)
     SAVEPPTR(PL_bufend);
     SAVEPPTR(PL_oldbufptr);
     SAVEPPTR(PL_oldoldbufptr);
+    SAVEPPTR(PL_last_lop);
+    SAVEPPTR(PL_last_uni);
     SAVEPPTR(PL_linestart);
     SAVESPTR(PL_linestr);
     SAVEPPTR(PL_lex_brackstack);
@@ -401,6 +441,7 @@ Perl_lex_start(pTHX_ SV *line)
     SvTEMP_off(PL_linestr);
     PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = SvPVX(PL_linestr);
     PL_bufend = PL_bufptr + SvCUR(PL_linestr);
+    PL_last_lop = PL_last_uni = Nullch;
     SvREFCNT_dec(PL_rs);
     PL_rs = newSVpvn("\n", 1);
     PL_rsfp = 0;
@@ -444,7 +485,7 @@ S_incline(pTHX_ char *s)
        s += 4;
     else
        return;
-    if (*s == ' ' || *s == '\t')
+    if (SPACE_OR_TAB(*s))
        s++;
     else
        return;
@@ -500,7 +541,7 @@ S_skipspace(pTHX_ register char *s)
     for (;;) {
        STRLEN prevlen;
        SSize_t oldprevlen, oldoldprevlen;
-       SSize_t oldloplen, oldunilen;
+       SSize_t oldloplen = 0, oldunilen = 0;
        while (s < PL_bufend && isSPACE(*s)) {
            if (*s++ == '\n' && PL_in_eval && !PL_rsfp)
                incline(s);
@@ -546,6 +587,7 @@ S_skipspace(pTHX_ register char *s)
            PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = s = PL_linestart
                = SvPVX(PL_linestr);
            PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+           PL_last_lop = PL_last_uni = Nullch;
 
            /* Close the filehandle.  Could be from -P preprocessor,
             * STDIN, or a regular file.  If we were reading code from
@@ -677,6 +719,7 @@ S_lop(pTHX_ I32 f, int x, char *s)
 {
     yylval.ival = f;
     CLINE;
+    REPORT("lop", f)
     PL_expect = x;
     PL_bufptr = s;
     PL_last_lop = PL_oldbufptr;
@@ -803,7 +846,7 @@ Perl_str_to_version(pTHX_ SV *sv)
        STRLEN skip;
        UV n;
        if (utf)
-           n = utf8_to_uv((U8*)start, len, &skip, 0);
+           n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
        else {
            n = *(U8*)start;
            skip = 1;
@@ -882,8 +925,11 @@ S_tokeq(pTHX_ SV *sv)
     if (s == send)
        goto finish;
     d = s;
-    if ( PL_hints & HINT_NEW_STRING )
+    if ( PL_hints & HINT_NEW_STRING ) {
        pv = sv_2mortal(newSVpvn(SvPVX(pv), len));
+       if (SvUTF8(sv))
+           SvUTF8_on(pv);
+    }
     while (s < send) {
        if (*s == '\\') {
            if (s + 1 < send && (s[1] == '\\'))
@@ -1000,8 +1046,11 @@ S_sublex_push(pTHX)
     SAVEI32(PL_lex_inwhat);
     SAVECOPLINE(PL_curcop);
     SAVEPPTR(PL_bufptr);
+    SAVEPPTR(PL_bufend);
     SAVEPPTR(PL_oldbufptr);
     SAVEPPTR(PL_oldoldbufptr);
+    SAVEPPTR(PL_last_lop);
+    SAVEPPTR(PL_last_uni);
     SAVEPPTR(PL_linestart);
     SAVESPTR(PL_linestr);
     SAVEPPTR(PL_lex_brackstack);
@@ -1013,6 +1062,7 @@ S_sublex_push(pTHX)
     PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
        = SvPVX(PL_linestr);
     PL_bufend += SvCUR(PL_linestr);
+    PL_last_lop = PL_last_uni = Nullch;
     SAVEFREESV(PL_linestr);
 
     PL_lex_dojoin = FALSE;
@@ -1045,8 +1095,11 @@ STATIC I32
 S_sublex_done(pTHX)
 {
     if (!PL_lex_starts++) {
+       SV *sv = newSVpvn("",0);
+       if (SvUTF8(PL_linestr))
+           SvUTF8_on(sv);
        PL_expect = XOPERATOR;
-       yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpvn("",0));
+       yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
        return THING;
     }
 
@@ -1061,6 +1114,7 @@ S_sublex_done(pTHX)
        PL_lex_inpat = 0;
        PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
        PL_bufend += SvCUR(PL_linestr);
+       PL_last_lop = PL_last_uni = Nullch;
        SAVEFREESV(PL_linestr);
        PL_lex_dojoin = FALSE;
        PL_lex_brackets = 0;
@@ -1173,21 +1227,22 @@ S_scan_const(pTHX_ char *start)
     register char *d = SvPVX(sv);              /* destination for copies */
     bool dorange = FALSE;                      /* are we in a translit range? */
     bool didrange = FALSE;                     /* did we just finish a range? */
-    bool has_utf8 = FALSE;                     /* embedded \x{} */
+    I32  has_utf8 = FALSE;                     /* Output constant is UTF8 */
+    I32  this_utf8 = UTF;                      /* The source string is assumed to be UTF8 */
     UV uv;
 
-    I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
-       ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
-       : UTF;
-    I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
-       ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ?
-                                               OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF))
-       : UTF;
     const char *leaveit =      /* set of acceptably-backslashed characters */
        PL_lex_inpat
            ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
            : "";
 
+    if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+       /* If we are doing a trans and we know we want UTF8 set expectation */
+       has_utf8   = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
+       this_utf8  = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+    }
+
+
     while (s < send || dorange) {
         /* get transliterations out of the way (they're most literal) */
        if (PL_lex_inwhat == OP_TRANS) {
@@ -1197,6 +1252,18 @@ S_scan_const(pTHX_ char *start)
                I32 min;                        /* first character in range */
                I32 max;                        /* last character in range */
 
+               if (has_utf8) {
+                   char *c = (char*)utf8_hop((U8*)d, -1);
+                   char *e = d++;
+                   while (e-- > c)
+                       *(e + 1) = *e;
+                   *c = UTF_TO_NATIVE(0xff);
+                   /* mark the range as done, and continue */
+                   dorange = FALSE;
+                   didrange = TRUE;
+                   continue;
+               }
+
                i = d - SvPVX(sv);              /* remember current offset */
                SvGROW(sv, SvLEN(sv) + 256);    /* never more than 256 chars in a range */
                d = SvPVX(sv) + i;              /* refresh d after realloc */
@@ -1211,17 +1278,17 @@ S_scan_const(pTHX_ char *start)
                               (char)min, (char)max);
                 }
 
-#ifndef ASCIIish
+#ifdef EBCDIC
                if ((isLOWER(min) && isLOWER(max)) ||
                    (isUPPER(min) && isUPPER(max))) {
                    if (isLOWER(min)) {
                        for (i = min; i <= max; i++)
                            if (isLOWER(i))
-                               *d++ = i;
+                               *d++ = NATIVE_TO_NEED(has_utf8,i);
                    } else {
                        for (i = min; i <= max; i++)
                            if (isUPPER(i))
-                               *d++ = i;
+                               *d++ = NATIVE_TO_NEED(has_utf8,i);
                    }
                }
                else
@@ -1240,8 +1307,8 @@ S_scan_const(pTHX_ char *start)
                if (didrange) {
                    Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
                }
-               if (utf) {
-                   *d++ = (char)0xff;  /* use illegal utf8 byte--see pmtrans */
+               if (has_utf8) {
+                   *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
                    s++;
                    continue;
                }
@@ -1260,7 +1327,7 @@ S_scan_const(pTHX_ char *start)
        else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
            if (s[2] == '#') {
                while (s < send && *s != ')')
-                   *d++ = *s++;
+                   *d++ = NATIVE_TO_NEED(has_utf8,*s++);
            }
            else if (s[2] == '{' /* This should match regcomp.c */
                     || ((s[2] == 'p' || s[2] == '?') && s[3] == '{'))
@@ -1283,7 +1350,7 @@ S_scan_const(pTHX_ char *start)
                    yyerror("Sequence (?{...}) not terminated or not {}-balanced");
                }
                while (s < regparse)
-                   *d++ = *s++;
+                   *d++ = NATIVE_TO_NEED(has_utf8,*s++);
            }
        }
 
@@ -1291,7 +1358,7 @@ S_scan_const(pTHX_ char *start)
        else if (*s == '#' && PL_lex_inpat &&
          ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
            while (s+1 < send && *s != '\n')
-               *d++ = *s++;
+               *d++ = NATIVE_TO_NEED(has_utf8,*s++);
        }
 
        /* check for embedded arrays
@@ -1311,28 +1378,7 @@ S_scan_const(pTHX_ char *start)
                break;          /* in regexp, $ might be tail anchor */
        }
 
-       /* (now in tr/// code again) */
-
-       if (*s & 0x80 && (this_utf8 || has_utf8)) {
-           STRLEN len = (STRLEN) -1;
-           UV uv;
-           if (this_utf8) {
-               uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
-           }
-           if (len == (STRLEN)-1) {
-               /* Illegal UTF8 (a high-bit byte), make it valid. */
-               char *old_pvx = SvPVX(sv);
-               /* need space for one extra char (NOTE: SvCUR() not set here) */
-               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
-           }
-           else {
-               while (len--)
-                   *d++ = *s++;
-           }
-           has_utf8 = TRUE;
-           continue;
-       }
+       /* End of else if chain - OP_TRANS rejoin rest */
 
        /* backslashes */
        if (*s == '\\' && s+1 < send) {
@@ -1340,8 +1386,8 @@ S_scan_const(pTHX_ char *start)
 
            /* some backslashes we leave behind */
            if (*leaveit && *s && strchr(leaveit, *s)) {
-               *d++ = '\\';
-               *d++ = *s++;
+               *d++ = NATIVE_TO_NEED(has_utf8,'\\');
+               *d++ = NATIVE_TO_NEED(has_utf8,*s++);
                continue;
            }
 
@@ -1378,8 +1424,7 @@ S_scan_const(pTHX_ char *start)
                               "Unrecognized escape \\%c passed through",
                               *s);
                    /* default action is to copy the quoted character */
-                   *d++ = *s++;
-                   continue;
+                   goto default_action;
                }
 
            /* \132 indicates an octal constant */
@@ -1397,15 +1442,14 @@ S_scan_const(pTHX_ char *start)
                ++s;
                if (*s == '{') {
                    char* e = strchr(s, '}');
+                   STRLEN len = 1;             /* allow underscores */
+
                    if (!e) {
                        yyerror("Missing right brace on \\x{}");
-                       e = s;
-                   }
-                   else {
-                       STRLEN len = 1;         /* allow underscores */
-                       uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-                       has_utf8 = TRUE;
+                       ++s;
+                       continue;
                    }
+                   uv = (UV)scan_hex(s + 1, e - s - 1, &len);
                    s = e + 1;
                }
                else {
@@ -1418,52 +1462,66 @@ S_scan_const(pTHX_ char *start)
 
              NUM_ESCAPE_INSERT:
                /* Insert oct or hex escaped character.
-                * There will always enough room in sv since such escapes will
-                * be longer than any utf8 sequence they can end up as
-                */
-               if (uv > 127 || has_utf8) {
-                   if (!this_utf8 && !has_utf8 && uv > 255) {
-                       /* might need to recode whatever we have accumulated so far
-                        * if it contains any hibit chars
+                * There will always enough room in sv since such
+                * escapes will be longer than any UTF-8 sequence
+                * they can end up as. */
+               
+               /* We need to map to chars to ASCII before doing the tests
+                  to cover EBCDIC
+               */
+               if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) {
+                   if (!has_utf8 && uv > 255) {
+                       /* Might need to recode whatever we have
+                        * accumulated so far if it contains any
+                        * hibit chars.
+                        *
+                        * (Can't we keep track of that and avoid
+                        *  this rescan? --jhi)
                         */
-                       int hicount = 0;
-                       char *c;
-                       for (c = SvPVX(sv); c < d; c++) {
-                           if (*c & 0x80)
+                       int hicount = 0;
+                       U8 *c;
+                       for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) {
+                           if (!NATIVE_IS_INVARIANT(*c)) {
                                hicount++;
+                           }
                        }
                        if (hicount) {
-                           char *old_pvx = SvPVX(sv);
-                           char *src, *dst;
-                           d = SvGROW(sv, SvCUR(sv) + hicount + 1) + (d - old_pvx);
-
-                           src = d - 1;
-                           d += hicount;
-                           dst = d - 1;
-
-                           while (src < dst) {
-                               if (*src & 0x80) {
-                                   dst--;
-                                   uv_to_utf8((U8*)dst, (U8)*src--);
-                                   dst--;
+                           STRLEN offset = d - SvPVX(sv);
+                           U8 *src, *dst;
+                           d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset;
+                           src = (U8 *)d - 1;
+                           dst = src+hicount;
+                           d  += hicount;
+                           while (src >= (U8 *)SvPVX(sv)) {
+                               if (!NATIVE_IS_INVARIANT(*src)) {
+                                   U8 ch = NATIVE_TO_ASCII(*src);
+                                   *dst-- = UTF8_EIGHT_BIT_LO(ch);
+                                   *dst-- = UTF8_EIGHT_BIT_HI(ch);
                                }
                                else {
-                                   *dst-- = *src--;
+                                   *dst-- = *src;
                                }
+                               src--;
                            }
                         }
                     }
 
                     if (has_utf8 || uv > 255) {
-                       d = (char*)uv_to_utf8((U8*)d, uv);
+                       d = (char*)uvchr_to_utf8((U8*)d, uv);
                        has_utf8 = TRUE;
+                       if (PL_lex_inwhat == OP_TRANS &&
+                           PL_sublex_info.sub_op) {
+                           PL_sublex_info.sub_op->op_private |=
+                               (PL_lex_repl ? OPpTRANS_FROM_UTF
+                                            : OPpTRANS_TO_UTF);
+                       }
                     }
                    else {
                        *d++ = (char)uv;
                    }
                }
                else {
-                   *d++ = (char)uv;
+                   *d++ = (char) uv;
                }
                continue;
 
@@ -1484,6 +1542,8 @@ S_scan_const(pTHX_ char *start)
                    res = newSVpvn(s + 1, e - s - 1);
                    res = new_constant( Nullch, 0, "charnames",
                                        res, Nullsv, "\\N{...}" );
+                   if (has_utf8)
+                       sv_utf8_upgrade(res);
                    str = SvPV(res,len);
                    if (!has_utf8 && SvUTF8(res)) {
                        char *ostart = SvPVX(sv);
@@ -1499,7 +1559,7 @@ S_scan_const(pTHX_ char *start)
                    if (len > e - s + 4) {
                        char *odest = SvPVX(sv);
 
-                       SvGROW(sv, (SvCUR(sv) + len - (e - s + 4)));
+                       SvGROW(sv, (SvLEN(sv) + len - (e - s + 4)));
                        d = SvPVX(sv) + (d - odest);
                    }
                    Copy(str, d, len, char);
@@ -1515,66 +1575,79 @@ S_scan_const(pTHX_ char *start)
            /* \c is a control character */
            case 'c':
                s++;
-#ifdef EBCDIC
-               *d = *s++;
-               if (isLOWER(*d))
-                  *d = toUPPER(*d);
-               *d = toCTRL(*d);
-               d++;
-#else
                {
                    U8 c = *s++;
-                   *d++ = toCTRL(c);
-               }
+#ifdef EBCDIC
+                   if (isLOWER(c))
+                       c = toUPPER(c);
 #endif
+                   *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c));
+               }
                continue;
 
            /* printf-style backslashes, formfeeds, newlines, etc */
            case 'b':
-               *d++ = '\b';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\b');
                break;
            case 'n':
-               *d++ = '\n';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\n');
                break;
            case 'r':
-               *d++ = '\r';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\r');
                break;
            case 'f':
-               *d++ = '\f';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\f');
                break;
            case 't':
-               *d++ = '\t';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\t');
                break;
-#ifdef EBCDIC
            case 'e':
-               *d++ = '\047';  /* CP 1047 */
+               *d++ = ASCII_TO_NEED(has_utf8,'\033');
                break;
            case 'a':
-               *d++ = '\057';  /* CP 1047 */
+               *d++ = ASCII_TO_NEED(has_utf8,'\007');
                break;
-#else
-           case 'e':
-               *d++ = '\033';
-               break;
-           case 'a':
-               *d++ = '\007';
-               break;
-#endif
            } /* end switch */
 
            s++;
            continue;
        } /* end if (backslash) */
 
-       *d++ = *s++;
+    default_action:
+       /* If we started with encoded form, or already know we want it
+          and then encode the next character */
+       if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) {
+           STRLEN len  = 1;
+           UV uv       = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
+           STRLEN need = UNISKIP(NATIVE_TO_UNI(uv));
+           s += len;
+           if (need > len) {
+               /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */
+               STRLEN off = d - SvPVX(sv);
+               d = SvGROW(sv, SvLEN(sv) + (need-len)) + off;
+           }
+           d = (char*)uvchr_to_utf8((U8*)d, uv);
+           has_utf8 = TRUE;
+       }
+       else {
+           *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+       }
     } /* while loop to process each character */
 
     /* terminate the string and set up the sv */
     *d = '\0';
     SvCUR_set(sv, d - SvPVX(sv));
+    if (SvCUR(sv) >= SvLEN(sv))
+      Perl_croak(aTHX_ "panic: constant overflowed allocated space");
+
     SvPOK_on(sv);
-    if (has_utf8)
+    if (has_utf8) {
        SvUTF8_on(sv);
+       if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+               PL_sublex_info.sub_op->op_private |=
+                   (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+       }
+    }
 
     /* shrink the sv if we allocated more than we used */
     if (SvCUR(sv) + 5 < SvLEN(sv)) {
@@ -2058,23 +2131,21 @@ S_find_in_my_stash(pTHX_ char *pkgname, I32 len)
 */
 
 #ifdef USE_PURE_BISON
-#ifdef __SC__
-#pragma segment Perl_yylex_r
-#endif
 int
 Perl_yylex_r(pTHX_ YYSTYPE *lvalp, int *lcharp)
 {
     int r;
 
+    yyactlevel++;
     yylval_pointer[yyactlevel] = lvalp;
     yychar_pointer[yyactlevel] = lcharp;
-    yyactlevel++;
     if (yyactlevel >= YYMAXLEVEL)
        Perl_croak(aTHX_ "panic: YYMAXLEVEL");
 
     r = Perl_yylex(aTHX);
 
-    yyactlevel--;
+    if (yyactlevel > 0)
+       yyactlevel--;
 
     return r;
 }
@@ -2083,13 +2154,8 @@ Perl_yylex_r(pTHX_ YYSTYPE *lvalp, int *lcharp)
 #ifdef __SC__
 #pragma segment Perl_yylex
 #endif
-
 int
-#ifdef USE_PURE_BISON
-Perl_yylex(pTHX_ YYSTYPE *lvalp, int *lcharp)
-#else
 Perl_yylex(pTHX)
-#endif
 {
     register char *s;
     register char *d;
@@ -2097,6 +2163,7 @@ Perl_yylex(pTHX)
     STRLEN len;
     GV *gv = Nullgv;
     GV **gvp = 0;
+    bool bof = FALSE;
 
     /* check if there's an identifier for us to look at */
     if (PL_pending_ident) {
@@ -2105,7 +2172,7 @@ Perl_yylex(pTHX)
        PL_pending_ident = 0;
 
        DEBUG_T({ PerlIO_printf(Perl_debug_log,
-              "### Tokener saw identifier '%s'\n", PL_tokenbuf); })
+              "### Tokener saw identifier '%s'\n", PL_tokenbuf); });
 
        /* if we're in a my(), we can't allow dynamics here.
           $foo'bar has already been turned into $foo::bar, so
@@ -2246,7 +2313,7 @@ Perl_yylex(pTHX)
        }
        DEBUG_T({ PerlIO_printf(Perl_debug_log,
               "### Next token after '%s' was known, type %"IVdf"\n", PL_bufptr,
-              (IV)PL_nexttype[PL_nexttoke]); })
+              (IV)PL_nexttype[PL_nexttoke]); });
 
        return(PL_nexttype[PL_nexttoke]);
 
@@ -2280,7 +2347,7 @@ Perl_yylex(pTHX)
        }
        else {
            DEBUG_T({ PerlIO_printf(Perl_debug_log,
-              "### Saw case modifier at '%s'\n", PL_bufptr); })
+              "### Saw case modifier at '%s'\n", PL_bufptr); });
            s = PL_bufptr + 1;
            if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3))
                tmp = *s, *s = s[2], s[2] = tmp;        /* misordered... */
@@ -2332,7 +2399,7 @@ Perl_yylex(pTHX)
        if (PL_bufptr == PL_bufend)
            return sublex_done();
        DEBUG_T({ PerlIO_printf(Perl_debug_log,
-              "### Interpolated variable at '%s'\n", PL_bufptr); })
+              "### Interpolated variable at '%s'\n", PL_bufptr); });
        PL_expect = XTERM;
        PL_lex_dojoin = (*PL_bufptr == '@');
        PL_lex_state = LEX_INTERPNORMAL;
@@ -2432,7 +2499,7 @@ Perl_yylex(pTHX)
     DEBUG_T( {
        PerlIO_printf(Perl_debug_log, "### Tokener expecting %s at %s\n",
                      exp_name[PL_expect], s);
-    } )
+    } );
 
   retry:
     switch (*s) {
@@ -2451,7 +2518,7 @@ Perl_yylex(pTHX)
                yyerror("Missing right curly or square bracket");
             DEBUG_T( { PerlIO_printf(Perl_debug_log,
                         "### Tokener got EOF\n");
-            } )
+            } );
            TOKEN(0);
        }
        if (s++ < PL_bufend)
@@ -2507,6 +2574,7 @@ Perl_yylex(pTHX)
            sv_catpv(PL_linestr, "\n");
            PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
            PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+           PL_last_lop = PL_last_uni = Nullch;
            if (PERLDB_LINE && PL_curstash != PL_debstash) {
                SV *sv = NEWSV(85,0);
 
@@ -2517,8 +2585,35 @@ Perl_yylex(pTHX)
            goto retry;
        }
        do {
-           bool bof = PL_rsfp ? TRUE : FALSE;
-           if (bof) {
+           bof = PL_rsfp ? TRUE : FALSE;
+           if ((s = filter_gets(PL_linestr, PL_rsfp, 0)) == Nullch) {
+             fake_eof:
+               if (PL_rsfp) {
+                   if (PL_preprocess && !PL_in_eval)
+                       (void)PerlProc_pclose(PL_rsfp);
+                   else if ((PerlIO *)PL_rsfp == PerlIO_stdin())
+                       PerlIO_clearerr(PL_rsfp);
+                   else
+                       (void)PerlIO_close(PL_rsfp);
+                   PL_rsfp = Nullfp;
+                   PL_doextract = FALSE;
+               }
+               if (!PL_in_eval && (PL_minus_n || PL_minus_p)) {
+                   sv_setpv(PL_linestr,PL_minus_p ? ";}continue{print" : "");
+                   sv_catpv(PL_linestr,";}");
+                   PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
+                   PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+                   PL_last_lop = PL_last_uni = Nullch;
+                   PL_minus_n = PL_minus_p = 0;
+                   goto retry;
+               }
+               PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
+               PL_last_lop = PL_last_uni = Nullch;
+               sv_setpv(PL_linestr,"");
+               TOKEN(';');     /* not infinite loop because rsfp is NULL now */
+           }
+           /* if it looks like the start of a BOM, check if it in fact is */
+           else if (bof && (!*s || *(U8*)s == 0xEF || *(U8*)s >= 0xFE)) {
 #ifdef PERLIO_IS_STDIO
 #  ifdef __GNU_LIBRARY__
 #    if __GNU_LIBRARY__ == 1 /* Linux glibc5 */
@@ -2538,38 +2633,14 @@ Perl_yylex(pTHX)
                 * Workaround?  Maybe attach some extra state to PL_rsfp?
                 */
                if (!PL_preprocess)
-                   bof = PerlIO_tell(PL_rsfp) == 0;
+                   bof = PerlIO_tell(PL_rsfp) == SvCUR(PL_linestr);
 #else
-               bof = PerlIO_tell(PL_rsfp) == 0;
+               bof = PerlIO_tell(PL_rsfp) == SvCUR(PL_linestr);
 #endif
-           }
-           s = filter_gets(PL_linestr, PL_rsfp, 0);
-           if (s == Nullch) {
-             fake_eof:
-               if (PL_rsfp) {
-                   if (PL_preprocess && !PL_in_eval)
-                       (void)PerlProc_pclose(PL_rsfp);
-                   else if ((PerlIO *)PL_rsfp == PerlIO_stdin())
-                       PerlIO_clearerr(PL_rsfp);
-                   else
-                       (void)PerlIO_close(PL_rsfp);
-                   PL_rsfp = Nullfp;
-                   PL_doextract = FALSE;
-               }
-               if (!PL_in_eval && (PL_minus_n || PL_minus_p)) {
-                   sv_setpv(PL_linestr,PL_minus_p ? ";}continue{print" : "");
-                   sv_catpv(PL_linestr,";}");
-                   PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
+               if (bof) {
                    PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
-                   PL_minus_n = PL_minus_p = 0;
-                   goto retry;
+                   s = swallow_bom((U8*)s);
                }
-               PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
-               sv_setpv(PL_linestr,"");
-               TOKEN(';');     /* not infinite loop because rsfp is NULL now */
-           } else if (bof) {
-               PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
-               s = swallow_bom((U8*)s);
            }
            if (PL_doextract) {
                if (*s == '#' && s[1] == '!' && instr(s,"perl"))
@@ -2580,6 +2651,7 @@ Perl_yylex(pTHX)
                    sv_setpv(PL_linestr, "");
                    PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
                    PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+                   PL_last_lop = PL_last_uni = Nullch;
                    PL_doextract = FALSE;
                }
            }
@@ -2594,6 +2666,7 @@ Perl_yylex(pTHX)
            av_store(CopFILEAV(PL_curcop),(I32)CopLINE(PL_curcop),sv);
        }
        PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+       PL_last_lop = PL_last_uni = Nullch;
        if (CopLINE(PL_curcop) == 1) {
            while (s < PL_bufend && isSPACE(*s))
                s++;
@@ -2737,6 +2810,7 @@ Perl_yylex(pTHX)
                            sv_setpv(PL_linestr, "");
                            PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
                            PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+                           PL_last_lop = PL_last_uni = Nullch;
                            PL_preambled = FALSE;
                            if (PERLDB_LINE)
                                (void)gv_fetchfile(PL_origfilename);
@@ -2777,6 +2851,8 @@ Perl_yylex(pTHX)
                s++;
            if (s < d)
                s++;
+           else if (s > d) /* Found by Ilya: feed random input to Perl. */
+             Perl_croak(aTHX_ "panic: input overflow");
            incline(s);
            if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
                PL_bufptr = s;
@@ -2804,7 +2880,7 @@ Perl_yylex(pTHX)
                s = force_word(PL_bufptr,WORD,FALSE,FALSE,FALSE);
                 DEBUG_T( { PerlIO_printf(Perl_debug_log,
                             "### Saw unary minus before =>, forcing word '%s'\n", s);
-                } )
+                } );
                OPERATOR('-');          /* unary minus */
            }
            PL_last_uni = PL_oldbufptr;
@@ -2848,16 +2924,17 @@ Perl_yylex(pTHX)
            if (ftst) {
                PL_last_lop_op = ftst;
                DEBUG_T( { PerlIO_printf(Perl_debug_log,
-                        "### Saw file test %c\n", ftst);
-               } )
+                        "### Saw file test %c\n", (int)ftst);
+               } );
                FTST(ftst);
            }
            else {
                /* Assume it was a minus followed by a one-letter named
                 * subroutine call (or a -bareword), then. */
                DEBUG_T( { PerlIO_printf(Perl_debug_log,
-                        "### %c looked like a file test but was not\n", ftst);
-               } )
+                       "### %c looked like a file test but was not\n",
+                       (int)ftst);
+               } );
                s -= 2;
            }
        }
@@ -2988,10 +3065,6 @@ Perl_yylex(pTHX)
                if (*d == '(') {
                    d = scan_str(d,TRUE,TRUE);
                    if (!d) {
-                       if (PL_lex_stuff) {
-                           SvREFCNT_dec(PL_lex_stuff);
-                           PL_lex_stuff = Nullsv;
-                       }
                        /* MUST advance bufptr here to avoid bogus
                           "at end of line" context messages from yyerror().
                         */
@@ -3011,9 +3084,25 @@ Perl_yylex(pTHX)
                    PL_lex_stuff = Nullsv;
                }
                else {
-                   attrs = append_elem(OP_LIST, attrs,
-                                       newSVOP(OP_CONST, 0,
-                                               newSVpvn(s, len)));
+                   if (!PL_in_my && len == 6 && strnEQ(s, "lvalue", len))
+                       CvLVALUE_on(PL_compcv);
+                   else if (!PL_in_my && len == 6 && strnEQ(s, "locked", len))
+                       CvLOCKED_on(PL_compcv);
+                   else if (!PL_in_my && len == 6 && strnEQ(s, "method", len))
+                       CvMETHOD_on(PL_compcv);
+#ifdef USE_ITHREADS
+                   else if (PL_in_my == KEY_our && len == 6 && strnEQ(s, "shared", len))
+                       GvSHARED_on(cGVOPx_gv(yylval.opval));
+#endif
+                   /* After we've set the flags, it could be argued that
+                      we don't need to do the attributes.pm-based setting
+                      process, and shouldn't bother appending recognized
+                      flags. To experiment with that, uncomment the
+                      following "else": */
+                   else
+                       attrs = append_elem(OP_LIST, attrs,
+                                           newSVOP(OP_CONST, 0,
+                                                   newSVpvn(s, len)));
                }
                s = skipspace(d);
                if (*s == ':' && s[1] != ':')
@@ -3142,8 +3231,16 @@ Perl_yylex(pTHX)
                else
                    PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
                s = skipspace(s);
-               if (*s == '}')
+               if (*s == '}') {
+                   if (PL_expect == XREF && PL_lex_state == LEX_INTERPNORMAL) {
+                       PL_expect = XTERM;
+                       /* This hack is to get the ${} in the message. */
+                       PL_bufptr = s+1;
+                       yyerror("syntax error");
+                       break;
+                   }
                    OPERATOR(HASHBRACK);
+               }
                /* This hack serves to disambiguate a pair of curlies
                 * as being a block or an anon hash.  Normally, expectation
                 * determines that, but in cases where we're not in a
@@ -3605,7 +3702,7 @@ Perl_yylex(pTHX)
        s = scan_num(s, &yylval);
         DEBUG_T( { PerlIO_printf(Perl_debug_log,
                     "### Saw number in '%s'\n", s);
-        } )
+        } );
        if (PL_expect == XOPERATOR)
            no_op("Number",s);
        TERM(THING);
@@ -3613,8 +3710,8 @@ Perl_yylex(pTHX)
     case '\'':
        s = scan_str(s,FALSE,FALSE);
         DEBUG_T( { PerlIO_printf(Perl_debug_log,
-                    "### Saw string in '%s'\n", s);
-        } )
+                    "### Saw string before '%s'\n", s);
+        } );
        if (PL_expect == XOPERATOR) {
            if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) {
                PL_expect = XTERM;
@@ -3632,8 +3729,8 @@ Perl_yylex(pTHX)
     case '"':
        s = scan_str(s,FALSE,FALSE);
         DEBUG_T( { PerlIO_printf(Perl_debug_log,
-                    "### Saw string in '%s'\n", s);
-        } )
+                    "### Saw string before '%s'\n", s);
+        } );
        if (PL_expect == XOPERATOR) {
            if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) {
                PL_expect = XTERM;
@@ -3647,7 +3744,7 @@ Perl_yylex(pTHX)
            missingterm((char*)0);
        yylval.ival = OP_CONST;
        for (d = SvPV(PL_lex_stuff, len); len; len--, d++) {
-           if (*d == '$' || *d == '@' || *d == '\\' || *d & 0x80) {
+           if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) {
                yylval.ival = OP_STRINGIFY;
                break;
            }
@@ -3657,8 +3754,8 @@ Perl_yylex(pTHX)
     case '`':
        s = scan_str(s,FALSE,FALSE);
         DEBUG_T( { PerlIO_printf(Perl_debug_log,
-                    "### Saw backtick string in '%s'\n", s);
-        } )
+                    "### Saw backtick string before '%s'\n", s);
+        } );
        if (PL_expect == XOPERATOR)
            no_op("Backticks",s);
        if (!s)
@@ -3688,7 +3785,7 @@ Perl_yylex(pTHX)
                TERM(THING);
            }
            /* avoid v123abc() or $h{v1}, allow C<print v10;> */
-           else if (!isALPHA(*start) && (PL_expect == XTERM || PL_expect == XREF)) {
+           else if (!isALPHA(*start) && (PL_expect == XTERM || PL_expect == XREF || PL_expect == XSTATE)) {
                char c = *start;
                GV *gv;
                *start = '\0';
@@ -3774,6 +3871,8 @@ Perl_yylex(pTHX)
            CLINE;
            yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0));
            yylval.opval->op_private = OPpCONST_BARE;
+           if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len))
+             SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
            TERM(WORD);
        }
 
@@ -3915,10 +4014,10 @@ Perl_yylex(pTHX)
                    /* If not a declared subroutine, it's an indirect object. */
                    /* (But it's an indir obj regardless for sort.) */
 
-                   if ((PL_last_lop_op == OP_SORT ||
-                         (!immediate_paren && (!gv || !GvCVu(gv)))) &&
+                   if ( !immediate_paren && (PL_last_lop_op == OP_SORT ||
+                         ((!gv || !GvCVu(gv)) &&
                         (PL_last_lop_op != OP_MAPSTART &&
-                        PL_last_lop_op != OP_GREPSTART))
+                        PL_last_lop_op != OP_GREPSTART))))
                    {
                        PL_expect = (PL_last_lop == PL_oldoldbufptr) ? XTERM : XOPERATOR;
                        goto bareword;
@@ -3933,6 +4032,8 @@ Perl_yylex(pTHX)
                if (*s == '=' && s[1] == '>') {
                    CLINE;
                    sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf);
+                   if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len))
+                     SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
                    TERM(WORD);
                }
 
@@ -4094,16 +4195,26 @@ Perl_yylex(pTHX)
                        loc = PerlIO_tell(PL_rsfp);
                        (void)PerlIO_seek(PL_rsfp, 0L, 0);
                    }
+#ifdef NETWARE
+                       if (PerlLIO_setmode(PL_rsfp, O_TEXT) != -1) {
+#else
                    if (PerlLIO_setmode(PerlIO_fileno(PL_rsfp), O_TEXT) != -1) {
-#if defined(__BORLANDC__)
+#endif /* NETWARE */
+#ifdef PERLIO_IS_STDIO /* really? */
+#  if defined(__BORLANDC__)
                        /* XXX see note in do_binmode() */
-                       ((FILE*)PL_rsfp)->flags |= _F_BIN;
+                       ((FILE*)PL_rsfp)->flags &= ~_F_BIN;
+#  endif
 #endif
                        if (loc > 0)
                            PerlIO_seek(PL_rsfp, loc, 0);
                    }
                }
 #endif
+#ifdef PERLIO_LAYERS
+               if (UTF && !IN_BYTES)
+                   PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8");
+#endif
                PL_rsfp = Nullfp;
            }
            goto fake_eof;
@@ -4634,6 +4745,7 @@ Perl_yylex(pTHX)
                int warned = 0;
                d = SvPV_force(PL_lex_stuff, len);
                while (len) {
+                   SV *sv;
                    for (; isSPACE(*d) && len; --len, ++d) ;
                    if (len) {
                        char *b = d;
@@ -4654,8 +4766,11 @@ Perl_yylex(pTHX)
                        else {
                            for (; !isSPACE(*d) && len; --len, ++d) ;
                        }
+                       sv = newSVpvn(b, d-b);
+                       if (DO_UTF8(PL_lex_stuff))
+                           SvUTF8_on(sv);
                        words = append_elem(OP_LIST, words,
-                                           newSVOP(OP_CONST, 0, tokeq(newSVpvn(b, d-b))));
+                                           newSVOP(OP_CONST, 0, tokeq(sv)));
                    }
                }
                if (words) {
@@ -4663,9 +4778,10 @@ Perl_yylex(pTHX)
                    force_next(THING);
                }
            }
-           if (PL_lex_stuff)
+           if (PL_lex_stuff) {
                SvREFCNT_dec(PL_lex_stuff);
-           PL_lex_stuff = Nullsv;
+               PL_lex_stuff = Nullsv;
+           }
            PL_expect = XTERM;
            TOKEN('(');
 
@@ -4885,7 +5001,7 @@ Perl_yylex(pTHX)
          really_sub:
            {
                char tmpbuf[sizeof PL_tokenbuf];
-               SSize_t tboffset;
+               SSize_t tboffset = 0;
                expectation attrful;
                bool have_name, have_proto;
                int key = tmp;
@@ -4933,12 +5049,8 @@ Perl_yylex(pTHX)
                    char *p;
 
                    s = scan_str(s,FALSE,FALSE);
-                   if (!s) {
-                       if (PL_lex_stuff)
-                           SvREFCNT_dec(PL_lex_stuff);
-                       PL_lex_stuff = Nullsv;
+                   if (!s)
                        Perl_croak(aTHX_ "Prototype not terminated");
-                   }
                    /* strip spaces */
                    d = SvPVX(PL_lex_stuff);
                    tmp = 0;
@@ -5533,7 +5645,7 @@ Perl_keyword(pTHX_ register char *d, I32 len)
            if (strEQ(d,"rindex"))              return -KEY_rindex;
            break;
        case 7:
-           if (strEQ(d,"require"))             return -KEY_require;
+           if (strEQ(d,"require"))             return KEY_require;
            if (strEQ(d,"reverse"))             return -KEY_reverse;
            if (strEQ(d,"readdir"))             return -KEY_readdir;
            break;
@@ -5906,9 +6018,9 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag
            *d++ = *s++;
            *d++ = *s++;
        }
-       else if (UTF && *(U8*)s >= 0xc0 && isALNUM_utf8((U8*)s)) {
+       else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) {
            char *t = s + UTF8SKIP(s);
-           while (*t & 0x80 && is_utf8_mark((U8*)t))
+           while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t))
                t += UTF8SKIP(t);
            if (d + (t - s) > e)
                Perl_croak(aTHX_ ident_too_long);
@@ -5958,9 +6070,9 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
                *d++ = *s++;
                *d++ = *s++;
            }
-           else if (UTF && *(U8*)s >= 0xc0 && isALNUM_utf8((U8*)s)) {
+           else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) {
                char *t = s + UTF8SKIP(s);
-               while (*t & 0x80 && is_utf8_mark((U8*)t))
+               while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t))
                    t += UTF8SKIP(t);
                if (d + (t - s) > e)
                    Perl_croak(aTHX_ ident_too_long);
@@ -6013,7 +6125,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
                e = s;
                while ((e < send && isALNUM_lazy_if(e,UTF)) || *e == ':') {
                    e += UTF8SKIP(e);
-                   while (e < send && *e & 0x80 && is_utf8_mark((U8*)e))
+                   while (e < send && UTF8_IS_CONTINUED(*e) && is_utf8_mark((U8*)e))
                        e += UTF8SKIP(e);
                }
                Copy(s, d, e - s, char);
@@ -6105,12 +6217,8 @@ S_scan_pat(pTHX_ char *start, I32 type)
     char *s;
 
     s = scan_str(start,FALSE,FALSE);
-    if (!s) {
-       if (PL_lex_stuff)
-           SvREFCNT_dec(PL_lex_stuff);
-       PL_lex_stuff = Nullsv;
+    if (!s)
        Perl_croak(aTHX_ "Search pattern not terminated");
-    }
 
     pm = (PMOP*)newPMOP(type, 0);
     if (PL_multi_open == '?')
@@ -6142,12 +6250,8 @@ S_scan_subst(pTHX_ char *start)
 
     s = scan_str(start,FALSE,FALSE);
 
-    if (!s) {
-       if (PL_lex_stuff)
-           SvREFCNT_dec(PL_lex_stuff);
-       PL_lex_stuff = Nullsv;
+    if (!s)
        Perl_croak(aTHX_ "Substitution pattern not terminated");
-    }
 
     if (s[-1] == PL_multi_open)
        s--;
@@ -6155,12 +6259,10 @@ S_scan_subst(pTHX_ char *start)
     first_start = PL_multi_start;
     s = scan_str(s,FALSE,FALSE);
     if (!s) {
-       if (PL_lex_stuff)
+       if (PL_lex_stuff) {
            SvREFCNT_dec(PL_lex_stuff);
-       PL_lex_stuff = Nullsv;
-       if (PL_lex_repl)
-           SvREFCNT_dec(PL_lex_repl);
-       PL_lex_repl = Nullsv;
+           PL_lex_stuff = Nullsv;
+       }
        Perl_croak(aTHX_ "Substitution replacement not terminated");
     }
     PL_multi_start = first_start;      /* so whole substitution is taken together */
@@ -6209,35 +6311,24 @@ S_scan_trans(pTHX_ char *start)
     I32 squash;
     I32 del;
     I32 complement;
-    I32 utf8;
-    I32 count = 0;
 
     yylval.ival = OP_NULL;
 
     s = scan_str(start,FALSE,FALSE);
-    if (!s) {
-       if (PL_lex_stuff)
-           SvREFCNT_dec(PL_lex_stuff);
-       PL_lex_stuff = Nullsv;
+    if (!s)
        Perl_croak(aTHX_ "Transliteration pattern not terminated");
-    }
     if (s[-1] == PL_multi_open)
        s--;
 
     s = scan_str(s,FALSE,FALSE);
     if (!s) {
-       if (PL_lex_stuff)
+       if (PL_lex_stuff) {
            SvREFCNT_dec(PL_lex_stuff);
-       PL_lex_stuff = Nullsv;
-       if (PL_lex_repl)
-           SvREFCNT_dec(PL_lex_repl);
-       PL_lex_repl = Nullsv;
+           PL_lex_stuff = Nullsv;
+       }
        Perl_croak(aTHX_ "Transliteration replacement not terminated");
     }
 
-    New(803,tbl,256,short);
-    o = newPVOP(OP_TRANS, 0, (char*)tbl);
-
     complement = del = squash = 0;
     while (strchr("cds", *s)) {
        if (*s == 'c')
@@ -6248,7 +6339,12 @@ S_scan_trans(pTHX_ char *start)
            squash = OPpTRANS_SQUASH;
        s++;
     }
-    o->op_private = del|squash|complement;
+
+    New(803, tbl, complement&&!del?258:256, short);
+    o = newPVOP(OP_TRANS, 0, (char*)tbl);
+    o->op_private = del|squash|complement|
+      (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
+      (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF   : 0);
 
     PL_lex_op = o;
     yylval.ival = OP_TRANS;
@@ -6390,6 +6486,7 @@ S_scan_heredoc(pTHX_ register char *s)
        sv_setsv(PL_linestr,herewas);
        PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = s = PL_linestart = SvPVX(PL_linestr);
        PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+       PL_last_lop = PL_last_uni = Nullch;
     }
     else
        sv_setpvn(tmpstr,"",0);   /* avoid "uninitialized" warning */
@@ -6401,6 +6498,7 @@ S_scan_heredoc(pTHX_ register char *s)
        }
        CopLINE_inc(PL_curcop);
        PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+       PL_last_lop = PL_last_uni = Nullch;
 #ifndef PERL_STRICT_CR
        if (PL_bufend - PL_linestart >= 2) {
            if ((PL_bufend[-2] == '\r' && PL_bufend[-1] == '\n') ||
@@ -6442,6 +6540,8 @@ retval:
        Renew(SvPVX(tmpstr), SvLEN(tmpstr), char);
     }
     SvREFCNT_dec(herewas);
+    if (UTF && !IN_BYTES && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr)))
+       SvUTF8_on(tmpstr);
     PL_lex_stuff = tmpstr;
     yylval.ival = op_type;
     return s;
@@ -6599,11 +6699,11 @@ S_scan_inputsymbol(pTHX_ char *start)
    delimiter.  It allows quoting of delimiters, and if the string has
    balanced delimiters ([{<>}]) it allows nesting.
 
-   The lexer always reads these strings into lex_stuff, except in the
-   case of the operators which take *two* arguments (s/// and tr///)
-   when it checks to see if lex_stuff is full (presumably with the 1st
-   arg to s or tr) and if so puts the string into lex_repl.
-
+   On success, the SV with the resulting string is put into lex_stuff or,
+   if that is already non-NULL, into lex_repl. The second case occurs only
+   when parsing the RHS of the special constructs s/// and tr/// (y///).
+   For convenience, the terminating delimiter character is stuffed into
+   SvIVX of the SV.
 */
 
 STATIC char *
@@ -6626,7 +6726,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
     /* after skipping whitespace, the next character is the terminator */
     term = *s;
-    if ((term & 0x80) && UTF)
+    if (!UTF8_IS_INVARIANT((U8)term) && UTF)
        has_utf8 = TRUE;
 
     /* mark where we are */
@@ -6673,7 +6773,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
                   have found the terminator */
                else if (*s == term)
                    break;
-               else if (!has_utf8 && (*s & 0x80) && UTF)
+               else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
                    has_utf8 = TRUE;
                *to = *s;
            }
@@ -6702,7 +6802,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
                    break;
                else if (*s == PL_multi_open)
                    brackets++;
-               else if (!has_utf8 && (*s & 0x80) && UTF)
+               else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
                    has_utf8 = TRUE;
                *to = *s;
            }
@@ -6757,6 +6857,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
        /* having changed the buffer, we must update PL_bufend */
        PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+       PL_last_lop = PL_last_uni = Nullch;
     }
 
     /* at this point, we have successfully read the delimited string */
@@ -6793,11 +6894,11 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
   Read a number in any of the formats that Perl accepts:
 
-  0(x[0-7A-F]+)|([0-7]+)|(b[01])
-  [\d_]+(\.[\d_]*)?[Ee](\d+)
-
-  Underbars (_) are allowed in decimal numbers.  If -w is on,
-  underbars before a decimal point must be at three digit intervals.
+  \d(_?\d)*(\.(\d(_?\d)*)?)?[Ee][\+\-]?(\d(_?\d)*)     12 12.34 12.
+  \.\d(_?\d)*[Ee][\+\-]?(\d(_?\d)*)                    .34
+  0b[01](_?[01])*
+  0[0-7](_?[0-7])*
+  0x[0-9A-Fa-f](_?[0-9A-Fa-f])*
 
   Like most scan_ routines, it uses the PL_tokenbuf buffer to hold the
   thing it reads.
@@ -6867,8 +6968,17 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
            else if (s[1] == '.' || s[1] == 'e' || s[1] == 'E')
                goto decimal;
            /* so it must be octal */
-           else
+           else {
                shift = 3;
+               s++;
+           }
+
+           if (*s == '_') {
+              if (ckWARN(WARN_SYNTAX))
+                  Perl_warner(aTHX_ WARN_SYNTAX,
+                              "Misplaced _ in number");
+              lastub = s++;
+           }
 
            base = bases[shift];
            Base = Bases[shift];
@@ -6886,9 +6996,12 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
                default:
                    goto out;
 
-               /* _ are ignored */
+               /* _ are ignored -- but warned about if consecutive */
                case '_':
-                   s++;
+                   if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1)
+                       Perl_warner(aTHX_ WARN_SYNTAX,
+                                   "Misplaced _ in number");
+                   lastub = s++;
                    break;
 
                /* 8 and 9 are not octal */
@@ -6955,6 +7068,13 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
             the number.
          */
          out:
+
+           /* final misplaced underbar check */
+           if (s[-1] == '_') {
+               if (ckWARN(WARN_SYNTAX))
+                   Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number");
+           }
+
            sv = NEWSV(92,0);
            if (overflowed) {
                if (ckWARN(WARN_PORTABLE) && n > 4294967295.0)
@@ -6994,9 +7114,10 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
               if -w is on
            */
            if (*s == '_') {
-               if (ckWARN(WARN_SYNTAX) && lastub && s - lastub != 3)
-                   Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number");
-               lastub = ++s;
+               if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1)
+                   Perl_warner(aTHX_ WARN_SYNTAX,
+                               "Misplaced _ in number");
+               lastub = s++;
            }
            else {
                /* check for end of fixed-length buffer */
@@ -7008,7 +7129,7 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
        }
 
        /* final misplaced underbar check */
-       if (lastub && s - lastub != 3) {
+       if (lastub && s == lastub + 1) {
            if (ckWARN(WARN_SYNTAX))
                Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number");
        }
@@ -7021,16 +7142,34 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
            floatit = TRUE;
            *d++ = *s++;
 
-           /* copy, ignoring underbars, until we run out of
-              digits.  Note: no misplaced underbar checks!
+           if (*s == '_') {
+               if (ckWARN(WARN_SYNTAX))
+                   Perl_warner(aTHX_ WARN_SYNTAX,
+                               "Misplaced _ in number");
+               lastub = s;
+           }
+
+           /* copy, ignoring underbars, until we run out of digits.
            */
            for (; isDIGIT(*s) || *s == '_'; s++) {
                /* fixed length buffer check */
                if (d >= e)
                    Perl_croak(aTHX_ number_too_long);
-               if (*s != '_')
+               if (*s == '_') {
+                  if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1)
+                      Perl_warner(aTHX_ WARN_SYNTAX,
+                                  "Misplaced _ in number");
+                  lastub = s;
+               }
+               else
                    *d++ = *s;
            }
+           /* fractional part ending in underbar? */
+           if (s[-1] == '_') {
+               if (ckWARN(WARN_SYNTAX))
+                   Perl_warner(aTHX_ WARN_SYNTAX,
+                               "Misplaced _ in number");
+           }
            if (*s == '.' && isDIGIT(s[1])) {
                /* oops, it's really a v-string, but without the "v" */
                s = start - 1;
@@ -7039,22 +7178,48 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
        }
 
        /* read exponent part, if present */
-       if (*s && strchr("eE",*s) && strchr("+-0123456789",s[1])) {
+       if (*s && strchr("eE",*s) && strchr("+-0123456789_", s[1])) {
            floatit = TRUE;
            s++;
 
            /* regardless of whether user said 3E5 or 3e5, use lower 'e' */
            *d++ = 'e';         /* At least some Mach atof()s don't grok 'E' */
 
+           /* stray preinitial _ */
+           if (*s == '_') {
+               if (ckWARN(WARN_SYNTAX))
+                   Perl_warner(aTHX_ WARN_SYNTAX,
+                               "Misplaced _ in number");
+               lastub = s++;
+           }
+
            /* allow positive or negative exponent */
            if (*s == '+' || *s == '-')
                *d++ = *s++;
 
-           /* read digits of exponent (no underbars :-) */
-           while (isDIGIT(*s)) {
-               if (d >= e)
-                   Perl_croak(aTHX_ number_too_long);
-               *d++ = *s++;
+           /* stray initial _ */
+           if (*s == '_') {
+               if (ckWARN(WARN_SYNTAX))
+                   Perl_warner(aTHX_ WARN_SYNTAX,
+                               "Misplaced _ in number");
+               lastub = s++;
+           }
+
+           /* read digits of exponent */
+           while (isDIGIT(*s) || *s == '_') {
+               if (isDIGIT(*s)) {
+                   if (d >= e)
+                       Perl_croak(aTHX_ number_too_long);
+                   *d++ = *s++;
+               }
+               else {
+                  if (ckWARN(WARN_SYNTAX) &&
+                      ((lastub && s == lastub + 1) ||
+                       (!isDIGIT(s[1]) && s[1] != '_')))
+                      Perl_warner(aTHX_ WARN_SYNTAX,
+                                  "Misplaced _ in number");
+                  lastub = s++;
+               }
            }
        }
 
@@ -7073,8 +7238,8 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp)
         */
 
        if (!floatit) {
-           IV iv;
-           UV uv;
+           IV iv = 0;
+           UV uv = 0;
            errno = 0;
            if (*PL_tokenbuf == '-')
                iv = Strtol(PL_tokenbuf, (char**)NULL, 10);
@@ -7162,7 +7327,6 @@ vstring:
                UV rev;
                U8 tmpbuf[UTF8_MAXLEN+1];
                U8 *tmpend;
-               bool utf8 = FALSE;
                s++;                            /* get past 'v' */
 
                sv = NEWSV(92,5);
@@ -7188,9 +7352,11 @@ vstring:
                                            "Integer overflow in decimal number");
                        }
                    }
-                   tmpend = uv_to_utf8(tmpbuf, rev);
-                   utf8 = utf8 || rev > 127;
+                   /* Append native character for the rev point */
+                   tmpend = uvchr_to_utf8(tmpbuf, rev);
                    sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
+                   if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev)))
+                       SvUTF8_on(sv);
                    if (*pos == '.' && isDIGIT(pos[1]))
                        s = ++pos;
                    else {
@@ -7200,13 +7366,8 @@ vstring:
                    while (isDIGIT(*pos) || *pos == '_')
                        pos++;
                }
-
                SvPOK_on(sv);
                SvREADONLY_on(sv);
-               if (utf8) {
-                   SvUTF8_on(sv);
-                   sv_utf8_downgrade(sv, TRUE);
-               }
            }
        }
        break;
@@ -7272,6 +7433,7 @@ S_scan_formline(pTHX_ register char *s)
            s = filter_gets(PL_linestr, PL_rsfp, 0);
            PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = SvPVX(PL_linestr);
            PL_bufend = PL_bufptr + SvCUR(PL_linestr);
+           PL_last_lop = PL_last_uni = Nullch;
            if (!s) {
                s = PL_bufptr;
                yyerror("Format not terminated");
@@ -7367,6 +7529,9 @@ Perl_start_subparse(pTHX_ I32 is_format, U32 flags)
     return oldsavestack_ix;
 }
 
+#ifdef __SC__
+#pragma segment Perl_yylex
+#endif
 int
 Perl_yywarn(pTHX_ char *s)
 {
@@ -7455,6 +7620,9 @@ Perl_yyerror(pTHX_ char *s)
     PL_in_my_stash = Nullhv;
     return 0;
 }
+#ifdef __SC__
+#pragma segment Main
+#endif
 
 STATIC char*
 S_swallow_bom(pTHX_ U8 *s)