X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/6f3cc9b8480ec4a90c106da77b8c428003fdfdb8..ea035a6903a44a6856d789d8a40f47888278b7bb:/toke.c diff --git a/toke.c b/toke.c index 2cb6407..00fe0b5 100644 --- a/toke.c +++ b/toke.c @@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen); #define XFAKEBRACK 128 #define XENUMMASK 127 -/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/ -#define UTF (PL_hints & HINT_UTF8) +#ifdef EBCDIC +/* For now 'use utf8' does not affect tokenizer on EBCDIC */ +#define UTF (PL_linestr && DO_UTF8(PL_linestr)) +#else +#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8)) +#endif /* In variables name $^X, these are the legal values for X. * 1999-02-27 mjd-perl-patch@plover.com */ @@ -126,40 +130,42 @@ int yyactlevel = -1; * Also see LOP and lop() below. */ +/* Note that REPORT() and REPORT2() will be expressions that supply + * their own trailing comma, not suitable for statements as such. */ #ifdef DEBUGGING /* Serve -DT. */ -# define REPORT(x,retval) tokereport(x,s,(int)retval) -# define REPORT2(x,retval) tokereport(x,s, yylval.ival) +# define REPORT(x,retval) tokereport(x,s,(int)retval), +# define REPORT2(x,retval) tokereport(x,s, yylval.ival), #else -# define REPORT(x,retval) 1 -# define REPORT2(x,retval) 1 +# define REPORT(x,retval) +# define REPORT2(x,retval) #endif -#define TOKEN(retval) return (REPORT2("token",retval), PL_bufptr = s,(int)retval) -#define OPERATOR(retval) return (REPORT2("operator",retval), PL_expect = XTERM, PL_bufptr = s,(int)retval) -#define AOPERATOR(retval) return ao((REPORT2("aop",retval), PL_expect = XTERM, PL_bufptr = s,(int)retval)) -#define PREBLOCK(retval) return (REPORT2("preblock",retval), PL_expect = XBLOCK,PL_bufptr = s,(int)retval) -#define PRETERMBLOCK(retval) return (REPORT2("pretermblock",retval), PL_expect = XTERMBLOCK,PL_bufptr = s,(int)retval) -#define PREREF(retval) return (REPORT2("preref",retval), PL_expect = XREF,PL_bufptr = s,(int)retval) -#define TERM(retval) return (CLINE, REPORT2("term",retval), PL_expect = XOPERATOR, PL_bufptr = s,(int)retval) -#define LOOPX(f) return(yylval.ival=f, REPORT("loopx",f), PL_expect = XTERM,PL_bufptr = s,(int)LOOPEX) -#define FTST(f) return(yylval.ival=f, REPORT("ftst",f), PL_expect = XTERM,PL_bufptr = s,(int)UNIOP) -#define FUN0(f) return(yylval.ival = f, REPORT("fun0",f), PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC0) -#define FUN1(f) return(yylval.ival = f, REPORT("fun1",f), PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC1) -#define BOop(f) return ao((yylval.ival=f, REPORT("bitorop",f), PL_expect = XTERM,PL_bufptr = s,(int)BITOROP)) -#define BAop(f) return ao((yylval.ival=f, REPORT("bitandop",f), PL_expect = XTERM,PL_bufptr = s,(int)BITANDOP)) -#define SHop(f) return ao((yylval.ival=f, REPORT("shiftop",f), PL_expect = XTERM,PL_bufptr = s,(int)SHIFTOP)) -#define PWop(f) return ao((yylval.ival=f, REPORT("powop",f), PL_expect = XTERM,PL_bufptr = s,(int)POWOP)) -#define PMop(f) return(yylval.ival=f, REPORT("matchop",f), PL_expect = XTERM,PL_bufptr = s,(int)MATCHOP) -#define Aop(f) return ao((yylval.ival=f, REPORT("add",f), PL_expect = XTERM,PL_bufptr = s,(int)ADDOP)) -#define Mop(f) return ao((yylval.ival=f, REPORT("mul",f), PL_expect = XTERM,PL_bufptr = s,(int)MULOP)) -#define Eop(f) return(yylval.ival=f, REPORT("eq",f), PL_expect = XTERM,PL_bufptr = s,(int)EQOP) -#define Rop(f) return(yylval.ival=f, REPORT("rel",f), PL_expect = XTERM,PL_bufptr = s,(int)RELOP) +#define TOKEN(retval) return (REPORT2("token",retval) PL_bufptr = s,(int)retval) +#define OPERATOR(retval) return (REPORT2("operator",retval) PL_expect = XTERM, PL_bufptr = s,(int)retval) +#define AOPERATOR(retval) return ao((REPORT2("aop",retval) PL_expect = XTERM, PL_bufptr = s,(int)retval)) +#define PREBLOCK(retval) return (REPORT2("preblock",retval) PL_expect = XBLOCK,PL_bufptr = s,(int)retval) +#define PRETERMBLOCK(retval) return (REPORT2("pretermblock",retval) PL_expect = XTERMBLOCK,PL_bufptr = s,(int)retval) +#define PREREF(retval) return (REPORT2("preref",retval) PL_expect = XREF,PL_bufptr = s,(int)retval) +#define TERM(retval) return (CLINE, REPORT2("term",retval) PL_expect = XOPERATOR, PL_bufptr = s,(int)retval) +#define LOOPX(f) return(yylval.ival=f, REPORT("loopx",f) PL_expect = XTERM,PL_bufptr = s,(int)LOOPEX) +#define FTST(f) return(yylval.ival=f, REPORT("ftst",f) PL_expect = XTERM,PL_bufptr = s,(int)UNIOP) +#define FUN0(f) return(yylval.ival = f, REPORT("fun0",f) PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC0) +#define FUN1(f) return(yylval.ival = f, REPORT("fun1",f) PL_expect = XOPERATOR,PL_bufptr = s,(int)FUNC1) +#define BOop(f) return ao((yylval.ival=f, REPORT("bitorop",f) PL_expect = XTERM,PL_bufptr = s,(int)BITOROP)) +#define BAop(f) return ao((yylval.ival=f, REPORT("bitandop",f) PL_expect = XTERM,PL_bufptr = s,(int)BITANDOP)) +#define SHop(f) return ao((yylval.ival=f, REPORT("shiftop",f) PL_expect = XTERM,PL_bufptr = s,(int)SHIFTOP)) +#define PWop(f) return ao((yylval.ival=f, REPORT("powop",f) PL_expect = XTERM,PL_bufptr = s,(int)POWOP)) +#define PMop(f) return(yylval.ival=f, REPORT("matchop",f) PL_expect = XTERM,PL_bufptr = s,(int)MATCHOP) +#define Aop(f) return ao((yylval.ival=f, REPORT("add",f) PL_expect = XTERM,PL_bufptr = s,(int)ADDOP)) +#define Mop(f) return ao((yylval.ival=f, REPORT("mul",f) PL_expect = XTERM,PL_bufptr = s,(int)MULOP)) +#define Eop(f) return(yylval.ival=f, REPORT("eq",f) PL_expect = XTERM,PL_bufptr = s,(int)EQOP) +#define Rop(f) return(yylval.ival=f, REPORT("rel",f) PL_expect = XTERM,PL_bufptr = s,(int)RELOP) /* This bit of chicanery makes a unary function followed by * a parenthesis into a function with one argument, highest precedence. */ #define UNI(f) return(yylval.ival = f, \ - REPORT("uni",f), \ + REPORT("uni",f) \ PL_expect = XTERM, \ PL_bufptr = s, \ PL_last_uni = PL_oldbufptr, \ @@ -167,7 +173,7 @@ int yyactlevel = -1; (*s == '(' || (s = skipspace(s), *s == '(') ? (int)FUNC1 : (int)UNIOP) ) #define UNIBRACK(f) return(yylval.ival = f, \ - REPORT("uni",f), \ + REPORT("uni",f) \ PL_bufptr = s, \ PL_last_uni = PL_oldbufptr, \ (*s == '(' || (s = skipspace(s), *s == '(') ? (int)FUNC1 : (int)UNIOP) ) @@ -177,7 +183,7 @@ int yyactlevel = -1; STATIC void S_tokereport(pTHX_ char *thing, char* s, I32 rv) -{ +{ SV *report; DEBUG_T({ report = newSVpv(thing, 0); @@ -387,6 +393,8 @@ Perl_lex_start(pTHX_ SV *line) SAVEPPTR(PL_bufend); SAVEPPTR(PL_oldbufptr); SAVEPPTR(PL_oldoldbufptr); + SAVEPPTR(PL_last_lop); + SAVEPPTR(PL_last_uni); SAVEPPTR(PL_linestart); SAVESPTR(PL_linestr); SAVEPPTR(PL_lex_brackstack); @@ -429,6 +437,7 @@ Perl_lex_start(pTHX_ SV *line) SvTEMP_off(PL_linestr); PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = SvPVX(PL_linestr); PL_bufend = PL_bufptr + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; SvREFCNT_dec(PL_rs); PL_rs = newSVpvn("\n", 1); PL_rsfp = 0; @@ -574,6 +583,7 @@ S_skipspace(pTHX_ register char *s) PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = s = PL_linestart = SvPVX(PL_linestr); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; /* Close the filehandle. Could be from -P preprocessor, * STDIN, or a regular file. If we were reading code from @@ -705,7 +715,7 @@ S_lop(pTHX_ I32 f, int x, char *s) { yylval.ival = f; CLINE; - REPORT("lop", f); + REPORT("lop", f) PL_expect = x; PL_bufptr = s; PL_last_lop = PL_oldbufptr; @@ -832,7 +842,7 @@ Perl_str_to_version(pTHX_ SV *sv) STRLEN skip; UV n; if (utf) - n = utf8_to_uv((U8*)start, len, &skip, 0); + n = utf8n_to_uvchr((U8*)start, len, &skip, 0); else { n = *(U8*)start; skip = 1; @@ -911,8 +921,11 @@ S_tokeq(pTHX_ SV *sv) if (s == send) goto finish; d = s; - if ( PL_hints & HINT_NEW_STRING ) + if ( PL_hints & HINT_NEW_STRING ) { pv = sv_2mortal(newSVpvn(SvPVX(pv), len)); + if (SvUTF8(sv)) + SvUTF8_on(pv); + } while (s < send) { if (*s == '\\') { if (s + 1 < send && (s[1] == '\\')) @@ -1031,6 +1044,8 @@ S_sublex_push(pTHX) SAVEPPTR(PL_bufptr); SAVEPPTR(PL_oldbufptr); SAVEPPTR(PL_oldoldbufptr); + SAVEPPTR(PL_last_lop); + SAVEPPTR(PL_last_uni); SAVEPPTR(PL_linestart); SAVESPTR(PL_linestr); SAVEPPTR(PL_lex_brackstack); @@ -1042,6 +1057,7 @@ S_sublex_push(pTHX) PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr); PL_bufend += SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; SAVEFREESV(PL_linestr); PL_lex_dojoin = FALSE; @@ -1093,6 +1109,7 @@ S_sublex_done(pTHX) PL_lex_inpat = 0; PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr); PL_bufend += SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; SAVEFREESV(PL_linestr); PL_lex_dojoin = FALSE; PL_lex_brackets = 0; @@ -1205,30 +1222,42 @@ S_scan_const(pTHX_ char *start) register char *d = SvPVX(sv); /* destination for copies */ bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ - bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr)); - /* the constant is UTF8 */ + I32 has_utf8 = FALSE; /* Output constant is UTF8 */ + I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */ UV uv; - I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) - : UTF; - I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ? - OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF)) - : UTF; const char *leaveit = /* set of acceptably-backslashed characters */ PL_lex_inpat ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""; + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + /* If we are doing a trans and we know we want UTF8 set expectation */ + has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF); + this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + + while (s < send || dorange) { /* get transliterations out of the way (they're most literal) */ if (PL_lex_inwhat == OP_TRANS) { /* expand a range A-Z to the full set of characters. AIE! */ if (dorange) { - UV i; /* current expanded character */ - UV min; /* first character in range */ - UV max; /* last character in range */ + I32 i; /* current expanded character */ + I32 min; /* first character in range */ + I32 max; /* last character in range */ + + if (has_utf8) { + char *c = (char*)utf8_hop((U8*)d, -1); + char *e = d++; + while (e-- > c) + *(e + 1) = *e; + *c = UTF_TO_NATIVE(0xff); + /* mark the range as done, and continue */ + dorange = FALSE; + didrange = TRUE; + continue; + } i = d - SvPVX(sv); /* remember current offset */ SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */ @@ -1240,22 +1269,21 @@ S_scan_const(pTHX_ char *start) if (min > max) { Perl_croak(aTHX_ - "Invalid [] range \"\\x%"UVxf"-\\x%"UVxf"\" in transliteration operator", - min, max); + "Invalid [] range \"%c-%c\" in transliteration operator", + (char)min, (char)max); } -#ifdef ALPHAS_HAVE_GAPS - /* BROKEN FOR EBCDIC, see regcomp.c:reglass() */ +#ifdef EBCDIC if ((isLOWER(min) && isLOWER(max)) || (isUPPER(min) && isUPPER(max))) { if (isLOWER(min)) { for (i = min; i <= max; i++) if (isLOWER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } else { for (i = min; i <= max; i++) if (isUPPER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } } else @@ -1274,8 +1302,8 @@ S_scan_const(pTHX_ char *start) if (didrange) { Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); } - if (utf) { - *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */ + if (has_utf8) { + *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */ s++; continue; } @@ -1294,7 +1322,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '(' && PL_lex_inpat && s[1] == '?') { if (s[2] == '#') { while (s < send && *s != ')') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } else if (s[2] == '{' /* This should match regcomp.c */ || ((s[2] == 'p' || s[2] == '?') && s[3] == '{')) @@ -1317,7 +1345,7 @@ S_scan_const(pTHX_ char *start) yyerror("Sequence (?{...}) not terminated or not {}-balanced"); } while (s < regparse) - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } } @@ -1325,7 +1353,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '#' && PL_lex_inpat && ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) { while (s+1 < send && *s != '\n') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } /* check for embedded arrays @@ -1345,14 +1373,16 @@ S_scan_const(pTHX_ char *start) break; /* in regexp, $ might be tail anchor */ } + /* End of else if chain - OP_TRANS rejoin rest */ + /* backslashes */ if (*s == '\\' && s+1 < send) { s++; /* some backslashes we leave behind */ if (*leaveit && *s && strchr(leaveit, *s)) { - *d++ = '\\'; - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,'\\'); + *d++ = NATIVE_TO_NEED(has_utf8,*s++); continue; } @@ -1428,15 +1458,13 @@ S_scan_const(pTHX_ char *start) NUM_ESCAPE_INSERT: /* Insert oct or hex escaped character. * There will always enough room in sv since such - * escapes will be longer than any UT-F8 sequence + * escapes will be longer than any UTF-8 sequence * they can end up as. */ - - /* This spot is wrong for EBCDIC. Characters like - * the lowercase letters and digits are >127 in EBCDIC, - * so here they would need to be mapped to the Unicode - * repertoire. --jhi */ - if (uv > 127) { + /* We need to map to chars to ASCII before doing the tests + to cover EBCDIC + */ + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) { if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have * accumulated so far if it contains any @@ -1445,46 +1473,42 @@ S_scan_const(pTHX_ char *start) * (Can't we keep track of that and avoid * this rescan? --jhi) */ - int hicount = 0; - char *c; - - for (c = SvPVX(sv); c < d; c++) { - if (UTF8_IS_CONTINUED(*c)) + int hicount = 0; + U8 *c; + for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) { + if (!NATIVE_IS_INVARIANT(*c)) { hicount++; + } } if (hicount) { - char *old_pvx = SvPVX(sv); - char *src, *dst; - - d = SvGROW(sv, - SvCUR(sv) + hicount + 1) + - (d - old_pvx); - - src = d - 1; - d += hicount; - dst = d - 1; - - while (src < dst) { - if (UTF8_IS_CONTINUED(*src)) { - *dst-- = UTF8_EIGHT_BIT_LO(*src); - *dst-- = UTF8_EIGHT_BIT_HI(*src--); + STRLEN offset = d - SvPVX(sv); + U8 *src, *dst; + d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset; + src = (U8 *)d - 1; + dst = src+hicount; + d += hicount; + while (src >= (U8 *)SvPVX(sv)) { + if (!NATIVE_IS_INVARIANT(*src)) { + U8 ch = NATIVE_TO_ASCII(*src); + *dst-- = UTF8_EIGHT_BIT_LO(ch); + *dst-- = UTF8_EIGHT_BIT_HI(ch); } else { - *dst-- = *src--; + *dst-- = *src; } + src--; } } } if (has_utf8 || uv > 255) { - d = (char*)uv_to_utf8((U8*)d, uv); + d = (char*)uvchr_to_utf8((U8*)d, uv); has_utf8 = TRUE; if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { PL_sublex_info.sub_op->op_private |= (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; } } else { @@ -1492,7 +1516,7 @@ S_scan_const(pTHX_ char *start) } } else { - *d++ = (char)uv; + *d++ = (char) uv; } continue; @@ -1530,7 +1554,7 @@ S_scan_const(pTHX_ char *start) if (len > e - s + 4) { char *odest = SvPVX(sv); - SvGROW(sv, (SvCUR(sv) + len - (e - s + 4))); + SvGROW(sv, (SvLEN(sv) + len - (e - s + 4))); d = SvPVX(sv) + (d - odest); } Copy(str, d, len, char); @@ -1546,51 +1570,38 @@ S_scan_const(pTHX_ char *start) /* \c is a control character */ case 'c': s++; -#ifdef EBCDIC - *d = *s++; - if (isLOWER(*d)) - *d = toUPPER(*d); - *d = toCTRL(*d); - d++; -#else { U8 c = *s++; - *d++ = toCTRL(c); - } +#ifdef EBCDIC + if (isLOWER(c)) + c = toUPPER(c); #endif + *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c)); + } continue; /* printf-style backslashes, formfeeds, newlines, etc */ case 'b': - *d++ = '\b'; + *d++ = NATIVE_TO_NEED(has_utf8,'\b'); break; case 'n': - *d++ = '\n'; + *d++ = NATIVE_TO_NEED(has_utf8,'\n'); break; case 'r': - *d++ = '\r'; + *d++ = NATIVE_TO_NEED(has_utf8,'\r'); break; case 'f': - *d++ = '\f'; + *d++ = NATIVE_TO_NEED(has_utf8,'\f'); break; case 't': - *d++ = '\t'; - break; -#ifdef EBCDIC - case 'e': - *d++ = '\047'; /* CP 1047 */ + *d++ = NATIVE_TO_NEED(has_utf8,'\t'); break; - case 'a': - *d++ = '\057'; /* CP 1047 */ - break; -#else case 'e': - *d++ = '\033'; + *d++ = ASCII_TO_NEED(has_utf8,'\033'); break; case 'a': - *d++ = '\007'; + *d++ = ASCII_TO_NEED(has_utf8,'\007'); break; -#endif } /* end switch */ s++; @@ -1598,41 +1609,40 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: - if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) { - STRLEN len = (STRLEN) -1; - UV uv; - if (this_utf8) { - uv = utf8_to_uv((U8*)s, send - s, &len, 0); - } - if (len == (STRLEN)-1) { - /* Illegal UTF8 (a high-bit byte), make it valid. */ - char *old_pvx = SvPVX(sv); - /* need space for one extra char (NOTE: SvCUR() not set here) */ - d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); - d = (char*)uv_to_utf8((U8*)d, (U8)*s++); - } - else { - while (len--) - *d++ = *s++; - } - has_utf8 = TRUE; - if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { - PL_sublex_info.sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; - } - continue; - } - - *d++ = *s++; + /* If we started with encoded form, or already know we want it + and then encode the next character */ + if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) { + STRLEN len = 1; + UV uv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); + STRLEN need = UNISKIP(NATIVE_TO_UNI(uv)); + s += len; + if (need > len) { + /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */ + STRLEN off = d - SvPVX(sv); + d = SvGROW(sv, SvLEN(sv) + (need-len)) + off; + } + d = (char*)uvchr_to_utf8((U8*)d, uv); + has_utf8 = TRUE; + } + else { + *d++ = NATIVE_TO_NEED(has_utf8,*s++); + } } /* while loop to process each character */ /* terminate the string and set up the sv */ *d = '\0'; SvCUR_set(sv, d - SvPVX(sv)); + if (SvCUR(sv) >= SvLEN(sv)) + Perl_croak(aTHX_ "panic:constant overflowed allocated space"); + SvPOK_on(sv); - if (has_utf8) + if (has_utf8) { SvUTF8_on(sv); + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + PL_sublex_info.sub_op->op_private |= + (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + } /* shrink the sv if we allocated more than we used */ if (SvCUR(sv) + 5 < SvLEN(sv)) { @@ -2559,6 +2569,7 @@ Perl_yylex(pTHX) sv_catpv(PL_linestr, "\n"); PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; if (PERLDB_LINE && PL_curstash != PL_debstash) { SV *sv = NEWSV(85,0); @@ -2587,10 +2598,12 @@ Perl_yylex(pTHX) sv_catpv(PL_linestr,";}"); PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; PL_minus_n = PL_minus_p = 0; goto retry; } PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; sv_setpv(PL_linestr,""); TOKEN(';'); /* not infinite loop because rsfp is NULL now */ } @@ -2633,6 +2646,7 @@ Perl_yylex(pTHX) sv_setpv(PL_linestr, ""); PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; PL_doextract = FALSE; } } @@ -2647,6 +2661,7 @@ Perl_yylex(pTHX) av_store(CopFILEAV(PL_curcop),(I32)CopLINE(PL_curcop),sv); } PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; if (CopLINE(PL_curcop) == 1) { while (s < PL_bufend && isSPACE(*s)) s++; @@ -2790,6 +2805,7 @@ Perl_yylex(pTHX) sv_setpv(PL_linestr, ""); PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; PL_preambled = FALSE; if (PERLDB_LINE) (void)gv_fetchfile(PL_origfilename); @@ -3679,7 +3695,7 @@ Perl_yylex(pTHX) case '\'': s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, - "### Saw string in '%s'\n", s); + "### Saw string before '%s'\n", s); } ) if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -3698,7 +3714,7 @@ Perl_yylex(pTHX) case '"': s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, - "### Saw string in '%s'\n", s); + "### Saw string before '%s'\n", s); } ) if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -3713,7 +3729,7 @@ Perl_yylex(pTHX) missingterm((char*)0); yylval.ival = OP_CONST; for (d = SvPV(PL_lex_stuff, len); len; len--, d++) { - if (*d == '$' || *d == '@' || *d == '\\' || UTF8_IS_CONTINUED(*d)) { + if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) { yylval.ival = OP_STRINGIFY; break; } @@ -3723,7 +3739,7 @@ Perl_yylex(pTHX) case '`': s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, - "### Saw backtick string in '%s'\n", s); + "### Saw backtick string before '%s'\n", s); } ) if (PL_expect == XOPERATOR) no_op("Backticks",s); @@ -4745,11 +4761,7 @@ Perl_yylex(pTHX) TOKEN('('); case KEY_qq: - case KEY_qu: s = scan_str(s,FALSE,FALSE); - if (tmp == KEY_qu && - is_utf8_string((U8*)SvPVX(PL_lex_stuff), SvCUR(PL_lex_stuff))) - SvUTF8_on(PL_lex_stuff); if (!s) missingterm((char*)0); yylval.ival = OP_STRINGIFY; @@ -5582,7 +5594,6 @@ Perl_keyword(pTHX_ register char *d, I32 len) if (strEQ(d,"q")) return KEY_q; if (strEQ(d,"qr")) return KEY_qr; if (strEQ(d,"qq")) return KEY_qq; - if (strEQ(d,"qu")) return KEY_qu; if (strEQ(d,"qw")) return KEY_qw; if (strEQ(d,"qx")) return KEY_qx; } @@ -6293,9 +6304,6 @@ S_scan_trans(pTHX_ char *start) Perl_croak(aTHX_ "Transliteration replacement not terminated"); } - New(803,tbl,256,short); - o = newPVOP(OP_TRANS, 0, (char*)tbl); - complement = del = squash = 0; while (strchr("cds", *s)) { if (*s == 'c') @@ -6306,6 +6314,9 @@ S_scan_trans(pTHX_ char *start) squash = OPpTRANS_SQUASH; s++; } + + New(803, tbl, complement&&!del?258:256, short); + o = newPVOP(OP_TRANS, 0, (char*)tbl); o->op_private = del|squash|complement| (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)| (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF : 0); @@ -6450,6 +6461,7 @@ S_scan_heredoc(pTHX_ register char *s) sv_setsv(PL_linestr,herewas); PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = s = PL_linestart = SvPVX(PL_linestr); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; } else sv_setpvn(tmpstr,"",0); /* avoid "uninitialized" warning */ @@ -6461,6 +6473,7 @@ S_scan_heredoc(pTHX_ register char *s) } CopLINE_inc(PL_curcop); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; #ifndef PERL_STRICT_CR if (PL_bufend - PL_linestart >= 2) { if ((PL_bufend[-2] == '\r' && PL_bufend[-1] == '\n') || @@ -6688,7 +6701,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if (UTF8_IS_CONTINUED(term) && UTF) + if (!UTF8_IS_INVARIANT((U8)term) && UTF) has_utf8 = TRUE; /* mark where we are */ @@ -6735,7 +6748,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) have found the terminator */ else if (*s == term) break; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6764,7 +6777,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) break; else if (*s == PL_multi_open) brackets++; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6819,6 +6832,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* having changed the buffer, we must update PL_bufend */ PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; } /* at this point, we have successfully read the delimited string */ @@ -7221,7 +7235,7 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; if (!isALPHA(*pos)) { - UV rev, revmax = 0; + UV rev; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tmpend; s++; /* get past 'v' */ @@ -7249,10 +7263,11 @@ vstring: "Integer overflow in decimal number"); } } - tmpend = uv_to_utf8(tmpbuf, rev); - if (rev > revmax) - revmax = rev; + /* Append native character for the rev point */ + tmpend = uvchr_to_utf8(tmpbuf, rev); sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev))) + SvUTF8_on(sv); if (*pos == '.' && isDIGIT(pos[1])) s = ++pos; else { @@ -7262,14 +7277,8 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; } - SvPOK_on(sv); SvREADONLY_on(sv); - if (revmax > 127) { - SvUTF8_on(sv); - if (revmax < 256) - sv_utf8_downgrade(sv, TRUE); - } } } break; @@ -7335,6 +7344,7 @@ S_scan_formline(pTHX_ register char *s) s = filter_gets(PL_linestr, PL_rsfp, 0); PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = SvPVX(PL_linestr); PL_bufend = PL_bufptr + SvCUR(PL_linestr); + PL_last_lop = PL_last_uni = Nullch; if (!s) { s = PL_bufptr; yyerror("Format not terminated");