#define XFAKEBRACK 0x80
#ifdef USE_UTF8_SCRIPTS
-# define UTF (!IN_BYTES)
+# define UTF cBOOL(!IN_BYTES)
#else
-# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
+# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
#endif
/* The maximum number of characters preceding the unrecognized one to display */
* a keyword (do this if the word is a label, e.g. goto FOO)
* int allow_pack : if true, : characters will also be allowed (require,
* use, etc. do this)
- * int allow_initial_tick : used by the "sub" lexer only.
*/
STATIC char *
* symbol meaning, e.g. \x{2E} would be confused with a dot. But
* in spite of this, we do have to process \N here while the proper
* charnames handler is in scope. See bugs #56444 and #62056.
+ *
* There is a complication because \N in a pattern may also stand
* for 'match a non-nl', and not mean a charname, in which case its
* processing should be deferred to the regex compiler. To be a
continue;
case 'N':
- /* In a non-pattern \N must be a named character, like \N{LATIN
- * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
- * mean to match a non-newline. For non-patterns, named
- * characters are converted to their string equivalents. In
- * patterns, named characters are not converted to their
- * ultimate forms for the same reasons that other escapes
- * aren't. Instead, they are converted to the \N{U+...} form
- * to get the value from the charnames that is in effect right
- * now, while preserving the fact that it was a named character
- * so that the regex compiler knows this */
-
- /* The structure of this section of code (besides checking for
+ /* In a non-pattern \N must be like \N{U+0041}, or it can be a
+ * named character, like \N{LATIN SMALL LETTER A}, or a named
+ * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
+ * GRAVE}. For convenience all three forms are referred to as
+ * "named characters" below.
+ *
+ * For patterns, \N also can mean to match a non-newline. Code
+ * before this 'switch' statement should already have handled
+ * this situation, and hence this code only has to deal with
+ * the named character cases.
+ *
+ * For non-patterns, the named characters are converted to
+ * their string equivalents. In patterns, named characters are
+ * not converted to their ultimate forms for the same reasons
+ * that other escapes aren't. Instead, they are converted to
+ * the \N{U+...} form to get the value from the charnames that
+ * is in effect right now, while preserving the fact that it
+ * was a named character, so that the regex compiler knows
+ * this.
+ *
+ * The structure of this section of code (besides checking for
* errors and upgrading to utf8) is:
- * Further disambiguate between the two meanings of \N, and if
- * not a charname, go process it elsewhere
- * If of form \N{U+...}, pass it through if a pattern;
- * otherwise convert to utf8
- * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
- * pattern; otherwise convert to utf8 */
-
- /* Here, s points to the 'N'; the test below is guaranteed to
- * succeed if we are being called on a pattern as we already
- * know from a test above that the next character is a '{'.
- * On a non-pattern \N must mean 'named sequence, which
- * requires braces */
+ * If the named character is of the form \N{U+...}, pass it
+ * through if a pattern; otherwise convert the code point
+ * to utf8
+ * Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
+ * if a pattern; otherwise convert to utf8
+ *
+ * If the regex compiler should ever need to differentiate
+ * between the \N{U+...} and \N{name} forms, that could easily
+ * be done here by stripping any leading zeros from the
+ * \N{U+...} case, and adding them to the other one. */
+
+ /* Here, 's' points to the 'N'; the test below is guaranteed to
+ * succeed if we are being called on a pattern, as we already
+ * know from a test above that the next character is a '{'. A
+ * non-pattern \N must mean 'named character', which requires
+ * braces */
s++;
if (*s != '{') {
yyerror("Missing braces on \\N{}");
| PERL_SCAN_DISALLOW_PREFIX;
STRLEN len;
- /* For \N{U+...}, the '...' is a unicode value even on
- * EBCDIC machines */
s += 2; /* Skip to next char after the 'U+' */
len = e - s;
uv = grok_hex(s, &len, &flags, NULL);
}
if (PL_lex_inpat) {
-
- /* On non-EBCDIC platforms, pass through to the regex
- * compiler unchanged. The reason we evaluated the
- * number above is to make sure there wasn't a syntax
- * error. But on EBCDIC we convert to native so
- * downstream code can continue to assume it's native
- */
s -= 5; /* Include the '\N{U+' */
#ifdef EBCDIC
- d += my_snprintf(d, e - s + 1 + 1, /* includes the }
+ /* On EBCDIC platforms, in \N{U+...}, the '...' is a
+ * Unicode value, so convert to native so downstream
+ * code can continue to assume it's native */
+ d += my_snprintf(d, e - s + 1 + 1, /* includes the '}'
and the \0 */
- "\\N{U+%X}",
- (unsigned int) UNI_TO_NATIVE(uv));
+ "\\N{U+%X}",
+ (unsigned int) UNI_TO_NATIVE(uv));
#else
- Copy(s, d, e - s + 1, char); /* 1 = include the } */
+ /* On non-EBCDIC platforms, pass it through unchanged.
+ * The reason we evaluated the number above is to make
+ * sure there wasn't a syntax error. */
+ Copy(s, d, e - s + 1, char); /* +1 is for the '}' */
d += e - s + 1;
#endif
}
else { /* Not a pattern: convert the hex to string */
- /* If destination is not in utf8, unconditionally
+ /* If the destination is not in utf8, unconditionally
* recode it to be so. This is because \N{} implies
* Unicode semantics, and scalars have to be in utf8
* to guarantee those semantics */
* through the string. Each character takes up
* 2 hex digits plus either a trailing dot or
* the "}" */
+ const char initial_text[] = "\\N{U+";
+ const STRLEN initial_len = sizeof(initial_text)
+ - 1;
d = off + SvGROW(sv, off
+ 3 * len
- + 6 /* For the "\N{U+", and
- trailing NUL */
+
+ /* +1 for trailing NUL */
+ + initial_len + 1
+
+ (STRLEN)(send - e));
- Copy("\\N{U+", d, 5, char);
- d += 5;
+ Copy(initial_text, d, initial_len, char);
+ d += initial_len;
while (str < str_end) {
char hex_string[4];
int len =
d += 3;
str++;
}
- d--; /* We will overwrite below the final
+ d--; /* Below, we will overwrite the final
dot with a right brace */
}
else {
if (PL_expect != XOPERATOR) {
if (s[1] != '<' && !strchr(s,'>'))
check_uni();
- if (s[1] == '<')
+ if (s[1] == '<' && s[2] != '>')
s = scan_heredoc(s);
else
s = scan_inputsymbol(s);
? GvCV(gv)
: SvROK(gv) && SvTYPE(SvRV(gv)) == SVt_PVCV
? (CV *)SvRV(gv)
- : (CV *)gv
+ : ((CV *)gv)
: rv2cv_op_cv(rv2cv_op, RV2CVOPCV_RETURN_STUB);
}
}
else {
if (has_colon) {
+ /* "my" variable %s can't be in a package */
/* PL_no_myglob is constant */
GCC_DIAG_IGNORE(-Wformat-nonliteral);
yyerror_pv(Perl_form(aTHX_ PL_no_myglob,
- PL_in_my == KEY_my ? "my" : "state", PL_tokenbuf),
+ PL_in_my == KEY_my ? "my" : "state",
+ *PL_tokenbuf == '&' ? "subroutin" : "variabl",
+ PL_tokenbuf),
UTF ? SVf_UTF8 : 0);
GCC_DIAG_RESTORE;
}
s++;
if (*s == ',') {
GV* gv;
+ PADOFFSET off;
if (keyword(w, s - w, 0))
return;
gv = gv_fetchpvn_flags(w, s - w, ( UTF ? SVf_UTF8 : 0 ), SVt_PVCV);
if (gv && GvCVu(gv))
return;
+ if (s - w <= 254) {
+ char tmpbuf[256];
+ Copy(w, tmpbuf+1, s - w, char);
+ *tmpbuf = '&';
+ off = pad_findmy_pvn(tmpbuf, s-w+1, UTF ? SVf_UTF8 : 0);
+ if (off != NOT_IN_PAD) return;
+ }
Perl_croak(aTHX_ "No comma allowed after %s", what);
}
}
}
static bool
-S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset) {
-
- /* Adds, subtracts to/from 'pmfl' based on regex modifier flags found in
- * the parse starting at 's', based on the subset that are valid in this
- * context input to this routine in 'valid_flags'. Advances s. Returns
- * TRUE if the input should be treated as a valid flag, so the next char
- * may be as well; otherwise FALSE. 'charset' should point to a NUL upon
- * first call on the current regex. This routine will set it to any
+S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset, unsigned int * x_mod_count) {
+
+ /* Adds, subtracts to/from 'pmfl' based on the next regex modifier flag
+ * found in the parse starting at 's', based on the subset that are valid
+ * in this context input to this routine in 'valid_flags'. Advances s.
+ * Returns TRUE if the input should be treated as a valid flag, so the next
+ * char may be as well; otherwise FALSE. 'charset' should point to a NUL
+ * upon first call on the current regex. This routine will set it to any
* charset modifier found. The caller shouldn't change it. This way,
* another charset modifier encountered in the parse can be detected as an
* error, as we have decided to allow only one */
switch (c) {
- CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl);
+ CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, *x_mod_count);
case GLOBAL_PAT_MOD: *pmfl |= PMf_GLOBAL; break;
case CONTINUE_PAT_MOD: *pmfl |= PMf_CONTINUE; break;
case ONCE_PAT_MOD: *pmfl |= PMf_KEEP; break;
const char * const valid_flags =
(const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS);
char charset = '\0'; /* character set modifier */
+ unsigned int x_mod_count = 0;
PERL_ARGS_ASSERT_SCAN_PAT;
pm->op_pmflags |= PMf_IS_QR;
}
- while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags), &s, &charset)) {};
+ while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags),
+ &s, &charset, &x_mod_count))
+ {};
/* issue a warning if /c is specified,but /g is not */
if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL))
{
"Use of /c modifier is meaningless without /g" );
}
+ STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+
PL_lex_op = (OP*)pm;
pl_yylval.ival = OP_MATCH;
return s;
line_t first_line;
I32 es = 0;
char charset = '\0'; /* character set modifier */
+ unsigned int x_mod_count = 0;
char *t;
PERL_ARGS_ASSERT_SCAN_SUBST;
s++;
es++;
}
- else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags), &s, &charset))
+ else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags),
+ &s, &charset, &x_mod_count))
{
break;
}
}
+ STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+
if ((pm->op_pmflags & PMf_CONTINUE)) {
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
}
else if (PL_bufend - PL_linestart == 1 && PL_bufend[-1] == '\r')
PL_bufend[-1] = '\n';
#endif
- if (*s == term && memEQ(s,PL_tokenbuf + 1,len)) {
+ if (*s == term && PL_bufend-s >= len
+ && memEQ(s,PL_tokenbuf + 1,len)) {
SvREFCNT_dec(PL_linestr);
PL_linestr = linestr_save;
PL_linestart = SvPVX(linestr_save);
This code handles:
<> read from ARGV
+ <<>> read from ARGV without magic open
<FH> read from filehandle
<pkg::FH> read from package qualified filehandle
<pkg'FH> read from package qualified filehandle
char *s = start; /* current position in buffer */
char *end;
I32 len;
+ bool nomagicopen = FALSE;
char *d = PL_tokenbuf; /* start of temp holding space */
const char * const e = PL_tokenbuf + sizeof PL_tokenbuf; /* end of temp holding space */
end = strchr(s, '\n');
if (!end)
end = PL_bufend;
- s = delimcpy(d, e, s + 1, end, '>', &len); /* extract until > */
+ if (s[1] == '<' && s[2] == '>' && s[3] == '>') {
+ nomagicopen = TRUE;
+ *d = '\0';
+ len = 0;
+ s += 3;
+ }
+ else
+ s = delimcpy(d, e, s + 1, end, '>', &len); /* extract until > */
/* die if we didn't have space for the contents of the <>,
or if it didn't end, or if we see a newline
op_append_elem(OP_LIST,
newGVOP(OP_GV, 0, gv),
newCVREF(0, newGVOP(OP_GV, 0, gv_readline))))
- : (OP*)newUNOP(OP_READLINE, 0, newGVOP(OP_GV, 0, gv));
+ : (OP*)newUNOP(OP_READLINE, nomagicopen ? OPf_SPECIAL : 0, newGVOP(OP_GV, 0, gv));
pl_yylval.ival = OP_NULL;
}
}
PL_bufptr = s;
if (d == PL_tokenbuf+1)
return NULL;
- *d = 0;
var = newOP(sigil == '$' ? OP_PADSV : sigil == '@' ? OP_PADAV : OP_PADHV,
OPf_MOD | (OPpLVAL_INTRO<<8));
var->op_targ = allocmy(PL_tokenbuf, d - PL_tokenbuf, UTF ? SVf_UTF8 : 0);