=head1 Lexer interface
This is the lower layer of the Perl parser, managing characters and tokens.
-=for apidoc AmU|yy_parser *|PL_parser
+=for apidoc AmnU|yy_parser *|PL_parser
Pointer to a structure encapsulating the state of the parsing operation
currently in progress. The pointer can be locally changed to perform
{ SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
{ SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
{ SUB, TOKENTYPE_NONE, "SUB" },
+ { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
+ { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
{ THING, TOKENTYPE_OPVAL, "THING" },
{ UMINUS, TOKENTYPE_NONE, "UMINUS" },
{ UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
#endif
/*
-=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
+=for apidoc lex_start
Creates and initialises a new lexer/parser state object, supplying
a context in which to lex and parse from a new source of Perl code.
parser->lex_state = LEX_NORMAL;
parser->expect = XSTATE;
parser->rsfp = rsfp;
- parser->recheck_utf8_validity = FALSE;
+ parser->recheck_utf8_validity = TRUE;
parser->rsfp_filters =
!(flags & LEX_START_SAME_FILTER) || !oparser
? NULL
/*
-=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
+=for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
Buffer scalar containing the chunk currently under consideration of the
text currently being lexed. This is always a plain string scalar (for
of these pointers is usually preferable to examination of the scalar
through normal scalar means.
-=for apidoc AmxU|char *|PL_parser-E<gt>bufend
+=for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
Direct pointer to the end of the chunk of text currently being lexed, the
end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
always located at the end of the buffer, and does not count as part of
the buffer's contents.
-=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
+=for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
Points to the current position of lexing inside the lexer buffer.
Characters around this point may be freely examined, within
using the slightly higher-level functions L</lex_peek_unichar> and
L</lex_read_unichar>.
-=for apidoc AmxU|char *|PL_parser-E<gt>linestart
+=for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
Points to the start of the current line inside the lexer buffer.
This is useful for indicating at which column an error occurred, and
*/
/*
-=for apidoc Amx|bool|lex_bufutf8
+=for apidoc lex_bufutf8
Indicates whether the octets in the lexer buffer
(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
}
/*
-=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
+=for apidoc lex_grow_linestr
Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
at least C<len> octets (including terminating C<NUL>). Returns a
}
/*
-=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
+=for apidoc lex_stuff_pvn
Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
}
/*
-=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
+=for apidoc lex_stuff_pv
Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
}
/*
-=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
+=for apidoc lex_stuff_sv
Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
}
/*
-=for apidoc Amx|void|lex_unstuff|char *ptr
+=for apidoc lex_unstuff
Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
}
/*
-=for apidoc Amx|void|lex_read_to|char *ptr
+=for apidoc lex_read_to
Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
}
/*
-=for apidoc Amx|void|lex_discard_to|char *ptr
+=for apidoc lex_discard_to
Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
up to C<ptr>. The remaining content of the buffer will be moved, and
}
/*
-=for apidoc Amx|bool|lex_next_chunk|U32 flags
+=for apidoc lex_next_chunk
Reads in the next chunk of text to be lexed, appending it to
L</PL_parser-E<gt>linestr>. This should be called when lexing code has
PL_parser->last_lop = NULL;
last_uni_pos = last_lop_pos = 0;
*buf = 0;
- SvCUR(linestr) = 0;
+ SvCUR_set(linestr, 0);
} else {
old_bufend_pos = PL_parser->bufend - buf;
bufptr_pos = PL_parser->bufptr - buf;
}
/*
-=for apidoc Amx|I32|lex_peek_unichar|U32 flags
+=for apidoc lex_peek_unichar
Looks ahead one (Unicode) character in the text currently being lexed.
Returns the codepoint (unsigned integer value) of the next character,
}
/*
-=for apidoc Amx|I32|lex_read_unichar|U32 flags
+=for apidoc lex_read_unichar
Reads the next (Unicode) character in the text currently being lexed.
Returns the codepoint (unsigned integer value) of the character read,
}
/*
-=for apidoc Amx|void|lex_read_space|U32 flags
+=for apidoc lex_read_space
Reads optional spaces, in Perl style, in the text currently being
lexed. The spaces may include ordinary whitespace characters and
/*
-=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
+=for apidoc validate_proto
This function performs syntax checking on a prototype, C<proto>.
If C<warn> is true, any illegal characters or mismatched brackets
#define skipspace(s) skipspace_flags(s, 0)
#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
-STATIC char *
-S_skipspace_flags(pTHX_ char *s, U32 flags)
+char *
+Perl_skipspace_flags(pTHX_ char *s, U32 flags)
{
PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
PL_in_eval &= ~EVAL_RE_REPARSING;
- return '(';
+ return SUBLEXSTART;
}
/*
PL_bufend = SvPVX(PL_linestr);
PL_bufend += SvCUR(PL_linestr);
PL_expect = XOPERATOR;
- return ')';
+ return SUBLEXEND;
}
}
ENTER_with_name("scan_const");
SAVEFREESV(sv);
+ /* A bunch of code in the loop below assumes that if s[n] exists and is not
+ * NUL, then s[n+1] exists. This assertion makes sure that assumption is
+ * valid */
+ assert(*send == '\0');
+
while (s < send
|| dorange /* Handle tr/// range at right edge of input */
) {
&& (range_min > 255 || ! convert_unicode)
#endif
) {
+ const STRLEN off = d - SvPVX(sv);
+ const STRLEN extra = 1 + (send - s) + 1;
+ char *e;
+
/* Move the high character one byte to the right; then
* insert between it and the range begin, an illegal
* byte which serves to indicate this is a range (using
* a '-' would be ambiguous). */
- char *e = d++;
+
+ if (off + extra > SvLEN(sv)) {
+ d = off + SvGROW(sv, off + extra);
+ max_ptr = d - off + offset_to_max;
+ }
+
+ e = d++;
while (e-- > max_ptr) {
*(e + 1) = *e;
}
* friends */
else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
if (s[2] == '#') {
- while (s+1 < send && *s != ')')
- *d++ = *s++;
+ if (s_is_utf8) {
+ PERL_UINT_FAST8_T len = UTF8SKIP(s);
+
+ while (s + len < send && *s != ')') {
+ Copy(s, d, len, U8);
+ d += len;
+ s += len;
+ len = UTF8_SAFE_SKIP(s, send);
+ }
+ }
+ else while (s+1 < send && *s != ')') {
+ *d++ = *s++;
+ }
}
else if (!PL_lex_casemods
&& ( s[2] == '{' /* This should match regcomp.c */
d += len;
s += len;
}
- else { /* UTF8ness matters and doesn't match, need to convert */
- STRLEN len = 1;
- const UV nextuv = (s_is_utf8)
- ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
- : (UV) ((U8) *s);
- STRLEN need = UVCHR_SKIP(nextuv);
-
- if (!d_is_utf8) {
- SvCUR_set(sv, d - SvPVX_const(sv));
- SvPOK_on(sv);
- *d = '\0';
+ else if (s_is_utf8) { /* UTF8ness matters: convert output to utf8 */
+ STRLEN need = send - s + 1; /* See Note on sizing above. */
- /* See Note on sizing above. */
- need += (STRLEN)(send - s) + 1;
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
- if (utf8_variant_count == 0) {
- SvUTF8_on(sv);
- d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
- }
- else {
- sv_utf8_upgrade_flags_grow(sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- need);
- d = SvPVX(sv) + SvCUR(sv);
- }
- d_is_utf8 = TRUE;
- } else if (need > len) {
- /* encoded value larger than old, may need extra space (NOTE:
- * SvCUR() is not set correctly here). See Note on sizing
- * above. */
- const STRLEN extra = need + (send - s) + 1;
- const STRLEN off = d - SvPVX_const(sv);
+ if (utf8_variant_count == 0) {
+ SvUTF8_on(sv);
+ d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
+ }
+ else {
+ sv_utf8_upgrade_flags_grow(sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ need);
+ d = SvPVX(sv) + SvCUR(sv);
+ }
+ d_is_utf8 = TRUE;
+ goto default_action; /* Redo, having upgraded so both are UTF-8 */
+ }
+ else { /* UTF8ness matters: convert this non-UTF8 source char to
+ UTF-8 for output. It will occupy 2 bytes, but don't include
+ the input byte since we haven't incremented 's' yet. See
+ Note on sizing above. */
+ const STRLEN off = d - SvPVX(sv);
+ const STRLEN extra = 2 + (send - s - 1) + 1;
+ if (off + extra > SvLEN(sv)) {
d = off + SvGROW(sv, off + extra);
}
- s += len;
-
- d = (char*)uvchr_to_utf8((U8*)d, nextuv);
+ *d++ = UTF8_EIGHT_BIT_HI(*s);
+ *d++ = UTF8_EIGHT_BIT_LO(*s);
+ s++;
}
} /* while loop to process each character */
+ {
+ const STRLEN off = d - SvPVX(sv);
+
+ /* See if room for the terminating NUL */
+ if (UNLIKELY(off >= SvLEN(sv))) {
+
+#ifndef DEBUGGING
+
+ if (off > SvLEN(sv))
+#endif
+ Perl_croak(aTHX_ "panic: constant overflowed allocated space,"
+ " %" UVuf " >= %" UVuf, (UV)off, (UV)SvLEN(sv));
+
+ /* Whew! Here we don't have room for the terminating NUL, but
+ * everything else so far has fit. It's not too late to grow
+ * to fit the NUL and continue on. But it is a bug, as the code
+ * above was supposed to have made room for this, so under
+ * DEBUGGING builds, we panic anyway. */
+ d = off + SvGROW(sv, off + 1);
+ }
+ }
+
/* terminate the string and set up the sv */
*d = '\0';
SvCUR_set(sv, d - SvPVX_const(sv));
- if (SvCUR(sv) >= SvLEN(sv))
- Perl_croak(aTHX_ "panic: constant overflowed allocated space, %" UVuf
- " >= %" UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
SvPOK_on(sv);
if (d_is_utf8) {
switch (PL_expect) {
case XOPERATOR:
- if (!PL_in_my || PL_lex_state != LEX_NORMAL)
+ if (!PL_in_my || (PL_lex_state != LEX_NORMAL && !PL_lex_brackets))
break;
PL_bufptr = s; /* update in case we back off */
if (*s == '=') {
}
PL_expect = XOPERATOR;
- if (PL_lex_state == LEX_NORMAL && isSPACE((char)tmp)) {
+ if ((PL_lex_state == LEX_NORMAL || PL_lex_brackets) && isSPACE((char)tmp)) {
const bool islop = (PL_last_lop == PL_oldoldbufptr);
if (!islop || PL_last_lop_op == OP_GREPSTART)
PL_expect = XOPERATOR;
if (!PL_tokenbuf[1]) {
PREREF('@');
}
- if (PL_lex_state == LEX_NORMAL)
+ if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
s = skipspace(s);
if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
&& intuit_more(s, PL_bufend))
Looks up an identifier in the pad or in a package
- is_sig indicates that this is a subroutine signature variable
+ PL_in_my == KEY_sigvar indicates that this is a subroutine signature variable
rather than a plain pad var.
Returns:
/* Returns a NUL terminated string, with the length of the string written to
*slp
*/
-STATIC char *
-S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
+char *
+Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
{
char *d = dest;
char * const e = d + destlen - 3; /* two-character token, ending NUL */
s = skipspace(s);
}
}
- if ((s <= PL_bufend - (is_utf8)
+ if ((s <= PL_bufend - ((is_utf8)
? UTF8SKIP(s)
- : 1)
+ : 1))
&& VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
{
if (is_utf8) {
PL_lex_state = LEX_INTERPEND;
PL_expect = XREF;
}
- if (PL_lex_state == LEX_NORMAL) {
+ if (PL_lex_state == LEX_NORMAL || PL_lex_brackets) {
if (ckWARN(WARN_AMBIGUOUS)
&& (keyword(dest, d - dest, 0)
|| get_cvn_flags(dest, d - dest, is_utf8
SvIVX of the SV.
*/
-STATIC char *
-S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re_reparse,
+char *
+Perl_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re_reparse,
char **delimp
)
{
const char *lastub = NULL; /* position of last underbar */
static const char* const number_too_long = "Number too long";
bool warned_about_underscore = 0;
+ I32 shift; /* shift per digit for hex/oct/bin, hoisted here for fp */
#define WARN_ABOUT_UNDERSCORE() \
do { \
if (!warned_about_underscore) { \
{
/* variables:
u holds the "number so far"
- shift the power of 2 of the base
- (hex == 4, octal == 3, binary == 1)
overflowed was the number more than we can hold?
Shift is used when we add a digit. It also serves as an "are
*/
NV n = 0.0;
UV u = 0;
- I32 shift;
bool overflowed = FALSE;
bool just_zero = TRUE; /* just plain 0 or binary number? */
+ bool has_digs = FALSE;
static const NV nvshift[5] = { 1.0, 2.0, 4.0, 8.0, 16.0 };
static const char* const bases[5] =
{ "", "binary", "", "octal", "hexadecimal" };
digit:
just_zero = FALSE;
+ has_digs = TRUE;
if (!overflowed) {
assert(shift >= 0);
x = u << shift; /* make room for the digit */
}
}
+ if (shift != 3 && !has_digs) {
+ /* 0x or 0b with no digits, treat it as an error.
+ Originally this backed up the parse before the b or
+ x, but that has the potential for silent changes in
+ behaviour, like for: "0x.3" and "0x+$foo".
+ */
+ const char *d = s;
+ char *oldbp = PL_bufptr;
+ if (*d) ++d; /* so the user sees the bad non-digit */
+ PL_bufptr = (char *)d; /* so yyerror reports the context */
+ yyerror(Perl_form(aTHX_ "No digits found for %s literal",
+ shift == 4 ? "hexadecimal" : "binary"));
+ PL_bufptr = oldbp;
+ }
+
if (overflowed) {
if (n > 4294967295.0)
Perl_ck_warner(aTHX_ packWARN(WARN_PORTABLE),
if (hexfp) {
floatit = TRUE;
*d++ = '0';
- *d++ = 'x';
- s = start + 2;
+ switch (shift) {
+ case 4:
+ *d++ = 'x';
+ s = start + 2;
+ break;
+ case 3:
+ s = start + 1;
+ break;
+ case 1:
+ *d++ = 'b';
+ s = start + 2;
+ break;
+ default:
+ NOT_REACHED; /* NOTREACHED */
+ }
}
/* read next group of digits and _ and copy into d */
}
/*
-=for apidoc Amx|void|wrap_keyword_plugin|Perl_keyword_plugin_t new_plugin|Perl_keyword_plugin_t *old_plugin_p
+=for apidoc wrap_keyword_plugin
Puts a C function into the chain of keyword plugins. This is the
preferred way to manipulate the L</PL_keyword_plugin> variable.
}
/*
-=for apidoc Amx|OP *|parse_arithexpr|U32 flags
+=for apidoc parse_arithexpr
Parse a Perl arithmetic expression. This may contain operators of precedence
down to the bit shift operators. The expression must be followed (and thus
}
/*
-=for apidoc Amx|OP *|parse_termexpr|U32 flags
+=for apidoc parse_termexpr
Parse a Perl term expression. This may contain operators of precedence
down to the assignment operators. The expression must be followed (and thus
}
/*
-=for apidoc Amx|OP *|parse_listexpr|U32 flags
+=for apidoc parse_listexpr
Parse a Perl list expression. This may contain operators of precedence
down to the comma operator. The expression must be followed (and thus
}
/*
-=for apidoc Amx|OP *|parse_fullexpr|U32 flags
+=for apidoc parse_fullexpr
Parse a single complete Perl expression. This allows the full
expression grammar, including the lowest-precedence operators such
}
/*
-=for apidoc Amx|OP *|parse_block|U32 flags
+=for apidoc parse_block
Parse a single complete Perl code block. This consists of an opening
brace, a sequence of statements, and a closing brace. The block
}
/*
-=for apidoc Amx|OP *|parse_barestmt|U32 flags
+=for apidoc parse_barestmt
Parse a single unadorned Perl statement. This may be a normal imperative
statement or a declaration that has compile-time effect. It does not
}
/*
-=for apidoc Amx|SV *|parse_label|U32 flags
+=for apidoc parse_label
Parse a single label, possibly optional, of the type that may prefix a
Perl statement. It is up to the caller to ensure that the dynamic parser
}
/*
-=for apidoc Amx|OP *|parse_fullstmt|U32 flags
+=for apidoc parse_fullstmt
Parse a single complete Perl statement. This may be a normal imperative
statement or a declaration that has compile-time effect, and may include
}
/*
-=for apidoc Amx|OP *|parse_stmtseq|U32 flags
+=for apidoc parse_stmtseq
Parse a sequence of zero or more Perl statements. These may be normal
imperative statements, including optional labels, or declarations
}
/*
+=for apidoc parse_subsignature
+
+Parse a subroutine signature declaration. This is the contents of the
+parentheses following a named or anonymous subroutine declaration when the
+C<signatures> feature is enabled. Note that this function neither expects
+nor consumes the opening and closing parentheses around the signature; it
+is the caller's job to handle these.
+
+This function must only be called during parsing of a subroutine; after
+L</start_subparse> has been called. It might allocate lexical variables on
+the pad for the current subroutine.
+
+The op tree to unpack the arguments from the stack at runtime is returned.
+This op tree should appear at the beginning of the compiled function. The
+caller may wish to use L</op_append_list> to build their function body
+after it, or splice it together with the body before calling L</newATTRSUB>.
+
+The C<flags> parameter is reserved for future use, and must always
+be zero.
+
+=cut
+*/
+
+OP *
+Perl_parse_subsignature(pTHX_ U32 flags)
+{
+ if (flags)
+ Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_subsignature");
+ return parse_recdescent_for_op(GRAMSUBSIGNATURE, LEX_FAKEEOF_NONEXPR);
+}
+
+/*
* ex: set ts=8 sts=4 sw=4 et:
*/