: ""), SVfARG(namesv));
GV *gv;
SvREFCNT_dec_NN(namesv);
- if (USE_UTF8_IN_NAMES)
+ if (is_utf8)
SvUTF8_on(err);
qerror(err);
gv = gv_fetchpvs("<none>::", GV_ADDMULTI, SVt_PVHV);
be a chain of identifiers, separated by C<::> (or by the slightly
archaic C<'>); all but the last are interpreted as names of packages,
to locate the namespace in which to look up the final identifier
-(see L<perlmod/Packages> for details). It's possible to substitute
-for a simple identifier, an expression that produces a reference
+(see L<perlmod/Packages> for details). For a more in-depth discussion
+on identifiers, see L<Identifier parsing>. It's possible to
+substitute for a simple identifier, an expression that produces a reference
to the value at runtime. This is described in more detail below
and in L<perlref>.
X<identifier>
significance to Perl. For instance, C<$$> is the current process
id.)
+=head2 Identifier parsing
+X<identifiers>
+
+Up until Perl 5.18, the actual rules of what a valid identifier
+was were a bit fuzzy. However, in general, anything defined here should
+work on previous versions of Perl, while the opposite -- edge cases
+that work in previous versions, but aren't defined here -- probably
+won't work on newer versions.
+As an important side note, please note that the following only applies
+to bareword identifiers as found in Perl source code, not identifiers
+introduced through symbolic references, which have much fewer
+restrictions.
+If working under the effect of the C<use utf8;> pragma, the following
+rules apply:
+
+ / (?[ ( \p{Word} & \p{XID_Start} ) + [_] ]) \p{XID_Continue}* /x
+
+If not under C<use utf8>, the source is treated as ASCII + 128 extra
+controls, and identifiers should match
+
+ / (?aa) (?!\d) \w+ /x
+
+That is, any word character in the ASCII range, as long as the first
+character is not a digit.
+
+There are two package separators in Perl: A double colon (C<::>) and a single
+quote (C<'>). Normal identifiers can start or end with a double colon, and
+can contain several parts delimited by double colons.
+Single quotes have similar rules, but with the exception that they are not
+legal at the end of an identifier: That is, C<$'foo> and C<$foo'bar> are
+legal, but C<$foo'bar'> are not.
+
+
+Finally, if the identifier is preceded by a sigil --
+More so, normal identifiers can start or end with any number
+of double colons (::), and can contain several parts delimited
+by double colons.
+And additionally, if the identifier is preceded by a sigil --
+that is, if the identifier is part of a variable name -- it
+may optionally be enclosed in braces.
+
+While you can mix double colons with singles quotes, the quotes must come
+after the colons: C<$::::'foo> and C<$foo::'bar> are legal, but C<$::'::foo>
+and C<$foo'::bar> are not.
+
+Put together, a grammar to match a basic identifier becomes
+
+ /
+ (?(DEFINE)
+ (?<variable>
+ (?&sigil)
+ (?:
+ (?&normal_identifier)
+ | \{ \s* (?&normal_identifier) \s* \}
+ )
+ )
+ (?<normal_identifier>
+ (?: :: )* '?
+ (?&basic_identifier)
+ (?: (?= (?: :: )+ '? | (?: :: )* ' ) (?&normal_identifier) )?
+ (?: :: )*
+ )
+ (?<basic_identifier>
+ # is use utf8 on?
+ (?(?{ (caller(0))[8] & $utf8::hint_bits })
+ (?&Perl_XIDS) \p{XID_Continue}*
+ | (?aa) (?!\d) \w+
+ )
+ )
+ (?<sigil> [&*\$\@\%])
+ (?<Perl_XIDS> (?[ ( \p{Word} & \p{XID_Start} ) + [_] ]) )
+ )
+ /x
+
+Meanwhile, special identifiers don't follow the above rules; For the most
+part, all of the identifiers in this category have a special meaning given
+by Perl. Because they have special parsing rules, these generally can't be
+fully-qualified. They come in four forms:
+
+=over
+
+=item A sigil, followed solely by digits matching \p{POSIX_Digit}, like C<$0>,
+C<$1>, or C<$10000>.
+
+=item A sigil, followed by either a caret and a single POSIX uppercase letter,
+like C<$^V> or C<$^W>, or a sigil followed by a literal control character
+matching the C<\p{POSIX_Cntrl}> property. Due to a historical oddity, if not
+running under C<use utf8>, the 128 extra controls in the C<[0x80-0xff]> range
+may also be used in length one variables.
+
+=item Similar to the above, a sigil, followed by bareword text in brackets,
+where the first character is either a caret followed by an uppercase letter,
+or a literal control, like C<${^GLOBAL_PHASE}> or C<${\7LOBAL_PHASE}>.
+
+=item A sigil followed by a single character matching the C<\p{POSIX_Punct}>
+property, like C<$!> or C<%+>.
+
+=back
+
=head2 Context
X<context> X<scalar context> X<list context>
=head1 Incompatible Changes
+=head2 Explicit rules for variable names and identifiers
+
+Due to an oversight, length-one variable names in 5.16 were completely
+unrestricted, and opened the door to several kinds of insanity. As of
+5.18, these now follow the rules of other identifiers, in addition
+to accepting characters that match the \p{POSIX_Punct} property.
+
There are no longer any differences in the parsing of identifiers specified
as $... or ${...}; previously, they were dealt with in different parts of
the core, and so had slightly different behavior. For instance,
Can't use global $! in "my" at - line 1, near "my $!"
Execution of - aborted due to compilation errors.
########
-# NAME my $<special_unicode>
-use utf8;
-BEGIN { binmode STDERR, ":utf8" }
-my $♠;
-EXPECT
-Can't use global $♠ in "my" at - line 3, near "my $♠"
-Execution of - aborted due to compilation errors.
-########
# NAME OP_HELEM fields
package Foo;
use fields qw(a b);
use 5.016;
use utf8;
use open qw( :utf8 :std );
+no warnings qw(misc reserved);
-plan (tests => 5);
+plan (tests => 65850);
# ${single:colon} should not be valid syntax
{
}
# The first character in ${...} should respect the rules
-TODO: {
- local $::TODO = "Fixed by the next commit";
+{
local $@;
use utf8;
eval '${☭asd} = 1';
like($@, qr/\QUnrecognized character/, q(the first character in ${...} isn't special))
}
+
+# Checking that at least some of the special variables work
+for my $v (qw( ^V ; < > ( ) {^GLOBAL_PHASE} ^W _ 1 4 0 [ ] ! @ / \ = )) {
+ local $@;
+ evalbytes "\$$v;";
+ is $@, '', "No syntax error for \$$v";
+
+ local $@;
+ eval "use utf8; \$$v;";
+ is $@, '', "No syntax error for \$$v under use utf8";
+}
+
+# Checking if the Latin-1 range behaves as expected, and that the behavior is the
+# same whenever under strict or not.
+for ( 0x80..0xff ) {
+ no warnings 'closure';
+ my $chr = chr;
+ my $esc = sprintf("%X", ord $chr);
+ utf8::downgrade($chr);
+ if ($chr !~ /\p{XIDS}/u) {
+ is evalbytes "no strict; \$$chr = 10",
+ 10,
+ sprintf("\\x%02x, part of the latin-1 range, is legal as a length-1 variable", $_);
+
+ utf8::upgrade($chr);
+ local $@;
+ eval "no strict; use utf8; \$$chr = 1";
+ like $@,
+ qr/\QUnrecognized character \x{\E\L$esc/,
+ sprintf("..but is illegal as a length-1 variable under use utf8", $_);
+ }
+ else {
+ {
+ no utf8;
+ local $@;
+ evalbytes "no strict; \$$chr = 1";
+ is($@, '', sprintf("\\x%02x, =~ \\p{XIDS}, latin-1, no utf8, no strict, is a valid length-1 variable", $_));
+
+ local $@;
+ evalbytes "use strict; \$$chr = 1";
+ is($@,
+ '',
+ sprintf("\\x%02x under no utf8 does not have to be required under strict, even though it matches XIDS", $_)
+ );
+
+ local $@;
+ evalbytes "\$a$chr = 1";
+ like($@,
+ qr/Unrecognized character /,
+ sprintf("...but under no utf8, it's not allowed in two-or-more character variables")
+ );
+
+ local $@;
+ evalbytes "\$a$chr = 1";
+ like($@,
+ qr/Unrecognized character /,
+ sprintf("...but under no utf8, it's not allowed in two-or-more character variables")
+ );
+ }
+ {
+ use utf8;
+ my $u = $chr;
+ utf8::upgrade($u);
+ local $@;
+ eval "no strict; \$$u = 1";
+ is($@, '', sprintf("\\x%02x, =~ \\p{XIDS}, UTF-8, use utf8, no strict, is a valid length-1 variable", $_));
+
+ local $@;
+ eval "use strict; \$$u = 1";
+ like($@,
+ qr/Global symbol "\$$u" requires explicit package name/,
+ sprintf("\\x%02x under utf8 has to be required under strict", $_)
+ );
+ }
+ }
+}
+
+{
+ use utf8;
+ my $ret = eval "my \$c\x{327} = 100; \$c\x{327}"; # c + cedilla
+ is($@, '', "ASCII character + combining character works as a variable name");
+ is($ret, 100, "...and returns the correct value");
+}
+
+# From Tom Christiansen's 'highly illegal variable names are now accidentally legal' mail
+for my $chr (
+ "\N{EM DASH}", "\x{F8FF}", "\N{POUND SIGN}", "\N{SOFT HYPHEN}",
+ "\N{THIN SPACE}", "\x{11_1111}", "\x{DC00}", "\N{COMBINING DIAERESIS}",
+ "\N{COMBINING ENCLOSING CIRCLE BACKSLASH}",
+ )
+{
+ no warnings 'non_unicode';
+ my $esc = sprintf("%x", ord $chr);
+ local $@;
+ eval "\$$chr = 1; \$$chr";
+ like($@,
+ qr/\QUnrecognized character \x{$esc};/,
+ "\\x{$esc} is illegal for a length-one identifier"
+ );
+}
+
+for my $i (0x100..0xffff) {
+ my $chr = chr($i);
+ my $esc = sprintf("%x", $i);
+ local $@;
+ eval "my \$$chr = q<test>; \$$chr;";
+ if ( $chr =~ /^\p{_Perl_IDStart}$/ ) {
+ is($@, '', sprintf("\\x{%04x} is XIDS, works as a length-1 variable", $i));
+ }
+ else {
+ like($@,
+ qr/\QUnrecognized character \x{$esc};/,
+ "\\x{$esc} isn't XIDS, illegal as a length-1 variable",
+ )
+ }
+}
\ No newline at end of file
#endif
switch (*s) {
default:
- if (isIDFIRST_lazy_if(s,UTF))
+ if (UTF ? isIDFIRST_utf8((U8*)s) : isALNUMC(*s))
goto keylookup;
{
SV *dsv = newSVpvs_flags("", SVs_TEMP);
for (d = s; isWORDCHAR_lazy_if(d,UTF);) {
d += UTF ? UTF8SKIP(d) : 1;
if (UTF) {
- while (UTF8_IS_CONTINUED(*d) && _is_utf8_mark((U8*)d)) {
+ while (isIDCONT_utf8((U8*)d)) {
d += UTF ? UTF8SKIP(d) : 1;
}
}
for (;;) {
if (*d >= e)
Perl_croak(aTHX_ "%s", ident_too_long);
- if (isWORDCHAR(**s)) /* UTF handled below */
- *(*d)++ = *(*s)++;
- else if (is_utf8 && UTF8_IS_START(**s) && isWORDCHAR_utf8((U8*)*s)) {
+ if (is_utf8 && isIDFIRST_utf8((U8*)*s)) {
+ /* The UTF-8 case must come first, otherwise things
+ * like c\N{COMBINING TILDE} would start failing, as the
+ * isWORDCHAR_A case below would gobble the 'c' up.
+ */
+
char *t = *s + UTF8SKIP(*s);
- while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t))
+ while (isIDCONT_utf8((U8*)t))
t += UTF8SKIP(t);
if (*d + (t - *s) > e)
Perl_croak(aTHX_ "%s", ident_too_long);
*d += t - *s;
*s = t;
}
+ else if ( isWORDCHAR_A(**s) ) {
+ do {
+ *(*d)++ = *(*s)++;
+ } while isWORDCHAR_A(**s);
+ }
else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) {
*(*d)++ = ':';
*(*d)++ = ':';
return s;
}
if (*s == '$' && s[1] &&
- (isWORDCHAR_lazy_if(s+1,is_utf8) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) )
+ (isIDFIRST_lazy_if(s+1,is_utf8) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) )
{
return s;
}
bracket = s;
s++;
}
- if (s < send) {
+
+#define VALID_LEN_ONE_IDENT(d, u) (isPUNCT_A((U8)*(d)) \
+ || isCNTRL_A((U8)*(d)) \
+ || isDIGIT_A((U8)*(d)) \
+ || (!(u) && !UTF8_IS_INVARIANT((U8)*(d))))
+ if (s < send
+ && (isIDFIRST_lazy_if(s, is_utf8) || VALID_LEN_ONE_IDENT(s, is_utf8)))
+ {
if (is_utf8) {
const STRLEN skip = UTF8SKIP(s);
STRLEN i;