From: Karl Williamson Date: Thu, 15 Jul 2010 23:28:28 +0000 (-0600) Subject: Add \o{} escape X-Git-Tag: v5.13.3~56 X-Git-Url: https://perl5.git.perl.org/perl5.git/commitdiff_plain/f0a2b745ce6c03aec6412d79ce0b782f20eddce4?ds=sidebyside Add \o{} escape This commit adds the new construct \o{} to express a character constant by its octal ordinal value, along with ancillary tests and documentation. A function to handle this is added to util.c, and it is called from the 3 parsing places it could occur. The function is a candidate for in-lining, though I doubt that it will ever be used frequently. --- diff --git a/embed.fnc b/embed.fnc index 8493dd7..37c7f2b 100644 --- a/embed.fnc +++ b/embed.fnc @@ -639,6 +639,7 @@ p |OP* |localize |NN OP *o|I32 lex ApdR |I32 |looks_like_number|NN SV *const sv Apd |UV |grok_bin |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result EXpR |char |grok_bslash_c |const char source|const bool output_warning +EXpR |char* |grok_bslash_o |NN const char* s|NN UV* uv|NN STRLEN* len|const bool output_warning Apd |UV |grok_hex |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result Apd |int |grok_number |NN const char *pv|STRLEN len|NULLOK UV *valuep ApdR |bool |grok_numeric_radix|NN const char **sp|NN const char *send diff --git a/embed.h b/embed.h index 8fb3cbe..fffdede 100644 --- a/embed.h +++ b/embed.h @@ -463,6 +463,7 @@ #define grok_bin Perl_grok_bin #if defined(PERL_CORE) || defined(PERL_EXT) #define grok_bslash_c Perl_grok_bslash_c +#define grok_bslash_o Perl_grok_bslash_o #endif #define grok_hex Perl_grok_hex #define grok_number Perl_grok_number @@ -2909,6 +2910,7 @@ #define grok_bin(a,b,c,d) Perl_grok_bin(aTHX_ a,b,c,d) #if defined(PERL_CORE) || defined(PERL_EXT) #define grok_bslash_c(a,b) Perl_grok_bslash_c(aTHX_ a,b) +#define grok_bslash_o(a,b,c,d) Perl_grok_bslash_o(aTHX_ a,b,c,d) #endif #define grok_hex(a,b,c,d) Perl_grok_hex(aTHX_ a,b,c,d) #define grok_number(a,b,c) Perl_grok_number(aTHX_ a,b,c) diff --git a/global.sym b/global.sym index aa61a69..3323815 100644 --- a/global.sym +++ b/global.sym @@ -283,6 +283,7 @@ Perl_vload_module Perl_looks_like_number Perl_grok_bin Perl_grok_bslash_c +Perl_grok_bslash_o Perl_grok_hex Perl_grok_number Perl_grok_numeric_radix diff --git a/pod/perl5133delta.pod b/pod/perl5133delta.pod index 476427e..d4db338 100644 --- a/pod/perl5133delta.pod +++ b/pod/perl5133delta.pod @@ -28,6 +28,17 @@ here, but most should go in the L section. [ List each enhancement as a =head2 entry ] +=head2 \o{...} + +The escape sequence C<"\o"> in double-quotish contexts is now defined. It +must be followed by braces enclosing an octal number of at least one digit. It +means the character whose ordinal value is that octal number. This construct +allows large octal ordinals beyond the current max of 0777 to be represented. +It also allows you to specify a character in octal which can safely be +concatenated with other regex snippets without danger of changing its meaning, +and one which won't ever be confused with being a backreference to a regex +capture group. See L + =head2 C<\N{I}> and C enhancements C<\N{}> and C now know about the abbreviated character @@ -84,7 +95,9 @@ anomalous behavior than their use in all other double-quotish contexts. Since all double-quotish contexts have the same behavior, namely to be equivalent to C<\x{100}> - C<\x{1FF}>, with no deprecation warning. Use of these values in the command line option C<"-0"> retains the current meaning to slurp input files -whole; previously, this was documented only for C<"-0777">. +whole; previously, this was documented only for C<"-0777">. It is recommended, +however, because of various ambiguities, to use the new L construct +to represent characters in octal. =head1 Deprecations diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 9e8a287..9f9fe4b 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2510,6 +2510,10 @@ comment) between the C<\N> and the C<{> in a regex with the C modifier. This modifier does not change the requirement that the brace immediately follow the C<\N>. +=item Missing braces on \o{} + +(F) A C<\o> must be followed immediately by a C<{> in double-quotish context. + =item Missing comma after first argument to %s function (F) While certain functions allow you to specify a filehandle or an @@ -2978,6 +2982,11 @@ to UTC. If it's not, define the logical name F to translate to the number of seconds which need to be added to UTC to get local time. +=item Non-octal character '%c'. Resolved as "%s" + +(W digit) In parsing an octal numeric constant, a character was unexpectedly +encountered that isn't octal. The resulting value is as indicated. + =item Non-string passed as bitmask (W misc) A number has been passed as a bitmask argument to select(). @@ -3020,6 +3029,11 @@ versions of Perl are likely to eliminate this arbitrary limitation. In the meantime, try using scientific notation (e.g. "1e6" instead of "1_000_000"). +=item Number with no digits + +(F) Perl was looking for a number but found nothing that looked like a number. +This happens, for example with C<\o{}>, with no number between the braces. + =item Octal number in vector unsupported (F) Numbers with a leading C<0> are not currently allowed in vectors. diff --git a/pod/perlre.pod b/pod/perlre.pod index 5ea15a0..2e00f0b 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -229,11 +229,11 @@ also work: \f form feed (FF) \a alarm (bell) (BEL) \e escape (think troff) (ESC) - \033 octal char (example: ESC) \cK control char (example: VT) \x{}, \x00 character whose ordinal is the given hexadecimal number \N{name} named Unicode character \N{U+263D} Unicode character (example: FIRST QUARTER MOON) + \o{}, \000 character whose ordinal is the given octal number \l lowercase next char (think vi) \u uppercase next char (think vi) \L lowercase till \E (think vi) diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod index 9d246bd..d460f7f 100644 --- a/pod/perlrebackslash.pod +++ b/pod/perlrebackslash.pod @@ -62,7 +62,7 @@ quoted constructs>. Those not usable within a bracketed character class (like C<[\da-z]>) are marked as C - \000 Octal escape sequence. + \000 Octal escape sequence. See also \o{}. \1 Absolute backreference. Not in []. \a Alarm or bell. \A Beginning of string. Not in []. @@ -86,6 +86,7 @@ as C \n (Logical) newline character. \N Any character but newline. Experimental. Not in []. \N{} Named or numbered (Unicode) character. + \o{} Octal escape sequence. \p{}, \pP Character with the given Unicode property. \P{}, \PP Character without the given Unicode property. \Q Quotemeta till \E. Not in []. @@ -207,33 +208,57 @@ match "as is". =head3 Octal escapes -Octal escapes consist of a backslash followed by three octal digits -matching the code point of the character you want to use. (In some contexts, -two or even one octal digits are also accepted, sometimes with a warning.) This -allows for 512 characters (C<\000> up to C<\777>) that can be expressed this -way. Enough in pre-Unicode days, -but most Unicode characters cannot be escaped this way. +There are two forms of octal escapes. Each is used to specify a character by +its ordinal, specified in octal notation. + +One form, available starting in Perl 5.14 looks like C<\o{...}>, where the dots +represent one or more octal digits. It can be used for any Unicode character. + +It was introduced to avoid the potential problems with the other form, +available in all Perls. That form consists of a backslash followed by three +octal digits. One problem with this form is that it can look exactly like an +old-style backreference (see +L +below.) You can avoid this by making the first of the three digits always a +zero, but that makes \077 the largest ordinal unambiguously specifiable by this +form. + +In some contexts, a backslash followed by two or even one octal digits may be +interpreted as an octal escape, sometimes with a warning, and because of some +bugs, sometimes with surprising results. Also, if you are creating a regex +out of smaller snippets concatentated together, and you use fewer than three +digits, the beginning of one snippet may be interpreted as adding digits to the +ending of the snippet before it. See L for more +discussion and examples of the snippet problem. Note that a character that is expressed as an octal escape is considered as a character without special meaning by the regex engine, and will match "as is". -=head4 Examples (assuming an ASCII platform) +To summarize, the C<\o{}> form is always safe to use, and the other form is +safe to use for ordinals up through \077 when you use exactly three digits to +specify them. - $str = "Perl"; - $str =~ /\120/; # Match, "\120" is "P". - $str =~ /\120+/; # Match, "\120" is "P", it is repeated at least once - $str =~ /P\053/; # No match, "\053" is "+" and taken literally. +Mnemonic: I<0>ctal or Ictal. -=head4 Caveat +=head4 Examples (assuming an ASCII platform) -Octal escapes potentially clash with old-style backreferences (see L below). They both consist of a backslash followed by numbers. So -Perl has to use heuristics to determine whether it is a backreference or an -octal escape. You can avoid ambiguity by using the C<\g> form for -backreferences, and by beginning octal escapes with a "0". (Since octal -escapes are 3 digits, this latter method works only up to C<\077>.) In the -absence of C<\g>, Perl uses the following rules: + $str = "Perl"; + $str =~ /\o{120}/; # Match, "\120" is "P". + $str =~ /\120/; # Same. + $str =~ /\o{120}+/; # Match, "\120" is "P", it's repeated at least once + $str =~ /\120+/; # Same. + $str =~ /P\053/; # No match, "\053" is "+" and taken literally. + /\o{23073}/ # Black foreground, white background smiling face. + /\o{4801234567}/ # Raises a warning, and yields chr(4) + +=head4 Disambiguation rules between old-style octal escapes and backreferences + +Octal escapes of the C<\000> form outside of bracketed character classes +potentially clash with old-style backreferences. (see L +below). They both consist of a backslash followed by numbers. So Perl has to +use heuristics to determine whether it is a backreference or an octal escape. +Perl uses the following rules to disambiguate: =over 4 @@ -258,18 +283,24 @@ the rest are matched as is. $pat .= ")" x 999; /^($pat)\1000$/; # Matches 'aa'; there are 1000 capture groups. /^$pat\1000$/; # Matches 'a@0'; there are 999 capture groups - # and \1000 is seen as \100 (a '@') and a '0'. + # and \1000 is seen as \100 (a '@') and a '0' =back +You can the force a backreference interpretation always by using the C<\g{...}> +form. You can the force an octal interpretation always by using the C<\o{...}> +form, or for numbers up through \077 (= 63 decimal), by using three digits, +beginning with a "0". + =head3 Hexadecimal escapes -Hexadecimal escapes start with C<\x> and are then either followed by a -two digit hexadecimal number, or a hexadecimal number of arbitrary length -surrounded by curly braces. The hexadecimal number is the code point of -the character you want to express. +Like octal escapes, there are two forms of hexadecimal escapes, but both start +with the same thing, C<\x>. This is followed by either exactly two hexadecimal +digits forming a number, or a hexadecimal number of arbitrary length surrounded +by curly braces. The hexadecimal number is the code point of the character you +want to express. -Note that a character that is expressed as a hexadecimal escape is considered +Note that a character that is expressed as one of these escapes is considered as a character without special meaning by the regex engine, and will match "as is". diff --git a/pod/perlretut.pod b/pod/perlretut.pod index eae266a..f218717 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -184,7 +184,8 @@ bytes. Here are some examples of escapes: "1000\t2000" =~ m(0\t2) # matches "1000\n2000" =~ /0\n20/ # matches "1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000" - "cat" =~ /\143\x61\x74/ # matches in ASCII, but a weird way to spell cat + "cat" =~ /\o{143}\x61\x74/ # matches in ASCII, but a weird way + # to spell cat If you've been around Perl a while, all this talk of escape sequences may seem familiar. Similar escape sequences are used in double-quoted @@ -1876,9 +1877,9 @@ much about Perl's internal representation of strings. But they do need to know 1) how to represent Unicode characters in a regexp and 2) that a matching operation will treat the string to be searched as a sequence of characters, not bytes. The answer to 1) is that Unicode characters -greater than C are represented using the C<\x{hex}> notation, -because the \0 octal and \x hex (without curly braces) don't go further -than 255. +greater than C are represented using the C<\x{hex}> notation, because +\x hex (without curly braces) doesn't go further than 255. (Starting in Perl +5.14) if you're an octal fan, you can also use C<\o{oct}>. /\x{263a}/; # match a Unicode smiley face :) diff --git a/proto.h b/proto.h index 6a5110e..1fc1180 100644 --- a/proto.h +++ b/proto.h @@ -1633,6 +1633,14 @@ PERL_CALLCONV UV Perl_grok_bin(pTHX_ const char* start, STRLEN* len_p, I32* flag PERL_CALLCONV char Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning) __attribute__warn_unused_result__; +PERL_CALLCONV char* Perl_grok_bslash_o(pTHX_ const char* s, UV* uv, STRLEN* len, const bool output_warning) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT_GROK_BSLASH_O \ + assert(s); assert(uv); assert(len) + PERL_CALLCONV UV Perl_grok_hex(pTHX_ const char* start, STRLEN* len_p, I32* flags, NV *result) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) diff --git a/regcomp.c b/regcomp.c index 72af569..74f996b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -7362,6 +7362,7 @@ tryagain: register UV ender; register char *p; char *s; + char *error_msg; STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; @@ -7462,6 +7463,26 @@ tryagain: ender = ASCII_TO_NATIVE('\007'); p++; break; + case 'o': + { + STRLEN brace_len = len; + if ((error_msg = grok_bslash_o(p, + &ender, + &brace_len, + SIZE_ONLY)) + != NULL) + { + vFAIL(error_msg); + } + p += brace_len; + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + RExC_utf8 = 1; + } + break; + } case 'x': if (*++p == '{') { char* const e = strchr(p, '}'); @@ -7971,6 +7992,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) parseit: while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + char* error_msg; charclassloop: @@ -8077,6 +8099,21 @@ parseit: case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + if ((error_msg = grok_bslash_o(RExC_parse, + &value, + &numlen, + SIZE_ONLY)) + != NULL) + { + vFAIL(error_msg); + } + RExC_parse += numlen; + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; case 'x': if (*RExC_parse == '{') { I32 flags = PERL_SCAN_ALLOW_UNDERSCORES diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp index f85aa44..3f80ccc 100644 --- a/t/lib/warnings/regcomp +++ b/t/lib/warnings/regcomp @@ -207,3 +207,33 @@ Useless (?-c) - don't use /gc modifier in regex; marked by <-- HERE in m/(?o-c < Useless (?o) - use /o modifier in regex; marked by <-- HERE in m/(?o <-- HERE gc)/ at - line 12. Useless (?g) - use /g modifier in regex; marked by <-- HERE in m/(?og <-- HERE c)/ at - line 12. Useless (?c) - use /gc modifier in regex; marked by <-- HERE in m/(?ogc <-- HERE )/ at - line 12. +######## +# regcomp.c [S_regatom] +$a = qr/\o{/; +EXPECT +Missing right brace on \o{ in regex; marked by <-- HERE in m/\ <-- HERE o{/ at - line 2. +######## +# regcomp.c [S_regatom] +$a = qr/\o/; +EXPECT +Missing braces on \o{} in regex; marked by <-- HERE in m/\ <-- HERE o/ at - line 2. +######## +# regcomp.c [S_regatom] +$a = qr/\o{}/; +EXPECT +Number with no digits in regex; marked by <-- HERE in m/\ <-- HERE o{}/ at - line 2. +######## +# regcomp.c [S_regclass] +$a = qr/[\o{]/; +EXPECT +Missing right brace on \o{ in regex; marked by <-- HERE in m/[\ <-- HERE o{]/ at - line 2. +######## +# regcomp.c [S_regclass] +$a = qr/[\o]/; +EXPECT +Missing braces on \o{} in regex; marked by <-- HERE in m/[\ <-- HERE o]/ at - line 2. +######## +# regcomp.c [S_regclass] +$a = qr/[\o{}]/; +EXPECT +Number with no digits in regex; marked by <-- HERE in m/[\ <-- HERE o{}]/ at - line 2. diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke index 4bb131f..076270c 100644 --- a/t/lib/warnings/toke +++ b/t/lib/warnings/toke @@ -966,3 +966,26 @@ Use of := for an empty attribute list is deprecated at - line 36. Use of := for an empty attribute list is deprecated at - line 38. Use of := for an empty attribute list is deprecated at - line 41. Use of := for an empty attribute list is deprecated at - line 42. +######## +# toke.c +use warnings 'syntax' ; +my $a = "\o"; +my $a = "\o{"; +my $a = "\o{}"; +no warnings 'syntax' ; +my $a = "\o"; +my $a = "\o{"; +my $a = "\o{}"; +EXPECT +Missing braces on \o{} at - line 3, within string +Missing right brace on \o{ at - line 4, within string +Number with no digits at - line 5, within string +BEGIN not safe after errors--compilation aborted at - line 6. +######## +# toke.c +use warnings 'digit' ; +my $a = "\o{1238456}"; +no warnings 'digit' ; +my $a = "\o{1238456}"; +EXPECT +Non-octal character '8'. Resolved as "\o{123}" at - line 3. diff --git a/t/op/qq.t b/t/op/qq.t index 7b75b9b..0136608 100644 --- a/t/op/qq.t +++ b/t/op/qq.t @@ -5,7 +5,7 @@ BEGIN { @INC = '../lib'; } -print q(1..26 +print q(1..29 ); # This is() function is written to avoid "" @@ -64,6 +64,9 @@ is ("\x{10FFFD}", chr 1114109); is ("\400", chr 0x100); is ("\600", chr 0x180); is ("\777", chr 0x1FF); +is ("a\o{120}b", "a" . chr(0x50) . "b"); +is ("a\o{400}b", "a" . chr(0x100) . "b"); +is ("a\o{1000}b", "a" . chr(0x200) . "b"); # These kludged tests should change when we remove the temporary fatal error # in util.c for "\c{". And, the warning there should probably not be diff --git a/t/re/re_tests b/t/re/re_tests index fc29fb6..36a2f4c 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1460,5 +1460,11 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer [a\400] \x{100} y $& \x{100} [b\600] \x{180} y $& \x{180} [c\777] \x{1FF} y $& \x{1FF} +\o{120} \x{50} y $& \x{50} +\o{400} \x{100} y $& \x{100} +\o{1000} \x{200} y $& \x{200} +[a\o{120}] \x{50} y $& \x{50} +[a\o{400}] \x{100} y $& \x{100} +[a\o{1000}] \x{200} y $& \x{200} # vim: softtabstop=0 noexpandtab diff --git a/toke.c b/toke.c index b7b33e8..75fb327 100644 --- a/toke.c +++ b/toke.c @@ -2879,6 +2879,20 @@ S_scan_const(pTHX_ char *start) } goto NUM_ESCAPE_INSERT; + /* eg. \o{24} indicates the octal constant \024 */ + case 'o': + { + STRLEN len; + + char* error = grok_bslash_o(s, &uv, &len, 1); + s += len; + if (error) { + yyerror(error); + continue; + } + goto NUM_ESCAPE_INSERT; + } + /* eg. \x24 indicates the hex constant 0x24 */ case 'x': ++s; diff --git a/util.c b/util.c index b3b385e..6fdc653 100644 --- a/util.c +++ b/util.c @@ -3904,7 +3904,7 @@ Perl_report_evil_fh(pTHX_ const GV *gv, const IO *io, I32 op) char Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning) { - + U8 result; if (! isASCII(source)) { @@ -3935,6 +3935,72 @@ Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning) return result; } +char * +Perl_grok_bslash_o(pTHX_ const char *s, UV *uv, STRLEN *len, const bool output_warning) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns NULL on success, otherwise a pointer to an internal constant + * error message. On input: + * s points to a string that begins with o, and the previous character was + * a backslash. + * uv points to a UV that will hold the output value + * len will point to the next character in the string past the end of this + * construct + * output_warning says whether to output any warning messages, or suppress + * them + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + /* XXX Until the message is improved in grok_oct, handle errors + * ourselves */ + | PERL_SCAN_SILENT_ILLDIGIT; + + PERL_ARGS_ASSERT_GROK_BSLASH_O; + + + assert(*s == 'o'); + s++; + + if (*s != '{') { + *len = 1; /* Move past the o */ + return "Missing braces on \\o{}"; + } + + e = strchr(s, '}'); + if (!e) { + *len = 2; /* Move past the o{ */ + return "Missing right brace on \\o{"; + } + + /* Return past the '}' no matter what is inside the braces */ + *len = e - s + 2; /* 2 = 1 for the o + 1 for the '}' */ + + s++; /* Point to first digit */ + + numbers_len = e - s; + if (numbers_len == 0) { + return "Number with no digits"; + } + + *uv = NATIVE_TO_UNI(grok_oct(s, &numbers_len, &flags, NULL)); + /* Note that if has non-octal, will ignore everything starting with that up + * to the '}' */ + + if (output_warning && numbers_len != (STRLEN) (e - s)) { + Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), + /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ + "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", + *(s + numbers_len), + (int) numbers_len, + s); + } + + return NULL; +} + /* To workaround core dumps from the uninitialised tm_zone we get the * system to give us a reasonable struct to copy. This fix means that * strftime uses the tm_zone and tm_gmtoff values returned by