This brings Perl regular expressions more into conformance with Unicode.
/x now accepts 5 additional characters as white space. Use of these
characters as literals under /x has been deprecated since 5.18, so now
we are free to change what they mean.
This commit eliminates the static function that processes the old
whitespace definition (and a generated macro that was used only for
this), using the already existing one for the new definition. It
refactors slightly the static function that skips comments to mesh
better with the needs of its callers, and calls it in one place where
before the code was essentially duplicated.
p5p discussion starting in
http://nntp.perl.org/group/perl.perl5.porters/214726 convinced me that
the (?[ ]) comments should be terminated the same way as regular /x
comments, and this was also done in this commit. No prior notice is
necessary as this is an experimental feature.
|NN regnode *scan|NN UV *min_subtract \
|NN bool *unfolded_multi_char \
|U32 flags|NULLOK regnode *val|U32 depth
-EsRn |char * |regwhite |NN RExC_state_t *pRExC_state \
- |NN char *p
-EsRn |char * |regpatws |NN RExC_state_t *pRExC_state \
+EsR |char * |regpatws |NN RExC_state_t *pRExC_state \
|NN char *p|const bool recognize_comment
Ei |void |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \
|NN regnode *node|NN I32 *flagp|STRLEN len \
|UV code_point|bool downgradable
Ei |U8 |compute_EXACTish|NN RExC_state_t *pRExC_state
Es |char * |nextchar |NN RExC_state_t *pRExC_state
-Es |bool |reg_skipcomment|NN RExC_state_t *pRExC_state
+Ei |char * |reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p
Es |void |scan_commit |NN const RExC_state_t *pRExC_state \
|NN struct scan_data_t *data \
|NN SSize_t *minlenp \
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
#define reg_recode(a,b) S_reg_recode(aTHX_ a,b)
#define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b)
-#define reg_skipcomment(a) S_reg_skipcomment(aTHX_ a)
+#define reg_skipcomment(a,b) S_reg_skipcomment(aTHX_ a,b)
#define reganode(a,b,c) S_reganode(aTHX_ a,b,c)
#define regatom(a,b,c) S_regatom(aTHX_ a,b,c)
#define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d)
#define regclass(a,b,c,d,e,f,g) S_regclass(aTHX_ a,b,c,d,e,f,g)
#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d)
-#define regpatws S_regpatws
+#define regpatws(a,b,c) S_regpatws(aTHX_ a,b,c)
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
#define regpposixcc(a,b,c) S_regpposixcc(aTHX_ a,b,c)
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)
#define reguni(a,b,c) S_reguni(aTHX_ a,b,c)
-#define regwhite S_regwhite
#define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d)
#define set_ANYOF_arg(a,b,c,d,e,f,g) S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f,g)
#define ssc_add_range(a,b,c) S_ssc_add_range(aTHX_ a,b,c)
[ List each enhancement as a =head2 entry ]
+=head2 C<qr/foo/x> now ignores any Unicode pattern white space
+
+The C</x> regular expression modifier allows the pattern to contain
+white space and comments, both of which are ignored, for improved
+readability. Until now, not all the white space characters that Unicode
+designates for this purpose were handled. The additional ones now
+recognized are
+U+0085 NEXT LINE,
+U+200E LEFT-TO-RIGHT MARK,
+U+200F RIGHT-TO-LEFT MARK,
+U+2028 LINE SEPARATOR,
+and
+U+2029 PARAGRAPH SEPARATOR.
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
These had been deprecated since v5.18.
+=head2 5 additional characters are treated as white space under C</x> in
+regex patterns (unless escaped)
+
+The use of these characters with C</x> outside bracketed character
+classes and when not preceeded by a backslash has raised a deprecation
+warning since v5.18. Now they will be ignored. See L</qrE<sol>fooE<sol>x>
+for the list of the five characters.
+
+=head2 Comment lines within S<C<(?[ ])>> now are ended only by a C<\n>
+
+S<C<(?[ ])>> is an experimental feature, introduced in v5.18. It operates
+as if C</x> is always enabled. But there was a difference, comment
+lines (following a C<#> character) were terminated by anything matching
+C<\R> which includes all vertical whitespace, such as form feeds. For
+consistency, this is now changed to match what terminates comment lines
+outside S<C<(?[ ])>>, namely a C<\n> (even if escaped), which is the
+same as what terminates a heredoc string and formats.
+
=head1 Deprecations
XXX Any deprecated features, syntax, modules etc. should be listed here.
an invalid file specification to Perl, or you've found a case the
conversion routines don't handle. Drat.
-=item Escape literal pattern white space under /x
-
-(D deprecated) You compiled a regular expression pattern with C</x> to
-ignore white space, and you used, as a literal, one of the characters
-that Perl plans to eventually treat as white space. The character must
-be escaped somehow, or it will work differently on a future Perl that
-does treat it as white space. The easiest way is to insert a backslash
-immediately before it, or to enclose it with square brackets. This
-change is to bring Perl into conformance with Unicode recommendations.
-Here are the five characters that generate this warning:
-U+0085 NEXT LINE,
-U+200E LEFT-TO-RIGHT MARK,
-U+200F RIGHT-TO-LEFT MARK,
-U+2028 LINE SEPARATOR,
-and
-U+2029 PARAGRAPH SEPARATOR.
-
=item Eval-group in insecure regular expression
(F) Perl detected tainted data when trying to compile a regular
L<perluniprops/Properties accessible through \p{} and \P{}>.
X</x>
+The set of characters that are deemed whitespace are those that Unicode
+calls "Pattern White Space", namely:
+
+ U+0009 CHARACTER TABULATION
+ U+000A LINE FEED
+ U+000B LINE TABULATION
+ U+000C FORM FEED
+ U+000D CARRIAGE RETURN
+ U+0020 SPACE
+ U+0085 NEXT LINE
+ U+200E LEFT-TO-RIGHT MARK
+ U+200F RIGHT-TO-LEFT MARK
+ U+2028 LINE SEPARATOR
+ U+2029 PARAGRAPH SEPARATOR
+
=head3 Character set modifiers
C</d>, C</u>, C</a>, and C</l>, available starting in 5.14, are called
#define PERL_ARGS_ASSERT_REG_SCAN_NAME \
assert(pRExC_state)
-STATIC bool S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
- __attribute__nonnull__(pTHX_1);
+PERL_STATIC_INLINE char * S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char * p)
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_REG_SKIPCOMMENT \
- assert(pRExC_state)
+ assert(pRExC_state); assert(p)
STATIC regnode* S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_REGINSERT \
assert(pRExC_state); assert(opnd)
-STATIC char * S_regpatws(RExC_state_t *pRExC_state, char *p, const bool recognize_comment)
+STATIC char * S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p, const bool recognize_comment)
__attribute__warn_unused_result__
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_REGPATWS \
assert(pRExC_state); assert(p)
#define PERL_ARGS_ASSERT_REGUNI \
assert(pRExC_state); assert(s)
-STATIC char * S_regwhite(RExC_state_t *pRExC_state, char *p)
- __attribute__warn_unused_result__
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-#define PERL_ARGS_ASSERT_REGWHITE \
- assert(pRExC_state); assert(p)
-
STATIC void S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_data_t *data, SSize_t *minlenp, int is_inf)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
: 0 )
/*** GENERATED CODE ***/
-#define is_PATWS_non_low_safe(s,e,is_utf8) \
-( ((e) > (s)) ? \
- ( (! is_utf8) ? \
- ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) \
- : (((e) - (s)) >= UTF8SKIP(s)) ? \
- ( ( 0xC2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \
- ( ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \
- : ( ( ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0x80 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0x8E || ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0xA8 ) ) ? 3 : 0 )\
- : 0 ) \
-: 0 )
-
-/*** GENERATED CODE ***/
#define is_PATWS_cp(cp) \
( ( 0x09 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0x0D ) || ( 0x0D < NATIVE_TO_UNI(cp) &&\
( 0x20 == NATIVE_TO_UNI(cp) || ( 0x20 < NATIVE_TO_UNI(cp) && \
* modifier. The other meaning does not, so use a temporary until we find
* out which we are being called with */
p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regwhite( pRExC_state, RExC_parse )
+ ? regpatws(pRExC_state, RExC_parse,
+ TRUE) /* means recognize comments */
: RExC_parse;
/* Disambiguate between \N meaning a named character versus \N meaning
case '#':
if (RExC_flags & RXf_PMf_EXTENDED) {
- if ( reg_skipcomment( pRExC_state ) )
+ RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+ if (RExC_parse < RExC_end)
goto tryagain;
}
/* FALLTHROUGH */
oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
switch ((U8)*p) {
case '^':
case '$':
break;
default: /* A literal character */
- if (! SIZE_ONLY
- && RExC_flags & RXf_PMf_EXTENDED
- && ckWARN_d(WARN_DEPRECATED)
- && is_PATWS_non_low_safe(p, RExC_end, UTF))
- {
- vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
- "Escape literal pattern white space under /x");
- }
-
normal_default:
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
*/
if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
}
STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGWHITE;
-
- while (p < e) {
- if (isSPACE(*p))
- ++p;
- else if (*p == '#') {
- bool ended = 0;
- do {
- if (*p++ == '\n') {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- }
- else
- break;
- }
- return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
{
/* Returns the next non-pattern-white space, non-comment character (the
* latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. If there is no line break ending a comment,
- * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+ * ended by RExC_end. See also reg_skipcomment */
const char *e = RExC_end;
PERL_ARGS_ASSERT_REGPATWS;
p += len;
}
else if (recognize_comment && *p == '#') {
- bool ended = 0;
- do {
- p++;
- if (is_LNBREAK_safe(p, e, UTF)) {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ p = reg_skipcomment(pRExC_state, p);
}
else
break;
while (RExC_parse < RExC_end) {
SV* current = NULL;
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE); /* means recognize comments */
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
/* Skip white space */
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE /* means recognize comments */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
}
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == ']') {
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (range) {
/* reg_skipcomment()
- Absorbs an /x style # comments from the input stream.
- Returns true if there is more text remaining in the stream.
- Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
- terminates the pattern without including a newline.
+ Absorbs an /x style # comment from the input stream,
+ returning a pointer to the first character beyond the comment, or if the
+ comment terminates the pattern without anything following it, this returns
+ one past the final character of the pattern (in other words, RExC_end) and
+ sets the REG_RUN_ON_COMMENT_SEEN flag.
- Note its the callers responsibility to ensure that we are
+ Note it's the callers responsibility to ensure that we are
actually in /x mode
*/
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
{
- bool ended = 0;
-
PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
- while (RExC_parse < RExC_end)
- if (*RExC_parse++ == '\n') {
- ended = 1;
- break;
+ assert(*p = '#');
+
+ while (p < RExC_end) {
+ if (*(++p) == '\n') {
+ return p+1;
}
- if (!ended) {
- /* we ran off the end of the pattern without ending
- the comment, so we have to add an \n when wrapping */
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- return 0;
- } else
- return 1;
+ }
+
+ /* we ran off the end of the pattern without ending the comment, so we have
+ * to add an \n when wrapping */
+ RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ return p;
}
/* nextchar()
continue;
}
if (RExC_flags & RXf_PMf_EXTENDED) {
- if (isSPACE(*RExC_parse)) {
- RExC_parse++;
- continue;
- }
- else if (*RExC_parse == '#') {
- if ( reg_skipcomment( pRExC_state ) )
- continue;
- }
+ char * p = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ if (p != RExC_parse) {
+ RExC_parse = p;
+ continue;
+ }
}
- return retval;
+ return retval;
}
}
\p{_Perl_Problematic_Locale_Foldeds_Start}
PATWS: pattern white space
-=> generic generic_non_low cp : safe
+=> generic cp : safe
\p{PatWS}
require './test.pl';
}
-plan tests => 732; # Update this when adding/deleting tests.
+plan tests => 733; # Update this when adding/deleting tests.
run_tests() unless caller;
ok("\x{101}" =~ /\x{101}/i, "A WITH MACRON l =~ l");
}
+ {
+ use utf8;
+ ok("abc" =~ /a\85b\85c/x, "NEL is white-space under /x");
+ }
+
} # End of sub run_tests
1;
);
my @deprecated = (
- "/(?x)latin1\\\x{85}\x{85}\\\x{85}/" => 'Escape literal pattern white space under /x {#} ' . "m/(?x)latin1\\\x{85}\x{85}{#}\\\x{85}/",
- 'use utf8; /(?x)utf8\\85\85\\85/' => 'Escape literal pattern white space under /x {#} ' . "m/(?x)utf8\\\N{NEXT LINE}\N{NEXT LINE}{#}\\\N{NEXT LINE}/",
);
while (my ($regex, $expect) = splice @death, 0, 2) {