Es |bool |to_byte_substr |NN regexp * prog
ERsn |I32 |reg_check_named_buff_matched |NN const regexp *rex \
|NN const regnode *scan
+EsnR |bool |isGCB |const PL_GCB_enum before|const PL_GCB_enum after
# ifdef DEBUGGING
Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\
|NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8
#define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e)
#define isFOO_lc(a,b) S_isFOO_lc(aTHX_ a,b)
#define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b)
+#define isGCB S_isGCB
#define reg_check_named_buff_matched S_reg_check_named_buff_matched
#define regcppop(a,b) S_regcppop(aTHX_ a,b)
#define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c)
#define PL_DBtrace (vTHX->IDBtrace)
#define PL_Dir (vTHX->IDir)
#define PL_Env (vTHX->IEnv)
+#define PL_GCB_invlist (vTHX->IGCB_invlist)
#define PL_HasMultiCharFold (vTHX->IHasMultiCharFold)
#define PL_InBitmap (vTHX->IInBitmap)
#define PL_LIO (vTHX->ILIO)
PERLVARA(I, utf8_swash_ptrs, POSIX_SWASH_COUNT, SV *)
PERLVARA(I, Posix_ptrs, POSIX_CC_COUNT, SV *)
PERLVARA(I, XPosix_ptrs, POSIX_CC_COUNT, SV *)
+PERLVAR(I, GCB_invlist, SV *)
PERLVAR(I, last_swash_hv, HV *)
PERLVAR(I, last_swash_tmps, U8 *)
my @should_match = map { eval "\"$_\"" } @should_display;
# If a string can be represented in both non-ut8 and utf8, test both cases
+ my $display_upgrade = "";
UPGRADE:
for my $to_upgrade (0 .. 1) {
next UPGRADE if utf8::is_utf8($string);
utf8::upgrade($string);
+ $display_upgrade = " (utf8-upgraded)";
+ }
+
+ # The /l modifier has C after it to indicate the locale to try
+ my @modifiers = qw(a aa d lC u i);
+ push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+ # Test for each of the regex modifiers.
+ for my $modifier (@modifiers) {
+ my $display_locale = "";
+
+ # For /l, set the locale to what it says to.
+ if ($modifier =~ / ^ l (.*) /x) {
+ my $locale = $1;
+ $display_locale = "(locale = $locale)";
+ use Config;
+ if (defined $Config{d_setlocale}) {
+ eval { require POSIX; import POSIX 'locale_h'; };
+ if (defined &POSIX::LC_CTYPE) {
+ POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
+ }
+ }
+ $modifier = 'l';
+ }
+
+ no warnings qw(locale regexp surrogate);
+ my $pattern = "(?$modifier:$break_pattern)";
+
+ # Actually do the test
+ my $matched = $string =~ qr/$pattern/;
+ print "not " unless $matched;
+
+ # Fancy display of test results
+ $matched = ($matched) ? "matched" : "failed to match";
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+
+ # Repeat with the first \B{} in the pattern. This makes sure the
+ # code in regexec.c:find_byclass() for \B gets executed
+ if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
+ my $B_pattern = "$1$2";
+ $matched = $string =~ qr/$B_pattern/;
+ print "not " unless $matched;
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+ }
}
+ next if $break_type ne 'gcb';
+
# Finally, do the \X match.
my @matches = $string =~ /(\X)/g;
#include "perl.h"
#include "patchlevel.h" /* for local_patches */
#include "XSUB.h"
-#include "charclass_invlists.h"
#ifdef NETWARE
#include "nwutil.h"
PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(XPosixWord_invlist);
PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
+ PL_GCB_invlist = _new_invlist_C_array(Grapheme_Cluster_Break_invlist);
ENTER;
}
SvREFCNT_dec(PL_XPosix_ptrs[i]);
PL_XPosix_ptrs[i] = NULL;
}
+ PL_GCB_invlist = NULL;
if (!specialWARN(PL_compiling.cop_warnings))
PerlMemShared_free(PL_compiling.cop_warnings);
#endif
#include "handy.h"
+#include "charclass_invlists.h"
#if defined(USE_LARGE_FILES) && !defined(NO_64_BIT_RAWIO)
# if LSEEKSIZE == 8 && !defined(USE_64_BIT_RAWIO)
, => /a ASCII /aa safe {3,7} repeat in range
list ops /l locale /d dual | alternation
not /u Unicode [] character class
- and /e evaluate /ee rpts \b word boundary
+ and /e evaluate /ee rpts \b boundary
or xor /g global \z string end
/o compile pat once () capture
DEBUG (?:p) no capture
# Word Boundary Opcodes:
BOUND no Match "" at any word boundary using native
- charset rules for non-utf8
- BOUNDL no Match "" at any locale word boundary
- BOUNDU no Match "" at any word boundary using Unicode
- rules
- BOUNDA no Match "" at any word boundary using ASCII
- rules
+ charset rules for non-utf8, otherwise
+ Unicode rules
+ BOUNDL no Match "" at any boundary of a given type
+ using locale rules
+ BOUNDU no Match "" at any boundary of a given type
+ using Unicode rules
+ BOUNDA no Match "" at any boundary of a given type
+ using ASCII rules
NBOUND no Match "" at any word non-boundary using
- native charset rules for non-utf8
- NBOUNDL no Match "" at any locale word non-boundary
- NBOUNDU no Match "" at any word non-boundary using
+ native charset rules for non-utf8, otherwise
Unicode rules
- NBOUNDA no Match "" at any word non-boundary using
- ASCII rules
+ NBOUNDL no Match "" at any boundary of a given type
+ using locale rules
+ NBOUNDU no Match "" at any boundary of a given type
+ using using Unicode rules
+ NBOUNDA no Match "" at any boundary of a given type
+ using using ASCII rules
# [Special] alternatives:
REG_ANY no Match any one character (except newline).
enhancements. Particularly prominent performance optimisations could go
here, but most should go in the L</Performance Enhancements> section.
-[ List each enhancement as a =head2 entry ]
+=head2 qr/\b{gcb}/ is now handled in regular expressions
+
+C<gcb> stands for Grapheme Cluster Boundary. It is a Unicode property
+that finds the boundary between sequences of characters that look like a
+single character to a native speaker of a language. Perl has long had
+the ability to deal with these through the C<\X> regular escape
+sequence. Now, there is an alternative way of handling these. See
+L<perlrebackslash/\b{}, \b, \B{}, \B> for details.
=head1 Security
(F) Your machine doesn't implement the sockatmark() functionality,
neither as a system call nor an ioctl call (SIOCATMARK).
+=item '%s' is an unknown bound type in regex; marked by <-- HERE in m/%s/
+
+(F) You used C<\b{...}> or C<\B{...}> and the C<...> is not known to
+Perl. The current valid ones are given in
+L<perlrebackslash/\b{}, \b, \B{}, \B>.
+
=item "%s" is more clearly written simply as "%s" in regex; marked by <-- HERE in m/%s/
(W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
form if you wish to use an empty line as the terminator of the
here-document.
+=item Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale
+
+(W locale) You are matching a regular expression using locale rules,
+and a Unicode boundary is being matched, but the locale is not a Unicode
+one. This doesn't make sense. Perl will continue, assuming a Unicode
+(UTF-8) locale, but the results could well be wrong except if the locale
+happens to be ISO-8859-1 (Latin1) where this message is spurious and can
+be ignored.
+
=item Use of chdir('') or chdir(undef) as chdir() deprecated
(D deprecated) chdir() with no arguments is documented to change to
these circumstances, Perl discards all but the first character
of the returned sequence, which is not likely what you want.
+=item Using /u for '%s' instead of /%s in regex; marked by <-- HERE in m/%s/
+
+(W regexp) You used a Unicode boundary (C<\b{...}> or C<\B{...}>) in a
+portion of a regular expression where the character set modifiers C</a>
+or C</aa> are in effect. These two modifiers indicate an ASCII
+interpretation, and this doesn't make sense for a Unicode definiton.
+The generated regular expression will compile so that the boundary uses
+all of Unicode. No other portion of the regular expression is affected.
+
=item Using !~ with %s doesn't make sense
(F) Using the C<!~> operator with C<s///r>, C<tr///r> or C<y///r> is
=item 6
+the pattern uses a Unicode break (C<\b{...}> or C<\B{...}>); or
+
+=item 7
+
the pattern uses L</C<(?[ ])>>
=back
X<regular expression, zero-width assertion>
X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G>
+ \b{} Match at Unicode boundary of specified type
+ \B{} Match where corresponding \b{} doesn't match
\b Match a word boundary
\B Match except at a word boundary
\A Match only at beginning of string
\G Match only at pos() (e.g. at the end-of-match position
of prior m//g)
+A Unicode boundary (C<\b{}>), available starting in v5.22, is a spot
+between two characters, or before the first character in the string, or
+after the final character in the string where certain criteria defined
+by Unicode are met. See L<perlrebackslash/\b{}, \b, \B{}, \B> for
+details.
+
A word boundary (C<\b>) is a spot between two characters
that has a C<\w> on one side of it and a C<\W> on the other side
of it (in either order), counting the imaginary characters off the
\1 Absolute backreference. Not in [].
\a Alarm or bell.
\A Beginning of string. Not in [].
- \b Word/non-word boundary. (Backspace in []).
- \B Not a word/non-word boundary. Not in [].
+ \b{}, \b Boundary. (\b is a backspace in []).
+ \B{}, \B Not a boundary.
\cX Control-X.
\C Single octet, even under UTF-8. Not in [].
(Deprecated)
=item [1]
C<\b> is the backspace character only inside a character class. Outside a
-character class, C<\b> is a word/non-word boundary.
+character class, C<\b> alone is a word-character/non-word-character
+boundary, and C<\b{}> is some other type of boundary.
=item [2]
Mnemonic: I<G>lobal.
-=item \b, \B
+=item \b{}, \b, \B{}, \B
-C<\b> matches at any place between a word and a non-word character; C<\B>
-matches at any place between characters where C<\b> doesn't match. C<\b>
+C<\b{...}>, available starting in v5.22, matches a boundary (between two
+characters, or before the first character of the string, or after the
+final character of the string) based on the Unicode rules for the
+boundary type specified inside the braces. The currently known boundary
+types are given a few paragraphs below. C<\B{...}> matches at any place
+between characters where C<\b{...}> of the same type doesn't match.
+
+C<\b> when not immediately followed by a C<"{"> matches at any place
+between a word (something matched by C<\w>) and a non-word character
+(C<\W>); C<\B> when not immediately followed by a C<"{"> matches at any
+place between characters where C<\b> doesn't match.
+
+C<\b>
and C<\B> assume there's a non-word character before the beginning and after
the end of the source string; so C<\b> will match at the beginning (or end)
of the source string if the source string begins (or ends) with a word
Do not use something like C<\b=head\d\b> and expect it to match the
beginning of a line. It can't, because for there to be a boundary before
the non-word "=", there must be a word character immediately previous.
-All boundary determinations look for word characters alone, not for
-non-words characters nor for string ends. It may help to understand how
+All plain C<\b> and C<\B> boundary determinations look for word
+characters alone, not for
+non-word characters nor for string ends. It may help to understand how
<\b> and <\B> work by equating them as follows:
\b really means (?:(?<=\w)(?!\w)|(?<!\w)(?=\w))
\B really means (?:(?<=\w)(?=\w)|(?<!\w)(?!\w))
+In contrast, C<\b{...}> always matches at the beginning and end of the
+line (and C<\B{...}> never does). The only boundary type currently
+"Grapheme Cluster Boundary". (Actually Perl always uses the improved
+"extended" grapheme cluster"). These are explained below under C<\X>.
+In fact, C<\X> is another way to get the same functionality. It is
+equivalent to C</.+?\b{gcb}/>. Use whichever is most convenient for
+your situation.
+
Mnemonic: I<b>oundary.
=back
The match is greedy and non-backtracking, so that the cluster is never
broken up into smaller components.
+See also L<C<\b{gcb}>|/\b{}, \b, \B{}, \B>.
+
Mnemonic: eI<X>tended Unicode character.
=back
^ Match string start (or line, if /m is used)
$ Match string end (or line, if /m is used) or before newline
+ \b{} Match boundary of type specified within the braces
+ \B{} Match wherever \b{} doesn't match
\b Match word boundary (between \w and \W)
\B Match except at word boundary (between \w and \w or \W and \W)
\A Match string start (regardless of /m)
[10] see UAX#15 "Unicode Normalization Forms"
[11] have Unicode::Normalize but not integrated to regexes
- [12] have \X but we don't have a "Grapheme Cluster Mode"
+ [12] have \X and \b{gcb} but we don't have a "Grapheme Cluster
+ Mode"
[14] see UAX#29, Word Boundaries
[15] This is covered in Chapter 3.13 (in Unicode 6.0)
=item *
-Matching any of several properties in regular expressions, namely C<\b>,
-C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
+Matching any of several properties in regular expressions, namely
+C<\b> (without braces), C<\B> (without braces), C<\s>, C<\S>, C<\w>,
+C<\W>, and all the Posix character classes
I<except> C<[[:ascii:]]>.
Starting in Perl 5.14.0, regular expressions compiled within
the scope of C<unicode_strings> use character semantics
#define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
assert(character)
+STATIC bool S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+ __attribute__warn_unused_result__;
+
STATIC I32 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
__attribute__warn_unused_result__
__attribute__nonnull__(1)
#endif
#include "dquote_static.c"
-#include "charclass_invlists.h"
#include "inline_invlist.c"
#include "unicode_constants.h"
invert = 1;
/* FALLTHROUGH */
case 'b':
+ {
+ regex_charset charset = get_regex_charset(RExC_flags);
+
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = BOUND + get_regex_charset(RExC_flags);
- if (op > BOUNDA) { /* /aa is same as /a */
- op = BOUNDA;
- }
- else if (op == BOUNDL) {
- RExC_contains_locale = 1;
- }
+ op = BOUND + charset;
- if (invert) {
- op += NBOUND - BOUND;
+ if (op == BOUNDL) {
+ RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
*flagp |= SIMPLE;
- if ((U8) *(RExC_parse + 1) == '{') {
- /* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
+ if (*(RExC_parse + 1) != '{') {
+ FLAGS(ret) = TRADITIONAL_BOUND;
+ if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDA;
+ }
+ }
+ else {
+ STRLEN length;
+ char name = *RExC_parse;
+ char * endbrace;
+ RExC_parse += 2;
+ endbrace = strchr(RExC_parse, '}');
+
+ if (! endbrace) {
+ vFAIL2("Missing right brace on \\%c{}", name);
+ }
+ /* XXX Need to decide whether to take spaces or not. Should be
+ * consistent with \p{}, but that currently is SPACE, which
+ * means vertical too, which seems wrong
+ * while (isBLANK(*RExC_parse)) {
+ RExC_parse++;
+ }*/
+ if (endbrace == RExC_parse) {
+ RExC_parse++; /* After the '}' */
+ vFAIL2("Empty \\%c{}", name);
+ }
+ length = endbrace - RExC_parse;
+ /*while (isBLANK(*(RExC_parse + length - 1))) {
+ length--;
+ }*/
+ switch (*RExC_parse) {
+ case 'g':
+ if (length != 1
+ && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+ {
+ goto bad_bound_type;
+ }
+ FLAGS(ret) = GCB_BOUND;
+ break;
+ default:
+ bad_bound_type:
+ RExC_parse = endbrace;
+ vFAIL2utf8f(
+ "'%"UTF8f"' is an unknown bound type",
+ UTF8fARG(UTF, length, endbrace - length));
+ NOT_REACHED; /*NOTREACHED*/
+ }
+ RExC_parse = endbrace;
+ RExC_uni_semantics = 1;
+
+ if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDU;
+ length += 4;
+
+ /* Don't have to worry about UTF-8, in this message because
+ * to get here the contents of the \b must be ASCII */
+ ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
+ "Using /u for '%.*s' instead of /%s",
+ (unsigned) length,
+ endbrace - length + 1,
+ (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+ ? ASCII_RESTRICT_PAT_MODS
+ : ASCII_MORE_RESTRICT_PAT_MODS);
+ }
}
+
+ if (PASS2 && invert) {
+ OP(ret) += NBOUND - BOUND;
+ }
goto finish_meta_pat;
+ }
case 'D':
invert = 1;
Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
}
}
+ else if (k == BOUND || k == NBOUND) {
+ /* Must be synced with order of 'bound_type' in regcomp.h */
+ const char * const bounds[] = {
+ "", /* Traditional */
+ "{gcb}"
+ };
+ sv_catpv(sv, bounds[FLAGS(o)]);
+ }
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
else if (OP(o) == SBOL)
#endif /* DEBUG RELATED DEFINES */
+typedef enum {
+ TRADITIONAL_BOUND = _CC_WORDCHAR,
+ GCB_BOUND
+} bound_type;
+
/*
* Local variables:
* c-indentation-style: bsd
# in regcomp.c uses the enum value of the modifier as an offset from the /d
# version. The complements must come after the non-complements.
# BOUND, POSIX and their complements are affected, as well as EXACTF.
-BOUND BOUND, no ; Match "" at any word boundary using native charset rules for non-utf8
-BOUNDL BOUND, no ; Match "" at any locale word boundary
-BOUNDU BOUND, no ; Match "" at any word boundary using Unicode rules
-BOUNDA BOUND, no ; Match "" at any word boundary using ASCII rules
+BOUND BOUND, no ; Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules
+BOUNDL BOUND, no ; Match "" at any boundary of a given type using locale rules
+BOUNDU BOUND, no ; Match "" at any boundary of a given type using Unicode rules
+BOUNDA BOUND, no ; Match "" at any boundary of a given type using ASCII rules
# All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones
-NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset rules for non-utf8
-NBOUNDL NBOUND, no ; Match "" at any locale word non-boundary
-NBOUNDU NBOUND, no ; Match "" at any word non-boundary using Unicode rules
-NBOUNDA NBOUND, no ; Match "" at any word non-boundary using ASCII rules
+NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules
+NBOUNDL NBOUND, no ; Match "" at any boundary of a given type using locale rules
+NBOUNDU NBOUND, no ; Match "" at any boundary of a given type using using Unicode rules
+NBOUNDA NBOUND, no ; Match "" at any boundary of a given type using using ASCII rules
#* [Special] alternatives:
REG_ANY REG_ANY, no 0 S ; Match any one character (except newline).
#include "re_top.h"
#endif
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
+ "Use of \\b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
PL_XPosix_ptrs[_CC_WORDCHAR], \
LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
-#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
- STMT_START { \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
- "_X_regular_begin", \
- NULL, \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
- "_X_extend", \
- NULL, \
- COMBINING_GRAVE_ACCENT_UTF8); \
- } STMT_END
-
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
} \
} STMT_END
-/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
- * These are for the pre-composed Hangul syllables, which are all in a
- * contiguous block and arranged there in such a way so as to facilitate
- * alorithmic determination of their characteristics. As such, they don't need
- * a swash, but can be determined by simple arithmetic. Almost all are
- * GCB=LVT, but every 28th one is a GCB=LV */
-#define SBASE 0xAC00 /* Start of block */
-#define SCount 11172 /* Length of block */
-#define TCount 28
-
#define SLAB_FIRST(s) (&(s)->states[0])
#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+/* Takes a pointer to an inversion list, a pointer to its corresponding
+ * inversion map, and a code point, and returns the code point's value
+ * according to the two arrays. It assumes that all code points have a value.
+ * This is used as the base macro for macros for particular properties */
+#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
+ invmap[_invlist_search(invlist, cp)]
+
+/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
+ * of a code point, returning the value for the first code point in the string.
+ * And it takes the particular macro name that finds the desired value given a
+ * code point. Merely convert the UTF-8 to code point and call the cp macro */
+#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
+ (__ASSERT_(pos < strend) \
+ /* Note assumes is valid UTF-8 */ \
+ (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
+
+/* Returns the GCB value for the input code point */
+#define getGCB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_GCB_invlist, \
+ Grapheme_Cluster_Break_invmap, \
+ (cp))
+
+/* Returns the GCB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getGCB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
/* We know what class REx starts with. Try to find this position... */
/* if reginfo->intuit, its a dryrun */
case BOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ goto do_boundu;
+ }
+
FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
+
case NBOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ goto do_nboundu;
+ }
+
FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
- case BOUND:
+
+ case BOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case BOUNDA:
+
+ case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_BOUND_A(isWORDCHAR_A);
break;
- case NBOUND:
+
+ case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case NBOUNDA:
+
+ case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_NBOUND_A(isWORDCHAR_A);
break;
- case BOUNDU:
- FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
- break;
+
case NBOUNDU:
- FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ }
+
+ do_nboundu:
+
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case BOUNDU:
+ do_boundu:
+ switch((bound_type) FLAGS(c)) {
+ case TRADITIONAL_BOUND:
+ FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ case GCB_BOUND:
+ if (s == reginfo->strbeg) { /* GCB always matches at begin and
+ end */
+ if (to_complement ^ cBOOL(reginfo->intuit
+ || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ }
+
+ if (utf8_target) {
+ PL_GCB_enum before = getGCB_VAL_UTF8(
+ reghop3((U8*)s, -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ PL_GCB_enum after = getGCB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if (to_complement ^ isGCB(before, after)) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ before = after;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. Everything is a GCB except between CR and
+ LF */
+ while (s < strend) {
+ if (to_complement ^ (UCHARAT(s - 1) != '\r'
+ || UCHARAT(s) != '\n'))
+ {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s++;
+ }
+ }
+ }
+
+ if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+ break;
+ }
break;
+
case LNBREAK:
REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
is_LNBREAK_latin1_safe(s, strend)
return TRUE;
}
+/* This creates a single number by combining two, with 'before' being like the
+ * 10's digit, but this isn't necessarily base 10; it is base however many
+ * elements of the enum there are */
+#define GCBcase(before, after) ((PL_GCB_ENUM_COUNT * before) + after)
+
+STATIC bool
+S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+{
+ /* returns a boolean indicating if there is a Grapheme Cluster Boundary
+ * between the inputs. See http://www.unicode.org/reports/tr29/ */
+
+ switch (GCBcase(before, after)) {
+
+ /* Break at the start and end of text.
+ GB1. sot ÷
+ GB2. ÷ eot
+
+ Break before and after controls except between CR and LF
+ GB4. ( Control | CR | LF ) ÷
+ GB5. ÷ ( Control | CR | LF )
+
+ Otherwise, break everywhere.
+ GB10. Any ÷ Any */
+ default:
+ return TRUE;
+
+ /* Do not break between a CR and LF.
+ GB3. CR × LF */
+ case GCBcase(PL_GCB_CR, PL_GCB_LF):
+ return FALSE;
+
+ /* Do not break Hangul syllable sequences.
+ GB6. L × ( L | V | LV | LVT ) */
+ case GCBcase(PL_GCB_L, PL_GCB_L):
+ case GCBcase(PL_GCB_L, PL_GCB_V):
+ case GCBcase(PL_GCB_L, PL_GCB_LV):
+ case GCBcase(PL_GCB_L, PL_GCB_LVT):
+ return FALSE;
+
+ /* GB7. ( LV | V ) × ( V | T ) */
+ case GCBcase(PL_GCB_LV, PL_GCB_V):
+ case GCBcase(PL_GCB_LV, PL_GCB_T):
+ case GCBcase(PL_GCB_V, PL_GCB_V):
+ case GCBcase(PL_GCB_V, PL_GCB_T):
+ return FALSE;
+
+ /* GB8. ( LVT | T) × T */
+ case GCBcase(PL_GCB_LVT, PL_GCB_T):
+ case GCBcase(PL_GCB_T, PL_GCB_T):
+ return FALSE;
+
+ /* Do not break between regional indicator symbols.
+ GB8a. Regional_Indicator × Regional_Indicator */
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Regional_Indicator):
+ return FALSE;
+
+ /* Do not break before extending characters.
+ GB9. × Extend */
+ case GCBcase(PL_GCB_Other, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Extend, PL_GCB_Extend):
+ case GCBcase(PL_GCB_L, PL_GCB_Extend):
+ case GCBcase(PL_GCB_LV, PL_GCB_Extend):
+ case GCBcase(PL_GCB_LVT, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Extend):
+ case GCBcase(PL_GCB_SpacingMark, PL_GCB_Extend):
+ case GCBcase(PL_GCB_T, PL_GCB_Extend):
+ case GCBcase(PL_GCB_V, PL_GCB_Extend):
+ return FALSE;
+
+ /* Do not break before SpacingMarks, or after Prepend characters.
+ GB9a. × SpacingMark */
+ case GCBcase(PL_GCB_Other, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Extend, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_L, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_LV, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_LVT, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_SpacingMark, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_T, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_V, PL_GCB_SpacingMark):
+ return FALSE;
+
+ /* GB9b. Prepend × */
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Other):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_L):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_LV):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_LVT):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Prepend):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Regional_Indicator):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_T):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_V):
+ return FALSE;
+ }
+
+ NOT_REACHED;
+}
+
/* returns -1 on failure, $+[0] on success */
STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
int to_complement; /* Invert the result? */
_char_class_number classnum;
bool is_utf8_pat = reginfo->is_utf8_pat;
+ bool match = FALSE;
+
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
break;
}
- /* XXX At that point regcomp.c would no longer * have to set the FLAGS fields of these */
case NBOUNDL: /* /\B/l */
to_complement = 1;
/* FALLTHROUGH */
case BOUNDL: /* /\b/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (FLAGS(scan) != TRADITIONAL_BOUND) {
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
+ goto boundu;
+ }
+
if (utf8_target) {
if (locinput == reginfo->strbeg)
ln = isWORDCHAR_LC('\n');
/* FALLTHROUGH */
case BOUNDU: /* /\b/u */
+
+ boundu:
if (utf8_target) {
- bound_utf8:
+ bound_utf8:
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
ln = (locinput == reginfo->strbeg)
? isWORDCHAR_L1('\n')
: isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
n = (NEXTCHR_IS_EOS)
? isWORDCHAR_L1('\n')
: isWORDCHAR_utf8((U8*)locinput);
+
+ match = ln != n;
+ break;
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else {
+ /* Find the gcb values of previous and current
+ * chars, then see if is a break point */
+ match = isGCB(getGCB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend));
+ }
+ break;
+ }
}
- else {
+ else { /* Not utf8 target */
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
ln = (locinput == reginfo->strbeg)
? isWORDCHAR_L1('\n')
: isWORDCHAR_L1(UCHARAT(locinput - 1));
n = (NEXTCHR_IS_EOS)
? isWORDCHAR_L1('\n')
: isWORDCHAR_L1(nextchr);
+ match = ln != n;
+ break;
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else { /* Only CR-LF combo isn't a GCB in 0-255
+ range */
+ match = UCHARAT(locinput - 1) != '\r'
+ || UCHARAT(locinput) != '\n';
+ }
+ break;
+ }
}
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ ! match) {
sayNO;
}
break;
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
- /* From http://www.unicode.org/reports/tr29 (5.2 version). An
- extended Grapheme Cluster is:
-
- CR LF
- | Prepend* Begin Extend*
- | .
-
- Begin is: ( Special_Begin | ! Control )
- Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- Extend is: ( Grapheme_Extend | Spacing_Mark )
- Control is: [ GCB_Control | CR | LF ]
- Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- If we create a 'Regular_Begin' = Begin - Special_Begin, then
- we can rewrite
-
- Begin is ( Regular_Begin + Special Begin )
-
- It turns out that 98.4% of all Unicode code points match
- Regular_Begin. Doing it this way eliminates a table match in
- the previous implementation for almost all Unicode code points.
-
- There is a subtlety with Prepend* which showed up in testing.
- Note that the Begin, and only the Begin is required in:
- | Prepend* Begin Extend*
- Also, Begin contains '! Control'. A Prepend must be a
- '! Control', which means it must also be a Begin. What it
- comes down to is that if we match Prepend* and then find no
- suitable Begin afterwards, that if we backtrack the last
- Prepend, that one will be a suitable Begin.
- */
-
if (NEXTCHR_IS_EOS)
sayNO;
if (! utf8_target) {
}
else {
- /* Utf8: See if is ( CR LF ); already know that locinput <
- * reginfo->strend, so locinput+1 is in bounds */
- if ( nextchr == '\r' && locinput+1 < reginfo->strend
- && UCHARAT(locinput + 1) == '\n')
- {
- locinput += 2;
- }
- else {
- STRLEN len;
-
- /* In case have to backtrack to beginning, then match '.' */
- char *starting = locinput;
-
- /* In case have to backtrack the last prepend */
- char *previous_prepend = NULL;
+ /* Get the gcb type for the current character */
+ PL_GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
- LOAD_UTF8_CHARCLASS_GCB();
-
- /* Match (prepend)* */
- while (locinput < reginfo->strend
- && (len = is_GCB_Prepend_utf8(locinput)))
- {
- previous_prepend = locinput;
- locinput += len;
- }
-
- /* As noted above, if we matched a prepend character, but
- * the next thing won't match, back off the last prepend we
- * matched, as it is guaranteed to match the begin */
- if (previous_prepend
- && (locinput >= reginfo->strend
- || (! swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)
- && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
- )
- {
- locinput = previous_prepend;
- }
-
- /* Note that here we know reginfo->strend > locinput, as we
- * tested that upon input to this switch case, and if we
- * moved locinput forward, we tested the result just above
- * and it either passed, or we backed off so that it will
- * now pass */
- if (swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)) {
- locinput += UTF8SKIP(locinput);
+ /* Then scan through the input until we get to the first
+ * character whose type is supposed to be a gcb with the
+ * current character. (There is always a break at the
+ * end-of-input) */
+ locinput += UTF8SKIP(locinput);
+ while (locinput < reginfo->strend) {
+ PL_GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
+ if (isGCB(prev_gcb, cur_gcb)) {
+ break;
}
- else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
-
- /* Here did not match the required 'Begin' in the
- * second term. So just match the very first
- * character, the '.' of the final term of the regex */
- locinput = starting + UTF8SKIP(starting);
- goto exit_utf8;
- } else {
-
- /* Here is a special begin. It can be composed of
- * several individual characters. One possibility is
- * RI+ */
- if ((len = is_GCB_RI_utf8(locinput))) {
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_RI_utf8(locinput)))
- {
- locinput += len;
- }
- } else if ((len = is_GCB_T_utf8(locinput))) {
- /* Another possibility is T+ */
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- } else {
- /* Here, neither RI+ nor T+; must be some other
- * Hangul. That means it is one of the others: L,
- * LV, LVT or V, and matches:
- * L* (L | LVT T* | V * V* T* | LV V* T*) */
-
- /* Match L* */
- while (locinput < reginfo->strend
- && (len = is_GCB_L_utf8(locinput)))
- {
- locinput += len;
- }
-
- /* Here, have exhausted L*. If the next character
- * is not an LV, LVT nor V, it means we had to have
- * at least one L, so matches L+ in the original
- * equation, we have a complete hangul syllable.
- * Are done. */
+ prev_gcb = cur_gcb;
+ locinput += UTF8SKIP(locinput);
+ }
- if (locinput < reginfo->strend
- && is_GCB_LV_LVT_V_utf8(locinput))
- {
- /* Otherwise keep going. Must be LV, LVT or V.
- * See if LVT, by first ruling out V, then LV */
- if (! is_GCB_V_utf8(locinput)
- /* All but every TCount one is LV */
- && (valid_utf8_to_uvchr((U8 *) locinput,
- NULL)
- - SBASE)
- % TCount != 0)
- {
- locinput += UTF8SKIP(locinput);
- } else {
-
- /* Must be V or LV. Take it, then match
- * V* */
- locinput += UTF8SKIP(locinput);
- while (locinput < reginfo->strend
- && (len = is_GCB_V_utf8(locinput)))
- {
- locinput += len;
- }
- }
- /* And any of LV, LVT, or V can be followed
- * by T* */
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- }
- }
- }
-
- /* Match any extender */
- while (locinput < reginfo->strend
- && swash_fetch(PL_utf8_X_extend,
- (U8*)locinput, utf8_target))
- {
- locinput += UTF8SKIP(locinput);
- }
- }
- exit_utf8:
- if (locinput > reginfo->strend) sayNO;
}
break;
#define MEOL 5 /* 0x05 Same, assuming multiline: /$/m */
#define EOS 6 /* 0x06 Match "" at end of string: /\z/ */
#define GPOS 7 /* 0x07 Matches where last m//g left off. */
-#define BOUND 8 /* 0x08 Match "" at any word boundary using native charset rules for non-utf8 */
-#define BOUNDL 9 /* 0x09 Match "" at any locale word boundary */
-#define BOUNDU 10 /* 0x0a Match "" at any word boundary using Unicode rules */
-#define BOUNDA 11 /* 0x0b Match "" at any word boundary using ASCII rules */
-#define NBOUND 12 /* 0x0c Match "" at any word non-boundary using native charset rules for non-utf8 */
-#define NBOUNDL 13 /* 0x0d Match "" at any locale word non-boundary */
-#define NBOUNDU 14 /* 0x0e Match "" at any word non-boundary using Unicode rules */
-#define NBOUNDA 15 /* 0x0f Match "" at any word non-boundary using ASCII rules */
+#define BOUND 8 /* 0x08 Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules */
+#define BOUNDL 9 /* 0x09 Match "" at any boundary of a given type using locale rules */
+#define BOUNDU 10 /* 0x0a Match "" at any boundary of a given type using Unicode rules */
+#define BOUNDA 11 /* 0x0b Match "" at any boundary of a given type using ASCII rules */
+#define NBOUND 12 /* 0x0c Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules */
+#define NBOUNDL 13 /* 0x0d Match "" at any boundary of a given type using locale rules */
+#define NBOUNDU 14 /* 0x0e Match "" at any boundary of a given type using using Unicode rules */
+#define NBOUNDA 15 /* 0x0f Match "" at any boundary of a given type using using ASCII rules */
#define REG_ANY 16 /* 0x10 Match any one character (except newline). */
#define SANY 17 /* 0x11 Match any one character. */
#define CANY 18 /* 0x12 Match any one byte. */
for (i = 0; i < POSIX_CC_COUNT; i++) {
PL_XPosix_ptrs[i] = sv_dup_inc(proto_perl->IXPosix_ptrs[i], param);
}
+ PL_GCB_invlist = sv_dup_inc(proto_perl->IGCB_invlist, param);
PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param);
PL_utf8_X_regular_begin = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
Wide character (U+100) in pattern match (m//) at - line 11.
Wide character (U+100) in pattern match (m//) at - line 12.
Wide character (U+100) in pattern match (m//) at - line 12.
+########
+# NAME \b{} in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+ print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+"a" =~ /\b{gcb}/l;
+no warnings 'locale';
+"a" =~ /\b{gcb}/l;
+EXPECT
+Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8.
+Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8.
'/(?lil:foo)/' => 'Regexp modifier "l" may not appear twice {#} m/(?lil{#}:foo)/',
'/(?aaia:foo)/' => 'Regexp modifier "a" may appear a maximum of twice {#} m/(?aaia{#}:foo)/',
'/(?i-l:foo)/' => 'Regexp modifier "l" may not appear after the "-" {#} m/(?i-l{#}:foo)/',
-'/a\b{cde/' => 'Use "\b\{" instead of "\b{" {#} m/a\{#}b{cde/',
-'/a\B{cde/' => 'Use "\B\{" instead of "\B{" {#} m/a\{#}B{cde/',
'/((x)/' => 'Unmatched ( {#} m/({#}(x)/',
'/[z-a]/' => 'Invalid [] range "z-a" {#} m/[z-a{#}]/',
'/\p/' => 'Empty \p{} {#} m/\p{#}/',
-
'/\P{}/' => 'Empty \P{} {#} m/\P{{#}}/',
+
+'/a\b{cde/' => 'Missing right brace on \b{} {#} m/a\b{{#}cde/',
+'/a\B{cde/' => 'Missing right brace on \B{} {#} m/a\B{{#}cde/',
+
+ '/\b{}/' => 'Empty \b{} {#} m/\b{}{#}/',
+ '/\B{}/' => 'Empty \B{} {#} m/\B{}{#}/',
+
+ '/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
+ '/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",
+
'/(?[[[:word]]])/' => "Unmatched ':' in POSIX class {#} m/(?[[[:word{#}]]])/",
'/(?[[:word]])/' => "Unmatched ':' in POSIX class {#} m/(?[[:word{#}]])/",
'/(?[[[:digit: ])/' => "Unmatched '[' in POSIX class {#} m/(?[[[:digit:{#} ])/",
'/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
'/\cネ/' => "Character following \"\\c\" must be printable ASCII",
+ '/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
+ '/\B{ネ}/' => "'ネ' is an unknown bound type {#} m/\\B{ネ{#}}/",
);
push @death, @death_utf8;
my @warning = (
'm/\b*\x{100}/' => '\b* matches null string many times {#} m/\b*{#}\x{100}/',
+ '/\b{g}/a' => "Using /u for '\\b{g}' instead of /a {#} m/\\b{g}{#}/",
+ '/\B{gcb}/a' => "Using /u for '\\B{gcb}' instead of /a {#} m/\\B{gcb}{#}/",
'm/[:blank:]\x{100}/' => 'POSIX syntax [: :] belongs inside character classes {#} m/[:blank:]{#}\x{100}/',
'm/[[:cntrl:]][:^ascii:]\x{100}/' => 'POSIX syntax [: :] belongs inside character classes {#} m/[[:cntrl:]][:^ascii:]{#}\x{100}/',
"m'\\y\\x{100}'" => 'Unrecognized escape \y passed through {#} m/\y{#}\x{100}/',
#define PERL_IN_UTF8_C
#include "perl.h"
#include "inline_invlist.c"
-#include "charclass_invlists.h"
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";