|const bool to_fold \
|const bool runtime \
|const bool deferrable \
+ |NULLOK AV ** strings \
|NN bool * user_defined_ptr \
|NN SV * msg \
|const STRLEN level
ES |bool |handle_names_wildcard \
|NN const char * wname \
|const STRLEN wname_len \
- |NN SV ** prop_definition
+ |NN SV ** prop_definition \
+ |NN AV ** strings
ES |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
|NN SV** invlist
ES |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state \
#define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
#define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d)
-#define handle_names_wildcard(a,b,c) S_handle_names_wildcard(aTHX_ a,b,c)
+#define handle_names_wildcard(a,b,c,d) S_handle_names_wildcard(aTHX_ a,b,c,d)
#define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e)
#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e)
#define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)
#define nextchar(a) S_nextchar(aTHX_ a)
#define output_posix_warnings(a,b) S_output_posix_warnings(aTHX_ a,b)
#define parse_lparen_question_flags(a) S_parse_lparen_question_flags(aTHX_ a)
-#define parse_uniprop_string(a,b,c,d,e,f,g,h,i) S_parse_uniprop_string(aTHX_ a,b,c,d,e,f,g,h,i)
+#define parse_uniprop_string(a,b,c,d,e,f,g,h,i,j) S_parse_uniprop_string(aTHX_ a,b,c,d,e,f,g,h,i,j)
#define populate_ANYOF_from_invlist(a,b) S_populate_ANYOF_from_invlist(aTHX_ a,b)
#define reg(a,b,c,d) S_reg(aTHX_ a,b,c,d)
#define reg2Lanode(a,b,c,d) S_reg2Lanode(aTHX_ a,b,c,d)
my $loose_name = get_loose_name($name);
use charnames ":loose";
is(charnames::string_vianame($loose_name), $utf8, "Verify string_vianame(\"$loose_name\") is the proper utf8");
+
+ like($utf8, qr/^\p{name=$name}$/, "Verify /\p{name=$name}\$/ is the proper utf8");
+ like($utf8, qr/^\p{name=$loose_name}$/, "Verify /\p{name=$loose_name}\$/ is the proper utf8");
+ like($utf8, qr!^\p{name=/\A$name\z/}!, "Verify /\p{name=/$\A$name\z/} is the proper utf8");
#diag("$name, $utf8");
}
close $fh;
=item In '(*VERB...)', the '(' and '*' must be adjacent in regex;
marked by S<<-- HERE> in m/%s/
+=item Inverting a character class which contains a multi-character
+sequence is illegal in regex; marked by <-- HERE in m/%s/
+
+(F) You wrote something like
+
+ qr/\P{name=KATAKANA LETTER AINU P}/
+ qr/[^\p{name=KATAKANA LETTER AINU P}]/
+
+This name actually evaluates to a sequence of two Katakana characters,
+not just a single one, and it is illegal to try to take the complement
+of a sequence. (Mathematically it would mean any sequence of characters
+from 0 to infinity in length that weren't these two in a row, and that
+is likely not of any real use.)
+
(F) The two-character sequence C<"(*"> in this context in a regular
expression pattern should be an indivisible token, with nothing
intervening between the C<"("> and the C<"*">, but you separated them.
can interpolate only with eval yes [1]
custom names yes no [2]
name aliases yes yes [3]
- named sequences yes not yet [4]
+ named sequences yes yes [4]
name value parsing exact Unicode loose [5]
=over
Some particular sequences of characters are given a single name, in
addition to their individual ones.
-It is planned to add support for named sequences to the C<\p{...}> form
-before 5.32; in the meantime, an accurate but not fully informative
-message is generated if use of one of these is attempted.
-
=item [5]
Exact name value matching means you have to specify case, hyphens,
warning if your pattern ends up causing every possible code point to
match.
-Starting in 5.32, the Name and Name Aliases properties are allowed to be
-matched. They are considered to be a single combination property, just
-as has long been the case for C<\N{}>. Loose matching doesn't work in
-exactly the same way for these as it does for the values of other
-properties. The rules are given in
+Starting in 5.32, the Name, Name Aliases, and Named Sequences properties
+are allowed to be matched. They are considered to be a single
+combination property, just as has long been the case for C<\N{}>. Loose
+matching doesn't work in exactly the same way for these as it does for
+the values of other properties. The rules are given in
L<https://www.unicode.org/reports/tr44/tr44-24.html#UAX44-LM2>. As a
result, Perl doesn't try loose matching for you, like it does in other
properties. All letters in names are uppercase, but you can add C<(?i)>
STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * parse_start, char ch);
#define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF \
assert(pRExC_state); assert(flagp); assert(parse_start)
-STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition);
+STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition, AV ** strings);
#define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \
- assert(wname); assert(prop_definition)
+ assert(wname); assert(prop_definition); assert(strings)
STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only);
#define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \
assert(pRExC_state); assert(s)
STATIC void S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state);
#define PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS \
assert(pRExC_state)
-STATIC SV * S_parse_uniprop_string(pTHX_ const char * const name, Size_t name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, bool * user_defined_ptr, SV * msg, const STRLEN level);
+STATIC SV * S_parse_uniprop_string(pTHX_ const char * const name, Size_t name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, AV ** strings, bool * user_defined_ptr, SV * msg, const STRLEN level);
#define PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING \
assert(name); assert(user_defined_ptr); assert(msg)
STATIC void S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr);
/* If set TRUE, the property is user-defined as opposed to
* official Unicode */
bool user_defined = FALSE;
+ AV * strings = NULL;
SV * prop_definition = parse_uniprop_string(
name, n, UTF, FOLD,
* this call */
! cBOOL(ret_invlist),
+ &strings,
&user_defined,
msg,
0 /* Base level */
SvCUR(msg), SvPVX(msg)));
}
- if (! is_invlist(prop_definition)) {
+ assert(prop_definition || strings);
+
+ if (strings) {
+ if (! RExC_in_multi_char_class) {
+ if (invert ^ (value == 'P')) {
+ RExC_parse = e + 1;
+ vFAIL("Inverting a character class which contains"
+ " a multi-character sequence is illegal");
+ }
+
+ /* For each multi-character string ... */
+ while (av_tindex(strings) >= 0) {
+ /* ... Each entry is itself an array of code
+ * points. */
+ AV * this_string = (AV *) av_shift( strings);
+ STRLEN cp_count = av_tindex(this_string) + 1;
+ SV * final = newSV(cp_count * 4);
+ SvPVCLEAR(final);
+
+ /* Create another string of sequences of \x{...} */
+ while (av_tindex(this_string) >= 0) {
+ SV * character = av_shift(this_string);
+ UV cp = SvUV(character);
+
+ if (cp > 255) {
+ REQUIRE_UTF8(flagp);
+ }
+ Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
+ cp);
+ SvREFCNT_dec_NN(character);
+ }
+ SvREFCNT_dec_NN(this_string);
+
+ /* And add that to the list of such things */
+ multi_char_matches
+ = add_multi_match(multi_char_matches,
+ final,
+ cp_count);
+ }
+ }
+ SvREFCNT_dec_NN(strings);
+ }
+
+ if (! prop_definition) { /* If we got only a string,
+ this iteration didn't really
+ find a character */
+ element_count--;
+ }
+ else if (! is_invlist(prop_definition)) {
/* Here, the definition isn't known, so we have gotten
* returned a string that will be evaluated if and when
this_definition = parse_uniprop_string(s0, s - s0,
is_utf8, to_fold, runtime,
deferrable,
+ NULL,
user_defined_ptr, msg,
(name_len == 0)
? level /* Don't increase level
const bool runtime, /* TRUE if this is being called at run time */
const bool deferrable, /* TRUE if it's ok for the definition to not be
known at this call */
+ AV ** strings, /* To return string property values, like named
+ sequences */
bool *user_defined_ptr, /* Upon return from this function it will be
set to TRUE if any component is a
user-defined property */
/* Farm this out to a function just to make the current
* function less unwieldy */
if (handle_names_wildcard(revised_name, revised_name_len,
- &prop_definition))
+ &prop_definition,
+ strings))
{
return prop_definition;
}
to_fold,
runtime,
deferrable,
+ NULL,
user_defined_ptr,
msg,
level + 1);
}
cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
- if (character_len < SvCUR(character)) {
- /* Temporarily, named sequences aren't handled */
- goto failed;
+ if (character_len == SvCUR(character)) {
+ prop_definition = add_cp_to_invlist(NULL, cp);
+ }
+ else {
+ AV * this_string;
+
+ /* First of the remaining characters in the string. */
+ char * remaining = SvPVX(character) + character_len;
+
+ if (strings == NULL) {
+ goto failed; /* XXX Perhaps a specific msg instead, like
+ 'not available here' */
+ }
+
+ if (*strings == NULL) {
+ *strings = newAV();
+ }
+
+ this_string = newAV();
+ av_push(this_string, newSVuv(cp));
+
+ do {
+ cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
+ av_push(this_string, newSVuv(cp));
+ remaining += character_len;
+ } while (remaining < SvEND(character));
+
+ av_push(*strings, (SV *) this_string);
}
- prop_definition = add_cp_to_invlist(NULL, cp);
return prop_definition;
}
STATIC bool
S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
const STRLEN wname_len, /* Its length */
- SV ** prop_definition)
+ SV ** prop_definition,
+ AV ** strings)
{
/* Deal with Name property wildcard subpatterns; returns TRUE if there were
* any matches, adding them to prop_definition */
+ RX_OFFS(subpattern_re)->end;
char * cp_start;
char * cp_end;
- UV cp;
+ UV cp = 0; /* Silences some compilers */
+ AV * this_string = NULL;
+ bool is_multi = FALSE;
/* If matched nothing, advance to next possible match */
if (this_name_start == this_name_end) {
/* All code points are 5 digits long */
cp_start = cp_end - 4;
+ /* This shouldn't happen, as we found a \n, and the first \n is
+ * further along than what we subtracted */
+ assert(cp_start >= all_names_start);
+
+ if (cp_start == all_names_start) {
+ *prop_definition = add_cp_to_invlist(*prop_definition, 0);
+ continue;
+ }
+
+ /* If the character is a blank, we either have a named sequence, or
+ * something is wrong */
+ if (*(cp_start - 1) == ' ') {
+ cp_start = (char *) my_memrchr(all_names_start,
+ '\n',
+ cp_start - all_names_start);
+ cp_start++;
+ }
+
+ assert(cp_start != NULL && cp_start >= all_names_start + 2);
+
/* Except for the first line in the string, the sequence before the
* code point is \n\n. If that isn't the case here, we didn't
* match the name of a character. (We could have matched a named
* sequence, not currently handled */
- if ( cp_start > all_names_start + 1
- && (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n'))
- {
+ if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
continue;
}
- /* Calculate the code point from its 5 digits */
+ /* We matched! Add this to the list */
+ found_matches = TRUE;
+
+ /* Loop through all the code points in the sequence */
+ while (cp_start < cp_end) {
+
+ /* Calculate this code point from its 5 digits */
cp = (XDIGIT_VALUE(cp_start[0]) << 16)
+ (XDIGIT_VALUE(cp_start[1]) << 12)
+ (XDIGIT_VALUE(cp_start[2]) << 8)
+ (XDIGIT_VALUE(cp_start[3]) << 4)
+ XDIGIT_VALUE(cp_start[4]);
- /* We matched! Add this to the list */
- *prop_definition = add_cp_to_invlist(*prop_definition, cp);
- found_matches = TRUE;
+ cp_start += 6; /* Go past any blank */
+
+ if (cp_start < cp_end || is_multi) {
+ if (this_string == NULL) {
+ this_string = newAV();
+ }
+
+ is_multi = TRUE;
+ av_push(this_string, newSVuv(cp));
+ }
+ }
+
+ if (is_multi) { /* Was more than one code point */
+ if (*strings == NULL) {
+ *strings = newAV();
+ }
+
+ av_push(*strings, (SV *) this_string);
+ }
+ else { /* Only a single code point */
+ *prop_definition = add_cp_to_invlist(*prop_definition, cp);
+ }
} /* End of loop through the non-algorithmic names string */
}