+ /* For backcompat, allow an empty first line */
+ if (*s0 == '\n') {
+ s0++;
+ continue;
+ }
+
+ /* First character in the line may optionally be the operation */
+ if ( *s0 == '+'
+ || *s0 == '!'
+ || *s0 == '-'
+ || *s0 == '&')
+ {
+ op = *s0++;
+ }
+
+ /* If the line is one or two hex digits separated by blank space, its
+ * a range; otherwise it is either another user-defined property or an
+ * error */
+
+ s = s0;
+
+ if (! isXDIGIT(*s)) {
+ goto check_if_property;
+ }
+
+ do { /* Each new hex digit will add 4 bits. */
+ if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpv(msg, overflow_msg);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_msg;
+ }
+
+ /* Accumulate this digit into the value */
+ min = (min << 4) + READ_XDIGIT(s);
+ } while (isXDIGIT(*s));
+
+ while (isBLANK(*s)) { s++; }
+
+ /* We allow comments at the end of the line */
+ if (*s == '#') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ s++;
+ }
+ else if (s < e && *s != '\n') {
+ if (! isXDIGIT(*s)) {
+ goto check_if_property;
+ }
+
+ /* Look for the high point of the range */
+ max = 0;
+ do {
+ if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpv(msg, overflow_msg);
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_msg;
+ }
+
+ max = (max << 4) + READ_XDIGIT(s);
+ } while (isXDIGIT(*s));
+
+ while (isBLANK(*s)) { s++; }
+
+ if (*s == '#') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ }
+ else if (s < e && *s != '\n') {
+ goto check_if_property;
+ }
+ }
+
+ if (max == -1) { /* The line only had one entry */
+ max = min;
+ }
+ else if (max < min) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Illegal range in \"");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ goto return_msg;
+ }
+
+#if 0 /* See explanation at definition above of get_extended_utf8_msg() */
+
+ if ( UNICODE_IS_PERL_EXTENDED(min)
+ || UNICODE_IS_PERL_EXTENDED(max))
+ {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+
+ /* If both code points are non-portable, warn only on the lower
+ * one. */
+ sv_catpv(msg, get_extended_utf8_msg(
+ (UNICODE_IS_PERL_EXTENDED(min))
+ ? min : max));
+ sv_catpvs(msg, " in \"");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
+ UTF8fARG(is_contents_utf8, s - s0, s0));
+ sv_catpvs(msg, "\"");
+ }
+
+#endif
+
+ /* Here, this line contains a legal range */
+ this_definition = sv_2mortal(_new_invlist(2));
+ this_definition = _add_range_to_invlist(this_definition, min, max);
+ goto calculate;
+
+ check_if_property:
+
+ /* Here it isn't a legal range line. See if it is a legal property
+ * line. First find the end of the meat of the line */
+ s = strpbrk(s, "#\n");
+ if (s == NULL) {
+ s = e;
+ }
+
+ /* Ignore trailing blanks in keeping with the requirements of
+ * parse_uniprop_string() */
+ s--;
+ while (s > s0 && isBLANK_A(*s)) {
+ s--;
+ }
+ s++;
+
+ this_definition = parse_uniprop_string(s0, s - s0,
+ is_utf8, to_fold, runtime,
+ deferrable,
+ user_defined_ptr, msg,
+ (name_len == 0)
+ ? level /* Don't increase level
+ if input is empty */
+ : level + 1
+ );
+ if (this_definition == NULL) {
+ goto return_msg; /* 'msg' should have had the reason appended to
+ it by the above call */
+ }
+
+ if (! is_invlist(this_definition)) { /* Unknown at this time */
+ return newSVsv(this_definition);
+ }
+
+ if (*s != '\n') {
+ s = strchr(s, '\n');
+ if (s == NULL) {
+ s = e;
+ }
+ }
+
+ calculate:
+
+ switch (op) {
+ case '+':
+ _invlist_union(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '-':
+ _invlist_subtract(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '&':
+ _invlist_intersection(running_definition, this_definition,
+ &running_definition);
+ break;
+ case '!':
+ _invlist_union_complement_2nd(running_definition,
+ this_definition, &running_definition);
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
+ __FILE__, __LINE__, op);
+ break;
+ }
+
+ /* Position past the '\n' */
+ s0 = s + 1;
+ } /* End of loop through the lines of 'contents' */
+
+ /* Here, we processed all the lines in 'contents' without error. If we
+ * didn't add any warnings, simply return success */
+ if (msgs_length_on_entry == SvCUR(msg)) {
+
+ /* If the expansion was empty, the answer isn't nothing: its an empty
+ * inversion list */
+ if (running_definition == NULL) {
+ running_definition = _new_invlist(1);
+ }
+
+ return running_definition;
+ }
+
+ /* Otherwise, add some explanatory text, but we will return success */
+
+ return_msg:
+
+ if (name_len > 0) {
+ sv_catpvs(msg, " in expansion of ");
+ Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
+ }
+
+ return running_definition;
+}
+
+/* As explained below, certain operations need to take place in the first
+ * thread created. These macros switch contexts */
+#ifdef USE_ITHREADS
+# define DECLARATION_FOR_GLOBAL_CONTEXT \
+ PerlInterpreter * save_aTHX = aTHX;
+# define SWITCH_TO_GLOBAL_CONTEXT \
+ PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
+# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
+# define CUR_CONTEXT aTHX
+# define ORIGINAL_CONTEXT save_aTHX
+#else
+# define DECLARATION_FOR_GLOBAL_CONTEXT
+# define SWITCH_TO_GLOBAL_CONTEXT NOOP
+# define RESTORE_CONTEXT NOOP
+# define CUR_CONTEXT NULL
+# define ORIGINAL_CONTEXT NULL
+#endif
+
+STATIC void
+S_delete_recursion_entry(pTHX_ void *key)
+{
+ /* Deletes the entry used to detect recursion when expanding user-defined
+ * properties. This is a function so it can be set up to be called even if
+ * the program unexpectedly quits */
+
+ dVAR;
+ SV ** current_entry;
+ const STRLEN key_len = strlen((const char *) key);
+ DECLARATION_FOR_GLOBAL_CONTEXT;
+
+ SWITCH_TO_GLOBAL_CONTEXT;
+
+ /* If the entry is one of these types, it is a permanent entry, and not the
+ * one used to detect recursions. This function should delete only the
+ * recursion entry */
+ current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
+ if ( current_entry
+ && ! is_invlist(*current_entry)
+ && ! SvPOK(*current_entry))
+ {
+ (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
+ G_DISCARD);
+ }
+
+ RESTORE_CONTEXT;
+}
+
+SV *
+Perl_parse_uniprop_string(pTHX_
+
+ /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
+ * now. If so, the return is an inversion list.
+ *
+ * If the property is user-defined, it is a subroutine, which in turn
+ * may call other subroutines. This function will call the whole nest of
+ * them to get the definition they return; if some aren't known at the time
+ * of the call to this function, the fully qualified name of the highest
+ * level sub is returned. It is an error to call this function at runtime
+ * without every sub defined.
+ *
+ * If an error was found, NULL is returned, and 'msg' gets a suitable
+ * message appended to it. (Appending allows the back trace of how we got
+ * to the faulty definition to be displayed through nested calls of
+ * user-defined subs.)
+ *
+ * The caller should NOT try to free any returned inversion list.
+ *
+ * Other parameters will be set on return as described below */
+
+ const char * const name, /* The first non-blank in the \p{}, \P{} */
+ const Size_t name_len, /* Its length in bytes, not including any
+ trailing space */
+ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
+ const bool to_fold, /* ? Is this under /i */
+ const bool runtime, /* TRUE if this is being called at run time */
+ const bool deferrable, /* TRUE if it's ok for the definition to not be
+ known at this call */
+ bool *user_defined_ptr, /* Upon return from this function it will be
+ set to TRUE if any component is a
+ user-defined property */
+ SV * msg, /* Any error or warning msg(s) are appended to
+ this */
+ const STRLEN level) /* Recursion level of this call */
+{
+ dVAR;
+ char* lookup_name; /* normalized name for lookup in our tables */
+ unsigned lookup_len; /* Its length */
+ bool stricter = FALSE; /* Some properties have stricter name
+ normalization rules, which we decide upon
+ based on parsing */
+
+ /* nv= or numeric_value=, or possibly one of the cjk numeric properties
+ * (though it requires extra effort to download them from Unicode and
+ * compile perl to know about them) */
+ bool is_nv_type = FALSE;
+
+ unsigned int i, j = 0;
+ int equals_pos = -1; /* Where the '=' is found, or negative if none */
+ int slash_pos = -1; /* Where the '/' is found, or negative if none */
+ int table_index = 0; /* The entry number for this property in the table
+ of all Unicode property names */
+ bool starts_with_In_or_Is = FALSE; /* ? Does the name start with 'In' or
+ 'Is' */
+ Size_t lookup_offset = 0; /* Used to ignore the first few characters of
+ the normalized name in certain situations */
+ Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
+ part of a package name */
+ bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
+ property rather than a Unicode
+ one. */
+ SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
+ if an error. If it is an inversion list,
+ it is the definition. Otherwise it is a
+ string containing the fully qualified sub
+ name of 'name' */
+ bool invert_return = FALSE; /* ? Do we need to complement the result before
+ returning it */