+/*
+=head1 Unicode Support
+
+=for apidoc isSCRIPT_RUN
+
+Returns a bool as to whether or not the sequence of bytes from C<s> up to but
+not including C<send> form a "script run". C<utf8_target> is TRUE iff the
+sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
+two degenerate cases given below, this function returns TRUE iff all code
+points in it come from any combination of three "scripts" given by the Unicode
+"Script Extensions" property: Common, Inherited, and possibly one other.
+Additionally all decimal digits must come from the same consecutive sequence of
+10.
+
+For example, if all the characters in the sequence are Greek, or Common, or
+Inherited, this function will return TRUE, provided any decimal digits in it
+are the ASCII digits "0".."9". For scripts (unlike Greek) that have their own
+digits defined this will accept either digits from that set or from 0..9, but
+not a combination of the two. Some scripts, such as Arabic, have more than one
+set of digits. All digits must come from the same set for this function to
+return TRUE.
+
+C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
+contain the script found, using the C<SCX_enum> typedef. Its value will be
+C<SCX_INVALID> if the function returns FALSE.
+
+If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
+will be C<SCX_INVALID>.
+
+If the sequence contains a single code point which is unassigned to a character
+in the version of Unicode being used, the function will return TRUE, and the
+script will be C<SCX_Unknown>. Any other combination of unassigned code points
+in the input sequence will result in the function treating the input as not
+being a script run.
+
+The returned script will be C<SCX_Inherited> iff all the code points in it are
+from the Inherited script.
+
+Otherwise, the returned script will be C<SCX_Common> iff all the code points in
+it are from the Inherited or Common scripts.
+
+=cut
+
+*/
+
+bool
+Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target, SCX_enum * ret_script)
+{
+ /* Basically, it looks at each character in the sequence to see if the
+ * above conditions are met; if not it fails. It uses an inversion map to
+ * find the enum corresponding to the script of each character. But this
+ * is complicated by the fact that a few code points can be in any of
+ * several scripts. The data has been constructed so that there are
+ * additional enum values (all negative) for these situations. The
+ * absolute value of those is an index into another table which contains
+ * pointers to auxiliary tables for each such situation. Each aux array
+ * lists all the scripts for the given situation. There is another,
+ * parallel, table that gives the number of entries in each aux table.
+ * These are all defined in charclass_invlists.h */
+
+ /* XXX Here are the additional things UTS 39 says could be done:
+ * Mark Chinese strings as “mixed script” if they contain both simplified
+ * (S) and traditional (T) Chinese characters, using the Unihan data in the
+ * Unicode Character Database [UCD]. The criterion can only be applied if
+ * the language of the string is known to be Chinese. So, for example, the
+ * string “写真だけの結婚式 ” is Japanese, and should not be marked as
+ * mixed script because of a mixture of S and T characters. Testing for
+ * whether a character is S or T needs to be based not on whether the
+ * character has a S or T variant , but whether the character is an S or T
+ * variant. khw notes that the sample contains a Hiragana character, and it
+ * is unclear if absence of any foreign script marks the script as
+ * "Chinese"
+ *
+ * Forbid sequences of the same nonspacing mark
+ *
+ * Check to see that all the characters are in the sets of exemplar
+ * characters for at least one language in the Unicode Common Locale Data
+ * Repository [CLDR]. */
+
+
+ /* Things that match /\d/u */
+ SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
+ UV * decimals_array = invlist_array(decimals_invlist);
+
+ /* What code point is the digit '0' of the script run? */
+ UV zero_of_run = 0;
+ SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
+ SCX_enum script_of_char = SCX_INVALID;
+
+ /* If the script remains not fully determined from iteration to iteration,
+ * this is the current intersection of the possiblities. */
+ SCX_enum * intersection = NULL;
+ PERL_UINT_FAST8_T intersection_len = 0;
+
+ bool retval = TRUE;
+
+ assert(send >= s);
+
+ PERL_ARGS_ASSERT_ISSCRIPT_RUN;
+
+ /* All code points in 0..255 are either Common or Latin, so must be a
+ * script run. We can special case it */
+ if (! utf8_target && LIKELY(send > s)) {
+ if (ret_script == NULL) {
+ return TRUE;
+ }
+
+ /* If any character is Latin, the run is Latin */
+ while (s < send) {
+ if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
+ *ret_script = SCX_Latin;
+ return TRUE;
+ }
+ }
+
+ /* If all are Common ... */
+ *ret_script = SCX_Common;
+ return TRUE;
+ }
+
+ /* Look at each character in the sequence */
+ while (s < send) {
+ UV cp;
+
+ /* The code allows all scripts to use the ASCII digits. This is
+ * because they are used in commerce even in scripts that have their
+ * own set. Hence any ASCII ones found are ok, unless a digit from
+ * another set has already been encountered. (The other digit ranges
+ * in Common are not similarly blessed) */
+ if (UNLIKELY(isDIGIT(*s))) {
+ if (UNLIKELY(script_of_run == SCX_Unknown)) {
+ retval = FALSE;
+ break;
+ }
+ if (zero_of_run > 0) {
+ if (zero_of_run != '0') {
+ retval = FALSE;
+ break;
+ }
+ }
+ else {
+ zero_of_run = '0';
+ }
+ s++;
+ continue;
+ }
+
+ /* Here, isn't an ASCII digit. Find the code point of the character */
+ if (! UTF8_IS_INVARIANT(*s)) {
+ Size_t len;
+ cp = valid_utf8_to_uvchr((U8 *) s, &len);
+ s += len;
+ }
+ else {
+ cp = *(s++);
+ }
+
+ /* If is within the range [+0 .. +9] of the script's zero, it also is a
+ * digit in that script. We can skip the rest of this code for this
+ * character. */
+ if (UNLIKELY( zero_of_run > 0
+ && cp >= zero_of_run
+ && cp - zero_of_run <= 9))
+ {
+ continue;
+ }
+
+ /* Find the character's script. The correct values are hard-coded here
+ * for small-enough code points. */
+ if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
+ unlikely to change */
+ if ( cp > 255
+ || ( isALPHA_L1(cp)
+ && LIKELY(cp != MICRO_SIGN_NATIVE)))
+ {
+ script_of_char = SCX_Latin;
+ }
+ else {
+ script_of_char = SCX_Common;
+ }
+ }
+ else {
+ script_of_char = _Perl_SCX_invmap[
+ _invlist_search(PL_SCX_invlist, cp)];
+ }
+
+ /* We arbitrarily accept a single unassigned character, but not in
+ * combination with anything else, and not a run of them. */
+ if ( UNLIKELY(script_of_run == SCX_Unknown)
+ || UNLIKELY( script_of_run != SCX_INVALID
+ && script_of_char == SCX_Unknown))
+ {
+ retval = FALSE;
+ break;
+ }
+
+ /* For the first character, or the run is inherited, the run's script
+ * is set to the char's */
+ if ( UNLIKELY(script_of_run == SCX_INVALID)
+ || UNLIKELY(script_of_run == SCX_Inherited))
+ {
+ script_of_run = script_of_char;
+ }
+
+ /* For the character's script to be Unknown, it must be the first
+ * character in the sequence (for otherwise a test above would have
+ * prevented us from reaching here), and we have set the run's script
+ * to it. Nothing further to be done for this character */
+ if (UNLIKELY(script_of_char == SCX_Unknown)) {
+ continue;
+ }
+
+ /* We accept 'inherited' script characters currently even at the
+ * beginning. (We know that no characters in Inherited are digits, or
+ * we'd have to check for that) */
+ if (UNLIKELY(script_of_char == SCX_Inherited)) {
+ continue;
+ }
+
+ /* If the run so far is Common, and the new character isn't, change the
+ * run's script to that of this character */
+ if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
+
+ /* But Common contains several sets of digits. Only the '0' set
+ * can be part of another script. */
+ if (zero_of_run > 0 && zero_of_run != '0') {
+ retval = FALSE;
+ break;
+ }
+
+ script_of_run = script_of_char;
+ }
+
+ /* All decimal digits must be from the same sequence of 10. Above, we
+ * handled any ASCII digits without descending to here. We also
+ * handled the case where we already knew what digit sequence is the
+ * one to use, and the character is in that sequence. Now that we know
+ * the script, we can use script_zeros[] to directly find which
+ * sequence the script uses, except in a few cases it returns 0 */
+ if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+ zero_of_run = script_zeros[script_of_char];
+ }
+
+ /* Now we can see if the script of the character is the same as that of
+ * the run */
+ if (LIKELY(script_of_char == script_of_run)) {
+ /* By far the most common case */
+ goto scripts_match;
+ }
+
+
+ /* Here, the script of the run isn't Common. But characters in Common
+ * match any script */
+ if (script_of_char == SCX_Common) {
+ goto scripts_match;
+ }
+
+#ifndef HAS_SCX_AUX_TABLES
+
+ /* Too early a Unicode version to have a code point belonging to more
+ * than one script, so, if the scripts don't exactly match, fail */
+ retval = FALSE;
+ break;
+
+#else
+
+ /* Here there is no exact match between the character's script and the
+ * run's. And we've handled the special cases of scripts Unknown,
+ * Inherited, and Common.
+ *
+ * Negative script numbers signify that the value may be any of several
+ * scripts, and we need to look at auxiliary information to make our
+ * deterimination. But if both are non-negative, we can fail now */
+ if (LIKELY(script_of_char >= 0)) {
+ const SCX_enum * search_in;
+ PERL_UINT_FAST8_T search_in_len;
+ PERL_UINT_FAST8_T i;
+
+ if (LIKELY(script_of_run >= 0)) {
+ retval = FALSE;
+ break;
+ }
+
+ /* Use the previously constructed set of possible scripts, if any.
+ * */
+ if (intersection) {
+ search_in = intersection;
+ search_in_len = intersection_len;
+ }
+ else {
+ search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
+ search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
+ }
+
+ for (i = 0; i < search_in_len; i++) {
+ if (search_in[i] == script_of_char) {
+ script_of_run = script_of_char;
+ goto scripts_match;
+ }
+ }
+
+ retval = FALSE;
+ break;
+ }
+ else if (LIKELY(script_of_run >= 0)) {
+ /* script of character could be one of several, but run is a single
+ * script */
+ const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
+ const PERL_UINT_FAST8_T search_in_len
+ = SCX_AUX_TABLE_lengths[-script_of_char];
+ PERL_UINT_FAST8_T i;
+
+ for (i = 0; i < search_in_len; i++) {
+ if (search_in[i] == script_of_run) {
+ script_of_char = script_of_run;
+ goto scripts_match;
+ }
+ }
+
+ retval = FALSE;
+ break;
+ }
+ else {
+ /* Both run and char could be in one of several scripts. If the
+ * intersection is empty, then this character isn't in this script
+ * run. Otherwise, we need to calculate the intersection to use
+ * for future iterations of the loop, unless we are already at the
+ * final character */
+ const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
+ const PERL_UINT_FAST8_T char_len
+ = SCX_AUX_TABLE_lengths[-script_of_char];
+ const SCX_enum * search_run;
+ PERL_UINT_FAST8_T run_len;
+
+ SCX_enum * new_overlap = NULL;
+ PERL_UINT_FAST8_T i, j;
+
+ if (intersection) {
+ search_run = intersection;
+ run_len = intersection_len;
+ }
+ else {
+ search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
+ run_len = SCX_AUX_TABLE_lengths[-script_of_run];
+ }
+
+ intersection_len = 0;
+
+ for (i = 0; i < run_len; i++) {
+ for (j = 0; j < char_len; j++) {
+ if (search_run[i] == search_char[j]) {
+
+ /* Here, the script at i,j matches. That means this
+ * character is in the run. But continue on to find
+ * the complete intersection, for the next loop
+ * iteration, and for the digit check after it.
+ *
+ * On the first found common script, we malloc space
+ * for the intersection list for the worst case of the
+ * intersection, which is the minimum of the number of
+ * scripts remaining in each set. */
+ if (intersection_len == 0) {
+ Newx(new_overlap,
+ MIN(run_len - i, char_len - j),
+ SCX_enum);
+ }
+ new_overlap[intersection_len++] = search_run[i];
+ }
+ }
+ }
+
+ /* Here we've looked through everything. If they have no scripts
+ * in common, not a run */
+ if (intersection_len == 0) {
+ retval = FALSE;
+ break;
+ }
+
+ /* If there is only a single script in common, set to that.
+ * Otherwise, use the intersection going forward */
+ Safefree(intersection);
+ if (intersection_len == 1) {
+ script_of_run = script_of_char = new_overlap[0];
+ Safefree(new_overlap);
+ }
+ else {
+ intersection = new_overlap;
+ }
+ }
+