return FALSE; /* Things like CNTRL are always below 256 */
}
+STATIC char *
+S_find_next_ascii(char * s, const char * send, const bool utf8_target)
+{
+ /* Returns the position of the first ASCII byte in the sequence between 's'
+ * and 'send-1' inclusive; returns 'send' if none found */
+
+ PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
+
+#ifndef EBCDIC
+
+ if ((STRLEN) (send - s) >= PERL_WORDSIZE
+
+ /* This term is wordsize if subword; 0 if not */
+ + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+
+ /* 'offset' */
+ - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
+ {
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s++; /* khw didn't bother creating a separate loop for
+ utf8_target */
+ }
+
+ /* Here, we know we have at least one full word to process. Process
+ * per-word as long as we have at least a full word left */
+ do {
+ if ((* (PERL_UINTMAX_T *) s) & ~ PERL_VARIANTS_WORD_MASK) {
+ break;
+ }
+ s += PERL_WORDSIZE;
+ } while (s + PERL_WORDSIZE <= send);
+ }
+
+#endif
+
+ /* Process per-character */
+ if (utf8_target) {
+ while (s < send) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ while (s < send) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s++;
+ }
+ }
+
+ return s;
+}
+
+STATIC char *
+S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
+{
+ /* Returns the position of the first non-ASCII byte in the sequence between
+ * 's' and 'send-1' inclusive; returns 'send' if none found */
+
+#ifdef EBCDIC
+
+ PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+
+ if (utf8_target) {
+ while (s < send) {
+ if ( ! isASCII(*s)) {
+ return s;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ while (s < send) {
+ if ( ! isASCII(*s)) {
+ return s;
+ }
+ s++;
+ }
+ }
+
+ return s;
+
+#else
+
+ const U8 * next_non_ascii = NULL;
+
+ PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+ PERL_UNUSED_ARG(utf8_target);
+
+ /* On ASCII platforms invariants and ASCII are identical, so if the string
+ * is entirely invariants, there is no non-ASCII character */
+ return (is_utf8_invariant_string_loc((U8 *) s,
+ (STRLEN) (send - s),
+ &next_non_ascii))
+ ? (char *) send
+ : (char *) next_non_ascii;
+
+#endif
+
+}
+
/*
* pregexec and friends
*/
);
break;
+ case ASCII:
+ s = find_next_ascii(s, strend, utf8_target);
+ if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+
+ break;
+
+ case NASCII:
+ s = find_next_non_ascii(s, strend, utf8_target);
+ if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+
+ break;
+
/* The argument to all the POSIX node types is the class number to pass to
* _generic_isCC() to build a mask for searching in PL_charclass[] */
bool is_utf8_pat = reginfo->is_utf8_pat;
bool match = FALSE;
I32 orig_savestack_ix = PL_savestack_ix;
- char * script_run_begin = NULL;
+ U8 * script_run_begin = NULL;
/* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
#if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
}
break;
+ case ASCII:
+ if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
+ sayNO;
+ }
+
+ locinput++; /* ASCII is always single byte */
+ break;
+
+ case NASCII:
+ if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
+ sayNO;
+ }
+
+ goto increment_locinput;
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number
* */
break;
case SROPEN: /* (*SCRIPT_RUN: */
- script_run_begin = locinput;
+ script_run_begin = (U8 *) locinput;
break;
/* XXX really need to log other places start/end are set too */
break;
case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
- {
- /* Checks that every character in the sequence started by SROPEN
- * and ending here is one of three scripts: Common, Inherited, and
- * possibly one other. Additionally all decimal digits must come
- * from the same consecutive sequence of 10.
- *
- * Basically, it looks at each character in the sequence to see if
- * the above conditions are met; if not it fails. It uses an
- * inversion map to find the enum corresponding to the script of
- * each character. But this is complicated by the fact that a few
- * code points can be in any of several scripts. The data has been
- * constructed so that there are additional enum values (all
- * negative) for these situations. The absolute value of those is
- * an index into another table which contains pointers to auxiliary
- * tables for each such situation. Each aux array lists all the
- * scripts for the given situation. There is another, parallel,
- * table that gives the number of entries in each aux table. These
- * are all defined in charclass_invlists.h */
-
- /* XXX Here are the additional things UTS 39 says could be done:
- * Mark Chinese strings as “mixed script” if they contain both
- * simplified (S) and traditional (T) Chinese characters, using the
- * Unihan data in the Unicode Character Database [UCD]. The
- * criterion can only be applied if the language of the string is
- * known to be Chinese. So, for example, the string
- * “写真だけの結婚式 ” is Japanese, and should not be marked as
- * mixed script because of a mixture of S and T characters.
- * Testing for whether a character is S or T needs to be based not
- * on whether the character has a S or T variant , but whether the
- * character is an S or T variant. khw notes that the sample
- * contains a Hiragana character, and it is unclear if absence of
- * any foreign script marks the script as "Chinese"
- *
- * Forbid sequences of the same nonspacing mark
- *
- * Check to see that all the characters are in the sets of exemplar
- * characters for at least one language in the Unicode Common
- * Locale Data Repository [CLDR]. */
-
-
- /* Things that match /\d/u */
- SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
- UV * decimals_array = invlist_array(decimals_invlist);
-
- /* What code point is the digit '0' of the script run? */
- UV zero_of_run = 0;
- SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
- SCX_enum script_of_char = SCX_INVALID;
-
- /* If the script remains not fully determined from iteration to
- * iteration, this is the current intersection of the possiblities.
- * */
- SCX_enum * intersection = NULL;
- PERL_UINT_FAST8_T intersection_len = 0;
-
- const char * s = script_run_begin;
- const char * strend = locinput;
-
- assert(s);
-
- /* Look at each character in the sequence */
- while (s < strend) {
- UV cp;
-
- /* The code allows all scripts to use the ASCII digits. This
- * is because they are used in commerce even in scripts that
- * have their own set. Hence any ASCII ones found are ok,
- * unless a digit from another set has already been
- * encountered. (The other digit ranges in Common are not
- * similarly blessed */
- if (UNLIKELY(isDIGIT(*s))) {
- if (zero_of_run > 0) {
- if (zero_of_run != '0') {
- Safefree(intersection);
- sayNO;
- }
- }
- else {
- zero_of_run = '0';
- }
- s++;
- continue;
- }
-
- /* Here, isn't an ASCII digit. Find the code point of the
- * character */
- if (utf8_target && ! UTF8_IS_INVARIANT(*s)) {
- Size_t len;
- cp = valid_utf8_to_uvchr((U8 *) s, &len);
- s += len;
- }
- else {
- cp = *(s++);
- }
-
- /* If is within the range [+0 .. +9] of the script's zero, it
- * also is a digit in that script. We can skip the rest of
- * this code for this character. */
- if (UNLIKELY( zero_of_run > 0
- && cp >= zero_of_run
- && cp - zero_of_run <= 9))
- {
- continue;
- }
-
- /* Find the character's script. The correct values are
- * hard-coded here for small-enough code points. */
- if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
- unlikely to change */
- if ( cp > 255
- || ( isALPHA_L1(cp)
- && LIKELY(cp != MICRO_SIGN_NATIVE)))
- {
- script_of_char = SCX_Latin;
- }
- else {
- script_of_char = SCX_Common;
- }
- }
- else {
- script_of_char = _Perl_SCX_invmap[
- _invlist_search(PL_SCX_invlist, cp)];
- }
-
- /* We arbitrarily accept a single unassigned character, but not
- * in combination with anything else, and not a run of them. */
- if ( UNLIKELY(script_of_run == SCX_Unknown)
- || UNLIKELY( script_of_run != SCX_INVALID
- && script_of_char == SCX_Unknown))
- {
- Safefree(intersection);
- sayNO;
- }
-
- if (UNLIKELY(script_of_char == SCX_Unknown)) {
- script_of_run = SCX_Unknown;
- continue;
- }
-
- /* We accept 'inherited' script characters currently even at
- * the beginning. (We know that no characters in Inherited
- * are digits, or we'd have to check for that) */
- if (UNLIKELY(script_of_char == SCX_Inherited)) {
- continue;
- }
-
- /* If unknown, the run's script is set to the char's */
- if (UNLIKELY(script_of_run == SCX_INVALID)) {
- script_of_run = script_of_char;
- }
-
- /* All decimal digits must be from the same sequence of 10.
- * Above, we handled any ASCII digits without descending to
- * here. We also handled the case where we already knew what
- * digit sequence is the one to use, and the character is in
- * that sequence. Now that we know the script, we can use
- * script_zeros[] to directly find which sequence the script
- * uses, except in a few cases it returns 0 */
- if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
- zero_of_run = script_zeros[script_of_char];
- }
-
- /* Now we can see if the script of the character is the same as
- * that of the run, or 'Common' which is considered to be in
- * every script */
- if (LIKELY( script_of_char == script_of_run
- || script_of_char == SCX_Common))
- { /* By far the most common case */
- goto scripts_match;
- }
-
-#ifndef HAS_SCX_AUX_TABLES
- /* Too early a Unicode version to have a code point belonging
- * to more than one script, so, if the scripts don't exactly
- * match, fail */
- Safefree(intersection);
+ if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
+ {
sayNO;
+ }
-#else
-
- /* Here there is no exact match between the character's script
- * and the run's. Negative script numbers signify that the
- * value may be any of several scripts, and we need to look at
- * auxiliary information to make our deterimination. But if
- * both are non-negative, we can fail now */
- if (LIKELY(script_of_char >= 0)) {
- const SCX_enum * search_in;
- PERL_UINT_FAST8_T search_in_len;
- PERL_UINT_FAST8_T i;
-
- if (LIKELY(script_of_run >= 0)) {
- Safefree(intersection);
- sayNO;
- }
-
- /* Use any previously constructed set of possible scripts.
- * */
- if (intersection) {
- search_in = intersection;
- search_in_len = intersection_len;
- }
- else {
- search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
- search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
- }
-
- for (i = 0; i < search_in_len; i++) {
- if (search_in[i] == script_of_char) {
- script_of_run = script_of_char;
- goto scripts_match;
- }
- }
-
- Safefree(intersection);
- sayNO;
- }
- else if (LIKELY(script_of_run >= 0)) {
- /* script of character could be one of several, but run is
- * a single script */
- const SCX_enum * search_in
- = SCX_AUX_TABLE_ptrs[-script_of_char];
- const PERL_UINT_FAST8_T search_in_len
- = SCX_AUX_TABLE_lengths[-script_of_char];
- PERL_UINT_FAST8_T i;
-
- for (i = 0; i < search_in_len; i++) {
- if (search_in[i] == script_of_run) {
- script_of_char = script_of_run;
- goto scripts_match;
- }
- }
-
- Safefree(intersection);
- sayNO;
- }
- else {
- /* Both run and char could be in one of several scripts.
- * If the intersection is empty, then this character isn't
- * in this script run. Otherwise, we need to calculate the
- * intersection to use for future iterations of the loop,
- * unless we are already at the final character */
- const SCX_enum * search_char
- = SCX_AUX_TABLE_ptrs[-script_of_char];
- const PERL_UINT_FAST8_T char_len
- = SCX_AUX_TABLE_lengths[-script_of_char];
- const SCX_enum * search_run;
- PERL_UINT_FAST8_T run_len;
-
- SCX_enum * new_overlap = NULL;
- PERL_UINT_FAST8_T i, j;
-
- if (intersection) {
- search_run = intersection;
- run_len = intersection_len;
- }
- else {
- search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
- run_len = SCX_AUX_TABLE_lengths[-script_of_run];
- }
-
- intersection_len = 0;
-
- for (i = 0; i < run_len; i++) {
- for (j = 0; j < char_len; j++) {
- if (search_run[i] == search_char[j]) {
-
- /* Here, the script at i,j matches. That means
- * this character is in the run. But continue
- * on to find the complete intersection, for
- * the next loop iteration, and for the digit
- * check after it.
- *
- * On the first found common script, we malloc
- * space for the intersection list for the worst
- * case of the intersection, which is the
- * minimum of the number of scripts remaining
- * in each set. */
- if (intersection_len == 0) {
- Newx(new_overlap,
- MIN(run_len - i, char_len - j),
- SCX_enum);
- }
- new_overlap[intersection_len++] = search_run[i];
- }
- }
- }
-
- /* Here we've looked through everything. If they have no
- * scripts in common, not a run */
- if (intersection_len == 0) {
- Safefree(intersection);
- sayNO;
- }
-
-
- /* If there is only a single script in common, set to that.
- * Otherwise, use the intersection going forward */
- Safefree(intersection);
- if (intersection_len == 1) {
- script_of_run = script_of_char = new_overlap[0];
- Safefree(new_overlap);
- }
- else {
- intersection = new_overlap;
- }
- }
-
-#endif
-
- scripts_match: ;
-
- /* Here, the script of the character is compatible with that of
- * the run. Either they match exactly, or one or both can be
- * any of several scripts, and the intersection is not empty.
- * If the character is not a decimal digit, we are done with
- * it. Otherwise, it could still fail if it is from a
- * different set of 10 than seen already (or we may not have
- * seen any, and we need to set the sequence). If we have
- * determined a single script and that script only has one set
- * of digits (almost all scripts are like that), then this
- * isn't a problem, as any digit must come from the same
- * sequence. The only scripts that have multiple sequences
- * have been constructed to be 0 in 'script_zeros[]'.
- *
- * Here we check if it is a digit. */
- if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
- && ( ( zero_of_run == 0
- || ( ( script_of_char >= 0
- && script_zeros[script_of_char] == 0)
- || intersection))))
- {
- SSize_t range_zero_index;
- range_zero_index = _invlist_search(decimals_invlist, cp);
- if ( LIKELY(range_zero_index >= 0)
- && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
- {
- UV range_zero = decimals_array[range_zero_index];
- if (zero_of_run) {
- if (zero_of_run != range_zero) {
- Safefree(intersection);
- sayNO;
- }
- }
- else {
- zero_of_run = range_zero;
- }
- }
- }
- } /* end of looping through CLOSESR text */
-
- Safefree(intersection);
break;
- }
+
case ACCEPT: /* (*ACCEPT) */
if (scan->flags)
}
else { /* Not utf8_target */
if (ST.c1 == ST.c2) {
- while (locinput <= ST.maxpos &&
- UCHARAT(locinput) != ST.c1)
- locinput++;
+ locinput = (char *) memchr(locinput,
+ ST.c1,
+ ST.maxpos + 1 - locinput);
+ if (! locinput) {
+ locinput = ST.maxpos + 1;
+ }
}
else {
U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
}
break;
+ case ASCII:
+ if (utf8_target && loceol - scan > max) {
+
+ /* We didn't adjust <loceol> at the beginning of this routine
+ * because is UTF-8, but it is actually ok to do so, since here, to
+ * match, 1 char == 1 byte. */
+ loceol = scan + max;
+ }
+
+ scan = find_next_non_ascii(scan, loceol, utf8_target);
+ break;
+
+ case NASCII:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < loceol
+ && ! isASCII_utf8_safe(scan, loceol))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ scan = find_next_ascii(scan, loceol, utf8_target);
+ }
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number */
case NPOSIXL:
#endif
+bool
+Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+{
+ /* Checks that every character in the sequence from 's' to 'send' is one of
+ * three scripts: Common, Inherited, and possibly one other. Additionally
+ * all decimal digits must come from the same consecutive sequence of 10.
+ * 'utf8_target' is TRUE iff the sequence is encoded in UTF-8.
+ *
+ * Basically, it looks at each character in the sequence to see if the
+ * above conditions are met; if not it fails. It uses an inversion map to
+ * find the enum corresponding to the script of each character. But this
+ * is complicated by the fact that a few code points can be in any of
+ * several scripts. The data has been constructed so that there are
+ * additional enum values (all negative) for these situations. The
+ * absolute value of those is an index into another table which contains
+ * pointers to auxiliary tables for each such situation. Each aux array
+ * lists all the scripts for the given situation. There is another,
+ * parallel, table that gives the number of entries in each aux table.
+ * These are all defined in charclass_invlists.h */
+
+ /* XXX Here are the additional things UTS 39 says could be done:
+ * Mark Chinese strings as “mixed script” if they contain both simplified
+ * (S) and traditional (T) Chinese characters, using the Unihan data in the
+ * Unicode Character Database [UCD]. The criterion can only be applied if
+ * the language of the string is known to be Chinese. So, for example, the
+ * string “写真だけの結婚式 ” is Japanese, and should not be marked as
+ * mixed script because of a mixture of S and T characters. Testing for
+ * whether a character is S or T needs to be based not on whether the
+ * character has a S or T variant , but whether the character is an S or T
+ * variant. khw notes that the sample contains a Hiragana character, and it
+ * is unclear if absence of any foreign script marks the script as
+ * "Chinese"
+ *
+ * Forbid sequences of the same nonspacing mark
+ *
+ * Check to see that all the characters are in the sets of exemplar
+ * characters for at least one language in the Unicode Common Locale Data
+ * Repository [CLDR]. */
+
+
+ /* Things that match /\d/u */
+ SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
+ UV * decimals_array = invlist_array(decimals_invlist);
+
+ /* What code point is the digit '0' of the script run? */
+ UV zero_of_run = 0;
+ SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
+ SCX_enum script_of_char = SCX_INVALID;
+
+ /* If the script remains not fully determined from iteration to iteration,
+ * this is the current intersection of the possiblities. */
+ SCX_enum * intersection = NULL;
+ PERL_UINT_FAST8_T intersection_len = 0;
+
+ bool retval = TRUE;
+
+ assert(send > s);
+
+ PERL_ARGS_ASSERT_ISSCRIPT_RUN;
+
+ /* Look at each character in the sequence */
+ while (s < send) {
+ UV cp;
+
+ /* The code allows all scripts to use the ASCII digits. This is
+ * because they are used in commerce even in scripts that have their
+ * own set. Hence any ASCII ones found are ok, unless a digit from
+ * another set has already been encountered. (The other digit ranges
+ * in Common are not similarly blessed */
+ if (UNLIKELY(isDIGIT(*s))) {
+ if (zero_of_run > 0) {
+ if (zero_of_run != '0') {
+ retval = FALSE;
+ break;
+ }
+ }
+ else {
+ zero_of_run = '0';
+ }
+ s++;
+ continue;
+ }
+
+ /* Here, isn't an ASCII digit. Find the code point of the character */
+ if (utf8_target && ! UTF8_IS_INVARIANT(*s)) {
+ Size_t len;
+ cp = valid_utf8_to_uvchr((U8 *) s, &len);
+ s += len;
+ }
+ else {
+ cp = *(s++);
+ }
+
+ /* If is within the range [+0 .. +9] of the script's zero, it also is a
+ * digit in that script. We can skip the rest of this code for this
+ * character. */
+ if (UNLIKELY( zero_of_run > 0
+ && cp >= zero_of_run
+ && cp - zero_of_run <= 9))
+ {
+ continue;
+ }
+
+ /* Find the character's script. The correct values are hard-coded here
+ * for small-enough code points. */
+ if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
+ unlikely to change */
+ if ( cp > 255
+ || ( isALPHA_L1(cp)
+ && LIKELY(cp != MICRO_SIGN_NATIVE)))
+ {
+ script_of_char = SCX_Latin;
+ }
+ else {
+ script_of_char = SCX_Common;
+ }
+ }
+ else {
+ script_of_char = _Perl_SCX_invmap[
+ _invlist_search(PL_SCX_invlist, cp)];
+ }
+
+ /* We arbitrarily accept a single unassigned character, but not in
+ * combination with anything else, and not a run of them. */
+ if ( UNLIKELY(script_of_run == SCX_Unknown)
+ || UNLIKELY( script_of_run != SCX_INVALID
+ && script_of_char == SCX_Unknown))
+ {
+ retval = FALSE;
+ break;
+ }
+
+ if (UNLIKELY(script_of_char == SCX_Unknown)) {
+ script_of_run = SCX_Unknown;
+ continue;
+ }
+
+ /* We accept 'inherited' script characters currently even at the
+ * beginning. (We know that no characters in Inherited are digits, or
+ * we'd have to check for that) */
+ if (UNLIKELY(script_of_char == SCX_Inherited)) {
+ continue;
+ }
+
+ /* If unknown, the run's script is set to the char's */
+ if (UNLIKELY(script_of_run == SCX_INVALID)) {
+ script_of_run = script_of_char;
+ }
+
+ /* All decimal digits must be from the same sequence of 10. Above, we
+ * handled any ASCII digits without descending to here. We also
+ * handled the case where we already knew what digit sequence is the
+ * one to use, and the character is in that sequence. Now that we know
+ * the script, we can use script_zeros[] to directly find which
+ * sequence the script uses, except in a few cases it returns 0 */
+ if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+ zero_of_run = script_zeros[script_of_char];
+ }
+
+ /* Now we can see if the script of the character is the same as that of
+ * the run, or 'Common' which is considered to be in every script */
+ if (LIKELY( script_of_char == script_of_run
+ || script_of_char == SCX_Common))
+ { /* By far the most common case */
+ goto scripts_match;
+ }
+
+#ifndef HAS_SCX_AUX_TABLES
+
+ /* Too early a Unicode version to have a code point belonging to more
+ * than one script, so, if the scripts don't exactly match, fail */
+ retval = FALSE;
+ break;
+
+#else
+
+ /* Here there is no exact match between the character's script and the
+ * run's. Negative script numbers signify that the value may be any of
+ * several scripts, and we need to look at auxiliary information to
+ * make our deterimination. But if both are non-negative, we can fail
+ * now */
+ if (LIKELY(script_of_char >= 0)) {
+ const SCX_enum * search_in;
+ PERL_UINT_FAST8_T search_in_len;
+ PERL_UINT_FAST8_T i;
+
+ if (LIKELY(script_of_run >= 0)) {
+ retval = FALSE;
+ break;
+ }
+
+ /* Use any previously constructed set of possible scripts.
+ * */
+ if (intersection) {
+ search_in = intersection;
+ search_in_len = intersection_len;
+ }
+ else {
+ search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
+ search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
+ }
+
+ for (i = 0; i < search_in_len; i++) {
+ if (search_in[i] == script_of_char) {
+ script_of_run = script_of_char;
+ goto scripts_match;
+ }
+ }
+
+ retval = FALSE;
+ break;
+ }
+ else if (LIKELY(script_of_run >= 0)) {
+ /* script of character could be one of several, but run is a single
+ * script */
+ const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
+ const PERL_UINT_FAST8_T search_in_len
+ = SCX_AUX_TABLE_lengths[-script_of_char];
+ PERL_UINT_FAST8_T i;
+
+ for (i = 0; i < search_in_len; i++) {
+ if (search_in[i] == script_of_run) {
+ script_of_char = script_of_run;
+ goto scripts_match;
+ }
+ }
+
+ retval = FALSE;
+ break;
+ }
+ else {
+ /* Both run and char could be in one of several scripts. If the
+ * intersection is empty, then this character isn't in this script
+ * run. Otherwise, we need to calculate the intersection to use
+ * for future iterations of the loop, unless we are already at the
+ * final character */
+ const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
+ const PERL_UINT_FAST8_T char_len
+ = SCX_AUX_TABLE_lengths[-script_of_char];
+ const SCX_enum * search_run;
+ PERL_UINT_FAST8_T run_len;
+
+ SCX_enum * new_overlap = NULL;
+ PERL_UINT_FAST8_T i, j;
+
+ if (intersection) {
+ search_run = intersection;
+ run_len = intersection_len;
+ }
+ else {
+ search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
+ run_len = SCX_AUX_TABLE_lengths[-script_of_run];
+ }
+
+ intersection_len = 0;
+
+ for (i = 0; i < run_len; i++) {
+ for (j = 0; j < char_len; j++) {
+ if (search_run[i] == search_char[j]) {
+
+ /* Here, the script at i,j matches. That means this
+ * character is in the run. But continue on to find
+ * the complete intersection, for the next loop
+ * iteration, and for the digit check after it.
+ *
+ * On the first found common script, we malloc space
+ * for the intersection list for the worst case of the
+ * intersection, which is the minimum of the number of
+ * scripts remaining in each set. */
+ if (intersection_len == 0) {
+ Newx(new_overlap,
+ MIN(run_len - i, char_len - j),
+ SCX_enum);
+ }
+ new_overlap[intersection_len++] = search_run[i];
+ }
+ }
+ }
+
+ /* Here we've looked through everything. If they have no scripts
+ * in common, not a run */
+ if (intersection_len == 0) {
+ retval = FALSE;
+ break;
+ }
+
+ /* If there is only a single script in common, set to that.
+ * Otherwise, use the intersection going forward */
+ Safefree(intersection);
+ if (intersection_len == 1) {
+ script_of_run = script_of_char = new_overlap[0];
+ Safefree(new_overlap);
+ }
+ else {
+ intersection = new_overlap;
+ }
+ }
+
+#endif
+
+ scripts_match: ;
+
+ /* Here, the script of the character is compatible with that of the
+ * run. Either they match exactly, or one or both can be any of
+ * several scripts, and the intersection is not empty. If the
+ * character is not a decimal digit, we are done with it. Otherwise,
+ * it could still fail if it is from a different set of 10 than seen
+ * already (or we may not have seen any, and we need to set the
+ * sequence). If we have determined a single script and that script
+ * only has one set of digits (almost all scripts are like that), then
+ * this isn't a problem, as any digit must come from the same sequence.
+ * The only scripts that have multiple sequences have been constructed
+ * to be 0 in 'script_zeros[]'.
+ *
+ * Here we check if it is a digit. */
+ if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
+ && ( ( zero_of_run == 0
+ || ( ( script_of_char >= 0
+ && script_zeros[script_of_char] == 0)
+ || intersection))))
+ {
+ SSize_t range_zero_index;
+ range_zero_index = _invlist_search(decimals_invlist, cp);
+ if ( LIKELY(range_zero_index >= 0)
+ && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
+ {
+ UV range_zero = decimals_array[range_zero_index];
+ if (zero_of_run) {
+ if (zero_of_run != range_zero) {
+ retval = FALSE;
+ break;
+ }
+ }
+ else {
+ zero_of_run = range_zero;
+ }
+ }
+ }
+ } /* end of looping through CLOSESR text */
+
+ Safefree(intersection);
+ return retval;
+}
/*