#define PERL_IN_UTF8_C
#include "perl.h"
#include "invlist_inline.h"
-#include "uni_keywords.h"
static const char malformed_text[] = "Malformed UTF-8 character";
static const char unees[] =
*/
UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
- STRLEN curlen,
- STRLEN *retlen,
- const U32 flags)
+Perl_utf8n_to_uvchr(const U8 *s,
+ STRLEN curlen,
+ STRLEN *retlen,
+ const U32 flags)
{
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
*/
UV
-Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+Perl_utf8n_to_uvchr_error(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
*/
UV
-Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
+Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
routine; see [perl #130921] */
UV uv_so_far;
- UV state = 0;
-
- PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
-
- /* Measurements show that this dfa is somewhat faster than the regular code
- * below, so use it first, dropping down for the non-normal cases. */
-
-#define PERL_UTF8_DECODE_REJECT 1
-
- while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
- UV type = strict_utf8_dfa_tab[*s];
+ dTHX;
- uv = (state == 0)
- ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
- : UTF8_ACCUMULATE(uv, *s);
- state = strict_utf8_dfa_tab[256 + state + type];
-
- if (state == 0) {
- if (retlen) {
- *retlen = s - s0 + 1;
- }
- if (errors) {
- *errors = 0;
- }
- if (msgs) {
- *msgs = NULL;
- }
-
- return uv;
- }
-
- s++;
- }
+ PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
/* Here, is one of: a) malformed; b) a problematic code point (surrogate,
* non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
return _to_upper_title_latin1((U8) c, p, lenp, 'S');
}
- uvchr_to_utf8(p, c);
- return CALL_UPPER_CASE(c, p, p, lenp);
+ return CALL_UPPER_CASE(c, NULL, p, lenp);
}
UV
return _to_upper_title_latin1((U8) c, p, lenp, 's');
}
- uvchr_to_utf8(p, c);
- return CALL_TITLE_CASE(c, p, p, lenp);
+ return CALL_TITLE_CASE(c, NULL, p, lenp);
}
STATIC U8
return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
}
- uvchr_to_utf8(p, c);
- return CALL_LOWER_CASE(c, p, p, lenp);
+ return CALL_LOWER_CASE(c, NULL, p, lenp);
}
UV
/* Here, above 255. If no special needs, just use the macro */
if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
- uvchr_to_utf8(p, c);
- return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
+ return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
}
else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
the special flags. */
if (*p == '_')
return TRUE;
- return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
+
+ return is_utf8_common(p, NULL,
+ "This is buggy if this gets used",
+ PL_utf8_xidstart);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
- return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
+ return is_utf8_common(p, NULL,
+ "This is buggy if this gets used",
+ PL_utf8_idcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
- return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL);
+ return is_utf8_common(p, NULL,
+ "This is buggy if this gets used",
+ PL_utf8_xidcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_MARK;
- return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
+ return is_utf8_common(p, NULL, "IsM", PL_utf8_mark);
}
STATIC UV
unsigned int i;
const unsigned int * cp_list;
U8 * d;
+
+ /* 'index' is guaranteed to be non-negative, as this is an inversion
+ * map that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(invlist, uv1);
IV base = invmap[index];
/* Here, there was no mapping defined, which means that the code point maps
* to itself. Return the inputs */
cases_to_self:
- len = UTF8SKIP(p);
- if (p != ustrp) { /* Don't copy onto itself */
- Copy(p, ustrp, len, U8);
+ if (p) {
+ len = UTF8SKIP(p);
+ if (p != ustrp) { /* Don't copy onto itself */
+ Copy(p, ustrp, len, U8);
+ }
+ *lenp = len;
+ }
+ else {
+ *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
}
-
- if (lenp)
- *lenp = len;
return uv1;
* the return can point to them, but single code points aren't, so would
* need to be constructed if we didn't employ something like this API */
+ /* 'index' is guaranteed to be non-negative, as this is an inversion map
+ * that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
int base = _Perl_IVCF_invmap[index];
return uvoffuni_to_utf8_flags(d, uv, flags);
}
-void
-Perl_init_uniprops(pTHX)
-{
- /* Set up the inversion list global variables */
-
- PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
- PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALNUM]);
- PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALPHA]);
- PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXBLANK]);
- PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
- PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXCNTRL]);
- PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXDIGIT]);
- PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXGRAPH]);
- PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXLOWER]);
- PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPRINT]);
- PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPUNCT]);
- PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXSPACE]);
- PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXUPPER]);
- PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
- PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXWORD]);
- PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXXDIGIT]);
-
- PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
- PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALNUM]);
- PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALPHA]);
- PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXBLANK]);
- PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
- PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXCNTRL]);
- PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXDIGIT]);
- PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXGRAPH]);
- PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXLOWER]);
- PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPRINT]);
- PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPUNCT]);
- PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXSPACE]);
- PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXUPPER]);
- PL_Posix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
- PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXWORD]);
- PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXXDIGIT]);
-
- PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
- PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
- PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
- PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
- PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
-
- PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
- PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
- PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
-
- PL_Assigned_invlist = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASSIGNED]);
-
- PL_utf8_perl_idstart = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDSTART]);
- PL_utf8_perl_idcont = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDCONT]);
-
- PL_utf8_charname_begin = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_BEGIN]);
- PL_utf8_charname_continue = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_CONTINUE]);
-
- PL_utf8_foldable = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_ANY_FOLDS]);
- PL_HasMultiCharFold = _new_invlist_C_array(PL_uni_prop_ptrs[
- PL__PERL_FOLDS_TO_MULTI_CHAR]);
- PL_NonL1NonFinalFold = _new_invlist_C_array(
- NonL1_Perl_Non_Final_Folds_invlist);
-
- PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
- PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
- PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
- PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
- PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
- PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
-}
-
-SV *
-Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const bool to_fold, bool * invert)
-{
- /* Parse the interior meat of \p{} passed to this in 'name' with length 'len',
- * and return an inversion list if a property with 'name' is found, or NULL
- * if not. 'name' point to the input with leading and trailing space trimmed.
- * 'to_fold' indicates if /i is in effect.
- *
- * When the return is an inversion list, '*invert' will be set to a boolean
- * indicating if it should be inverted or not
- *
- * This currently doesn't handle all cases. A NULL return indicates the
- * caller should try a different approach
- */
-
- char* lookup_name;
- bool stricter = FALSE;
- bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one
- of the cjk numeric properties (though
- it requires extra effort to compile
- them) */
- unsigned int i;
- unsigned int j = 0, lookup_len;
- int equals_pos = -1; /* Where the '=' is found, or negative if none */
- int slash_pos = -1; /* Where the '/' is found, or negative if none */
- int table_index = 0;
- bool starts_with_In_or_Is = FALSE;
- Size_t lookup_offset = 0;
-
- PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
-
- /* The input will be modified into 'lookup_name' */
- Newx(lookup_name, len, char);
- SAVEFREEPV(lookup_name);
-
- /* Parse the input. */
- for (i = 0; i < len; i++) {
- char cur = name[i];
-
- /* These characters can be freely ignored in most situations. Later it
- * may turn out we shouldn't have ignored them, and we have to reparse,
- * but we don't have enough information yet to make that decision */
- if (cur == '-' || cur == '_' || isSPACE(cur)) {
- continue;
- }
-
- /* Case differences are also ignored. Our lookup routine assumes
- * everything is lowercase */
- if (isUPPER(cur)) {
- lookup_name[j++] = toLOWER(cur);
- continue;
- }
-
- /* A double colon is either an error, or a package qualifier to a
- * subroutine user-defined property; neither of which do we currently
- * handle
- *
- * But a single colon is a synonym for '=' */
- if (cur == ':') {
- if (i < len - 1 && name[i+1] == ':') {
- return NULL;
- }
- cur = '=';
- }
-
- /* Otherwise, this character is part of the name. */
- lookup_name[j++] = cur;
-
- /* Only the equals sign needs further processing */
- if (cur == '=') {
- equals_pos = j; /* Note where it occurred in the input */
- break;
- }
- }
-
- /* Here, we are either done with the whole property name, if it was simple;
- * or are positioned just after the '=' if it is compound. */
-
- if (equals_pos >= 0) {
- assert(! stricter); /* We shouldn't have set this yet */
-
- /* Space immediately after the '=' is ignored */
- i++;
- for (; i < len; i++) {
- if (! isSPACE(name[i])) {
- break;
- }
- }
-
- /* Certain properties need special handling. They may optionally be
- * prefixed by 'is'. Ignore that prefix for the purposes of checking
- * if this is one of those properties */
- if (memBEGINPs(lookup_name, len, "is")) {
- lookup_offset = 2;
- }
-
- /* Then check if it is one of these properties. This is hard-coded
- * because easier this way, and the list is unlikely to change. There
- * are several properties like this in the Unihan DB, which is unlikely
- * to be compiled, and they all end with 'numeric'. The interiors
- * aren't checked for the precise property. This would stop working if
- * a cjk property were to be created that ended with 'numeric' and
- * wasn't a numeric type */
- is_nv_type = memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "numericvalue")
- || memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "nv")
- || ( memENDPs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "numeric")
- && ( memBEGINPs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "cjk")
- || memBEGINPs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "k")));
- if ( is_nv_type
- || memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "canonicalcombiningclass")
- || memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "ccc")
- || memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "age")
- || memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "in")
- || memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "presentin"))
- {
- unsigned int k;
-
- /* What makes these properties special is that the stuff after the
- * '=' is a number. Therefore, we can't throw away '-'
- * willy-nilly, as those could be a minus sign. Other stricter
- * rules also apply. However, these properties all can have the
- * rhs not be a number, in which case they contain at least one
- * alphabetic. In those cases, the stricter rules don't apply.
- * But the numeric type properties can have the alphas [Ee] to
- * signify an exponent, and it is still a number with stricter
- * rules. So look for an alpha that signifys not-strict */
- stricter = TRUE;
- for (k = i; k < len; k++) {
- if ( isALPHA(name[k])
- && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
- {
- stricter = FALSE;
- break;
- }
- }
- }
-
- if (stricter) {
-
- /* A number may have a leading '+' or '-'. The latter is retained
- * */
- if (name[i] == '+') {
- i++;
- }
- else if (name[i] == '-') {
- lookup_name[j++] = '-';
- i++;
- }
-
- /* Skip leading zeros including single underscores separating the
- * zeros, or between the final leading zero and the first other
- * digit */
- for (; i < len - 1; i++) {
- if ( name[i] != '0'
- && (name[i] != '_' || ! isDIGIT(name[i+1])))
- {
- break;
- }
- }
- }
- }
- else { /* No '=' */
-
- /* We are now in a position to determine if this property should have
- * been parsed using stricter rules. Only a few are like that, and
- * unlikely to change. */
- if ( memBEGINPs(lookup_name, j, "perl")
- && memNEs(lookup_name + 4, j - 4, "space")
- && memNEs(lookup_name + 4, j - 4, "word"))
- {
- stricter = TRUE;
-
- /* We set the inputs back to 0 and the code below will reparse,
- * using strict */
- i = j = 0;
- }
- }
-
- /* Here, we have either finished the property, or are positioned to parse
- * the remainder, and we know if stricter rules apply. Finish out, if not
- * already done */
- for (; i < len; i++) {
- char cur = name[i];
-
- /* In all instances, case differences are ignored, and we normalize to
- * lowercase */
- if (isUPPER(cur)) {
- lookup_name[j++] = toLOWER(cur);
- continue;
- }
-
- /* An underscore is skipped, but not under strict rules unless it
- * separates two digits */
- if (cur == '_') {
- if ( stricter
- && ( i == 0 || (int) i == equals_pos || i == len- 1
- || ! isDIGIT(name[i-1]) || ! isDIGIT(name[i+1])))
- {
- lookup_name[j++] = '_';
- }
- continue;
- }
-
- /* Hyphens are skipped except under strict */
- if (cur == '-' && ! stricter) {
- continue;
- }
-
- /* XXX Bug in documentation. It says white space skipped adjacent to
- * non-word char. Maybe we should, but shouldn't skip it next to a dot
- * in a number */
- if (isSPACE(cur) && ! stricter) {
- continue;
- }
-
- lookup_name[j++] = cur;
-
- /* Unless this is a non-trailing slash, we are done with it */
- if (i >= len - 1 || cur != '/') {
- continue;
- }
-
- slash_pos = j;
-
- /* A slash in the 'numeric value' property indicates that what follows
- * is a denominator. It can have a leading '+' and '0's that should be
- * skipped. But we have never allowed a negative denominator, so treat
- * a minus like every other character. (No need to rule out a second
- * '/', as that won't match anything anyway */
- if (is_nv_type) {
- i++;
- if (i < len && name[i] == '+') {
- i++;
- }
-
- /* Skip leading zeros including underscores separating digits */
- for (; i < len - 1; i++) {
- if ( name[i] != '0'
- && (name[i] != '_' || ! isDIGIT(name[i+1])))
- {
- break;
- }
- }
-
- /* Store the first real character in the denominator */
- lookup_name[j++] = name[i];
- }
- }
-
- /* Here are completely done parsing the input 'name', and 'lookup_name'
- * contains a copy, normalized.
- *
- * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
- * different from without the underscores. */
- if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
- || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
- && UNLIKELY(name[len-1] == '_'))
- {
- lookup_name[j++] = '&';
- }
- else if (len > 2 && name[0] == 'I' && ( name[1] == 'n' || name[1] == 's'))
- {
-
- /* Also, if the original input began with 'In' or 'Is', it could be a
- * subroutine call instead of a property names, which currently isn't
- * handled by this function. Subroutine calls can't happen if there is
- * an '=' in the name */
- if (equals_pos < 0 && get_cvn_flags(name, len, GV_NOTQUAL) != NULL) {
- return NULL;
- }
-
- starts_with_In_or_Is = TRUE;
- }
-
- lookup_len = j; /* Use a more mnemonic name starting here */
-
- /* Get the index into our pointer table of the inversion list corresponding
- * to the property */
- table_index = match_uniprop((U8 *) lookup_name, lookup_len);
-
- /* If it didn't find the property */
- if (table_index == 0) {
-
- /* If didn't find the property, we try again stripping off any initial
- * 'In' or 'Is' */
- if (starts_with_In_or_Is) {
- lookup_name += 2;
- lookup_len -= 2;
- equals_pos -= 2;
- slash_pos -= 2;
-
- table_index = match_uniprop((U8 *) lookup_name, lookup_len);
- }
-
- if (table_index == 0) {
- char * canonical;
-
- /* If not found, and not a numeric type property, isn't a legal
- * property */
- if (! is_nv_type) {
- return NULL;
- }
-
- /* But the numeric type properties need more work to decide. What
- * we do is make sure we have the number in canonical form and look
- * that up. */
-
- if (slash_pos < 0) { /* No slash */
-
- /* When it isn't a rational, take the input, convert it to a
- * NV, then create a canonical string representation of that
- * NV. */
-
- NV value;
-
- /* Get the value */
- if (my_atof3(lookup_name + equals_pos, &value,
- lookup_len - equals_pos)
- != lookup_name + lookup_len)
- {
- return NULL;
- }
-
- /* If the value is an integer, the canonical value is integral */
- if (Perl_ceil(value) == value) {
- canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
- equals_pos, lookup_name, value);
- }
- else { /* Otherwise, it is %e with a known precision */
- canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
- equals_pos, lookup_name,
- PL_E_FORMAT_PRECISION, value);
- }
- }
- else { /* Has a slash. Create a rational in canonical form */
- UV numerator, denominator, gcd, trial;
- const char * end_ptr;
- const char * sign = "";
-
- /* We can't just find the numerator, denominator, and do the
- * division, then use the method above, because that is
- * inexact. And the input could be a rational that is within
- * epsilon (given our precision) of a valid rational, and would
- * then incorrectly compare valid.
- *
- * We're only interested in the part after the '=' */
- const char * this_lookup_name = lookup_name + equals_pos;
- lookup_len -= equals_pos;
- slash_pos -= equals_pos;
-
- /* Handle any leading minus */
- if (this_lookup_name[0] == '-') {
- sign = "-";
- this_lookup_name++;
- lookup_len--;
- slash_pos--;
- }
-
- /* Convert the numerator to numeric */
- end_ptr = this_lookup_name + slash_pos;
- if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
- return NULL;
- }
-
- /* It better have included all characters before the slash */
- if (*end_ptr != '/') {
- return NULL;
- }
-
- /* Set to look at just the denominator */
- this_lookup_name += slash_pos;
- lookup_len -= slash_pos;
- end_ptr = this_lookup_name + lookup_len;
-
- /* Convert the denominator to numeric */
- if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
- return NULL;
- }
-
- /* It better be the rest of the characters, and don't divide by
- * 0 */
- if ( end_ptr != this_lookup_name + lookup_len
- || denominator == 0)
- {
- return NULL;
- }
-
- /* Get the greatest common denominator using
- http://en.wikipedia.org/wiki/Euclidean_algorithm */
- gcd = numerator;
- trial = denominator;
- while (trial != 0) {
- UV temp = trial;
- trial = gcd % trial;
- gcd = temp;
- }
-
- /* If already in lowest possible terms, we have already tried
- * looking this up */
- if (gcd == 1) {
- return NULL;
- }
-
- /* Reduce the rational, which should put it in canonical form.
- * Then look it up */
- numerator /= gcd;
- denominator /= gcd;
-
- canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
- equals_pos, lookup_name, sign, numerator, denominator);
- }
-
- /* Here, we have the number in canonical form. Try that */
- table_index = match_uniprop((U8 *) canonical, strlen(canonical));
- if (table_index == 0) {
- return NULL;
- }
- }
- }
-
- /* The return is an index into a table of ptrs. A negative return
- * signifies that the real index is the absolute value, but the result
- * needs to be inverted */
- if (table_index < 0) {
- *invert = TRUE;
- table_index = -table_index;
- }
- else {
- *invert = FALSE;
- }
-
- /* Out-of band indices indicate a deprecated property. The proper index is
- * modulo it with the table size. And dividing by the table size yields
- * an offset into a table constructed to contain the corresponding warning
- * message */
- if (table_index > MAX_UNI_KEYWORD_INDEX) {
- Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
- table_index %= MAX_UNI_KEYWORD_INDEX;
- Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
- "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
- (int) len, name, deprecated_property_msgs[warning_offset]);
- }
-
- /* In a few properties, a different property is used under /i. These are
- * unlikely to change, so are hard-coded here. */
- if (to_fold) {
- if ( table_index == PL_XPOSIXUPPER
- || table_index == PL_XPOSIXLOWER
- || table_index == PL_TITLE)
- {
- table_index = PL_CASED;
- }
- else if ( table_index == PL_UPPERCASELETTER
- || table_index == PL_LOWERCASELETTER
-#ifdef PL_TITLECASELETTER /* Missing from early Unicodes */
- || table_index == PL_TITLECASELETTER
-#endif
- ) {
- table_index = PL_CASEDLETTER;
- }
- else if ( table_index == PL_POSIXUPPER
- || table_index == PL_POSIXLOWER)
- {
- table_index = PL_POSIXALPHA;
- }
- }
-
- /* Create and return the inversion list */
- return _new_invlist_C_array(PL_uni_prop_ptrs[table_index]);
-}
-
/*
=for apidoc utf8_to_uvchr
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
- return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
+ /* This function is unsafe if malformed UTF-8 input is given it, which is
+ * why the function is deprecated. If the first byte of the input
+ * indicates that there are more bytes remaining in the sequence that forms
+ * the character than there are in the input buffer, it can read past the
+ * end. But we can make it safe if the input string happens to be
+ * NUL-terminated, as many strings in Perl are, by refusing to read past a
+ * NUL. A NUL indicates the start of the next character anyway. If the
+ * input isn't NUL-terminated, the function remains unsafe, as it always
+ * has been.
+ *
+ * An initial NUL has to be handled separately, but all ASCIIs can be
+ * handled the same way, speeding up this common case */
+
+ if (UTF8_IS_INVARIANT(*s)) { /* Assumes 's' contains at least 1 byte */
+ return (UV) *s;
+ }
+
+ return utf8_to_uvchr_buf(s,
+ s + my_strnlen((char *) s, UTF8_MAXBYTES),
+ retlen);
}
/*