#define UTF8_WARN_EMPTY 1
#define UTF8_WARN_CONTINUATION 2
#define UTF8_WARN_NON_CONTINUATION 3
-#define UTF8_WARN_SHORT 5
-#define UTF8_WARN_OVERFLOW 6
-#define UTF8_WARN_LONG 8
+#define UTF8_WARN_SHORT 4
+#define UTF8_WARN_OVERFLOW 5
+#define UTF8_WARN_LONG 6
if (curlen == 0 &&
!(flags & UTF8_ALLOW_EMPTY)) {
#else
if (uv == 0xfe || uv == 0xff) {
if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
- sv = sv_2mortal(newSVpvf_nocontext("Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
}
if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
} else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
if (UNICODE_IS_SURROGATE(uv)) {
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
- sv = sv_2mortal(newSVpvf_nocontext("UTF-16 surrogate U+%04"UVXf"", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
}
if (flags & UTF8_DISALLOW_SURROGATE) {
goto disallowed;
}
else if (UNICODE_IS_NONCHAR(uv)) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
- sv = sv_2mortal(newSVpvf_nocontext("Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
}
if (flags & UTF8_DISALLOW_NONCHAR) {
goto disallowed;
}
else if ((uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
- sv = sv_2mortal(newSVpvf_nocontext("Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
}
if (flags & UTF8_DISALLOW_SUPER) {
goto disallowed;
=cut
*/
+
UV
Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
- ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
/* Call the low level routine asking for checks */
return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
- ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
}
HV*
-Perl__swash_inversion_hash(pTHX_ SV* swash)
+Perl__swash_inversion_hash(pTHX_ SV* const swash)
{
/* Subject to change or removal. For use only in one place in regexec.c
char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
STRLEN key_len = key_end - key;
- /* And the value is what the forward mapping is from. */
- char utf8_inverse[UTF8_MAXBYTES+1];
- char *utf8_inverse_end = (char *) uvuni_to_utf8((U8*) utf8_inverse, inverse);
-
/* Get the list for the map */
if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
list = (AV*) *listp;
Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
}
entry = *entryp;
- if (SvCUR(entry) != key_len) {
- continue;
- }
- if (memEQ(key, SvPVX(entry), key_len)) {
+ if (SvUV(entry) == val) {
found_key = TRUE;
break;
}
}
+
+ /* Make sure there is a mapping to itself on the list */
if (! found_key) {
- element = newSVpvn_flags(key, key_len, SVf_UTF8);
+ element = newSVuv(val);
av_push(list, element);
}
/* Simply add the value to the list */
- element = newSVpvn_flags(utf8_inverse, utf8_inverse_end - utf8_inverse, SVf_UTF8);
+ element = newSVuv(inverse);
av_push(list, element);
/* swash_get() increments the value of val for each element in the
return ret;
}
+HV*
+Perl__swash_to_invlist(pTHX_ SV* const swash)
+{
+
+ /* Subject to change or removal. For use only in one place in regcomp.c */
+
+ U8 *l, *lend;
+ char *loc;
+ STRLEN lcur;
+ HV *const hv = MUTABLE_HV(SvRV(swash));
+ UV elements = 0; /* Number of elements in the inversion list */
+ U8 empty[] = "";
+
+ /* The string containing the main body of the table */
+ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+ SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+
+ const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+ const STRLEN bits = SvUV(*bitssvp);
+ const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+
+ HV* invlist;
+
+ PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
+
+ /* read $swash->{LIST} */
+ if (SvPOK(*listsvp)) {
+ l = (U8*)SvPV(*listsvp, lcur);
+ }
+ else {
+ /* LIST legitimately doesn't contain a string during compilation phases
+ * of Perl itself, before the Unicode tables are generated. In this
+ * case, just fake things up by creating an empty list */
+ l = empty;
+ lcur = 0;
+ }
+ loc = (char *) l;
+ lend = l + lcur;
+
+ /* Scan the input to count the number of lines to preallocate array size
+ * based on worst possible case, which is each line in the input creates 2
+ * elements in the inversion list: 1) the beginning of a range in the list;
+ * 2) the beginning of a range not in the list. */
+ while ((loc = (strchr(loc, '\n'))) != NULL) {
+ elements += 2;
+ loc++;
+ }
+
+ /* If the ending is somehow corrupt and isn't a new line, add another
+ * element for the final range that isn't in the inversion list */
+ if (! (*lend == '\n' || (*lend == '\0' && *(lend - 1) == '\n'))) {
+ elements++;
+ }
+
+ invlist = _new_invlist(elements);
+
+ /* Now go through the input again, adding each range to the list */
+ while (l < lend) {
+ UV start, end;
+ UV val; /* Not used by this function */
+
+ l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
+ cBOOL(octets), typestr);
+
+ if (l > lend) {
+ break;
+ }
+
+ _append_range_to_invlist(invlist, start, end);
+ }
+
+ return invlist;
+}
+
/*
=for apidoc uvchr_to_utf8