This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add isFOO_utf8_safe() macros
authorKarl Williamson <khw@cpan.org>
Thu, 15 Dec 2016 23:30:27 +0000 (16:30 -0700)
committerKarl Williamson <khw@cpan.org>
Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
The original API does not check that we aren't reading beyond the end of
a buffer, apparently assuming that we could keep malformed UTF-8 out by
use of gatekeepers, but that is currently impossible.  This commit adds
"safe" macros for determining if a UTF-8 sequence represents
an alphabetic, a digit, etc.  Each new macro has an extra parameter
pointing to the end of the sequence, so that looking beyond the input
string can be avoided.

The macros aren't currently completely safe, as they don't test that
there is at least a single valid byte in the input, except by an
assertion in DEBUGGING builds.  This is because typically they are
called in code that makes that assumption, and frequently tests the
current byte for one thing or another.

embed.fnc
embed.h
ext/XS-APItest/APItest.xs
ext/XS-APItest/t/handy.t
handy.h
pod/perldelta.pod
proto.h
utf8.c
utf8.h

index 4743524..db19d55 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -813,6 +813,8 @@ AnidR       |bool   |is_utf8_valid_partial_char_flags                           \
                |NN const U8 * const s|NN const U8 * const e|const U32 flags
 AMpR   |bool   |_is_uni_FOO|const U8 classnum|const UV c
 AMpR   |bool   |_is_utf8_FOO|const U8 classnum|NN const U8 *p
+AMpR   |bool   |_is_utf8_FOO_with_len|const U8 classnum|NN const U8 *p     \
+               |NN const U8 * const e
 ADMpR  |bool   |is_utf8_alnum  |NN const U8 *p
 ADMpR  |bool   |is_utf8_alnumc |NN const U8 *p
 ADMpR  |bool   |is_utf8_idfirst|NN const U8 *p
@@ -823,6 +825,10 @@ AMpR       |bool   |_is_utf8_xidcont|NN const U8 *p
 AMpR   |bool   |_is_utf8_xidstart|NN const U8 *p
 AMpR   |bool   |_is_utf8_perl_idcont|NN const U8 *p
 AMpR   |bool   |_is_utf8_perl_idstart|NN const U8 *p
+AMpR   |bool   |_is_utf8_perl_idcont_with_len|NN const U8 *p               \
+               |NN const U8 * const e
+AMpR   |bool   |_is_utf8_perl_idstart_with_len|NN const U8 *p              \
+               |NN const U8 * const e
 ADMpR  |bool   |is_utf8_idcont |NN const U8 *p
 ADMpR  |bool   |is_utf8_xidcont        |NN const U8 *p
 ADMpR  |bool   |is_utf8_alpha  |NN const U8 *p
@@ -2727,6 +2733,11 @@ sRM      |UV     |check_locale_boundary_crossing                             \
                |NN U8* const ustrp                                         \
                |NN STRLEN *lenp
 iR     |bool   |is_utf8_common |NN const U8 *const p|NN SV **swash|NN const char * const swashname|NULLOK SV* const invlist
+iR     |bool   |is_utf8_common_with_len|NN const U8 *const p               \
+                                          |NN const U8 *const e            \
+                                   |NN SV **swash                          \
+                                   |NN const char * const swashname        \
+                                   |NULLOK SV* const invlist
 sR     |SV*    |swatch_get     |NN SV* swash|UV start|UV span
 sRM    |U8*    |swash_scan_list_line|NN U8* l|NN U8* const lend|NN UV* min \
                |NN UV* max|NN UV* val|const bool wants_value               \
diff --git a/embed.h b/embed.h
index 66fe0cc..7d81ef7 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define _is_uni_perl_idcont(a) Perl__is_uni_perl_idcont(aTHX_ a)
 #define _is_uni_perl_idstart(a)        Perl__is_uni_perl_idstart(aTHX_ a)
 #define _is_utf8_FOO(a,b)      Perl__is_utf8_FOO(aTHX_ a,b)
+#define _is_utf8_FOO_with_len(a,b,c)   Perl__is_utf8_FOO_with_len(aTHX_ a,b,c)
 #define _is_utf8_idcont(a)     Perl__is_utf8_idcont(aTHX_ a)
 #define _is_utf8_idstart(a)    Perl__is_utf8_idstart(aTHX_ a)
 #define _is_utf8_mark(a)       Perl__is_utf8_mark(aTHX_ a)
 #define _is_utf8_perl_idcont(a)        Perl__is_utf8_perl_idcont(aTHX_ a)
+#define _is_utf8_perl_idcont_with_len(a,b)     Perl__is_utf8_perl_idcont_with_len(aTHX_ a,b)
 #define _is_utf8_perl_idstart(a)       Perl__is_utf8_perl_idstart(aTHX_ a)
+#define _is_utf8_perl_idstart_with_len(a,b)    Perl__is_utf8_perl_idstart_with_len(aTHX_ a,b)
 #define _is_utf8_xidcont(a)    Perl__is_utf8_xidcont(aTHX_ a)
 #define _is_utf8_xidstart(a)   Perl__is_utf8_xidstart(aTHX_ a)
 #define _to_uni_fold_flags(a,b,c,d)    Perl__to_uni_fold_flags(aTHX_ a,b,c,d)
 #define does_utf8_overflow     S_does_utf8_overflow
 #define isFF_OVERLONG          S_isFF_OVERLONG
 #define is_utf8_common(a,b,c,d)        S_is_utf8_common(aTHX_ a,b,c,d)
+#define is_utf8_common_with_len(a,b,c,d,e)     S_is_utf8_common_with_len(aTHX_ a,b,c,d,e)
 #define is_utf8_cp_above_31_bits       S_is_utf8_cp_above_31_bits
 #define is_utf8_overlong_given_start_byte_ok   S_is_utf8_overlong_given_start_byte_ok
 #define swash_scan_list_line(a,b,c,d,e,f,g)    S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
index 8b4e638..e9d28c8 100644 (file)
@@ -4414,16 +4414,36 @@ test_isBLANK_LC(UV ord)
         RETVAL
 
 bool
-test_isBLANK_utf8(unsigned char * p)
+test_isBLANK_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isBLANK_utf8(p);
+
+        /* In this function and those that follow, the boolean 'type'
+         * indicates if to pass a malformed UTF-8 string to the tested macro
+         * (malformed by making it too short) */
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isBLANK_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isBLANK_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isBLANK_LC_utf8(unsigned char * p)
+test_isBLANK_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isBLANK_LC_utf8(p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isBLANK_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isBLANK_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4442,9 +4462,17 @@ test_isVERTWS_uvchr(UV ord)
         RETVAL
 
 bool
-test_isVERTWS_utf8(unsigned char * p)
+test_isVERTWS_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isVERTWS_utf8(p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isVERTWS_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isVERTWS_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4498,16 +4526,32 @@ test_isUPPER_LC(UV ord)
         RETVAL
 
 bool
-test_isUPPER_utf8(unsigned char * p)
+test_isUPPER_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isUPPER_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isUPPER_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isUPPER_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isUPPER_LC_utf8(unsigned char * p)
+test_isUPPER_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isUPPER_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isUPPER_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isUPPER_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4561,16 +4605,32 @@ test_isLOWER_LC(UV ord)
         RETVAL
 
 bool
-test_isLOWER_utf8(unsigned char * p)
+test_isLOWER_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isLOWER_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isLOWER_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isLOWER_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isLOWER_LC_utf8(unsigned char * p)
+test_isLOWER_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isLOWER_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isLOWER_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isLOWER_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4624,16 +4684,32 @@ test_isALPHA_LC(UV ord)
         RETVAL
 
 bool
-test_isALPHA_utf8(unsigned char * p)
+test_isALPHA_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isALPHA_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHA_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHA_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isALPHA_LC_utf8(unsigned char * p)
+test_isALPHA_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isALPHA_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHA_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHA_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4687,16 +4763,32 @@ test_isWORDCHAR_LC(UV ord)
         RETVAL
 
 bool
-test_isWORDCHAR_utf8(unsigned char * p)
+test_isWORDCHAR_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isWORDCHAR_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isWORDCHAR_LC_utf8(unsigned char * p)
+test_isWORDCHAR_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isWORDCHAR_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4750,16 +4842,32 @@ test_isALPHANUMERIC_LC(UV ord)
         RETVAL
 
 bool
-test_isALPHANUMERIC_utf8(unsigned char * p)
+test_isALPHANUMERIC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isALPHANUMERIC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHANUMERIC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHANUMERIC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isALPHANUMERIC_LC_utf8(unsigned char * p)
+test_isALPHANUMERIC_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isALPHANUMERIC_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHANUMERIC_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHANUMERIC_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4792,16 +4900,32 @@ test_isALNUM_LC(UV ord)
         RETVAL
 
 bool
-test_isALNUM_utf8(unsigned char * p)
+test_isALNUM_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isALNUM_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isALNUM_LC_utf8(unsigned char * p)
+test_isALNUM_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isALNUM_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4827,16 +4951,32 @@ test_isDIGIT_LC_uvchr(UV ord)
         RETVAL
 
 bool
-test_isDIGIT_utf8(unsigned char * p)
+test_isDIGIT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isDIGIT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isDIGIT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isDIGIT_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isDIGIT_LC_utf8(unsigned char * p)
+test_isDIGIT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isDIGIT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isDIGIT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isDIGIT_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -4939,16 +5079,32 @@ test_isIDFIRST_LC(UV ord)
         RETVAL
 
 bool
-test_isIDFIRST_utf8(unsigned char * p)
+test_isIDFIRST_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isIDFIRST_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDFIRST_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDFIRST_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isIDFIRST_LC_utf8(unsigned char * p)
+test_isIDFIRST_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isIDFIRST_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDFIRST_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDFIRST_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5002,16 +5158,32 @@ test_isIDCONT_LC(UV ord)
         RETVAL
 
 bool
-test_isIDCONT_utf8(unsigned char * p)
+test_isIDCONT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isIDCONT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDCONT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDCONT_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isIDCONT_LC_utf8(unsigned char * p)
+test_isIDCONT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isIDCONT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDCONT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDCONT_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5065,16 +5237,32 @@ test_isSPACE_LC(UV ord)
         RETVAL
 
 bool
-test_isSPACE_utf8(unsigned char * p)
+test_isSPACE_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isSPACE_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isSPACE_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isSPACE_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isSPACE_LC_utf8(unsigned char * p)
+test_isSPACE_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isSPACE_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isSPACE_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isSPACE_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5128,16 +5316,32 @@ test_isASCII_LC(UV ord)
         RETVAL
 
 bool
-test_isASCII_utf8(unsigned char * p)
+test_isASCII_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isASCII_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isASCII_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isASCII_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isASCII_LC_utf8(unsigned char * p)
+test_isASCII_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isASCII_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isASCII_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isASCII_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5191,16 +5395,32 @@ test_isCNTRL_LC(UV ord)
         RETVAL
 
 bool
-test_isCNTRL_utf8(unsigned char * p)
+test_isCNTRL_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isCNTRL_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isCNTRL_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isCNTRL_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isCNTRL_LC_utf8(unsigned char * p)
+test_isCNTRL_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isCNTRL_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isCNTRL_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isCNTRL_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5254,16 +5474,32 @@ test_isPRINT_LC(UV ord)
         RETVAL
 
 bool
-test_isPRINT_utf8(unsigned char * p)
+test_isPRINT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isPRINT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPRINT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPRINT_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isPRINT_LC_utf8(unsigned char * p)
+test_isPRINT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isPRINT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPRINT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPRINT_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5317,16 +5553,32 @@ test_isGRAPH_LC(UV ord)
         RETVAL
 
 bool
-test_isGRAPH_utf8(unsigned char * p)
+test_isGRAPH_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isGRAPH_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isGRAPH_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isGRAPH_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isGRAPH_LC_utf8(unsigned char * p)
+test_isGRAPH_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isGRAPH_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isGRAPH_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isGRAPH_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5380,16 +5632,32 @@ test_isPUNCT_LC(UV ord)
         RETVAL
 
 bool
-test_isPUNCT_utf8(unsigned char * p)
+test_isPUNCT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isPUNCT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPUNCT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPUNCT_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isPUNCT_LC_utf8(unsigned char * p)
+test_isPUNCT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isPUNCT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPUNCT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPUNCT_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5443,16 +5711,32 @@ test_isXDIGIT_LC(UV ord)
         RETVAL
 
 bool
-test_isXDIGIT_utf8(unsigned char * p)
+test_isXDIGIT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isXDIGIT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isXDIGIT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isXDIGIT_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isXDIGIT_LC_utf8(unsigned char * p)
+test_isXDIGIT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isXDIGIT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isXDIGIT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isXDIGIT_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
@@ -5506,16 +5790,32 @@ test_isPSXSPC_LC(UV ord)
         RETVAL
 
 bool
-test_isPSXSPC_utf8(unsigned char * p)
+test_isPSXSPC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isPSXSPC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPSXSPC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPSXSPC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
 bool
-test_isPSXSPC_LC_utf8(unsigned char * p)
+test_isPSXSPC_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
     CODE:
-        RETVAL = isPSXSPC_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPSXSPC_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPSXSPC_LC_utf8(p);
+        }
     OUTPUT:
         RETVAL
 
index b08e814..036b9c1 100644 (file)
@@ -104,6 +104,31 @@ sub get_display_locale_or_skip($$) {
     return (" ($locale)", 1);
 }
 
+sub try_malforming($$$)
+{
+    # Determines if the tests for malformed UTF-8 should be done.  When done,
+    # the .xs code creates malformations by pretending the length is shorter
+    # than it actually is.  Some things can't be malformed, and sometimes this
+    # test knows that the current code doesn't look for a malformation under
+    # various circumstances.
+
+    my ($i, $function, $using_locale) = @_;
+
+    # Single bytes can't be malformed
+    return 0 if $i < ((ord "A" == 65) ? 128 : 160);
+
+    # ASCII doesn't need to ever look beyond the first byte.
+    return 0 if $function eq "ASCII";
+
+    # No controls above 255, so the code doesn't look at those
+    return 0 if $i > 255 && $function eq "CNTRL";
+
+    # No non-ASCII digits below 256, except if using locales.
+    return 0 if $i < 256 && ! $using_locale && $function =~ /X?DIGIT/;
+
+    return 1;
+}
+
 my %properties = (
                    # name => Lookup-property name
                    alnum => 'Word',
@@ -131,6 +156,11 @@ my %properties = (
 my @warnings;
 local $SIG{__WARN__} = sub { push @warnings, @_ };
 
+my %utf8_param_code = (
+                        "_safe"                 =>  0,
+                        "_safe, malformed"      =>  1,
+                        "unsafe"                => -1,
+                      );
 
 foreach my $name (sort keys %properties, 'octal') {
     my @invlist;
@@ -282,13 +312,42 @@ foreach my $name (sort keys %properties, 'octal') {
                         $truth = $matches;
                     }
 
-                        my $display_call = "is${function}$suffix("
-                                         . " $display_name )$display_locale";
-                        $ret = truth eval "test_is${function}$suffix('$char')";
-                        if (is ($@, "", "$display_call didn't give error")) {
+                    foreach my $utf8_param("_safe",
+                                           "_safe, malformed",
+                                           "unsafe"
+                                          )
+                    {
+                        my $utf8_param_code = $utf8_param_code{$utf8_param};
+                        my $expect_error = $utf8_param_code > 0;
+                        next if      $expect_error
+                                && ! try_malforming($i, $function, $suffix =~ /LC/);
+
+                        my $display_call = "is${function}$suffix( $display_name"
+                                         . ", $utf8_param )$display_locale";
+                        $ret = truth eval "test_is${function}$suffix('$char',"
+                                        . " $utf8_param_code)";
+                        if ($expect_error) {
+                            isnt ($@, "",
+                                    "expected and got error in $display_call");
+                            like($@, qr/Malformed UTF-8 character/,
+                                "${tab}And got expected message");
+                            if (is (@warnings, 1,
+                                           "${tab}Got a single warning besides"))
+                            {
+                                like($warnings[0],
+                                     qr/Malformed UTF-8 character.*short/,
+                                     "${tab}Got expected warning");
+                            }
+                            else {
+                                diag("@warnings");
+                            }
+                            undef @warnings;
+                        }
+                        elsif (is ($@, "", "$display_call didn't give error")) {
                             is ($ret, $truth,
                                 "${tab}And correctly returned $truth");
                         }
+                    }
                 }
             }
         }
diff --git a/handy.h b/handy.h
index 848050f..625343b 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -565,10 +565,24 @@ to determine if it is in the character class.  For example,
 C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A
 WITH MACRON in Unicode, and is a word character.
 
-Variant C<isFOO_utf8> is like C<isFOO_uvchr>, but the input is a pointer to a
-(known to be well-formed) UTF-8 encoded string (C<U8*> or C<char*>, and
-possibly containing embedded C<NUL> characters).  The classification of just
-the first (possibly multi-byte) character in the string is tested.
+Variant C<isFOO_utf8_safe> is like C<isFOO_uvchr>, but is used for UTF-8
+encoded strings.  Each call classifies one character, even if the string
+contains many.  This variant takes two parameters.  The first, C<p>, is a
+pointer to the first byte of the character to be classified.  (Recall that it
+may take more than one byte to represent a character in UTF-8 strings.)  The
+second parameter, C<e>, points to anywhere in the string beyond the first
+character, up to one byte past the end of the entire string.  The suffix
+C<_safe> in the function's name indicates that it will not attempt to read
+beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this
+is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the input
+character is malformed in some way, the program may croak, or the function may
+return FALSE, at the discretion of the implementation, and subject to change in
+future releases.
+
+Variant C<isFOO_utf8> is like C<isFOO_utf8_safe>, but takes just a single
+parameter, C<p>, which has the same meaning as the corresponding parameter does
+in C<isFOO_utf8_safe>.  The function therefore can't check if it is reading
+beyond the end of the string.
 
 Variant C<isFOO_LC> is like the C<isFOO_A> and C<isFOO_L1> variants, but the
 result is based on the current locale, which is what C<LC> in the name stands
@@ -584,18 +598,32 @@ Variant C<isFOO_LC_uvchr> is like C<isFOO_LC>, but is defined on any UV.  It
 returns the same as C<isFOO_LC> for input code points less than 256, and
 returns the hard-coded, not-affected-by-locale, Unicode results for larger ones.
 
-Variant C<isFOO_LC_utf8> is like C<isFOO_LC_uvchr>, but the input is a pointer
-to a (known to be well-formed) UTF-8 encoded string (C<U8*> or C<char*>, and
-possibly containing embedded C<NUL> characters).  The classification of just
-the first (possibly multi-byte) character in the string is tested.
+Variant C<isFOO_LC_utf8_safe> is like C<isFOO_LC_uvchr>, but is used for UTF-8
+encoded strings.  Each call classifies one character, even if the string
+contains many.  This variant takes two parameters.  The first, C<p>, is a
+pointer to the first byte of the character to be classified.  (Recall that it
+may take more than one byte to represent a character in UTF-8 strings.) The
+second parameter, C<e>, points to anywhere in the string beyond the first
+character, up to one byte past the end of the entire string.  The suffix
+C<_safe> in the function's name indicates that it will not attempt to read
+beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this
+is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the input
+character is malformed in some way, the program may croak, or the function may
+return FALSE, at the discretion of the implementation, and subject to change in
+future releases.
+
+Variant C<isFOO_LC_utf8> is like C<isFOO_LC_utf8_safe>, but takes just a single
+parameter, C<p>, which has the same meaning as the corresponding parameter does
+in C<isFOO_LC_utf8_safe>.  The function therefore can't check if it is reading
+beyond the end of the string.
 
 =for apidoc Am|bool|isALPHA|char ch
 Returns a boolean indicating whether the specified character is an
 alphabetic character, analogous to C<m/[[:alpha:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8>, C<isALPHA_LC>,
-C<isALPHA_LC_uvchr>, and C<isALPHA_LC_utf8>.
+C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8_safe>,
+C<isALPHA_LC>, C<isALPHA_LC_uvchr>, and C<isALPHA_LC_utf8_safe>.
 
 =for apidoc Am|bool|isALPHANUMERIC|char ch
 Returns a boolean indicating whether the specified character is a either an
@@ -603,8 +631,8 @@ alphabetic character or decimal digit, analogous to C<m/[[:alnum:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
 C<isALPHANUMERIC_A>, C<isALPHANUMERIC_L1>, C<isALPHANUMERIC_uvchr>,
-C<isALPHANUMERIC_utf8>, C<isALPHANUMERIC_LC>, C<isALPHANUMERIC_LC_uvchr>, and
-C<isALPHANUMERIC_LC_utf8>.
+C<isALPHANUMERIC_utf8_safe>, C<isALPHANUMERIC_LC>, C<isALPHANUMERIC_LC_uvchr>,
+and C<isALPHANUMERIC_LC_utf8_safe>.
 
 =for apidoc Am|bool|isASCII|char ch
 Returns a boolean indicating whether the specified character is one of the 128
@@ -614,36 +642,36 @@ character corresponds to an ASCII character.  Variants C<isASCII_A()> and
 C<isASCII_L1()> are identical to C<isASCII()>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isASCII_uvchr>, C<isASCII_utf8>, C<isASCII_LC>, C<isASCII_LC_uvchr>, and
-C<isASCII_LC_utf8>.  Note, however, that some platforms do not have the C
+C<isASCII_uvchr>, C<isASCII_utf8_safe>, C<isASCII_LC>, C<isASCII_LC_uvchr>, and
+C<isASCII_LC_utf8_safe>.  Note, however, that some platforms do not have the C
 library routine C<isascii()>.  In these cases, the variants whose names contain
 C<LC> are the same as the corresponding ones without.
 
 Also note, that because all ASCII characters are UTF-8 invariant (meaning they
 have the exact same representation (always a single byte) whether encoded in
 UTF-8 or not), C<isASCII> will give the correct results when called with any
-byte in any string encoded or not in UTF-8.  And similarly C<isASCII_utf8> will
-work properly on any string encoded or not in UTF-8.
+byte in any string encoded or not in UTF-8.  And similarly C<isASCII_utf8_safe>
+will work properly on any string encoded or not in UTF-8.
 
 =for apidoc Am|bool|isBLANK|char ch
 Returns a boolean indicating whether the specified character is a
 character considered to be a blank, analogous to C<m/[[:blank:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8>, C<isBLANK_LC>,
-C<isBLANK_LC_uvchr>, and C<isBLANK_LC_utf8>.  Note, however, that some
-platforms do not have the C library routine C<isblank()>.  In these cases, the
-variants whose names contain C<LC> are the same as the corresponding ones
-without.
+C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8_safe>,
+C<isBLANK_LC>, C<isBLANK_LC_uvchr>, and C<isBLANK_LC_utf8_safe>.  Note,
+however, that some platforms do not have the C library routine
+C<isblank()>.  In these cases, the variants whose names contain C<LC> are
+the same as the corresponding ones without.
 
 =for apidoc Am|bool|isCNTRL|char ch
 Returns a boolean indicating whether the specified character is a
 control character, analogous to C<m/[[:cntrl:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8>, C<isCNTRL_LC>,
-C<isCNTRL_LC_uvchr>, and C<isCNTRL_LC_utf8>
-On EBCDIC platforms, you almost always want to use the C<isCNTRL_L1> variant.
+C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8_safe>,
+C<isCNTRL_LC>, C<isCNTRL_LC_uvchr>, and C<isCNTRL_LC_utf8_safe> On EBCDIC
+platforms, you almost always want to use the C<isCNTRL_L1> variant.
 
 =for apidoc Am|bool|isDIGIT|char ch
 Returns a boolean indicating whether the specified character is a
@@ -651,24 +679,23 @@ digit, analogous to C<m/[[:digit:]]/>.
 Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isDIGIT_uvchr>, C<isDIGIT_utf8>, C<isDIGIT_LC>, C<isDIGIT_LC_uvchr>, and
-C<isDIGIT_LC_utf8>.
+C<isDIGIT_uvchr>, C<isDIGIT_utf8_safe>, C<isDIGIT_LC>, C<isDIGIT_LC_uvchr>, and
+C<isDIGIT_LC_utf8_safe>.
 
 =for apidoc Am|bool|isGRAPH|char ch
 Returns a boolean indicating whether the specified character is a
 graphic character, analogous to C<m/[[:graph:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8>, C<isGRAPH_LC>,
-C<isGRAPH_LC_uvchr>, and C<isGRAPH_LC_utf8>.
+variants C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8_safe>,
+C<isGRAPH_LC>, C<isGRAPH_LC_uvchr>, and C<isGRAPH_LC_utf8_safe>.
 
 =for apidoc Am|bool|isLOWER|char ch
 Returns a boolean indicating whether the specified character is a
 lowercase character, analogous to C<m/[[:lower:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8>, C<isLOWER_LC>,
-C<isLOWER_LC_uvchr>, and C<isLOWER_LC_utf8>.
+C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8_safe>,
+C<isLOWER_LC>, C<isLOWER_LC_uvchr>, and C<isLOWER_LC_utf8_safe>.
 
 =for apidoc Am|bool|isOCTAL|char ch
 Returns a boolean indicating whether the specified character is an
@@ -683,9 +710,8 @@ Note that the definition of what is punctuation isn't as
 straightforward as one might desire.  See L<perlrecharclass/POSIX Character
 Classes> for details.
 See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8>, C<isPUNCT_LC>,
-C<isPUNCT_LC_uvchr>, and C<isPUNCT_LC_utf8>.
+variants C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8_safe>,
+C<isPUNCT_LC>, C<isPUNCT_LC_uvchr>, and C<isPUNCT_LC_utf8_safe>.
 
 =for apidoc Am|bool|isSPACE|char ch
 Returns a boolean indicating whether the specified character is a
@@ -698,8 +724,8 @@ in the non-locale variants, was that C<isSPACE()> did not match a vertical tab.
 (See L</isPSXSPC> for a macro that matches a vertical tab in all releases.)
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8>, C<isSPACE_LC>,
-C<isSPACE_LC_uvchr>, and C<isSPACE_LC_utf8>.
+C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8_safe>,
+C<isSPACE_LC>, C<isSPACE_LC_uvchr>, and C<isSPACE_LC_utf8_safe>.
 
 =for apidoc Am|bool|isPSXSPC|char ch
 (short for Posix Space)
@@ -712,24 +738,23 @@ C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do.
 Otherwise they are identical.  Thus this macro is analogous to what
 C<m/[[:space:]]/> matches in a regular expression.
 See the L<top of this section|/Character classification> for an explanation of
-variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8>,
-C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>, and C<isPSXSPC_LC_utf8>.
+variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8_safe>,
+C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>, and C<isPSXSPC_LC_utf8_safe>.
 
 =for apidoc Am|bool|isUPPER|char ch
 Returns a boolean indicating whether the specified character is an
 uppercase character, analogous to C<m/[[:upper:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8>, C<isUPPER_LC>,
-C<isUPPER_LC_uvchr>, and C<isUPPER_LC_utf8>.
+variants C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8_safe>,
+C<isUPPER_LC>, C<isUPPER_LC_uvchr>, and C<isUPPER_LC_utf8_safe>.
 
 =for apidoc Am|bool|isPRINT|char ch
 Returns a boolean indicating whether the specified character is a
 printable character, analogous to C<m/[[:print:]]/>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8>, C<isPRINT_LC>,
-C<isPRINT_LC_uvchr>, and C<isPRINT_LC_utf8>.
+C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8_safe>,
+C<isPRINT_LC>, C<isPRINT_LC_uvchr>, and C<isPRINT_LC_utf8_safe>.
 
 =for apidoc Am|bool|isWORDCHAR|char ch
 Returns a boolean indicating whether the specified character is a character
@@ -741,10 +766,10 @@ C<isALNUM()> is a synonym provided for backward compatibility, even though a
 word character includes more than the standard C language meaning of
 alphanumeric.
 See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, and C<isWORDCHAR_utf8>.
-C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, and C<isWORDCHAR_LC_utf8> are also as
-described there, but additionally include the platform's native underscore.
+variants C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, and
+C<isWORDCHAR_utf8_safe>.  C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, and
+C<isWORDCHAR_LC_utf8_safe> are also as described there, but additionally
+include the platform's native underscore.
 
 =for apidoc Am|bool|isXDIGIT|char ch
 Returns a boolean indicating whether the specified character is a hexadecimal
@@ -752,8 +777,8 @@ digit.  In the ASCII range these are C<[0-9A-Fa-f]>.  Variants C<isXDIGIT_A()>
 and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isXDIGIT_uvchr>, C<isXDIGIT_utf8>, C<isXDIGIT_LC>, C<isXDIGIT_LC_uvchr>, and
-C<isXDIGIT_LC_utf8>.
+C<isXDIGIT_uvchr>, C<isXDIGIT_utf8_safe>, C<isXDIGIT_LC>, C<isXDIGIT_LC_uvchr>,
+and C<isXDIGIT_LC_utf8_safe>.
 
 =for apidoc Am|bool|isIDFIRST|char ch
 Returns a boolean indicating whether the specified character can be the first
@@ -762,8 +787,8 @@ the official Unicode property C<XID_Start>.  The difference is that this
 returns true only if the input character also matches L</isWORDCHAR>.
 See the L<top of this section|/Character classification> for an explanation of
 variants
-C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8>,
-C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>, and C<isIDFIRST_LC_utf8>.
+C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8_safe>,
+C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>, and C<isIDFIRST_LC_utf8_safe>.
 
 =for apidoc Am|bool|isIDCONT|char ch
 Returns a boolean indicating whether the specified character can be the
@@ -773,8 +798,8 @@ difference is that this returns true only if the input character also matches
 L</isWORDCHAR>.  See the L<top of this section|/Character classification> for
 an
 explanation of variants C<isIDCONT_A>, C<isIDCONT_L1>, C<isIDCONT_uvchr>,
-C<isIDCONT_utf8>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>, and
-C<isIDCONT_LC_utf8>.
+C<isIDCONT_utf8_safe>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>, and
+C<isIDCONT_LC_utf8_safe>.
 
 =head1 Miscellaneous Functions
 
@@ -1684,14 +1709,59 @@ END_EXTERN_C
                                                                    *((p)+1 )), \
                                                 classnum)                      \
                                            : utf8)
+
+/* The "_safe" macros make sure that we don't attempt to read beyond 'e', but
+ * they don't otherwise go out of their way to look for malformed UTF-8.  If
+ * they can return accurate results without knowing if the input is otherwise
+ * malformed, they do so.  For example isASCII is accurate in spite of any
+ * non-length malformations because it looks only at a single byte. Likewise
+ * isDIGIT looks just at the first byte for code points 0-255, as all UTF-8
+ * variant ones return FALSE.  But, if the input has to be well-formed in order
+ * for the results to be accurate, the macros will test and if malformed will
+ * call a routine to die
+ *
+ * Except for toke.c, the macros do assume that e > p, asserting that on
+ * DEBUGGING builds.  Much code that calls these depends on this being true,
+ * for other reasons.  toke.c is treated specially as using the regular
+ * assertion breaks it in many ways.  All strings that these operate on there
+ * are supposed to have an extra NUL character at the end,  so that *e = \0. A
+ * bunch of code in toke.c assumes that this is true, so the assertion allows
+ * for that */
+#ifdef PERL_IN_TOKE_C
+#  define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0'))
+#else
+#  define _utf8_safe_assert(p,e) ((e) > (p))
+#endif
+
+#define _generic_utf8_safe(classnum, p, e, above_latin1)                    \
+         (__ASSERT_(_utf8_safe_assert(p, e))                                \
+         (UTF8_IS_INVARIANT(*(p)))                                          \
+          ? _generic_isCC(*(p), classnum)                                   \
+          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
+             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+                ? _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )),  \
+                                classnum)                                   \
+                : (_force_out_malformed_utf8_message(                       \
+                                        (U8 *) (p), (U8 *) (e), 0, 1), 0))  \
+             : above_latin1))
 /* Like the above, but calls 'above_latin1(p)' to get the utf8 value.
  * 'above_latin1' can be a macro */
 #define _generic_func_utf8(classnum, above_latin1, p)  \
                                     _generic_utf8(classnum, p, above_latin1(p))
+#define _generic_func_utf8_safe(classnum, above_latin1, p, e)               \
+                    _generic_utf8_safe(classnum, p, e, above_latin1(p, e))
+#define _generic_non_swash_utf8_safe(classnum, above_latin1, p, e)          \
+          _generic_utf8_safe(classnum, p, e,                                \
+                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
+                              ? (_force_out_malformed_utf8_message(         \
+                                      (U8 *) (p), (U8 *) (e), 0, 1), 0)     \
+                              : above_latin1(p)))
 /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
  * 'above_latin1' parameter */
 #define _generic_swash_utf8(classnum, p)  \
                       _generic_utf8(classnum, p, _is_utf8_FOO(classnum, p))
+#define _generic_swash_utf8_safe(classnum, p, e)                            \
+_generic_utf8_safe(classnum, p, e, _is_utf8_FOO_with_len(classnum, p, e))
 
 /* Like the above, but should be used only when it is known that there are no
  * characters in the upper-Latin1 range (128-255 on ASCII platforms) which the
@@ -1704,6 +1774,14 @@ END_EXTERN_C
                                            ? above_latin1                      \
                                            : 0)
 
+#define _generic_utf8_safe_no_upper_latin1(classnum, p, e, above_latin1)    \
+         (__ASSERT_(_utf8_safe_assert(p, e))                                \
+         (UTF8_IS_INVARIANT(*(p)))                                          \
+          ? _generic_isCC(*(p), classnum)                                   \
+          : (UTF8_IS_DOWNGRADEABLE_START(*(p)))                             \
+             ? 0 /* Note that doesn't check validity for latin1 */          \
+             : above_latin1)
+
 /* NOTE that some of these macros have very similar ones in regcharclass.h.
  * For example, there is (at the time of this writing) an 'is_SPACE_utf8()'
  * there, differing in name only by an underscore from the one here
@@ -1719,12 +1797,25 @@ END_EXTERN_C
                                              */
 #define isBLANK_utf8(p)        _generic_func_utf8(_CC_BLANK, is_HORIZWS_high, p)
 
+#define isALPHA_utf8_safe(p, e)  _generic_swash_utf8_safe(_CC_ALPHA, p, e)
+#define isALPHANUMERIC_utf8_safe(p, e)                                      \
+                        _generic_swash_utf8_safe(_CC_ALPHANUMERIC, p, e)
+#define isASCII_utf8_safe(p, e)                                             \
+    /* Because ASCII is invariant under utf8, the non-utf8 macro            \
+    * works */                                                              \
+    (__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p)))
+#define isBLANK_utf8_safe(p, e)                                             \
+        _generic_non_swash_utf8_safe(_CC_BLANK, is_HORIZWS_high, p, e)
+
 #ifdef EBCDIC
     /* Because all controls are UTF-8 invariants in EBCDIC, we can use this
      * more efficient macro instead of the more general one */
 #   define isCNTRL_utf8(p)      isCNTRL_L1(*(p))
+#   define isCNTRL_utf8_safe(p, e)                                          \
+                    (__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p))
 #else
-#   define isCNTRL_utf8(p)      _generic_utf8(_CC_CNTRL, p, 0)
+#   define isCNTRL_utf8(p)          _generic_utf8(_CC_CNTRL, p, 0)
+#   define isCNTRL_utf8_safe(p, e)  _generic_utf8_safe(_CC_CNTRL, p, e, 0)
 #endif
 
 #define isDIGIT_utf8(p)         _generic_utf8_no_upper_latin1(_CC_DIGIT, p,   \
@@ -1732,6 +1823,12 @@ END_EXTERN_C
 #define isGRAPH_utf8(p)         _generic_swash_utf8(_CC_GRAPH, p)
 #define isIDCONT_utf8(p)        _generic_func_utf8(_CC_WORDCHAR,              \
                                                   _is_utf8_perl_idcont, p)
+#define isDIGIT_utf8_safe(p, e)                                             \
+            _generic_utf8_safe_no_upper_latin1(_CC_DIGIT, p, e,             \
+                                    _is_utf8_FOO_with_len(_CC_DIGIT, p, e))
+#define isGRAPH_utf8_safe(p, e)    _generic_swash_utf8_safe(_CC_GRAPH, p, e)
+#define isIDCONT_utf8_safe(p, e)   _generic_func_utf8_safe(_CC_WORDCHAR,    \
+                                     _is_utf8_perl_idcont_with_len, p, e)
 
 /* To prevent S_scan_word in toke.c from hanging, we have to make sure that
  * IDFIRST is an alnum.  See
@@ -1752,6 +1849,27 @@ END_EXTERN_C
 #define isWORDCHAR_utf8(p)  _generic_swash_utf8(_CC_WORDCHAR, p)
 #define isXDIGIT_utf8(p)    _generic_utf8_no_upper_latin1(_CC_XDIGIT, p,     \
                                                           is_XDIGIT_high(p))
+#define isIDFIRST_utf8_safe(p, e)                                           \
+    _generic_func_utf8_safe(_CC_IDFIRST,                                    \
+                    _is_utf8_perl_idstart_with_len, (U8 *) (p), (U8 *) (e))
+
+#define isLOWER_utf8_safe(p, e)     _generic_swash_utf8_safe(_CC_LOWER, p, e)
+#define isPRINT_utf8_safe(p, e)     _generic_swash_utf8_safe(_CC_PRINT, p, e)
+#define isPSXSPC_utf8_safe(p, e)     isSPACE_utf8_safe(p, e)
+#define isPUNCT_utf8_safe(p, e)     _generic_swash_utf8_safe(_CC_PUNCT, p, e)
+#define isSPACE_utf8_safe(p, e)                                             \
+    _generic_non_swash_utf8_safe(_CC_SPACE, is_XPERLSPACE_high, p, e)
+#define isUPPER_utf8_safe(p, e)  _generic_swash_utf8_safe(_CC_UPPER, p, e)
+#define isVERTWS_utf8_safe(p, e)                                            \
+        _generic_non_swash_utf8_safe(_CC_VERTSPACE, is_VERTWS_high, p, e)
+#define isWORDCHAR_utf8_safe(p, e)                                          \
+                             _generic_swash_utf8_safe(_CC_WORDCHAR, p, e)
+#define isXDIGIT_utf8_safe(p, e)                                            \
+                   _generic_utf8_safe_no_upper_latin1(_CC_XDIGIT, p, e,     \
+                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
+                              ? (_force_out_malformed_utf8_message(         \
+                                      (U8 *) (p), (U8 *) (e), 0, 1), 0)     \
+                              : is_XDIGIT_high(p)))
 
 #define toFOLD_utf8(p,s,l)     to_utf8_fold(p,s,l)
 #define toLOWER_utf8(p,s,l)    to_utf8_lower(p,s,l)
@@ -1798,6 +1916,71 @@ END_EXTERN_C
                                                             _CC_WORDCHAR, p)
 #define isXDIGIT_LC_utf8(p)   _generic_LC_func_utf8(isXDIGIT_LC,              \
                                                             is_XDIGIT_high, p)
+/* For internal core Perl use only: the base macros for defining macros like
+ * isALPHA_LC_utf8_safe.  These are like _generic_utf8, but if the first code
+ * point in 'p' is within the 0-255 range, it uses locale rules from the
+ * passed-in 'macro' parameter */
+#define _generic_LC_utf8_safe(macro, p, e, above_latin1)                    \
+         (__ASSERT_(_utf8_safe_assert(p, e))                                \
+         (UTF8_IS_INVARIANT(*(p)))                                          \
+          ? macro(*(p))                                                     \
+          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
+             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+                ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1)))           \
+                : (_force_out_malformed_utf8_message(                       \
+                                        (U8 *) (p), (U8 *) (e), 0, 1), 0))  \
+              : above_latin1))
+
+#define _generic_LC_swash_utf8_safe(macro, classnum, p, e)                  \
+            _generic_LC_utf8_safe(macro, p, e,                              \
+                               _is_utf8_FOO_with_len(classnum, p, e))
+
+#define _generic_LC_func_utf8_safe(macro, above_latin1, p, e)               \
+            _generic_LC_utf8_safe(macro, p, e, above_latin1(p, e))
+
+#define _generic_LC_non_swash_utf8_safe(classnum, above_latin1, p, e)       \
+          _generic_LC_utf8_safe(classnum, p, e,                             \
+                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
+                              ? (_force_out_malformed_utf8_message(         \
+                                      (U8 *) (p), (U8 *) (e), 0, 1), 0)     \
+                              : above_latin1(p)))
+
+#define isALPHANUMERIC_LC_utf8_safe(p, e)                                   \
+            _generic_LC_swash_utf8_safe(isALPHANUMERIC_LC,                  \
+                                        _CC_ALPHANUMERIC, p, e)
+#define isALPHA_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isALPHA_LC, _CC_ALPHA, p, e)
+#define isASCII_LC_utf8_safe(p, e)                                          \
+                    (__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p)))
+#define isBLANK_LC_utf8_safe(p, e)                                          \
+        _generic_LC_non_swash_utf8_safe(isBLANK_LC, is_HORIZWS_high, p, e)
+#define isCNTRL_LC_utf8_safe(p, e)                                          \
+            _generic_LC_utf8_safe(isCNTRL_LC, p, e, 0)
+#define isDIGIT_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isDIGIT_LC, _CC_DIGIT, p, e)
+#define isGRAPH_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isGRAPH_LC, _CC_GRAPH, p, e)
+#define isIDCONT_LC_utf8_safe(p, e)                                         \
+            _generic_LC_func_utf8_safe(isIDCONT_LC,                         \
+                                _is_utf8_perl_idcont_with_len, p, e)
+#define isIDFIRST_LC_utf8_safe(p, e)                                        \
+            _generic_LC_func_utf8_safe(isIDFIRST_LC,                        \
+                                _is_utf8_perl_idstart_with_len, p, e)
+#define isLOWER_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isLOWER_LC, _CC_LOWER, p, e)
+#define isPRINT_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isPRINT_LC, _CC_PRINT, p, e)
+#define isPSXSPC_LC_utf8_safe(p, e)    isSPACE_LC_utf8_safe(p, e)
+#define isPUNCT_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isPUNCT_LC, _CC_PUNCT, p, e)
+#define isSPACE_LC_utf8_safe(p, e)                                          \
+    _generic_LC_non_swash_utf8_safe(isSPACE_LC, is_XPERLSPACE_high, p, e)
+#define isUPPER_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isUPPER_LC, _CC_UPPER, p, e)
+#define isWORDCHAR_LC_utf8_safe(p, e)                                       \
+            _generic_LC_swash_utf8_safe(isWORDCHAR_LC, _CC_WORDCHAR, p, e)
+#define isXDIGIT_LC_utf8_safe(p, e)                                         \
+        _generic_LC_non_swash_utf8_safe(isXDIGIT_LC, is_XDIGIT_high, p, e)
 
 /* Macros for backwards compatibility and for completeness when the ASCII and
  * Latin1 values are identical */
index b6feb46..2569c69 100644 (file)
@@ -327,6 +327,15 @@ well.
 
 =item *
 
+New versions of macros like C<isALPHA_utf8> have been added, each with the
+suffix C<_safe>, like C<isSPACE_utf8_safe>.  These take an extra
+parameter, giving an upper limit of how far into the string it is safe
+to read.  Using the old versions could cause attempts to read beyond the
+end of the input buffer if the UTF-8 is not well-formed.  Details are at
+L<perlapi/Character classification>.
+
+=item *
+
 Calling macros like C<isALPHA_utf8> on malformed UTF-8 have issued a
 deprecation warning since Perl v5.18.  They now die.
 
diff --git a/proto.h b/proto.h
index c7065cd..a7dc7d7 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -59,6 +59,11 @@ PERL_CALLCONV bool   Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
 #define PERL_ARGS_ASSERT__IS_UTF8_FOO  \
        assert(p)
 
+PERL_CALLCONV bool     Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN \
+       assert(p); assert(e)
+
 PERL_CALLCONV bool     Perl__is_utf8_idcont(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT__IS_UTF8_IDCONT       \
@@ -79,11 +84,21 @@ PERL_CALLCONV bool  Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT  \
        assert(p)
 
+PERL_CALLCONV bool     Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN \
+       assert(p); assert(e)
+
 PERL_CALLCONV bool     Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART \
        assert(p)
 
+PERL_CALLCONV bool     Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN        \
+       assert(p); assert(e)
+
 PERL_CALLCONV bool     Perl__is_utf8_xidcont(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT__IS_UTF8_XIDCONT      \
@@ -5624,6 +5639,11 @@ PERL_STATIC_INLINE bool  S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, co
 #define PERL_ARGS_ASSERT_IS_UTF8_COMMON        \
        assert(p); assert(swash); assert(swashname)
 
+PERL_STATIC_INLINE bool        S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 *const e, SV **swash, const char * const swashname, SV* const invlist)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN       \
+       assert(p); assert(e); assert(swash); assert(swashname)
+
 PERL_STATIC_INLINE bool        S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS      \
diff --git a/utf8.c b/utf8.c
index 6b2c128..44aada5 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2116,7 +2116,7 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(classnum, tmpbuf);
+    return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
 }
 
 /* Internal function so we can deprecate the external one, and call
@@ -2137,7 +2137,7 @@ Perl__is_uni_perl_idcont(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idcont(tmpbuf);
+    return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
 }
 
 bool
@@ -2145,7 +2145,7 @@ Perl__is_uni_perl_idstart(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idstart(tmpbuf);
+    return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
 }
 
 UV
@@ -2445,6 +2445,40 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
     return swash_fetch(*swash, p, TRUE) != 0;
 }
 
+PERL_STATIC_INLINE bool
+S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash,
+                         const char *const swashname, SV* const invlist)
+{
+    /* returns a boolean giving whether or not the UTF8-encoded character that
+     * starts at <p>, and extending no further than <e - 1> is in the swash
+     * indicated by <swashname>.  <swash> contains a pointer to where the swash
+     * indicated by <swashname> is to be stored; which this routine will do, so
+     * that future calls will look at <*swash> and only generate a swash if it
+     * is not null.  <invlist> is NULL or an inversion list that defines the
+     * swash.  If not null, it saves time during initialization of the swash.
+     */
+
+    PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
+
+    if (! isUTF8_CHAR(p, e)) {
+        _force_out_malformed_utf8_message(p, e, 0, 1);
+        NOT_REACHED; /* NOTREACHED */
+    }
+
+    if (!*swash) {
+        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+        *swash = _core_swash_init("utf8",
+
+                                  /* Only use the name if there is no inversion
+                                   * list; otherwise will go out to disk */
+                                  (invlist) ? "" : swashname,
+
+                                  &PL_sv_undef, 1, 0, invlist, &flags);
+    }
+
+    return swash_fetch(*swash, p, TRUE) != 0;
+}
+
 bool
 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
 {
@@ -2459,6 +2493,21 @@ Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
 }
 
 bool
+Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
+                                                            const U8 * const e)
+{
+    PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
+
+    assert(classnum < _FIRST_NON_SWASH_CC);
+
+    return is_utf8_common_with_len(p,
+                                   e,
+                                   &PL_utf8_swash_ptrs[classnum],
+                                   swash_property_names[classnum],
+                                   PL_XPosix_ptrs[classnum]);
+}
+
+bool
 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
 {
     SV* invlist = NULL;
@@ -2472,6 +2521,20 @@ Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
 }
 
 bool
+Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
+{
+    SV* invlist = NULL;
+
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
+
+    if (! PL_utf8_perl_idstart) {
+        invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
+    }
+    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
+                                      "_Perl_IDStart", invlist);
+}
+
+bool
 Perl__is_utf8_xidstart(pTHX_ const U8 *p)
 {
     PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
@@ -2495,6 +2558,20 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
 }
 
 bool
+Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
+{
+    SV* invlist = NULL;
+
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
+
+    if (! PL_utf8_perl_idcont) {
+        invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
+    }
+    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
+                                   "_Perl_IDCont", invlist);
+}
+
+bool
 Perl__is_utf8_idcont(pTHX_ const U8 *p)
 {
     PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
diff --git a/utf8.h b/utf8.h
index f6d9d54..5568039 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -680,6 +680,17 @@ with a ptr argument.
                                 : isWORDCHAR_utf8((const U8*)p))
 #define isALNUM_lazy_if(p,UTF)   isWORDCHAR_lazy_if(p,UTF)
 
+#define isIDFIRST_lazy_if_safe(p, e, UTF)                                   \
+                   ((IN_BYTES || !UTF)                                      \
+                     ? isIDFIRST(*(p))                                      \
+                     : isIDFIRST_utf8_safe(p, e))
+
+#define isWORDCHAR_lazy_if_safe(p, e, UTF)                                  \
+                   ((IN_BYTES || !UTF)                                      \
+                     ? isWORDCHAR(*(p))                                     \
+                     : isWORDCHAR_utf8_safe((U8 *) p, (U8 *) e))
+
+
 #define UTF8_MAXLEN UTF8_MAXBYTES
 
 /* A Unicode character can fold to up to 3 characters */