Add isFOO_utf8_safe() macros

author Karl Williamson <khw@cpan.org>

Thu, 15 Dec 2016 23:30:27 +0000 (16:30 -0700)

committer Karl Williamson <khw@cpan.org>

Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
author Karl Williamson <khw@cpan.org>
Thu, 15 Dec 2016 23:30:27 +0000 (16:30 -0700)
committer Karl Williamson <khw@cpan.org>
Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
diff --git a/embed.fnc b/embed.fnc

index 4743524..db19d55 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -813,6 +813,8 @@ AnidR       |bool   |is_utf8_valid_partial_char_flags                           \
                 |NN const U8 * const s|NN const U8 * const e|const U32 flags
  AMpR   |bool   |_is_uni_FOO|const U8 classnum|const UV c
  AMpR   |bool   |_is_utf8_FOO|const U8 classnum|NN const U8 *p
+AMpR   |bool   |_is_utf8_FOO_with_len|const U8 classnum|NN const U8 *p     \
+               |NN const U8 * const e
  ADMpR  |bool   |is_utf8_alnum  |NN const U8 *p
  ADMpR  |bool   |is_utf8_alnumc |NN const U8 *p
  ADMpR  |bool   |is_utf8_idfirst|NN const U8 *p
@@ -823,6 +825,10 @@ AMpR       |bool   |_is_utf8_xidcont|NN const U8 *p
  AMpR   |bool   |_is_utf8_xidstart|NN const U8 *p
  AMpR   |bool   |_is_utf8_perl_idcont|NN const U8 *p
  AMpR   |bool   |_is_utf8_perl_idstart|NN const U8 *p
+AMpR   |bool   |_is_utf8_perl_idcont_with_len|NN const U8 *p               \
+               |NN const U8 * const e
+AMpR   |bool   |_is_utf8_perl_idstart_with_len|NN const U8 *p              \
+               |NN const U8 * const e
  ADMpR  |bool   |is_utf8_idcont |NN const U8 *p
  ADMpR  |bool   |is_utf8_xidcont        |NN const U8 *p
  ADMpR  |bool   |is_utf8_alpha  |NN const U8 *p
@@ -2727,6 +2733,11 @@ sRM      |UV     |check_locale_boundary_crossing                             \
                 |NN U8* const ustrp                                         \
                 |NN STRLEN *lenp
  iR     |bool   |is_utf8_common |NN const U8 *const p|NN SV **swash|NN const char * const swashname|NULLOK SV* const invlist
+iR     |bool   |is_utf8_common_with_len|NN const U8 *const p               \
+                                          |NN const U8 *const e            \
+                                   |NN SV **swash                          \
+                                   |NN const char * const swashname        \
+                                   |NULLOK SV* const invlist
  sR     |SV*    |swatch_get     |NN SV* swash|UV start|UV span
  sRM    |U8*    |swash_scan_list_line|NN U8* l|NN U8* const lend|NN UV* min \
                 |NN UV* max|NN UV* val|const bool wants_value               \
diff --git a/embed.h b/embed.h

index 66fe0cc..7d81ef7 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -33,11 +33,14 @@
  #define _is_uni_perl_idcont(a) Perl__is_uni_perl_idcont(aTHX_ a)
  #define _is_uni_perl_idstart(a)        Perl__is_uni_perl_idstart(aTHX_ a)
  #define _is_utf8_FOO(a,b)      Perl__is_utf8_FOO(aTHX_ a,b)
+#define _is_utf8_FOO_with_len(a,b,c)   Perl__is_utf8_FOO_with_len(aTHX_ a,b,c)
  #define _is_utf8_idcont(a)     Perl__is_utf8_idcont(aTHX_ a)
  #define _is_utf8_idstart(a)    Perl__is_utf8_idstart(aTHX_ a)
  #define _is_utf8_mark(a)       Perl__is_utf8_mark(aTHX_ a)
  #define _is_utf8_perl_idcont(a)        Perl__is_utf8_perl_idcont(aTHX_ a)
+#define _is_utf8_perl_idcont_with_len(a,b)     Perl__is_utf8_perl_idcont_with_len(aTHX_ a,b)
  #define _is_utf8_perl_idstart(a)       Perl__is_utf8_perl_idstart(aTHX_ a)
+#define _is_utf8_perl_idstart_with_len(a,b)    Perl__is_utf8_perl_idstart_with_len(aTHX_ a,b)
  #define _is_utf8_xidcont(a)    Perl__is_utf8_xidcont(aTHX_ a)
  #define _is_utf8_xidstart(a)   Perl__is_utf8_xidstart(aTHX_ a)
  #define _to_uni_fold_flags(a,b,c,d)    Perl__to_uni_fold_flags(aTHX_ a,b,c,d)
@@ -1835,6 +1838,7 @@
  #define does_utf8_overflow     S_does_utf8_overflow
  #define isFF_OVERLONG          S_isFF_OVERLONG
  #define is_utf8_common(a,b,c,d)        S_is_utf8_common(aTHX_ a,b,c,d)
+#define is_utf8_common_with_len(a,b,c,d,e)     S_is_utf8_common_with_len(aTHX_ a,b,c,d,e)
  #define is_utf8_cp_above_31_bits       S_is_utf8_cp_above_31_bits
  #define is_utf8_overlong_given_start_byte_ok   S_is_utf8_overlong_given_start_byte_ok
  #define swash_scan_list_line(a,b,c,d,e,f,g)    S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs

index 8b4e638..e9d28c8 100644 (file)
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -4414,16 +4414,36 @@ test_isBLANK_LC(UV ord)
          RETVAL
  
  bool
-test_isBLANK_utf8(unsigned char * p)
+test_isBLANK_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isBLANK_utf8(p);
+
+        /* In this function and those that follow, the boolean 'type'
+         * indicates if to pass a malformed UTF-8 string to the tested macro
+         * (malformed by making it too short) */
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isBLANK_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isBLANK_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isBLANK_LC_utf8(unsigned char * p)
+test_isBLANK_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isBLANK_LC_utf8(p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isBLANK_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isBLANK_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4442,9 +4462,17 @@ test_isVERTWS_uvchr(UV ord)
          RETVAL
  
  bool
-test_isVERTWS_utf8(unsigned char * p)
+test_isVERTWS_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isVERTWS_utf8(p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isVERTWS_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isVERTWS_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4498,16 +4526,32 @@ test_isUPPER_LC(UV ord)
          RETVAL
  
  bool
-test_isUPPER_utf8(unsigned char * p)
+test_isUPPER_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isUPPER_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isUPPER_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isUPPER_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isUPPER_LC_utf8(unsigned char * p)
+test_isUPPER_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isUPPER_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isUPPER_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isUPPER_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4561,16 +4605,32 @@ test_isLOWER_LC(UV ord)
          RETVAL
  
  bool
-test_isLOWER_utf8(unsigned char * p)
+test_isLOWER_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isLOWER_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isLOWER_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isLOWER_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isLOWER_LC_utf8(unsigned char * p)
+test_isLOWER_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isLOWER_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isLOWER_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isLOWER_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4624,16 +4684,32 @@ test_isALPHA_LC(UV ord)
          RETVAL
  
  bool
-test_isALPHA_utf8(unsigned char * p)
+test_isALPHA_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isALPHA_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHA_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHA_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isALPHA_LC_utf8(unsigned char * p)
+test_isALPHA_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isALPHA_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHA_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHA_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4687,16 +4763,32 @@ test_isWORDCHAR_LC(UV ord)
          RETVAL
  
  bool
-test_isWORDCHAR_utf8(unsigned char * p)
+test_isWORDCHAR_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isWORDCHAR_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isWORDCHAR_LC_utf8(unsigned char * p)
+test_isWORDCHAR_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isWORDCHAR_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4750,16 +4842,32 @@ test_isALPHANUMERIC_LC(UV ord)
          RETVAL
  
  bool
-test_isALPHANUMERIC_utf8(unsigned char * p)
+test_isALPHANUMERIC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isALPHANUMERIC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHANUMERIC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHANUMERIC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isALPHANUMERIC_LC_utf8(unsigned char * p)
+test_isALPHANUMERIC_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isALPHANUMERIC_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isALPHANUMERIC_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isALPHANUMERIC_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4792,16 +4900,32 @@ test_isALNUM_LC(UV ord)
          RETVAL
  
  bool
-test_isALNUM_utf8(unsigned char * p)
+test_isALNUM_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isALNUM_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isALNUM_LC_utf8(unsigned char * p)
+test_isALNUM_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isALNUM_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isWORDCHAR_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isWORDCHAR_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4827,16 +4951,32 @@ test_isDIGIT_LC_uvchr(UV ord)
          RETVAL
  
  bool
-test_isDIGIT_utf8(unsigned char * p)
+test_isDIGIT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isDIGIT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isDIGIT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isDIGIT_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isDIGIT_LC_utf8(unsigned char * p)
+test_isDIGIT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isDIGIT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isDIGIT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isDIGIT_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -4939,16 +5079,32 @@ test_isIDFIRST_LC(UV ord)
          RETVAL
  
  bool
-test_isIDFIRST_utf8(unsigned char * p)
+test_isIDFIRST_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isIDFIRST_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDFIRST_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDFIRST_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isIDFIRST_LC_utf8(unsigned char * p)
+test_isIDFIRST_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isIDFIRST_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDFIRST_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDFIRST_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5002,16 +5158,32 @@ test_isIDCONT_LC(UV ord)
          RETVAL
  
  bool
-test_isIDCONT_utf8(unsigned char * p)
+test_isIDCONT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isIDCONT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDCONT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDCONT_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isIDCONT_LC_utf8(unsigned char * p)
+test_isIDCONT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isIDCONT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isIDCONT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isIDCONT_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5065,16 +5237,32 @@ test_isSPACE_LC(UV ord)
          RETVAL
  
  bool
-test_isSPACE_utf8(unsigned char * p)
+test_isSPACE_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isSPACE_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isSPACE_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isSPACE_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isSPACE_LC_utf8(unsigned char * p)
+test_isSPACE_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isSPACE_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isSPACE_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isSPACE_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5128,16 +5316,32 @@ test_isASCII_LC(UV ord)
          RETVAL
  
  bool
-test_isASCII_utf8(unsigned char * p)
+test_isASCII_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isASCII_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isASCII_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isASCII_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isASCII_LC_utf8(unsigned char * p)
+test_isASCII_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isASCII_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isASCII_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isASCII_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5191,16 +5395,32 @@ test_isCNTRL_LC(UV ord)
          RETVAL
  
  bool
-test_isCNTRL_utf8(unsigned char * p)
+test_isCNTRL_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isCNTRL_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isCNTRL_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isCNTRL_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isCNTRL_LC_utf8(unsigned char * p)
+test_isCNTRL_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isCNTRL_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isCNTRL_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isCNTRL_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5254,16 +5474,32 @@ test_isPRINT_LC(UV ord)
          RETVAL
  
  bool
-test_isPRINT_utf8(unsigned char * p)
+test_isPRINT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isPRINT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPRINT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPRINT_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isPRINT_LC_utf8(unsigned char * p)
+test_isPRINT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isPRINT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPRINT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPRINT_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5317,16 +5553,32 @@ test_isGRAPH_LC(UV ord)
          RETVAL
  
  bool
-test_isGRAPH_utf8(unsigned char * p)
+test_isGRAPH_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isGRAPH_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isGRAPH_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isGRAPH_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isGRAPH_LC_utf8(unsigned char * p)
+test_isGRAPH_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isGRAPH_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isGRAPH_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isGRAPH_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5380,16 +5632,32 @@ test_isPUNCT_LC(UV ord)
          RETVAL
  
  bool
-test_isPUNCT_utf8(unsigned char * p)
+test_isPUNCT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isPUNCT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPUNCT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPUNCT_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isPUNCT_LC_utf8(unsigned char * p)
+test_isPUNCT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isPUNCT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPUNCT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPUNCT_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5443,16 +5711,32 @@ test_isXDIGIT_LC(UV ord)
          RETVAL
  
  bool
-test_isXDIGIT_utf8(unsigned char * p)
+test_isXDIGIT_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isXDIGIT_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isXDIGIT_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isXDIGIT_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isXDIGIT_LC_utf8(unsigned char * p)
+test_isXDIGIT_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isXDIGIT_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isXDIGIT_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isXDIGIT_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
@@ -5506,16 +5790,32 @@ test_isPSXSPC_LC(UV ord)
          RETVAL
  
  bool
-test_isPSXSPC_utf8(unsigned char * p)
+test_isPSXSPC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isPSXSPC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPSXSPC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPSXSPC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
  bool
-test_isPSXSPC_LC_utf8(unsigned char * p)
+test_isPSXSPC_LC_utf8(unsigned char * p, int type)
+    PREINIT:
+       const unsigned char * e;
      CODE:
-        RETVAL = isPSXSPC_LC_utf8( p);
+        if (type >= 0) {
+            e = p + UTF8SKIP(p) - type;
+            RETVAL = isPSXSPC_LC_utf8_safe(p, e);
+        }
+        else {
+            RETVAL = isPSXSPC_LC_utf8(p);
+        }
      OUTPUT:
          RETVAL
  
diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t

index b08e814..036b9c1 100644 (file)
--- a/ext/XS-APItest/t/handy.t
+++ b/ext/XS-APItest/t/handy.t
@@ -104,6 +104,31 @@ sub get_display_locale_or_skip($$) {
      return (" ($locale)", 1);
  }
  
+sub try_malforming($$$)
+{
+    # Determines if the tests for malformed UTF-8 should be done.  When done,
+    # the .xs code creates malformations by pretending the length is shorter
+    # than it actually is.  Some things can't be malformed, and sometimes this
+    # test knows that the current code doesn't look for a malformation under
+    # various circumstances.
+
+    my ($i, $function, $using_locale) = @_;
+
+    # Single bytes can't be malformed
+    return 0 if $i < ((ord "A" == 65) ? 128 : 160);
+
+    # ASCII doesn't need to ever look beyond the first byte.
+    return 0 if $function eq "ASCII";
+
+    # No controls above 255, so the code doesn't look at those
+    return 0 if $i > 255 && $function eq "CNTRL";
+
+    # No non-ASCII digits below 256, except if using locales.
+    return 0 if $i < 256 && ! $using_locale && $function =~ /X?DIGIT/;
+
+    return 1;
+}
+
  my %properties = (
                     # name => Lookup-property name
                     alnum => 'Word',
@@ -131,6 +156,11 @@ my %properties = (
  my @warnings;
  local $SIG{__WARN__} = sub { push @warnings, @_ };
  
+my %utf8_param_code = (
+                        "_safe"                 =>  0,
+                        "_safe, malformed"      =>  1,
+                        "unsafe"                => -1,
+                      );
  
  foreach my $name (sort keys %properties, 'octal') {
      my @invlist;
@@ -282,13 +312,42 @@ foreach my $name (sort keys %properties, 'octal') {
                          $truth = $matches;
                      }
  
-                        my $display_call = "is${function}$suffix("
-                                         . " $display_name )$display_locale";
-                        $ret = truth eval "test_is${function}$suffix('$char')";
-                        if (is ($@, "", "$display_call didn't give error")) {
+                    foreach my $utf8_param("_safe",
+                                           "_safe, malformed",
+                                           "unsafe"
+                                          )
+                    {
+                        my $utf8_param_code = $utf8_param_code{$utf8_param};
+                        my $expect_error = $utf8_param_code > 0;
+                        next if      $expect_error
+                                && ! try_malforming($i, $function, $suffix =~ /LC/);
+
+                        my $display_call = "is${function}$suffix( $display_name"
+                                         . ", $utf8_param )$display_locale";
+                        $ret = truth eval "test_is${function}$suffix('$char',"
+                                        . " $utf8_param_code)";
+                        if ($expect_error) {
+                            isnt ($@, "",
+                                    "expected and got error in $display_call");
+                            like($@, qr/Malformed UTF-8 character/,
+                                "${tab}And got expected message");
+                            if (is (@warnings, 1,
+                                           "${tab}Got a single warning besides"))
+                            {
+                                like($warnings[0],
+                                     qr/Malformed UTF-8 character.*short/,
+                                     "${tab}Got expected warning");
+                            }
+                            else {
+                                diag("@warnings");
+                            }
+                            undef @warnings;
+                        }
+                        elsif (is ($@, "", "$display_call didn't give error")) {
                              is ($ret, $truth,
                                  "${tab}And correctly returned $truth");
                          }
+                    }
                  }
              }
          }
diff --git a/handy.h b/handy.h

index 848050f..625343b 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -565,10 +565,24 @@ to determine if it is in the character class.  For example,
  C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A
  WITH MACRON in Unicode, and is a word character.
  
-Variant C<isFOO_utf8> is like C<isFOO_uvchr>, but the input is a pointer to a
-(known to be well-formed) UTF-8 encoded string (C<U8*> or C<char*>, and
-possibly containing embedded C<NUL> characters).  The classification of just
-the first (possibly multi-byte) character in the string is tested.
+Variant C<isFOO_utf8_safe> is like C<isFOO_uvchr>, but is used for UTF-8
+encoded strings.  Each call classifies one character, even if the string
+contains many.  This variant takes two parameters.  The first, C<p>, is a
+pointer to the first byte of the character to be classified.  (Recall that it
+may take more than one byte to represent a character in UTF-8 strings.)  The
+second parameter, C<e>, points to anywhere in the string beyond the first
+character, up to one byte past the end of the entire string.  The suffix
+C<_safe> in the function's name indicates that it will not attempt to read
+beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this
+is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the input
+character is malformed in some way, the program may croak, or the function may
+return FALSE, at the discretion of the implementation, and subject to change in
+future releases.
+
+Variant C<isFOO_utf8> is like C<isFOO_utf8_safe>, but takes just a single
+parameter, C<p>, which has the same meaning as the corresponding parameter does
+in C<isFOO_utf8_safe>.  The function therefore can't check if it is reading
+beyond the end of the string.
  
  Variant C<isFOO_LC> is like the C<isFOO_A> and C<isFOO_L1> variants, but the
  result is based on the current locale, which is what C<LC> in the name stands
@@ -584,18 +598,32 @@ Variant C<isFOO_LC_uvchr> is like C<isFOO_LC>, but is defined on any UV.  It
  returns the same as C<isFOO_LC> for input code points less than 256, and
  returns the hard-coded, not-affected-by-locale, Unicode results for larger ones.
  
-Variant C<isFOO_LC_utf8> is like C<isFOO_LC_uvchr>, but the input is a pointer
-to a (known to be well-formed) UTF-8 encoded string (C<U8*> or C<char*>, and
-possibly containing embedded C<NUL> characters).  The classification of just
-the first (possibly multi-byte) character in the string is tested.
+Variant C<isFOO_LC_utf8_safe> is like C<isFOO_LC_uvchr>, but is used for UTF-8
+encoded strings.  Each call classifies one character, even if the string
+contains many.  This variant takes two parameters.  The first, C<p>, is a
+pointer to the first byte of the character to be classified.  (Recall that it
+may take more than one byte to represent a character in UTF-8 strings.) The
+second parameter, C<e>, points to anywhere in the string beyond the first
+character, up to one byte past the end of the entire string.  The suffix
+C<_safe> in the function's name indicates that it will not attempt to read
+beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this
+is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the input
+character is malformed in some way, the program may croak, or the function may
+return FALSE, at the discretion of the implementation, and subject to change in
+future releases.
+
+Variant C<isFOO_LC_utf8> is like C<isFOO_LC_utf8_safe>, but takes just a single
+parameter, C<p>, which has the same meaning as the corresponding parameter does
+in C<isFOO_LC_utf8_safe>.  The function therefore can't check if it is reading
+beyond the end of the string.
  
  =for apidoc Am|bool|isALPHA|char ch
  Returns a boolean indicating whether the specified character is an
  alphabetic character, analogous to C<m/[[:alpha:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8>, C<isALPHA_LC>,
-C<isALPHA_LC_uvchr>, and C<isALPHA_LC_utf8>.
+C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8_safe>,
+C<isALPHA_LC>, C<isALPHA_LC_uvchr>, and C<isALPHA_LC_utf8_safe>.
  
  =for apidoc Am|bool|isALPHANUMERIC|char ch
  Returns a boolean indicating whether the specified character is a either an
@@ -603,8 +631,8 @@ alphabetic character or decimal digit, analogous to C<m/[[:alnum:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
  C<isALPHANUMERIC_A>, C<isALPHANUMERIC_L1>, C<isALPHANUMERIC_uvchr>,
-C<isALPHANUMERIC_utf8>, C<isALPHANUMERIC_LC>, C<isALPHANUMERIC_LC_uvchr>, and
-C<isALPHANUMERIC_LC_utf8>.
+C<isALPHANUMERIC_utf8_safe>, C<isALPHANUMERIC_LC>, C<isALPHANUMERIC_LC_uvchr>,
+and C<isALPHANUMERIC_LC_utf8_safe>.
  
  =for apidoc Am|bool|isASCII|char ch
  Returns a boolean indicating whether the specified character is one of the 128
@@ -614,36 +642,36 @@ character corresponds to an ASCII character.  Variants C<isASCII_A()> and
  C<isASCII_L1()> are identical to C<isASCII()>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isASCII_uvchr>, C<isASCII_utf8>, C<isASCII_LC>, C<isASCII_LC_uvchr>, and
-C<isASCII_LC_utf8>.  Note, however, that some platforms do not have the C
+C<isASCII_uvchr>, C<isASCII_utf8_safe>, C<isASCII_LC>, C<isASCII_LC_uvchr>, and
+C<isASCII_LC_utf8_safe>.  Note, however, that some platforms do not have the C
  library routine C<isascii()>.  In these cases, the variants whose names contain
  C<LC> are the same as the corresponding ones without.
  
  Also note, that because all ASCII characters are UTF-8 invariant (meaning they
  have the exact same representation (always a single byte) whether encoded in
  UTF-8 or not), C<isASCII> will give the correct results when called with any
-byte in any string encoded or not in UTF-8.  And similarly C<isASCII_utf8> will
-work properly on any string encoded or not in UTF-8.
+byte in any string encoded or not in UTF-8.  And similarly C<isASCII_utf8_safe>
+will work properly on any string encoded or not in UTF-8.
  
  =for apidoc Am|bool|isBLANK|char ch
  Returns a boolean indicating whether the specified character is a
  character considered to be a blank, analogous to C<m/[[:blank:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8>, C<isBLANK_LC>,
-C<isBLANK_LC_uvchr>, and C<isBLANK_LC_utf8>.  Note, however, that some
-platforms do not have the C library routine C<isblank()>.  In these cases, the
-variants whose names contain C<LC> are the same as the corresponding ones
-without.
+C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8_safe>,
+C<isBLANK_LC>, C<isBLANK_LC_uvchr>, and C<isBLANK_LC_utf8_safe>.  Note,
+however, that some platforms do not have the C library routine
+C<isblank()>.  In these cases, the variants whose names contain C<LC> are
+the same as the corresponding ones without.
  
  =for apidoc Am|bool|isCNTRL|char ch
  Returns a boolean indicating whether the specified character is a
  control character, analogous to C<m/[[:cntrl:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8>, C<isCNTRL_LC>,
-C<isCNTRL_LC_uvchr>, and C<isCNTRL_LC_utf8>
-On EBCDIC platforms, you almost always want to use the C<isCNTRL_L1> variant.
+C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8_safe>,
+C<isCNTRL_LC>, C<isCNTRL_LC_uvchr>, and C<isCNTRL_LC_utf8_safe> On EBCDIC
+platforms, you almost always want to use the C<isCNTRL_L1> variant.
  
  =for apidoc Am|bool|isDIGIT|char ch
  Returns a boolean indicating whether the specified character is a
@@ -651,24 +679,23 @@ digit, analogous to C<m/[[:digit:]]/>.
  Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isDIGIT_uvchr>, C<isDIGIT_utf8>, C<isDIGIT_LC>, C<isDIGIT_LC_uvchr>, and
-C<isDIGIT_LC_utf8>.
+C<isDIGIT_uvchr>, C<isDIGIT_utf8_safe>, C<isDIGIT_LC>, C<isDIGIT_LC_uvchr>, and
+C<isDIGIT_LC_utf8_safe>.
  
  =for apidoc Am|bool|isGRAPH|char ch
  Returns a boolean indicating whether the specified character is a
  graphic character, analogous to C<m/[[:graph:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8>, C<isGRAPH_LC>,
-C<isGRAPH_LC_uvchr>, and C<isGRAPH_LC_utf8>.
+variants C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8_safe>,
+C<isGRAPH_LC>, C<isGRAPH_LC_uvchr>, and C<isGRAPH_LC_utf8_safe>.
  
  =for apidoc Am|bool|isLOWER|char ch
  Returns a boolean indicating whether the specified character is a
  lowercase character, analogous to C<m/[[:lower:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8>, C<isLOWER_LC>,
-C<isLOWER_LC_uvchr>, and C<isLOWER_LC_utf8>.
+C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8_safe>,
+C<isLOWER_LC>, C<isLOWER_LC_uvchr>, and C<isLOWER_LC_utf8_safe>.
  
  =for apidoc Am|bool|isOCTAL|char ch
  Returns a boolean indicating whether the specified character is an
@@ -683,9 +710,8 @@ Note that the definition of what is punctuation isn't as
  straightforward as one might desire.  See L<perlrecharclass/POSIX Character
  Classes> for details.
  See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8>, C<isPUNCT_LC>,
-C<isPUNCT_LC_uvchr>, and C<isPUNCT_LC_utf8>.
+variants C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8_safe>,
+C<isPUNCT_LC>, C<isPUNCT_LC_uvchr>, and C<isPUNCT_LC_utf8_safe>.
  
  =for apidoc Am|bool|isSPACE|char ch
  Returns a boolean indicating whether the specified character is a
@@ -698,8 +724,8 @@ in the non-locale variants, was that C<isSPACE()> did not match a vertical tab.
  (See L</isPSXSPC> for a macro that matches a vertical tab in all releases.)
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8>, C<isSPACE_LC>,
-C<isSPACE_LC_uvchr>, and C<isSPACE_LC_utf8>.
+C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8_safe>,
+C<isSPACE_LC>, C<isSPACE_LC_uvchr>, and C<isSPACE_LC_utf8_safe>.
  
  =for apidoc Am|bool|isPSXSPC|char ch
  (short for Posix Space)
@@ -712,24 +738,23 @@ C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do.
  Otherwise they are identical.  Thus this macro is analogous to what
  C<m/[[:space:]]/> matches in a regular expression.
  See the L<top of this section|/Character classification> for an explanation of
-variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8>,
-C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>, and C<isPSXSPC_LC_utf8>.
+variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8_safe>,
+C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>, and C<isPSXSPC_LC_utf8_safe>.
  
  =for apidoc Am|bool|isUPPER|char ch
  Returns a boolean indicating whether the specified character is an
  uppercase character, analogous to C<m/[[:upper:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8>, C<isUPPER_LC>,
-C<isUPPER_LC_uvchr>, and C<isUPPER_LC_utf8>.
+variants C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8_safe>,
+C<isUPPER_LC>, C<isUPPER_LC_uvchr>, and C<isUPPER_LC_utf8_safe>.
  
  =for apidoc Am|bool|isPRINT|char ch
  Returns a boolean indicating whether the specified character is a
  printable character, analogous to C<m/[[:print:]]/>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8>, C<isPRINT_LC>,
-C<isPRINT_LC_uvchr>, and C<isPRINT_LC_utf8>.
+C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8_safe>,
+C<isPRINT_LC>, C<isPRINT_LC_uvchr>, and C<isPRINT_LC_utf8_safe>.
  
  =for apidoc Am|bool|isWORDCHAR|char ch
  Returns a boolean indicating whether the specified character is a character
@@ -741,10 +766,10 @@ C<isALNUM()> is a synonym provided for backward compatibility, even though a
  word character includes more than the standard C language meaning of
  alphanumeric.
  See the L<top of this section|/Character classification> for an explanation of
-variants
-C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, and C<isWORDCHAR_utf8>.
-C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, and C<isWORDCHAR_LC_utf8> are also as
-described there, but additionally include the platform's native underscore.
+variants C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, and
+C<isWORDCHAR_utf8_safe>.  C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, and
+C<isWORDCHAR_LC_utf8_safe> are also as described there, but additionally
+include the platform's native underscore.
  
  =for apidoc Am|bool|isXDIGIT|char ch
  Returns a boolean indicating whether the specified character is a hexadecimal
@@ -752,8 +777,8 @@ digit.  In the ASCII range these are C<[0-9A-Fa-f]>.  Variants C<isXDIGIT_A()>
  and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isXDIGIT_uvchr>, C<isXDIGIT_utf8>, C<isXDIGIT_LC>, C<isXDIGIT_LC_uvchr>, and
-C<isXDIGIT_LC_utf8>.
+C<isXDIGIT_uvchr>, C<isXDIGIT_utf8_safe>, C<isXDIGIT_LC>, C<isXDIGIT_LC_uvchr>,
+and C<isXDIGIT_LC_utf8_safe>.
  
  =for apidoc Am|bool|isIDFIRST|char ch
  Returns a boolean indicating whether the specified character can be the first
@@ -762,8 +787,8 @@ the official Unicode property C<XID_Start>.  The difference is that this
  returns true only if the input character also matches L</isWORDCHAR>.
  See the L<top of this section|/Character classification> for an explanation of
  variants
-C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8>,
-C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>, and C<isIDFIRST_LC_utf8>.
+C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8_safe>,
+C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>, and C<isIDFIRST_LC_utf8_safe>.
  
  =for apidoc Am|bool|isIDCONT|char ch
  Returns a boolean indicating whether the specified character can be the
@@ -773,8 +798,8 @@ difference is that this returns true only if the input character also matches
  L</isWORDCHAR>.  See the L<top of this section|/Character classification> for
  an
  explanation of variants C<isIDCONT_A>, C<isIDCONT_L1>, C<isIDCONT_uvchr>,
-C<isIDCONT_utf8>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>, and
-C<isIDCONT_LC_utf8>.
+C<isIDCONT_utf8_safe>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>, and
+C<isIDCONT_LC_utf8_safe>.
  
  =head1 Miscellaneous Functions
  
@@ -1684,14 +1709,59 @@ END_EXTERN_C
                                                                     *((p)+1 )), \
                                                  classnum)                      \
                                             : utf8)
+
+/* The "_safe" macros make sure that we don't attempt to read beyond 'e', but
+ * they don't otherwise go out of their way to look for malformed UTF-8.  If
+ * they can return accurate results without knowing if the input is otherwise
+ * malformed, they do so.  For example isASCII is accurate in spite of any
+ * non-length malformations because it looks only at a single byte. Likewise
+ * isDIGIT looks just at the first byte for code points 0-255, as all UTF-8
+ * variant ones return FALSE.  But, if the input has to be well-formed in order
+ * for the results to be accurate, the macros will test and if malformed will
+ * call a routine to die
+ *
+ * Except for toke.c, the macros do assume that e > p, asserting that on
+ * DEBUGGING builds.  Much code that calls these depends on this being true,
+ * for other reasons.  toke.c is treated specially as using the regular
+ * assertion breaks it in many ways.  All strings that these operate on there
+ * are supposed to have an extra NUL character at the end,  so that *e = \0. A
+ * bunch of code in toke.c assumes that this is true, so the assertion allows
+ * for that */
+#ifdef PERL_IN_TOKE_C
+#  define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0'))
+#else
+#  define _utf8_safe_assert(p,e) ((e) > (p))
+#endif
+
+#define _generic_utf8_safe(classnum, p, e, above_latin1)                    \
+         (__ASSERT_(_utf8_safe_assert(p, e))                                \
+         (UTF8_IS_INVARIANT(*(p)))                                          \
+          ? _generic_isCC(*(p), classnum)                                   \
+          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
+             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+                ? _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )),  \
+                                classnum)                                   \
+                : (_force_out_malformed_utf8_message(                       \
+                                        (U8 *) (p), (U8 *) (e), 0, 1), 0))  \
+             : above_latin1))
  /* Like the above, but calls 'above_latin1(p)' to get the utf8 value.
   * 'above_latin1' can be a macro */
  #define _generic_func_utf8(classnum, above_latin1, p)  \
                                      _generic_utf8(classnum, p, above_latin1(p))
+#define _generic_func_utf8_safe(classnum, above_latin1, p, e)               \
+                    _generic_utf8_safe(classnum, p, e, above_latin1(p, e))
+#define _generic_non_swash_utf8_safe(classnum, above_latin1, p, e)          \
+          _generic_utf8_safe(classnum, p, e,                                \
+                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
+                              ? (_force_out_malformed_utf8_message(         \
+                                      (U8 *) (p), (U8 *) (e), 0, 1), 0)     \
+                              : above_latin1(p)))
  /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
   * 'above_latin1' parameter */
  #define _generic_swash_utf8(classnum, p)  \
                        _generic_utf8(classnum, p, _is_utf8_FOO(classnum, p))
+#define _generic_swash_utf8_safe(classnum, p, e)                            \
+_generic_utf8_safe(classnum, p, e, _is_utf8_FOO_with_len(classnum, p, e))
  
  /* Like the above, but should be used only when it is known that there are no
   * characters in the upper-Latin1 range (128-255 on ASCII platforms) which the
@@ -1704,6 +1774,14 @@ END_EXTERN_C
                                             ? above_latin1                      \
                                             : 0)
  
+#define _generic_utf8_safe_no_upper_latin1(classnum, p, e, above_latin1)    \
+         (__ASSERT_(_utf8_safe_assert(p, e))                                \
+         (UTF8_IS_INVARIANT(*(p)))                                          \
+          ? _generic_isCC(*(p), classnum)                                   \
+          : (UTF8_IS_DOWNGRADEABLE_START(*(p)))                             \
+             ? 0 /* Note that doesn't check validity for latin1 */          \
+             : above_latin1)
+
  /* NOTE that some of these macros have very similar ones in regcharclass.h.
   * For example, there is (at the time of this writing) an 'is_SPACE_utf8()'
   * there, differing in name only by an underscore from the one here
@@ -1719,12 +1797,25 @@ END_EXTERN_C
                                               */
  #define isBLANK_utf8(p)        _generic_func_utf8(_CC_BLANK, is_HORIZWS_high, p)
  
+#define isALPHA_utf8_safe(p, e)  _generic_swash_utf8_safe(_CC_ALPHA, p, e)
+#define isALPHANUMERIC_utf8_safe(p, e)                                      \
+                        _generic_swash_utf8_safe(_CC_ALPHANUMERIC, p, e)
+#define isASCII_utf8_safe(p, e)                                             \
+    /* Because ASCII is invariant under utf8, the non-utf8 macro            \
+    * works */                                                              \
+    (__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p)))
+#define isBLANK_utf8_safe(p, e)                                             \
+        _generic_non_swash_utf8_safe(_CC_BLANK, is_HORIZWS_high, p, e)
+
  #ifdef EBCDIC
      /* Because all controls are UTF-8 invariants in EBCDIC, we can use this
       * more efficient macro instead of the more general one */
  #   define isCNTRL_utf8(p)      isCNTRL_L1(*(p))
+#   define isCNTRL_utf8_safe(p, e)                                          \
+                    (__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p))
  #else
-#   define isCNTRL_utf8(p)      _generic_utf8(_CC_CNTRL, p, 0)
+#   define isCNTRL_utf8(p)          _generic_utf8(_CC_CNTRL, p, 0)
+#   define isCNTRL_utf8_safe(p, e)  _generic_utf8_safe(_CC_CNTRL, p, e, 0)
  #endif
  
  #define isDIGIT_utf8(p)         _generic_utf8_no_upper_latin1(_CC_DIGIT, p,   \
@@ -1732,6 +1823,12 @@ END_EXTERN_C
  #define isGRAPH_utf8(p)         _generic_swash_utf8(_CC_GRAPH, p)
  #define isIDCONT_utf8(p)        _generic_func_utf8(_CC_WORDCHAR,              \
                                                    _is_utf8_perl_idcont, p)
+#define isDIGIT_utf8_safe(p, e)                                             \
+            _generic_utf8_safe_no_upper_latin1(_CC_DIGIT, p, e,             \
+                                    _is_utf8_FOO_with_len(_CC_DIGIT, p, e))
+#define isGRAPH_utf8_safe(p, e)    _generic_swash_utf8_safe(_CC_GRAPH, p, e)
+#define isIDCONT_utf8_safe(p, e)   _generic_func_utf8_safe(_CC_WORDCHAR,    \
+                                     _is_utf8_perl_idcont_with_len, p, e)
  
  /* To prevent S_scan_word in toke.c from hanging, we have to make sure that
   * IDFIRST is an alnum.  See
@@ -1752,6 +1849,27 @@ END_EXTERN_C
  #define isWORDCHAR_utf8(p)  _generic_swash_utf8(_CC_WORDCHAR, p)
  #define isXDIGIT_utf8(p)    _generic_utf8_no_upper_latin1(_CC_XDIGIT, p,     \
                                                            is_XDIGIT_high(p))
+#define isIDFIRST_utf8_safe(p, e)                                           \
+    _generic_func_utf8_safe(_CC_IDFIRST,                                    \
+                    _is_utf8_perl_idstart_with_len, (U8 *) (p), (U8 *) (e))
+
+#define isLOWER_utf8_safe(p, e)     _generic_swash_utf8_safe(_CC_LOWER, p, e)
+#define isPRINT_utf8_safe(p, e)     _generic_swash_utf8_safe(_CC_PRINT, p, e)
+#define isPSXSPC_utf8_safe(p, e)     isSPACE_utf8_safe(p, e)
+#define isPUNCT_utf8_safe(p, e)     _generic_swash_utf8_safe(_CC_PUNCT, p, e)
+#define isSPACE_utf8_safe(p, e)                                             \
+    _generic_non_swash_utf8_safe(_CC_SPACE, is_XPERLSPACE_high, p, e)
+#define isUPPER_utf8_safe(p, e)  _generic_swash_utf8_safe(_CC_UPPER, p, e)
+#define isVERTWS_utf8_safe(p, e)                                            \
+        _generic_non_swash_utf8_safe(_CC_VERTSPACE, is_VERTWS_high, p, e)
+#define isWORDCHAR_utf8_safe(p, e)                                          \
+                             _generic_swash_utf8_safe(_CC_WORDCHAR, p, e)
+#define isXDIGIT_utf8_safe(p, e)                                            \
+                   _generic_utf8_safe_no_upper_latin1(_CC_XDIGIT, p, e,     \
+                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
+                              ? (_force_out_malformed_utf8_message(         \
+                                      (U8 *) (p), (U8 *) (e), 0, 1), 0)     \
+                              : is_XDIGIT_high(p)))
  
  #define toFOLD_utf8(p,s,l)     to_utf8_fold(p,s,l)
  #define toLOWER_utf8(p,s,l)    to_utf8_lower(p,s,l)
@@ -1798,6 +1916,71 @@ END_EXTERN_C
                                                              _CC_WORDCHAR, p)
  #define isXDIGIT_LC_utf8(p)   _generic_LC_func_utf8(isXDIGIT_LC,              \
                                                              is_XDIGIT_high, p)
+/* For internal core Perl use only: the base macros for defining macros like
+ * isALPHA_LC_utf8_safe.  These are like _generic_utf8, but if the first code
+ * point in 'p' is within the 0-255 range, it uses locale rules from the
+ * passed-in 'macro' parameter */
+#define _generic_LC_utf8_safe(macro, p, e, above_latin1)                    \
+         (__ASSERT_(_utf8_safe_assert(p, e))                                \
+         (UTF8_IS_INVARIANT(*(p)))                                          \
+          ? macro(*(p))                                                     \
+          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
+             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+                ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1)))           \
+                : (_force_out_malformed_utf8_message(                       \
+                                        (U8 *) (p), (U8 *) (e), 0, 1), 0))  \
+              : above_latin1))
+
+#define _generic_LC_swash_utf8_safe(macro, classnum, p, e)                  \
+            _generic_LC_utf8_safe(macro, p, e,                              \
+                               _is_utf8_FOO_with_len(classnum, p, e))
+
+#define _generic_LC_func_utf8_safe(macro, above_latin1, p, e)               \
+            _generic_LC_utf8_safe(macro, p, e, above_latin1(p, e))
+
+#define _generic_LC_non_swash_utf8_safe(classnum, above_latin1, p, e)       \
+          _generic_LC_utf8_safe(classnum, p, e,                             \
+                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
+                              ? (_force_out_malformed_utf8_message(         \
+                                      (U8 *) (p), (U8 *) (e), 0, 1), 0)     \
+                              : above_latin1(p)))
+
+#define isALPHANUMERIC_LC_utf8_safe(p, e)                                   \
+            _generic_LC_swash_utf8_safe(isALPHANUMERIC_LC,                  \
+                                        _CC_ALPHANUMERIC, p, e)
+#define isALPHA_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isALPHA_LC, _CC_ALPHA, p, e)
+#define isASCII_LC_utf8_safe(p, e)                                          \
+                    (__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p)))
+#define isBLANK_LC_utf8_safe(p, e)                                          \
+        _generic_LC_non_swash_utf8_safe(isBLANK_LC, is_HORIZWS_high, p, e)
+#define isCNTRL_LC_utf8_safe(p, e)                                          \
+            _generic_LC_utf8_safe(isCNTRL_LC, p, e, 0)
+#define isDIGIT_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isDIGIT_LC, _CC_DIGIT, p, e)
+#define isGRAPH_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isGRAPH_LC, _CC_GRAPH, p, e)
+#define isIDCONT_LC_utf8_safe(p, e)                                         \
+            _generic_LC_func_utf8_safe(isIDCONT_LC,                         \
+                                _is_utf8_perl_idcont_with_len, p, e)
+#define isIDFIRST_LC_utf8_safe(p, e)                                        \
+            _generic_LC_func_utf8_safe(isIDFIRST_LC,                        \
+                                _is_utf8_perl_idstart_with_len, p, e)
+#define isLOWER_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isLOWER_LC, _CC_LOWER, p, e)
+#define isPRINT_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isPRINT_LC, _CC_PRINT, p, e)
+#define isPSXSPC_LC_utf8_safe(p, e)    isSPACE_LC_utf8_safe(p, e)
+#define isPUNCT_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isPUNCT_LC, _CC_PUNCT, p, e)
+#define isSPACE_LC_utf8_safe(p, e)                                          \
+    _generic_LC_non_swash_utf8_safe(isSPACE_LC, is_XPERLSPACE_high, p, e)
+#define isUPPER_LC_utf8_safe(p, e)                                          \
+            _generic_LC_swash_utf8_safe(isUPPER_LC, _CC_UPPER, p, e)
+#define isWORDCHAR_LC_utf8_safe(p, e)                                       \
+            _generic_LC_swash_utf8_safe(isWORDCHAR_LC, _CC_WORDCHAR, p, e)
+#define isXDIGIT_LC_utf8_safe(p, e)                                         \
+        _generic_LC_non_swash_utf8_safe(isXDIGIT_LC, is_XDIGIT_high, p, e)
  
  /* Macros for backwards compatibility and for completeness when the ASCII and
   * Latin1 values are identical */
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index b6feb46..2569c69 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -327,6 +327,15 @@ well.
  
  =item *
  
+New versions of macros like C<isALPHA_utf8> have been added, each with the
+suffix C<_safe>, like C<isSPACE_utf8_safe>.  These take an extra
+parameter, giving an upper limit of how far into the string it is safe
+to read.  Using the old versions could cause attempts to read beyond the
+end of the input buffer if the UTF-8 is not well-formed.  Details are at
+L<perlapi/Character classification>.
+
+=item *
+
  Calling macros like C<isALPHA_utf8> on malformed UTF-8 have issued a
  deprecation warning since Perl v5.18.  They now die.
  
diff --git a/proto.h b/proto.h

index c7065cd..a7dc7d7 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -59,6 +59,11 @@ PERL_CALLCONV bool   Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
  #define PERL_ARGS_ASSERT__IS_UTF8_FOO  \
         assert(p)
  
+PERL_CALLCONV bool     Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN \
+       assert(p); assert(e)
+
  PERL_CALLCONV bool     Perl__is_utf8_idcont(pTHX_ const U8 *p)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT__IS_UTF8_IDCONT       \
@@ -79,11 +84,21 @@ PERL_CALLCONV bool  Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
  #define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT  \
         assert(p)
  
+PERL_CALLCONV bool     Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN \
+       assert(p); assert(e)
+
  PERL_CALLCONV bool     Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART \
         assert(p)
  
+PERL_CALLCONV bool     Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN        \
+       assert(p); assert(e)
+
  PERL_CALLCONV bool     Perl__is_utf8_xidcont(pTHX_ const U8 *p)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT__IS_UTF8_XIDCONT      \
@@ -5624,6 +5639,11 @@ PERL_STATIC_INLINE bool  S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, co
  #define PERL_ARGS_ASSERT_IS_UTF8_COMMON        \
         assert(p); assert(swash); assert(swashname)
  
+PERL_STATIC_INLINE bool        S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 *const e, SV **swash, const char * const swashname, SV* const invlist)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN       \
+       assert(p); assert(e); assert(swash); assert(swashname)
+
  PERL_STATIC_INLINE bool        S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS      \
diff --git a/utf8.c b/utf8.c

index 6b2c128..44aada5 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2116,7 +2116,7 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
  {
      U8 tmpbuf[UTF8_MAXBYTES+1];
      uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(classnum, tmpbuf);
+    return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
  }
  
  /* Internal function so we can deprecate the external one, and call
@@ -2137,7 +2137,7 @@ Perl__is_uni_perl_idcont(pTHX_ UV c)
  {
      U8 tmpbuf[UTF8_MAXBYTES+1];
      uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idcont(tmpbuf);
+    return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
  }
  
  bool
@@ -2145,7 +2145,7 @@ Perl__is_uni_perl_idstart(pTHX_ UV c)
  {
      U8 tmpbuf[UTF8_MAXBYTES+1];
      uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idstart(tmpbuf);
+    return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
  }
  
  UV
@@ -2445,6 +2445,40 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
      return swash_fetch(*swash, p, TRUE) != 0;
  }
  
+PERL_STATIC_INLINE bool
+S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash,
+                         const char *const swashname, SV* const invlist)
+{
+    /* returns a boolean giving whether or not the UTF8-encoded character that
+     * starts at <p>, and extending no further than <e - 1> is in the swash
+     * indicated by <swashname>.  <swash> contains a pointer to where the swash
+     * indicated by <swashname> is to be stored; which this routine will do, so
+     * that future calls will look at <*swash> and only generate a swash if it
+     * is not null.  <invlist> is NULL or an inversion list that defines the
+     * swash.  If not null, it saves time during initialization of the swash.
+     */
+
+    PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
+
+    if (! isUTF8_CHAR(p, e)) {
+        _force_out_malformed_utf8_message(p, e, 0, 1);
+        NOT_REACHED; /* NOTREACHED */
+    }
+
+    if (!*swash) {
+        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+        *swash = _core_swash_init("utf8",
+
+                                  /* Only use the name if there is no inversion
+                                   * list; otherwise will go out to disk */
+                                  (invlist) ? "" : swashname,
+
+                                  &PL_sv_undef, 1, 0, invlist, &flags);
+    }
+
+    return swash_fetch(*swash, p, TRUE) != 0;
+}
+
  bool
  Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
  {
@@ -2459,6 +2493,21 @@ Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
  }
  
  bool
+Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
+                                                            const U8 * const e)
+{
+    PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
+
+    assert(classnum < _FIRST_NON_SWASH_CC);
+
+    return is_utf8_common_with_len(p,
+                                   e,
+                                   &PL_utf8_swash_ptrs[classnum],
+                                   swash_property_names[classnum],
+                                   PL_XPosix_ptrs[classnum]);
+}
+
+bool
  Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
  {
      SV* invlist = NULL;
@@ -2472,6 +2521,20 @@ Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
  }
  
  bool
+Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
+{
+    SV* invlist = NULL;
+
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
+
+    if (! PL_utf8_perl_idstart) {
+        invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
+    }
+    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
+                                      "_Perl_IDStart", invlist);
+}
+
+bool
  Perl__is_utf8_xidstart(pTHX_ const U8 *p)
  {
      PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
@@ -2495,6 +2558,20 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
  }
  
  bool
+Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
+{
+    SV* invlist = NULL;
+
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
+
+    if (! PL_utf8_perl_idcont) {
+        invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
+    }
+    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
+                                   "_Perl_IDCont", invlist);
+}
+
+bool
  Perl__is_utf8_idcont(pTHX_ const U8 *p)
  {
      PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
diff --git a/utf8.h b/utf8.h

index f6d9d54..5568039 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -680,6 +680,17 @@ with a ptr argument.
                                  : isWORDCHAR_utf8((const U8*)p))
  #define isALNUM_lazy_if(p,UTF)   isWORDCHAR_lazy_if(p,UTF)
  
+#define isIDFIRST_lazy_if_safe(p, e, UTF)                                   \
+                   ((IN_BYTES || !UTF)                                      \
+                     ? isIDFIRST(*(p))                                      \
+                     : isIDFIRST_utf8_safe(p, e))
+
+#define isWORDCHAR_lazy_if_safe(p, e, UTF)                                  \
+                   ((IN_BYTES || !UTF)                                      \
+                     ? isWORDCHAR(*(p))                                     \
+                     : isWORDCHAR_utf8_safe((U8 *) p, (U8 *) e))
+
+
  #define UTF8_MAXLEN UTF8_MAXBYTES
  
  /* A Unicode character can fold to up to 3 characters */
author	Karl Williamson <khw@cpan.org>
	Thu, 15 Dec 2016 23:30:27 +0000 (16:30 -0700)
committer	Karl Williamson <khw@cpan.org>
	Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
ext/XS-APItest/APItest.xs		patch \| blob \| blame \| history
ext/XS-APItest/t/handy.t		patch \| blob \| blame \| history
handy.h		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history