Fix uninitialized error in my_atof3()

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index e66c328..345d810 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -214,7 +214,7 @@ Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly
  
  This function is for code that wants any warning and/or error messages to be
  returned to the caller rather than be displayed.  All messages that would have
-been displayed if all lexcial warnings are enabled will be returned.
+been displayed if all lexical warnings are enabled will be returned.
  
  It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
  placed after all the others, C<msgs>.  If this parameter is 0, this function
@@ -387,8 +387,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
         STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
-           *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
-           uv >>= UTF_ACCUMULATION_SHIFT;
+           *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
+           uv >>= SHIFT;
         }
         *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
         return d+len;
@@ -1275,112 +1275,16 @@ Also implemented as a macro in utf8.h
  */
  
  UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
-                          STRLEN curlen,
-                          STRLEN *retlen,
-                          const U32 flags)
+Perl_utf8n_to_uvchr(const U8 *s,
+                    STRLEN curlen,
+                    STRLEN *retlen,
+                    const U32 flags)
  {
      PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
  
      return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
  }
  
-/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/,
- * which requires this copyright notice */
-
-/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#if 0
-static U8 utf8d_C9[] = {
-  /* The first part of the table maps bytes to character classes that
-   * to reduce the size of the transition table and create bitmasks. */
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
-   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
-   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
-  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
-
-  /* The second part is a transition table that maps a combination
-   * of a state of the automaton and a character class to a state. */
-   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
-  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
-  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
-  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
-  12,36,12,12,12,12,12,12,12,12,12,12
-};
-
-#endif
-
-#ifndef EBCDIC
-
-/* This is a version of the above table customized for Perl that doesn't
- * exclude surrogates and accepts start bytes up through F7 (representing
- * 2**21 - 1). */
-static U8 dfa_tab_for_perl[] = {
-    /* The first part of the table maps bytes to character classes to reduce
-     * the size of the transition table and create bitmasks. */
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
-   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
-   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
-  10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/
-
-  /* The second part is a transition table that maps a combination
-   * of a state of the automaton and a character class to a state. */
-   0,12,24,36,96,12,12,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,/*23*/
-  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,/*47*/
-  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,/*71*/
-  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,/*95*/
-  12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */
-
- /* The customization was to repurpose the surrogates type '4' to instead be
-  * for start bytes F1-F7.  Types 5 and 6 are now unused, and their entries in
-  * the transition part of the table are set to 12, so are illegal.
-  *
-  * To do higher code points would require expansion and some rearrangement of
-  * the table.  The type '1' entries for continuation bytes 80-8f would have to
-  * be split into several types, because they aren't treated uniformly for
-  * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE:
-  * 80-81.  We start needing to worry about overflow if FE is included.
-  * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember
-  * from the web site that these are used to right shift).  FE would
-  * necessarily be type 7; and FF, type 8.  And new states would have to be
-  * created for F8 and FC (and FE and FF if used), so quite a bit of work would
-  * be involved.
-  *
-  * XXX Better would be to customize the table so that the noncharacters are
-  * excluded.  This again is non trivial, but doing so would simplify the code
-  * that uses this, and might make it small enough to make it inlinable */
-};
-
-#endif
-
  /*
  
  =for apidoc utf8n_to_uvchr_error
@@ -1500,7 +1404,7 @@ Also implemented as a macro in utf8.h
  */
  
  UV
-Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+Perl_utf8n_to_uvchr_error(const U8 *s,
                            STRLEN curlen,
                            STRLEN *retlen,
                            const U32 flags,
@@ -1564,7 +1468,7 @@ The caller, of course, is responsible for freeing any returned AV.
  */
  
  UV
-Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
+Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
                                 STRLEN curlen,
                                 STRLEN *retlen,
                                 const U32 flags,
@@ -1573,27 +1477,70 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
  {
      const U8 * const s0 = s;
      const U8 * send = s0 + curlen;
-    U32 possible_problems = 0;  /* A bit is set here for each potential problem
-                                   found as we go along */
-    UV uv = (UV) -1;
-    STRLEN expectlen   = 0;     /* How long should this sequence be?
-                                   (initialized to silence compilers' wrong
-                                   warning) */
-    STRLEN avail_len   = 0;     /* When input is too short, gives what that is */
-    U32 discard_errors = 0;     /* Used to save branches when 'errors' is NULL;
-                                   this gets set and discarded */
+    U32 possible_problems;  /* A bit is set here for each potential problem
+                               found as we go along */
+    UV uv;
+    STRLEN expectlen;     /* How long should this sequence be? */
+    STRLEN avail_len;     /* When input is too short, gives what that is */
+    U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
+                             gets set and discarded */
  
      /* The below are used only if there is both an overlong malformation and a
       * too short one.  Otherwise the first two are set to 's0' and 'send', and
       * the third not used at all */
-    U8 * adjusted_s0 = (U8 *) s0;
+    U8 * adjusted_s0;
      U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
                                              routine; see [perl #130921] */
-    UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
+    UV uv_so_far;
+    dTHX;
+
+    PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
+
+    /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
+     * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
+     * syllables that the dfa doesn't properly handle.  Quickly dispose of the
+     * final case. */
+
+#ifndef EBCDIC
+
+    /* Each of the affected Hanguls starts with \xED */
+
+    if (is_HANGUL_ED_utf8_safe(s0, send)) {
+        if (retlen) {
+            *retlen = 3;
+        }
+        if (errors) {
+            *errors = 0;
+        }
+        if (msgs) {
+            *msgs = NULL;
+        }
+
+        return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
+             | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
+             |  (s0[2] & UTF_CONTINUATION_MASK);
+    }
  
-    UV state = 0;
+#endif
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+    /* In conjunction with the exhaustive tests that can be enabled in
+     * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
+     * what it is intended to do, and that no flaws in it are masked by
+     * dropping down and executing the code below
+    assert(! isUTF8_CHAR(s0, send)
+          || UTF8_IS_SURROGATE(s0, send)
+          || UTF8_IS_SUPER(s0, send)
+          || UTF8_IS_NONCHAR(s0,send));
+    */
+
+    s = s0;
+    uv = *s0;
+    possible_problems = 0;
+    expectlen = 0;
+    avail_len = 0;
+    discard_errors = 0;
+    adjusted_s0 = (U8 *) s0;
+    uv_so_far = 0;
  
      if (errors) {
          *errors = 0;
@@ -1646,55 +1593,6 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
         *retlen = expectlen;
      }
  
-    /* An invariant is trivially well-formed */
-    if (UTF8_IS_INVARIANT(*s0)) {
-       return *s0;
-    }
-
-#ifndef EBCDIC
-
-    /* Measurements show that this dfa is somewhat faster than the regular code
-     * below, so use it first, dropping down for the non-normal cases. */
-
-#  define PERL_UTF8_DECODE_REJECT 12
-
-    while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
-        UV type = dfa_tab_for_perl[*s];
-
-        if (state != 0) {
-            uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT);
-            state = dfa_tab_for_perl[256 + state + type];
-        }
-        else {
-            uv = (0xff >> type) & (*s);
-            state = dfa_tab_for_perl[256 + type];
-        }
-
-        if (state == 0) {
-
-            /* If this could be a code point that the flags don't allow (the first
-            * surrogate is the first such possible one), delve further, but we already
-            * have calculated 'uv' */
-            if (  (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-                           |UTF8_WARN_ILLEGAL_INTERCHANGE))
-                && uv >= UNICODE_SURROGATE_FIRST)
-            {
-                curlen = s + 1 - s0;
-                goto got_uv;
-            }
-
-            return uv;
-        }
-
-        s++;
-    }
-
-    /* Here, is some sort of failure.  Use the full mechanism */
-
-    uv = *s0;
-
-#endif
-
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
         possible_problems |= UTF8_GOT_CONTINUATION;
@@ -1814,8 +1712,6 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
          }
      }
  
-  got_uv:
-
      /* Here, we have found all the possible problems, except for when the input
       * is for a problematic code point not allowed by the input parameters. */
  
@@ -3030,8 +2926,7 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_UPPER_CASE(c, p, p, lenp);
+    return CALL_UPPER_CASE(c, NULL, p, lenp);
  }
  
  UV
@@ -3043,8 +2938,7 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
         return _to_upper_title_latin1((U8) c, p, lenp, 's');
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_TITLE_CASE(c, p, p, lenp);
+    return CALL_TITLE_CASE(c, NULL, p, lenp);
  }
  
  STATIC U8
@@ -3083,8 +2977,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_LOWER_CASE(c, p, p, lenp);
+    return CALL_LOWER_CASE(c, NULL, p, lenp);
  }
  
  UV
@@ -3180,8 +3073,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
  
      /* Here, above 255.  If no special needs, just use the macro */
      if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
-       uvchr_to_utf8(p, c);
-       return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
+       return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
      }
      else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
                the special flags. */
@@ -3475,7 +3367,7 @@ Perl__is_utf8_mark(pTHX_ const U8 *p)
  {
      PERL_ARGS_ASSERT__IS_UTF8_MARK;
  
-    return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
+    return is_utf8_common(p, NULL, "IsM", PL_utf8_mark);
  }
  
  STATIC UV
@@ -3636,13 +3528,16 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
      /* Here, there was no mapping defined, which means that the code point maps
       * to itself.  Return the inputs */
    cases_to_self:
-    len = UTF8SKIP(p);
-    if (p != ustrp) {   /* Don't copy onto itself */
-        Copy(p, ustrp, len, U8);
+    if (p) {
+        len = UTF8SKIP(p);
+        if (p != ustrp) {   /* Don't copy onto itself */
+            Copy(p, ustrp, len, U8);
+        }
+        *lenp = len;
+    }
+    else {
+       *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
      }
-
-    if (lenp)
-        *lenp = len;
  
      return uv1;
  
@@ -5201,6 +5096,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
              /* Get the 0th element, which is needed to setup the inversion list
               * */
              while (isSPACE(*l)) l++;
+            after_atou = (char *) lend;
              if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
                  Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
                                   " inversion list");
@@ -5217,6 +5113,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
                                       " elements than available", elements);
                  }
                  while (isSPACE(*l)) l++;
+                after_atou = (char *) lend;
                  if (!grok_atoUV((const char *)l, other_elements_ptr++,
                                   &after_atou))
                  {
@@ -5868,46 +5765,71 @@ Perl_init_uniprops(pTHX)
  {
      /* Set up the inversion list global variables */
  
-    PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_ASCII_invlist);
-    PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_XPOSIXALNUM_invlist);
-    PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_XPOSIXALPHA_invlist);
-    PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_XPOSIXBLANK_invlist);
-    PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(PL_CASED_invlist);
-    PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_XPOSIXCNTRL_invlist);
-    PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_XPOSIXDIGIT_invlist);
-    PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_XPOSIXGRAPH_invlist);
-    PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_XPOSIXLOWER_invlist);
-    PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_XPOSIXPRINT_invlist);
-    PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_XPOSIXPUNCT_invlist);
-    PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_XPOSIXSPACE_invlist);
-    PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_XPOSIXUPPER_invlist);
-    PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_VERTSPACE_invlist);
-    PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_XPOSIXWORD_invlist);
-    PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_XPOSIXXDIGIT_invlist);
+    PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
+    PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALNUM]);
+    PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALPHA]);
+    PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXBLANK]);
+    PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
+    PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXCNTRL]);
+    PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXDIGIT]);
+    PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXGRAPH]);
+    PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXLOWER]);
+    PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPRINT]);
+    PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPUNCT]);
+    PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXSPACE]);
+    PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXUPPER]);
+    PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
+    PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXWORD]);
+    PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXXDIGIT]);
+
+    PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
+    PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALNUM]);
+    PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALPHA]);
+    PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXBLANK]);
+    PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
+    PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXCNTRL]);
+    PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXDIGIT]);
+    PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXGRAPH]);
+    PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXLOWER]);
+    PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPRINT]);
+    PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPUNCT]);
+    PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXSPACE]);
+    PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXUPPER]);
+    PL_Posix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
+    PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXWORD]);
+    PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXXDIGIT]);
+
      PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
      PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
      PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
      PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
-    PL_Assigned_invlist = _new_invlist_C_array(PL_ASSIGNED_invlist);
      PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
-    PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
-    PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
-    PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
-    PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
-    PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
-    PL_utf8_perl_idstart = _new_invlist_C_array(PL__PERL_IDSTART_invlist);
-    PL_utf8_perl_idcont = _new_invlist_C_array(PL__PERL_IDCONT_invlist);
+
      PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
      PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
      PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
-    PL_utf8_foldable = _new_invlist_C_array(PL__PERL_ANY_FOLDS_invlist);
-    PL_HasMultiCharFold = _new_invlist_C_array(
-                                         PL__PERL_FOLDS_TO_MULTI_CHAR_invlist);
+
+    PL_Assigned_invlist = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASSIGNED]);
+
+    PL_utf8_perl_idstart = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDSTART]);
+    PL_utf8_perl_idcont = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDCONT]);
+
+    PL_utf8_charname_begin = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_BEGIN]);
+    PL_utf8_charname_continue = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_CONTINUE]);
+
+    PL_utf8_foldable = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_ANY_FOLDS]);
+    PL_HasMultiCharFold = _new_invlist_C_array(PL_uni_prop_ptrs[
+                                            PL__PERL_FOLDS_TO_MULTI_CHAR]);
      PL_NonL1NonFinalFold = _new_invlist_C_array(
                                              NonL1_Perl_Non_Final_Folds_invlist);
-    PL_utf8_charname_begin = _new_invlist_C_array(PL__PERL_CHARNAME_BEGIN_invlist);
-    PL_utf8_charname_continue = _new_invlist_C_array(PL__PERL_CHARNAME_CONTINUE_invlist);
+
+    PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
+    PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
+    PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
+    PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
+    PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
      PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
+    PL_utf8_mark = _new_invlist_C_array(PL_uni_prop_ptrs[PL_M]);
  }
  
  SV *
@@ -5927,9 +5849,14 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const
  
      char* lookup_name;
      bool stricter = FALSE;
+    bool is_nv_type = FALSE;         /* nv= or numeric_value=, or possibly one
+                                        of the cjk numeric properties (though
+                                        it requires extra effort to compile
+                                        them) */
      unsigned int i;
-    unsigned int j = 0;
+    unsigned int j = 0, lookup_len;
      int equals_pos = -1;        /* Where the '=' is found, or negative if none */
+    int slash_pos = -1;        /* Where the '/' is found, or negative if none */
      int table_index = 0;
      bool starts_with_In_or_Is = FALSE;
      Size_t lookup_offset = 0;
@@ -6002,16 +5929,28 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const
          }
  
          /* Then check if it is one of these properties.  This is hard-coded
-         * because easier this way, and the list is unlikely to change */
-        if (   memEQs(lookup_name + lookup_offset,
+         * because easier this way, and the list is unlikely to change.  There
+         * are several properties like this in the Unihan DB, which is unlikely
+         * to be compiled, and they all end with 'numeric'.  The interiors
+         * aren't checked for the precise property.  This would stop working if
+         * a cjk property were to be created that ended with 'numeric' and
+         * wasn't a numeric type */
+        is_nv_type = memEQs(lookup_name + lookup_offset,
+                       j - 1 - lookup_offset, "numericvalue")
+                  || memEQs(lookup_name + lookup_offset,
+                      j - 1 - lookup_offset, "nv")
+                  || (   memENDPs(lookup_name + lookup_offset,
+                            j - 1 - lookup_offset, "numeric")
+                      && (   memBEGINPs(lookup_name + lookup_offset,
+                                      j - 1 - lookup_offset, "cjk")
+                          || memBEGINPs(lookup_name + lookup_offset,
+                                      j - 1 - lookup_offset, "k")));
+        if (   is_nv_type
+            || memEQs(lookup_name + lookup_offset,
                        j - 1 - lookup_offset, "canonicalcombiningclass")
              || memEQs(lookup_name + lookup_offset,
                        j - 1 - lookup_offset, "ccc")
              || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "numericvalue")
-            || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "nv")
-            || memEQs(lookup_name + lookup_offset,
                        j - 1 - lookup_offset, "age")
              || memEQs(lookup_name + lookup_offset,
                        j - 1 - lookup_offset, "in")
@@ -6025,11 +5964,15 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const
               * willy-nilly, as those could be a minus sign.  Other stricter
               * rules also apply.  However, these properties all can have the
               * rhs not be a number, in which case they contain at least one
-             * alphabetic.  In those cases, the stricter rules don't apply.  We
-             * first parse to look for alphas */
+             * alphabetic.  In those cases, the stricter rules don't apply.
+             * But the numeric type properties can have the alphas [Ee] to
+             * signify an exponent, and it is still a number with stricter
+             * rules.  So look for an alpha that signifys not-strict */
              stricter = TRUE;
              for (k = i; k < len; k++) {
-                if (isALPHA(name[k])) {
+                if (   isALPHA(name[k])
+                    && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
+                {
                      stricter = FALSE;
                      break;
                  }
@@ -6065,11 +6008,9 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const
         /* We are now in a position to determine if this property should have
          * been parsed using stricter rules.  Only a few are like that, and
          * unlikely to change. */
-        if (   (   memBEGINPs(lookup_name, j, "perl")
-                && memNEs(lookup_name + 4, j - 4, "space")
-                && memNEs(lookup_name + 4, j - 4, "word"))
-            || memEQs(lookup_name, j, "canondcij")
-            || memEQs(lookup_name, j, "combabove"))
+        if (   memBEGINPs(lookup_name, j, "perl")
+            && memNEs(lookup_name + 4, j - 4, "space")
+            && memNEs(lookup_name + 4, j - 4, "word"))
          {
              stricter = TRUE;
  
@@ -6118,21 +6059,19 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const
  
          lookup_name[j++] = cur;
  
-        /* Unless this is a slash, we are done with it */
-        if (cur != '/') {
+        /* Unless this is a non-trailing slash, we are done with it */
+        if (i >= len - 1 || cur != '/') {
              continue;
          }
  
+        slash_pos = j;
+
          /* A slash in the 'numeric value' property indicates that what follows
           * is a denominator.  It can have a leading '+' and '0's that should be
           * skipped.  But we have never allowed a negative denominator, so treat
           * a minus like every other character.  (No need to rule out a second
           * '/', as that won't match anything anyway */
-        if (   memEQs(lookup_name + lookup_offset, equals_pos - lookup_offset,
-                      "nv=")
-            || memEQs(lookup_name + lookup_offset, equals_pos - lookup_offset,
-                      "numericvalue="))
-        {
+        if (is_nv_type) {
              i++;
              if (i < len && name[i] == '+') {
                  i++;
@@ -6174,29 +6113,152 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const
              return NULL;
          }
  
-        starts_with_In_or_Is = true;
+        starts_with_In_or_Is = TRUE;
      }
  
+    lookup_len = j;     /* Use a more mnemonic name starting here */
+
      /* Get the index into our pointer table of the inversion list corresponding
       * to the property */
-    table_index = match_uniprop((U8 *) lookup_name, j);
+    table_index = match_uniprop((U8 *) lookup_name, lookup_len);
  
      /* If it didn't find the property */
      if (table_index == 0) {
  
          /* If didn't find the property, we try again stripping off any initial
           * 'In' or 'Is' */
-        if (! starts_with_In_or_Is) {
-            return NULL;
-        }
+        if (starts_with_In_or_Is) {
+            lookup_name += 2;
+            lookup_len -= 2;
+            equals_pos -= 2;
+            slash_pos -= 2;
  
-        lookup_name += 2;
-        j -= 2;
+            table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+        }
  
-        /* If still didn't find it, give up */
-        table_index = match_uniprop((U8 *) lookup_name, j);
          if (table_index == 0) {
-            return NULL;
+            char * canonical;
+
+            /* If not found, and not a numeric type property, isn't a legal
+             * property */
+            if (! is_nv_type) {
+                return NULL;
+            }
+
+            /* But the numeric type properties need more work to decide.  What
+             * we do is make sure we have the number in canonical form and look
+             * that up. */
+
+            if (slash_pos < 0) {    /* No slash */
+
+                /* When it isn't a rational, take the input, convert it to a
+                 * NV, then create a canonical string representation of that
+                 * NV. */
+
+                NV value;
+
+                /* Get the value */
+                if (my_atof3(lookup_name + equals_pos, &value,
+                             lookup_len - equals_pos)
+                          != lookup_name + lookup_len)
+                {
+                    return NULL;
+                }
+
+                /* If the value is an integer, the canonical value is integral */
+                if (Perl_ceil(value) == value) {
+                    canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
+                                                equals_pos, lookup_name, value);
+                }
+                else {  /* Otherwise, it is %e with a known precision */
+                    canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
+                                                equals_pos, lookup_name,
+                                                PL_E_FORMAT_PRECISION, value);
+                }
+            }
+            else {  /* Has a slash.  Create a rational in canonical form  */
+                UV numerator, denominator, gcd, trial;
+                const char * end_ptr;
+                const char * sign = "";
+
+                /* We can't just find the numerator, denominator, and do the
+                 * division, then use the method above, because that is
+                 * inexact.  And the input could be a rational that is within
+                 * epsilon (given our precision) of a valid rational, and would
+                 * then incorrectly compare valid.
+                 *
+                 * We're only interested in the part after the '=' */
+                const char * this_lookup_name = lookup_name + equals_pos;
+                lookup_len -= equals_pos;
+                slash_pos -= equals_pos;
+
+                /* Handle any leading minus */
+                if (this_lookup_name[0] == '-') {
+                    sign = "-";
+                    this_lookup_name++;
+                    lookup_len--;
+                    slash_pos--;
+                }
+
+                /* Convert the numerator to numeric */
+                end_ptr = this_lookup_name + slash_pos;
+                if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
+                    return NULL;
+                }
+
+                /* It better have included all characters before the slash */
+                if (*end_ptr != '/') {
+                    return NULL;
+                }
+
+                /* Set to look at just the denominator */
+                this_lookup_name += slash_pos;
+                lookup_len -= slash_pos;
+                end_ptr = this_lookup_name + lookup_len;
+
+                /* Convert the denominator to numeric */
+                if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
+                    return NULL;
+                }
+
+                /* It better be the rest of the characters, and don't divide by
+                 * 0 */
+                if (   end_ptr != this_lookup_name + lookup_len
+                    || denominator == 0)
+                {
+                    return NULL;
+                }
+
+                /* Get the greatest common denominator using
+                   http://en.wikipedia.org/wiki/Euclidean_algorithm */
+                gcd = numerator;
+                trial = denominator;
+                while (trial != 0) {
+                    UV temp = trial;
+                    trial = gcd % trial;
+                    gcd = temp;
+                }
+
+                /* If already in lowest possible terms, we have already tried
+                 * looking this up */
+                if (gcd == 1) {
+                    return NULL;
+                }
+
+                /* Reduce the rational, which should put it in canonical form.
+                 * Then look it up */
+                numerator /= gcd;
+                denominator /= gcd;
+
+                canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
+                        equals_pos, lookup_name, sign, numerator, denominator);
+            }
+
+            /* Here, we have the number in canonical form.  Try that */
+            table_index = match_uniprop((U8 *) canonical, strlen(canonical));
+            if (table_index == 0) {
+                return NULL;
+            }
          }
      }
  
@@ -6228,18 +6290,20 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const
      if (to_fold) {
          if (   table_index == PL_XPOSIXUPPER
              || table_index == PL_XPOSIXLOWER
-            || table_index == PL_LT)
+            || table_index == PL_TITLE)
          {
              table_index = PL_CASED;
          }
-        else if (   table_index == PL_LU
-                 || table_index == PL_LL
-                 || table_index == PL_LT)
-        {
-            table_index = PL_L_AMP_;
+        else if (   table_index == PL_UPPERCASELETTER
+                 || table_index == PL_LOWERCASELETTER
+#ifdef PL_TITLECASELETTER   /* Missing from early Unicodes */
+                 || table_index == PL_TITLECASELETTER
+#endif
+        ) {
+            table_index = PL_CASEDLETTER;
          }
-        else if (   table_index == PL_POSIXUPPER
-                 || table_index == PL_POSIXLOWER)
+        else if (  table_index == PL_POSIXUPPER
+                || table_index == PL_POSIXLOWER)
          {
              table_index = PL_POSIXALPHA;
          }