This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Prepare for Unicode 6.2
authorKarl Williamson <public@khwilliamson.com>
Sun, 26 Aug 2012 17:25:13 +0000 (11:25 -0600)
committerKarl Williamson <public@khwilliamson.com>
Sun, 26 Aug 2012 18:28:28 +0000 (12:28 -0600)
This changes code to be able to handle Unicode 6.2, while continuing to
handle all prevrious releases.

The major change was a new definition of \X, which adds a property to
its calculation.  Unfortunately \X is hard-coded into regexec.c, and so
has to revised whenever there is a change of this magnitude in Unicode,
which fortunately isn't all that often.  I refactored the code in
mktables to make it easier next time there is a change like this one.

embed.fnc
embed.h
embedvar.h
intrpvar.h
lib/unicore/mktables
proto.h
regexec.c
sv.c
utf8.c

index dac6182..e92dc0c 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -663,8 +663,9 @@ ApR |bool   |is_utf8_mark   |NN const U8 *p
 EXpR   |bool   |is_utf8_X_begin        |NN const U8 *p
 EXpR   |bool   |is_utf8_X_extend       |NN const U8 *p
 EXpR   |bool   |is_utf8_X_prepend      |NN const U8 *p
-EXpR   |bool   |is_utf8_X_non_hangul   |NN const U8 *p
+EXpR   |bool   |is_utf8_X_special_begin|NN const U8 *p
 EXpR   |bool   |is_utf8_X_L            |NN const U8 *p
+EXpR   |bool   |is_utf8_X_RI           |NN const U8 *p
 :not currently used EXpR       |bool   |is_utf8_X_LV           |NN const U8 *p
 EXpR   |bool   |is_utf8_X_LVT          |NN const U8 *p
 EXpR   |bool   |is_utf8_X_LV_LVT_V     |NN const U8 *p
diff --git a/embed.h b/embed.h
index 118d733..51f45d3 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define is_utf8_X_L(a)         Perl_is_utf8_X_L(aTHX_ a)
 #define is_utf8_X_LVT(a)       Perl_is_utf8_X_LVT(aTHX_ a)
 #define is_utf8_X_LV_LVT_V(a)  Perl_is_utf8_X_LV_LVT_V(aTHX_ a)
+#define is_utf8_X_RI(a)                Perl_is_utf8_X_RI(aTHX_ a)
 #define is_utf8_X_T(a)         Perl_is_utf8_X_T(aTHX_ a)
 #define is_utf8_X_V(a)         Perl_is_utf8_X_V(aTHX_ a)
 #define is_utf8_X_begin(a)     Perl_is_utf8_X_begin(aTHX_ a)
 #define is_utf8_X_extend(a)    Perl_is_utf8_X_extend(aTHX_ a)
-#define is_utf8_X_non_hangul(a)        Perl_is_utf8_X_non_hangul(aTHX_ a)
 #define is_utf8_X_prepend(a)   Perl_is_utf8_X_prepend(aTHX_ a)
+#define is_utf8_X_special_begin(a)     Perl_is_utf8_X_special_begin(aTHX_ a)
 #define op_clear(a)            Perl_op_clear(aTHX_ a)
 #define qerror(a)              Perl_qerror(aTHX_ a)
 #define reg_named_buff(a,b,c,d)        Perl_reg_named_buff(aTHX_ a,b,c,d)
index 77109e7..136e4e1 100644 (file)
 #define PL_utf8_X_L            (vTHX->Iutf8_X_L)
 #define PL_utf8_X_LVT          (vTHX->Iutf8_X_LVT)
 #define PL_utf8_X_LV_LVT_V     (vTHX->Iutf8_X_LV_LVT_V)
+#define PL_utf8_X_RI           (vTHX->Iutf8_X_RI)
 #define PL_utf8_X_T            (vTHX->Iutf8_X_T)
 #define PL_utf8_X_V            (vTHX->Iutf8_X_V)
 #define PL_utf8_X_begin                (vTHX->Iutf8_X_begin)
 #define PL_utf8_X_extend       (vTHX->Iutf8_X_extend)
-#define PL_utf8_X_non_hangul   (vTHX->Iutf8_X_non_hangul)
 #define PL_utf8_X_prepend      (vTHX->Iutf8_X_prepend)
+#define PL_utf8_X_special_begin        (vTHX->Iutf8_X_special_begin)
 #define PL_utf8_alnum          (vTHX->Iutf8_alnum)
 #define PL_utf8_alpha          (vTHX->Iutf8_alpha)
 #define PL_utf8_blank          (vTHX->Iutf8_blank)
index 218679a..94253a2 100644 (file)
@@ -629,9 +629,10 @@ PERLVAR(I, utf8_mark,      SV *)
 PERLVAR(I, utf8_X_begin, SV *)
 PERLVAR(I, utf8_X_extend, SV *)
 PERLVAR(I, utf8_X_prepend, SV *)
-PERLVAR(I, utf8_X_non_hangul, SV *)
+PERLVAR(I, utf8_X_special_begin, SV *)
 PERLVAR(I, utf8_X_L,   SV *)
 PERLVAR(I, utf8_X_LVT, SV *)
+PERLVAR(I, utf8_X_RI,  SV *)
 PERLVAR(I, utf8_X_T,   SV *)
 PERLVAR(I, utf8_X_V,   SV *)
 PERLVAR(I, utf8_X_LV_LVT_V, SV *)
index 89945f6..d3cc95a 100644 (file)
@@ -814,6 +814,8 @@ push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
                                                     if $v_version ge v6.0.0;
 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
                                                     if $v_version ge v6.1.0;
+push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
+                                                    if $v_version ge v6.2.0;
 
 # The lists below are hashes, so the key is the item in the list, and the
 # value is the reason why it is in the list.  This makes generation of
@@ -13509,7 +13511,8 @@ sub compile_perl() {
     #   CR-LF
     #   | Prepend* Begin Extend*
     #   | .
-    # Begin is:           ( Hangul-syllable | ! Control )
+    # Begin is:           ( Special_Begin | ! Control )
+    # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
     # Extend is:          ( Grapheme_Extend | Spacing_Mark )
     # Control is:         [ GCB_Control CR LF ]
     # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
@@ -13529,11 +13532,6 @@ sub compile_perl() {
             push @tables_that_may_be_empty, $perl_table->complete_name;
         }
     }
-    $perl->add_match_table('_X_HST_Not_Applicable',
-                            Perl_Extension => 1,
-                            Fate => $INTERNAL_ONLY,
-                            Initialize => property_ref('HST')->table('NA'),
-                            );
 
     # More GCB.  Populate a combined hangul syllables table
     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
@@ -13542,20 +13540,41 @@ sub compile_perl() {
     $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
     $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
 
+    my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
+                                    Fate => $INTERNAL_ONLY);
+    $ri += $gcb->table('RI') if $v_version ge v6.2;
+
+    my $specials_begin = $perl->add_match_table('_X_Special_Begin',
+                                       Perl_Extension => 1,
+                                       Fate => $INTERNAL_ONLY,
+                                       Initialize => $lv_lvt_v
+                                                   + $gcb->table('L')
+                                                   + $gcb->table('T')
+                                                   + $ri
+                                      );
+    $specials_begin->add_comment(join_lines( <<END
+For use in \\X; matches first character of potential multi-character
+sequences that can begin an extended grapheme cluster.  They need special
+handling because of their complicated nature.
+END
+    ));
     my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
-                                       Fate => $INTERNAL_ONLY);
-    my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
-                                        Fate => $INTERNAL_ONLY);
-
-    # In the line below, two negatives means: yes hangul
-    $begin += ~ property_ref('Hangul_Syllable_Type')
-                                                ->table('Not_Applicable')
-                + ~ ($gcb->table('Control')
-                + $gcb->table('CR')
-                + $gcb->table('LF'));
-    $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
+                                       Fate => $INTERNAL_ONLY,
+                                       Initialize => $specials_begin
+                                                   + ~ $gcb->table('Control')
+                                                   - $gcb->table('CR')
+                                                   - $gcb->table('LF')
+                                      );
+    $begin->add_comment(join_lines( <<END
+For use in \\X; matches first character of anything that can begin an extended
+grapheme cluster.
+END
+    ));
 
-    $extend += $gcb->table('Extend');
+    my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
+                                        Fate => $INTERNAL_ONLY,
+                                        Initialize => $gcb->table('Extend')
+                                       );
     if (defined (my $sm = $gcb->table('SpacingMark'))) {
         $extend += $sm;
     }
@@ -17351,6 +17370,13 @@ my @input_file_objects = (
                     ),
     Input_file->new('BidiMirroring.txt', v3.0.1,
                     Property => 'Bidi_Mirroring_Glyph',
+                    Has_Missings_Defaults => ($v_version lt v6.2.0)
+                                              ? $NO_DEFAULTS
+                                              # Is <none> which doesn't mean
+                                              # anything to us, we will use the
+                                              # null string
+                                              : $IGNORED,
+
                     ),
     Input_file->new("NormTest.txt", v3.0.0,
                      Handler => \&process_NormalizationsTest,
diff --git a/proto.h b/proto.h
index c88c22f..5f3076a 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1782,6 +1782,12 @@ PERL_CALLCONV bool       Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V    \
        assert(p)
 
+PERL_CALLCONV bool     Perl_is_utf8_X_RI(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_RI  \
+       assert(p)
+
 PERL_CALLCONV bool     Perl_is_utf8_X_T(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
@@ -1806,16 +1812,16 @@ PERL_CALLCONV bool      Perl_is_utf8_X_extend(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND      \
        assert(p)
 
-PERL_CALLCONV bool     Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
+PERL_CALLCONV bool     Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL  \
+#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND     \
        assert(p)
 
-PERL_CALLCONV bool     Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
+PERL_CALLCONV bool     Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND     \
+#define PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN       \
        assert(p)
 
 PERL_CALLCONV bool     Perl_is_utf8_alnum(pTHX_ const U8 *p)
index 39bc226..e87e365 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 
 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
        LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
-       LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                             \
        /* These are utf8 constants, and not utf-ebcdic constants, so the   \
            * assert should likely and hopefully fail on an EBCDIC machine */ \
        LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
        LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
        LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
        LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);                                 \
        LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */  
 
 #define PLACEHOLDER    /* Something for the preprocessor to grab onto */
@@ -3924,9 +3925,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
               | Prepend* Begin Extend*
               | .
 
-              Begin is (Hangul-syllable | ! Control)
-              Extend is (Grapheme_Extend | Spacing_Mark)
-              Control is [ GCB_Control CR LF ]
+               Begin is:           ( Special_Begin | ! Control )
+               Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
+               Extend is:          ( Grapheme_Extend | Spacing_Mark )
+               Control is:         [ GCB_Control  CR  LF ]
+               Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
 
               The discussion below shows how the code for CLUMP is derived
               from this regex.  Note that most of these concepts are from
@@ -4033,21 +4036,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    } else {
 
                        /* Here is the beginning of a character that can have
-                        * an extender.  It is either a hangul syllable, or a
-                        * non-control */
-                       if (swash_fetch(PL_utf8_X_non_hangul,
+                         * an extender.  It is either a special begin character
+                         * that requires complicated handling, or a non-control
+                         * */
+                       if (! swash_fetch(PL_utf8_X_special_begin,
                                        (U8*)locinput, utf8_target))
                        {
 
-                           /* Here not a Hangul syllable, must be a
+                           /* Here not a special begin, must be a
                             * ('!  * Control') */
                            locinput += UTF8SKIP(locinput);
                        } else {
 
-                           /* Here is a Hangul syllable.  It can be composed
-                            * of several individual characters.  One
-                            * possibility is T+ */
-                           if (swash_fetch(PL_utf8_X_T,
+                           /* Here is a special begin.  It can be composed
+                             * of several individual characters.  One
+                             * possibility is RI+ */
+                           if (swash_fetch(PL_utf8_X_RI,
+                                           (U8*)locinput, utf8_target))
+                           {
+                               while (locinput < PL_regeol
+                                       && swash_fetch(PL_utf8_X_RI,
+                                                       (U8*)locinput, utf8_target))
+                               {
+                                   locinput += UTF8SKIP(locinput);
+                               }
+                           } else /* Another possibility is T+ */
+                                   if (swash_fetch(PL_utf8_X_T,
                                            (U8*)locinput, utf8_target))
                            {
                                while (locinput < PL_regeol
@@ -4058,9 +4072,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                }
                            } else {
 
-                               /* Here, not T+, but is a Hangul.  That means
-                                * it is one of the others: L, LV, LVT or V,
-                                * and matches:
+                                /* Here, neither RI+ nor T+; must be some other
+                                 * Hangul.  That means it is one of the others:
+                                 * L, LV, LVT or V, and matches:
                                 * L* (L | LVT T* | V  V* T* | LV  V* T*) */
 
                                /* Match L*           */
diff --git a/sv.c b/sv.c
index b936c6e..a3ff695 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -13367,10 +13367,11 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_X_begin    = sv_dup_inc(proto_perl->Iutf8_X_begin, param);
     PL_utf8_X_extend   = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
     PL_utf8_X_prepend  = sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
-    PL_utf8_X_non_hangul       = sv_dup_inc(proto_perl->Iutf8_X_non_hangul, param);
+    PL_utf8_X_special_begin    = sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);
     PL_utf8_X_L        = sv_dup_inc(proto_perl->Iutf8_X_L, param);
     /*not currently used: PL_utf8_X_LV = sv_dup_inc(proto_perl->Iutf8_X_LV, param);*/
     PL_utf8_X_LVT      = sv_dup_inc(proto_perl->Iutf8_X_LVT, param);
+    PL_utf8_X_RI       = sv_dup_inc(proto_perl->Iutf8_X_RI, param);
     PL_utf8_X_T        = sv_dup_inc(proto_perl->Iutf8_X_T, param);
     PL_utf8_X_V        = sv_dup_inc(proto_perl->Iutf8_X_V, param);
     PL_utf8_X_LV_LVT_V = sv_dup_inc(proto_perl->Iutf8_X_LV_LVT_V, param);
diff --git a/utf8.c b/utf8.c
index 8cc05c3..4ba29d8 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2270,13 +2270,13 @@ Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
 }
 
 bool
-Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
+Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
 {
     dVAR;
 
-    PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
+    PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN;
 
-    return is_utf8_common(p, &PL_utf8_X_non_hangul, "_X_HST_Not_Applicable");
+    return is_utf8_common(p, &PL_utf8_X_special_begin, "_X_Special_Begin");
 }
 
 bool
@@ -2289,6 +2289,16 @@ Perl_is_utf8_X_L(pTHX_ const U8 *p)
     return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L");
 }
 
+bool
+Perl_is_utf8_X_RI(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_RI;
+
+    return is_utf8_common(p, &PL_utf8_X_RI, "_X_RI");
+}
+
 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
  * pre-composed Hangul syllables, which are all in a contiguous block and
  * arranged there in such a way so as to facilitate alorithmic determination of