This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
(perl #128095) check pack_sockaddr_un()'s return value
[perl5.git] / regexec.c
index f2e0164..b86cb1b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -458,7 +458,7 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
         case _CC_ENUM_ASCII:     return isASCII_LC(character);
         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
-        case _CC_ENUM_CASED:     return isLOWER_LC(character)
+        case _CC_ENUM_CASED:     return    isLOWER_LC(character)
                                         || isUPPER_LC(character);
         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
@@ -1051,6 +1051,8 @@ Perl_re_intuit_start(pTHX_
             char *from = s;
             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
 
+            if (to > strend)
+                to = strend;
             if (from > to) {
                 s = NULL;
                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
@@ -2118,7 +2120,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     while (s < strend) {
                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
                                                         (U8*) reginfo->strend);
-                        if (   (to_complement ^ isGCB(before, after))
+                        if (   (to_complement ^ isGCB(before,
+                                                      after,
+                                                      (U8*) reginfo->strbeg,
+                                                      (U8*) s,
+                                                      utf8_target))
                             && (reginfo->intuit || regtry(reginfo, &s)))
                         {
                             goto got_it;
@@ -3640,7 +3646,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
    messages are inline with the regop output that created them.
 */
 #define REPORT_CODE_OFF 29
-#define INDENT_CHARS(depth) ((depth) % 20)
+#define INDENT_CHARS(depth) ((int)(depth) % 20)
 #ifdef DEBUGGING
 int
 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
@@ -3650,7 +3656,7 @@ Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
     PerlIO *f= Perl_debug_log;
     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
     va_start(ap, depth);
-    PerlIO_printf(f, "%*s|%4d| %*s", REPORT_CODE_OFF, "", depth, INDENT_CHARS(depth), "" );
+    PerlIO_printf(f, "%*s|%4"UVuf"| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
     result = PerlIO_vprintf(f, fmt, ap);
     va_end(ap);
     return result;
@@ -4289,13 +4295,108 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
     return TRUE;
 }
 
-PERL_STATIC_INLINE bool
-S_isGCB(const GCB_enum before, const GCB_enum after)
+STATIC bool
+S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
 {
     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
-     * between the inputs.  See http://www.unicode.org/reports/tr29/ */
+     * between the inputs.  See http://www.unicode.org/reports/tr29/. */
+
+    PERL_ARGS_ASSERT_ISGCB;
+
+    switch (GCB_table[before][after]) {
+        case GCB_BREAKABLE:
+            return TRUE;
+
+        case GCB_NOBREAK:
+            return FALSE;
+
+        case GCB_RI_then_RI:
+            {
+                int RI_count = 1;
+                U8 * temp_pos = (U8 *) curpos;
+
+                /* Do not break within emoji flag sequences. That is, do not
+                 * break between regional indicator (RI) symbols if there is an
+                 * odd number of RI characters before the break point.
+                 *  GB12     ^ (RI RI)* RI × RI
+                 *  GB13 [^RI] (RI RI)* RI × RI */
+
+                while (backup_one_GCB(strbeg,
+                                    &temp_pos,
+                                    utf8_target) == GCB_Regional_Indicator)
+                {
+                    RI_count++;
+                }
+
+                return RI_count % 2 != 1;
+            }
 
-    return GCB_table[before][after];
+        case GCB_EX_then_EM:
+
+            /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
+            {
+                U8 * temp_pos = (U8 *) curpos;
+                GCB_enum prev;
+
+                do {
+                    prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
+                }
+                while (prev == GCB_Extend);
+
+                return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
+            }
+
+        default:
+            break;
+    }
+
+#ifdef DEBUGGING
+    Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
+                                  before, after, GCB_table[before][after]);
+    assert(0);
+#endif
+    return TRUE;
+}
+
+STATIC GCB_enum
+S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+    GCB_enum gcb;
+
+    PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
+
+    if (*curpos < strbeg) {
+        return GCB_EDGE;
+    }
+
+    if (utf8_target) {
+        U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+        U8 * prev_prev_char_pos;
+
+        if (! prev_char_pos) {
+            return GCB_EDGE;
+        }
+
+        if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
+            gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+            *curpos = prev_char_pos;
+            prev_char_pos = prev_prev_char_pos;
+        }
+        else {
+            *curpos = (U8 *) strbeg;
+            return GCB_EDGE;
+        }
+    }
+    else {
+        if (*curpos - 2 < strbeg) {
+            *curpos = (U8 *) strbeg;
+            return GCB_EDGE;
+        }
+        (*curpos)--;
+        gcb = getGCB_VAL_CP(*(*curpos - 1));
+    }
+
+    return gcb;
 }
 
 /* Combining marks attach to most classes that precede them, but this defines
@@ -4326,7 +4427,7 @@ S_isLB(pTHX_ LB_enum before,
 
     PERL_ARGS_ASSERT_ISLB;
 
-    /* Rule numbers in the comments below are as of Unicode 8.0 */
+    /* Rule numbers in the comments below are as of Unicode 9.0 */
 
   redo:
     before = prev;
@@ -4420,14 +4521,14 @@ S_isLB(pTHX_ LB_enum before,
              * that is overriden */
             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
 
-        case LB_CM_foo:
+        case LB_CM_ZWJ_foo:
 
             /* We don't know how to treat the CM except by looking at the first
-             * non-CM character preceding it */
+             * non-CM character preceding it.  ZWJ is treated as CM */
             do {
                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
             }
-            while (prev == LB_Combining_Mark);
+            while (prev == LB_Combining_Mark || prev == LB_ZWJ);
 
             /* Here, 'prev' is that first earlier non-CM character.  If the CM
              * attatches to it, then it inherits the behavior of 'prev'.  If it
@@ -4500,6 +4601,28 @@ S_isLB(pTHX_ LB_enum before,
             return LB_various_then_PO_or_PR;
         }
 
+        case LB_RI_then_RI + LB_NOBREAK:
+        case LB_RI_then_RI + LB_BREAKABLE:
+            {
+                int RI_count = 1;
+
+                /* LB30a Break between two regional indicator symbols if and
+                 * only if there are an even number of regional indicators
+                 * preceding the position of the break.
+                 *
+                 *  sot (RI RI)* RI × RI
+                 *  [^RI] (RI RI)* RI × RI */
+
+                while (backup_one_LB(strbeg,
+                                     &temp_pos,
+                                     utf8_target) == LB_Regional_Indicator)
+                {
+                    RI_count++;
+                }
+
+                return RI_count % 2 == 0;
+            }
+
         default:
             break;
     }
@@ -4884,7 +5007,7 @@ S_isWB(pTHX_ WB_enum previous,
 
     PERL_ARGS_ASSERT_ISWB;
 
-    /* Rule numbers in the comments below are as of Unicode 8.0 */
+    /* Rule numbers in the comments below are as of Unicode 9.0 */
 
   redo:
     before = prev;
@@ -4910,11 +5033,11 @@ S_isWB(pTHX_ WB_enum previous,
          * the beginning of a region of text', the rule is to break before
          * them, just like any other character.  Therefore, the default rule
          * applies and we don't have to look in more depth.  Should this ever
-         * change, we would have to have 2 'case' statements, like in the
-         * rules below, and backup a single character (not spacing over the
-         * extend ones) and then see if that is one of the region-end
-         * characters and go from there */
-        case WB_Ex_or_FO_then_foo:
+         * change, we would have to have 2 'case' statements, like in the rules
+         * below, and backup a single character (not spacing over the extend
+         * ones) and then see if that is one of the region-end characters and
+         * go from there */
+        case WB_Ex_or_FO_or_ZWJ_then_foo:
             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
             goto redo;
 
@@ -5007,6 +5130,30 @@ S_isWB(pTHX_ WB_enum previous,
             return WB_table[before][after]
                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
 
+        case WB_RI_then_RI + WB_NOBREAK:
+        case WB_RI_then_RI + WB_BREAKABLE:
+            {
+                int RI_count = 1;
+
+                /* Do not break within emoji flag sequences. That is, do not
+                 * break between regional indicator (RI) symbols if there is an
+                 * odd number of RI characters before the potential break
+                 * point.
+                 *
+                 * WB15     ^ (RI RI)* RI × RI
+                 * WB16 [^RI] (RI RI)* RI × RI */
+
+                while (backup_one_WB(&previous,
+                                     strbeg,
+                                     &before_pos,
+                                     utf8_target) == WB_Regional_Indicator)
+                {
+                    RI_count++;
+                }
+
+                return RI_count % 2 != 1;
+            }
+
         default:
             break;
     }
@@ -5087,8 +5234,8 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos,
             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
         }
 
-        /* And we always back up over these two types */
-        if (wb != WB_Extend && wb != WB_Format) {
+        /* And we always back up over these three types */
+        if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
             return wb;
         }
     }
@@ -5119,7 +5266,7 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos,
                 *curpos = (U8 *) strbeg;
                 return WB_EDGE;
             }
-        } while (wb == WB_Extend || wb == WB_Format);
+        } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
     }
     else {
         do {
@@ -5232,6 +5379,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     bool is_utf8_pat = reginfo->is_utf8_pat;
     bool match = FALSE;
 
+/* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
+#if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
+#  define SOLARIS_BAD_OPTIMIZER
+    const U32 *pl_charclass_dup = PL_charclass;
+#  define PL_charclass pl_charclass_dup
+#endif
+
 #ifdef DEBUGGING
     GET_RE_DEBUG_FLAGS_DECL;
 #endif
@@ -5973,7 +6127,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                         b1 = (locinput == reginfo->strbeg)
                              ? 0 /* isWORDCHAR_L1('\n') */
                              : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
-                                                                (U8*)(reginfo->strbeg)));
+                                                       (U8*)(reginfo->strbeg)));
                         b2 = (NEXTCHR_IS_EOS)
                             ? 0 /* isWORDCHAR_L1('\n') */
                             : isWORDCHAR_utf8((U8*)locinput);
@@ -5994,7 +6148,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                                                         (U8*)(reginfo->strbeg)),
                                                 (U8*) reginfo->strend),
                                           getGCB_VAL_UTF8((U8*) locinput,
-                                                        (U8*) reginfo->strend));
+                                                        (U8*) reginfo->strend),
+                                          (U8*) reginfo->strbeg,
+                                          (U8*) locinput,
+                                          utf8_target);
                         }
                         break;
 
@@ -6184,23 +6341,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
                     sayNO;
                 }
+
+                locinput++;
+                break;
             }
-            else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
-                if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
-                                               EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
-                                               *(locinput + 1))))))
-                {
-                    sayNO;
-                }
-            }
-            else { /* Here, must be an above Latin-1 code point */
+
+            if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
                 goto utf8_posix_above_latin1;
             }
 
-            /* Here, must be utf8 */
-            locinput += UTF8SKIP(locinput);
-            break;
+            /* Here is a UTF-8 variant code point below 256 and the target is
+             * UTF-8 */
+            if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
+                                            EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
+                                            *(locinput + 1))))))
+            {
+                sayNO;
+            }
+
+            goto increment_locinput;
 
         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
             to_complement = 1;
@@ -6376,7 +6536,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 while (locinput < reginfo->strend) {
                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
                                                          (U8*) reginfo->strend);
-                    if (isGCB(prev_gcb, cur_gcb)) {
+                    if (isGCB(prev_gcb, cur_gcb,
+                              (U8*) reginfo->strbeg, (U8*) locinput,
+                              utf8_target))
+                    {
                         break;
                     }
 
@@ -6883,6 +7046,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
        }
 
        case EVAL_AB: /* cleanup after a successful (??{A})B */
+            /* note: this is called twice; first after popping B, then A */
             DEBUG_STACK_r({
                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
                     depth, cur_eval, ST.prev_eval);
@@ -8315,6 +8479,9 @@ NULL
             /* NOTREACHED */
        }
     }
+#ifdef SOLARIS_BAD_OPTIMIZER
+#  undef PL_charclass
+#endif
 
     /*
     * We get here only if there's trouble -- normally "case END" is
@@ -8415,6 +8582,7 @@ NULL
            yes_state = st->u.yes.prev_yes_state;
 
        state_num = st->resume_state + 1; /* failure = success + 1 */
+        PERL_ASYNC_CHECK();
        goto reenter_switch;
     }
     result = 0;
@@ -9251,7 +9419,7 @@ S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
  * char pos */
 
 STATIC U8 *
-S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
+S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
 {
     PERL_ARGS_ASSERT_REGHOPMAYBE3;