regcomp.c: Comments, white space

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 0f25df7..3c1c06d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -11113,7 +11113,8 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
  
      if (value == '[' && RExC_parse + 1 < RExC_end &&
         /* I smell either [: or [= or [. -- POSIX has been here, right? */
  
      if (value == '[' && RExC_parse + 1 < RExC_end &&
         /* I smell either [: or [= or [. -- POSIX has been here, right? */
-       POSIXCC(UCHARAT(RExC_parse))) {
+       POSIXCC(UCHARAT(RExC_parse)))
+    {
         const char c = UCHARAT(RExC_parse);
         char* const s = RExC_parse++;
  
         const char c = UCHARAT(RExC_parse);
         char* const s = RExC_parse++;
  
@@ -11137,7 +11138,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
                     /* Initially switch on the length of the name.  */
                     switch (skip) {
                     case 4:
                     /* Initially switch on the length of the name.  */
                     switch (skip) {
                     case 4:
-                       if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
+                        if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
+                                                          this is the Perl \w
+                                                        */
                             namedclass = ANYOF_WORDCHAR;
                         break;
                     case 5:
                             namedclass = ANYOF_WORDCHAR;
                         break;
                     case 5:
@@ -11239,12 +11242,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                   const bool stop_at_1, bool allow_multi_folds,
                   const bool silence_non_portable)
  {
                   const bool stop_at_1, bool allow_multi_folds,
                   const bool silence_non_portable)
  {
-    /* parse a bracketed class specification.  Most of these will produce an ANYOF node;
-     * but something like [a] will produce an EXACT node; [aA], an EXACTFish
-     * node; [[:ascii:]], a POSIXA node; etc.  It is more complex under /i with
-     * multi-character folds: it will be rewritten following the paradigm of
-     * this example, where the <multi-fold>s are characters which fold to
-     * multiple character sequences:
+    /* parse a bracketed class specification.  Most of these will produce an
+     * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
+     * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
+     * under /i with multi-character folds: it will be rewritten following the
+     * paradigm of this example, where the <multi-fold>s are characters which
+     * fold to multiple character sequences:
       *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
       * gets effectively rewritten as:
       *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
       *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
       * gets effectively rewritten as:
       *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
@@ -11377,7 +11380,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          stop_ptr = RExC_parse + 1;
      }
  
          stop_ptr = RExC_parse + 1;
      }
  
-    /* allow 1st char to be ] (allowing it to be - is dealt with later) */
+    /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
      if (UCHARAT(RExC_parse) == ']')
         goto charclassloop;
  
      if (UCHARAT(RExC_parse) == ']')
         goto charclassloop;
  
@@ -11409,7 +11412,7 @@ parseit:
          {
              namedclass = regpposixcc(pRExC_state, value, listsv);
          }
          {
              namedclass = regpposixcc(pRExC_state, value, listsv);
          }
-       else if (value == '\\') {
+        else if (value == '\\') {
             if (UTF) {
                 value = utf8n_to_uvchr((U8*)RExC_parse,
                                    RExC_end - RExC_parse,
             if (UTF) {
                 value = utf8n_to_uvchr((U8*)RExC_parse,
                                    RExC_end - RExC_parse,
@@ -11568,7 +11571,8 @@ parseit:
                     Safefree(name);
                 }
                 RExC_parse = e + 1;
                     Safefree(name);
                 }
                 RExC_parse = e + 1;
-               namedclass = ANYOF_UNIPROP;  /* no official name, but it's named */
+                namedclass = ANYOF_UNIPROP;  /* no official name, but it's
+                                                named */
  
                 /* \p means they want Unicode semantics */
                 RExC_uni_semantics = 1;
  
                 /* \p means they want Unicode semantics */
                 RExC_uni_semantics = 1;
@@ -11588,7 +11592,8 @@ parseit:
                     bool valid = grok_bslash_o(&RExC_parse,
                                                &value,
                                                &error_msg,
                     bool valid = grok_bslash_o(&RExC_parse,
                                                &value,
                                                &error_msg,
-                                              SIZE_ONLY,
+                                               SIZE_ONLY,   /* warnings in pass
+                                                               1 only */
                                                 FALSE, /* Not strict */
                                                 silence_non_portable,
                                                 UTF);
                                                 FALSE, /* Not strict */
                                                 silence_non_portable,
                                                 UTF);
@@ -11611,7 +11616,7 @@ parseit:
                                                 FALSE, /* Not strict */
                                                 silence_non_portable,
                                                 UTF);
                                                 FALSE, /* Not strict */
                                                 silence_non_portable,
                                                 UTF);
-                   if (! valid) {
+                    if (! valid) {
                         vFAIL(error_msg);
                     }
                 }
                         vFAIL(error_msg);
                     }
                 }
@@ -11627,7 +11632,7 @@ parseit:
                     /* Take 1-3 octal digits */
                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                     numlen = 3;
                     /* Take 1-3 octal digits */
                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                     numlen = 3;
-                   value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
+                    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                     RExC_parse += numlen;
                     if (PL_encoding && value < 0x100)
                         goto recode_encoding;
                     RExC_parse += numlen;
                     if (PL_encoding && value < 0x100)
                         goto recode_encoding;
@@ -11645,8 +11650,8 @@ parseit:
             default:
                 /* Allow \_ to not give an error */
                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
             default:
                 /* Allow \_ to not give an error */
                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
-                   SAVEFREESV(RExC_rx_sv);
                     SAVEFREESV(listsv);
                     SAVEFREESV(listsv);
+                   SAVEFREESV(RExC_rx_sv);
                     ckWARN2reg(RExC_parse,
                                "Unrecognized escape \\%c in character class passed through",
                                (int)value);
                     ckWARN2reg(RExC_parse,
                                "Unrecognized escape \\%c in character class passed through",
                                (int)value);
@@ -11654,13 +11659,15 @@ parseit:
                     SvREFCNT_inc_simple_void_NN(listsv);
                 }
                 break;
                     SvREFCNT_inc_simple_void_NN(listsv);
                 }
                 break;
-           }
+           }   /* End of switch on char following backslash */
         } /* end of handling backslash escape sequences */
  #ifdef EBCDIC
         } /* end of handling backslash escape sequences */
  #ifdef EBCDIC
-       else
-           literal_endpoint++;
+        else
+            literal_endpoint++;
  #endif
  
  #endif
  
+        /* Here, we have the current token in 'value' */
+
          /* What matches in a locale is not known until runtime.  This includes
           * what the Posix classes (like \w, [:space:]) match.  Room must be
           * reserved (one time per class) to store such classes, either if Perl
          /* What matches in a locale is not known until runtime.  This includes
           * what the Posix classes (like \w, [:space:]) match.  Room must be
           * reserved (one time per class) to store such classes, either if Perl
@@ -11695,8 +11702,8 @@ parseit:
                     const int w =
                         RExC_parse >= rangebegin ?
                         RExC_parse - rangebegin : 0;
                     const int w =
                         RExC_parse >= rangebegin ?
                         RExC_parse - rangebegin : 0;
+                   SAVEFREESV(listsv); /* in case of fatal warnings */
                     SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
                     SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
-                   SAVEFREESV(listsv);
                     ckWARN4reg(RExC_parse,
                                "False [] range \"%*.*s\"",
                                w, w, rangebegin);
                     ckWARN4reg(RExC_parse,
                                "False [] range \"%*.*s\"",
                                w, w, rangebegin);
@@ -11963,6 +11970,13 @@ parseit:
             }
         } /* end of namedclass \blah */
  
             }
         } /* end of namedclass \blah */
  
+        /* Here, we have a single value.  If 'range' is set, it is the ending
+         * of a range--check its validity.  Later, we will handle each
+         * individual code point in the range.  If 'range' isn't set, this
+         * could be the beginning of a range, so check for that by looking
+         * ahead to see if the next character to be processed is the range
+         * indicator--the minus sign */
+
         if (range) {
             if (prevvalue > value) /* b-a */ {
                 const int w = RExC_parse - rangebegin;
         if (range) {
             if (prevvalue > value) /* b-a */ {
                 const int w = RExC_parse - rangebegin;
@@ -11978,6 +11992,9 @@ parseit:
             {
                 RExC_parse++;
  
             {
                 RExC_parse++;
  
+                /* If the '-' is at the end of the class (just before the ']',
+                 * it is a literal minus; otherwise it is a range */
+
                 /* a bad range like \w-, [:word:]- ? */
                 if (namedclass > OOB_NAMEDCLASS) {
                     if (ckWARN(WARN_REGEXP)) {
                 /* a bad range like \w-, [:word:]- ? */
                 if (namedclass > OOB_NAMEDCLASS) {
                     if (ckWARN(WARN_REGEXP)) {
@@ -12264,9 +12281,9 @@ parseit:
                      }
                      /* FALLTHROUGH */
  
                      }
                      /* FALLTHROUGH */
  
-                /* The rest have more possibilities depending on the charset.  We
-                 * take advantage of the enum ordering of the charset modifiers to
-                 * get the exact node type, */
+                /* The rest have more possibilities depending on the charset.
+                 * We take advantage of the enum ordering of the charset
+                 * modifiers to get the exact node type, */
                  default:
                      op = POSIXD + get_regex_charset(RExC_flags);
                      if (op > POSIXA) { /* /aa is same as /a */
                  default:
                      op = POSIXD + get_regex_charset(RExC_flags);
                      if (op > POSIXA) { /* /aa is same as /a */
@@ -12385,7 +12402,8 @@ parseit:
           * indicators, which are weeded out below using the
           * IS_IN_SOME_FOLD_L1() macro */
          if (invlist_highest(cp_list) < 256) {
           * indicators, which are weeded out below using the
           * IS_IN_SOME_FOLD_L1() macro */
          if (invlist_highest(cp_list) < 256) {
-            _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, &fold_intersection);
+            _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
+                                                           &fold_intersection);
          }
          else {
  
          }
          else {