feel the the baÃ\9f (encoding problems in the regex engine)

author Yves Orton <demerphq@gmail.com>

Tue, 20 Mar 2007 01:40:34 +0000 (02:40 +0100)

committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>

Tue, 20 Mar 2007 09:01:05 +0000 (09:01 +0000)
author Yves Orton <demerphq@gmail.com>
Tue, 20 Mar 2007 01:40:34 +0000 (02:40 +0100)
committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
Tue, 20 Mar 2007 09:01:05 +0000 (09:01 +0000)
diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod

index 3ba0da0..d119dfe 100644 (file)
--- a/pod/perlreguts.pod
+++ b/pod/perlreguts.pod
@@ -775,7 +775,7 @@ must be able to correctly build a regexp structure.
  
      typedef struct regexp {
              /* what engine created this regexp? */
-            const struct regexp_engine* engine; 
+            const struct regexp_engine* engine;
              
              /* Information about the match that the perl core uses to manage things */
              U32 extflags;           /* Flags used both externally and internally */
@@ -829,10 +829,10 @@ to the subroutines that are to be used for performing a match. It
  is the compiling routine's responsibility to populate this field before
  returning the regexp object.
  
-=item C<precomp> C<prelen> 
+=item C<precomp> C<prelen>
  
  Used for debugging purposes. C<precomp> holds a copy of the pattern
-that was compiled. 
+that was compiled.
  
  =item C<extflags>
  
@@ -841,22 +841,22 @@ contains a \G or a ^ or $ symbol.
  
  =item C<minlen> C<minlenret>
  
-C<minlen> is the minimum string length required for the pattern to match. 
-This is used to prune the search space by not bothering to match any 
-closer to the end of a string than would allow a match. For instance 
-there is no point in even starting the regex engine if the minlen is 
-10 but the string is only 5 characters long. There is no way that the 
+C<minlen> is the minimum string length required for the pattern to match.
+This is used to prune the search space by not bothering to match any
+closer to the end of a string than would allow a match. For instance
+there is no point in even starting the regex engine if the minlen is
+10 but the string is only 5 characters long. There is no way that the
  pattern can match.
  
  C<minlenret> is the minimum length of the string that would be found
-in $& after a match. 
+in $& after a match.
  
  The difference between C<minlen> and C<minlenret> can be seen in the
  following pattern:
  
    /ns(?=\d)/
  
-where the C<minlen> would be 3 but the minlen ret would only be 2 as 
+where the C<minlen> would be 3 but the minlen ret would only be 2 as
  the \d is required to match but is not actually included in the matched
  content. This distinction is particularly important as the substitution
  logic uses the C<minlenret> to tell whether it can do in-place substition
@@ -889,7 +889,7 @@ occur at a floating offset from the start of the pattern. Used to do
  Fast-Boyer-Moore searches on the string to find out if its worth using
  the regex engine at all, and if so where in the string to search.
  
-=item C<startp>, C<endp>, 
+=item C<startp>, C<endp>
  
  These fields store arrays that are used to hold the offsets of the begining
  and end of each capture group that has matched. -1 is used to indicate no match.
@@ -903,8 +903,8 @@ patterns.
  
  =item C<seen_evals>
  
-This stores the number of eval groups in the pattern. This is used 
-for security purposes when embedding compiled regexes into larger 
+This stores the number of eval groups in the pattern. This is used
+for security purposes when embedding compiled regexes into larger
  patterns.
  
  =back
@@ -1028,6 +1028,17 @@ Compile the pattern between exp and xend using the flags contained in
  pm and return a pointer to a prepared regexp structure that can perform
  the match.
  
+The utf8'ness of the string can be found by testing
+
+   pm->op_pmdynflags & PMdf_CMP_UTF8
+
+Additional various flags reflecting the modifiers used are contained in
+
+   pm->op_pmflags
+
+some of these have exact equivelents in re->extflags. See regcomp.h and op.h
+for details of these values.   
+
  =item exec
  
      I32 exec(regexp* prog,
@@ -1046,7 +1057,7 @@ Execute a regexp.
  Find the start position where a regex match should be attempted,
  or possibly whether the regex engine should not be run because the
  pattern can't match. This is called as appropriate by the core
-depending on the values of the extflags member of the regexp 
+depending on the values of the extflags member of the regexp
  structure.
  
  =item checkstr
diff --git a/regcomp.c b/regcomp.c

index 429b493..7c08840 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -124,7 +124,10 @@ typedef struct RExC_state_t {
      regnode    **open_parens;          /* pointers to open parens */
      regnode    **close_parens;         /* pointers to close parens */
      regnode    *opend;                 /* END node in program */
-    I32                utf8;
+    I32                utf8;           /* whether the pattern is utf8 or not */
+    I32                orig_utf8;      /* whether the pattern was originally in utf8 */
+                               /* XXX use this for future optimisation of case
+                                * where pattern must be upgraded to utf8. */
      HV         *charnames;             /* cache of named sequences */
      HV         *paren_names;           /* Paren names */
      
@@ -168,6 +171,7 @@ typedef struct RExC_state_t {
  #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
  #define RExC_seen_evals        (pRExC_state->seen_evals)
  #define RExC_utf8      (pRExC_state->utf8)
+#define RExC_orig_utf8 (pRExC_state->orig_utf8)
  #define RExC_charnames  (pRExC_state->charnames)
  #define RExC_open_parens       (pRExC_state->open_parens)
  #define RExC_close_parens      (pRExC_state->close_parens)
@@ -1375,16 +1379,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
          U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
          const U8 *scan = (U8*)NULL;
          U32 wordlen      = 0;         /* required init */
-        STRLEN chars=0;
+        STRLEN chars = 0;
+        bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
  
          if (OP(noper) == NOTHING) {
              trie->minlen= 0;
              continue;
          }
-        if (trie->bitmap) {
-            TRIE_BITMAP_SET(trie,*uc);
-            if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);            
-        }
+        if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+            TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
+                                          regardless of encoding */
+
          for ( ; uc < e ; uc += len ) {
              TRIE_CHARCOUNT(trie)++;
              TRIE_READ_CHAR;
@@ -1396,6 +1401,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                          trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
                      TRIE_STORE_REVCHAR;
                  }
+                if ( set_bit ) {
+                    /* store the codepoint in the bitmap, and if its ascii
+                       also store its folded equivelent. */
+                    TRIE_BITMAP_SET(trie,uvc);
+                    if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+                    set_bit = 0; /* We've done our bit :-) */
+                }
              } else {
                  SV** svpp;
                  if ( !widecharmap )
@@ -4052,16 +4064,18 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
      if (exp == NULL)
         FAIL("NULL regexp argument");
  
-    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+    RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
  
-    RExC_precomp = exp;
      DEBUG_COMPILE_r({
          SV *dsv= sv_newmortal();
          RE_PV_QUOTED_DECL(s, RExC_utf8,
-            dsv, RExC_precomp, (xend - exp), 60);
+            dsv, exp, (xend - exp), 60);
          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
                        PL_colors[4],PL_colors[5],s);
      });
+
+redo_first_pass:
+    RExC_precomp = exp;
      RExC_flags = pm->op_pmflags;
      RExC_sawback = 0;
  
@@ -4100,6 +4114,25 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
         RExC_precomp = NULL;
         return(NULL);
      }
+    if (RExC_utf8 && !RExC_orig_utf8) {
+        /* It's possible to write a regexp in ascii that represents unicode
+        codepoints outside of the byte range, such as via \x{100}. If we
+        detect such a sequence we have to convert the entire pattern to utf8
+        and then recompile, as our sizing calculation will have been based
+        on 1 byte == 1 character, but we will need to use utf8 to encode
+        at least some part of the pattern, and therefore must convert the whole
+        thing.
+        XXX: somehow figure out how to make this less expensive...
+        -- dmq */
+        STRLEN len = xend-exp;
+        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
+        xend = exp + len;
+        RExC_orig_utf8 = RExC_utf8;
+        SAVEFREEPV(exp);
+        goto redo_first_pass;
+    }
      DEBUG_PARSE_r({
          PerlIO_printf(Perl_debug_log, 
              "Required size %"IVdf" nodes\n"
@@ -4956,7 +4989,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
      GET_RE_DEBUG_FLAGS_DECL;
      DEBUG_PARSE("reg ");
  
-
      *flagp = 0;                                /* Tentatively. */
  
  
@@ -5796,6 +5828,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
      I32 flags = 0, c = 0;
      GET_RE_DEBUG_FLAGS_DECL;
      DEBUG_PARSE("brnc");
+
      if (first)
         ret = NULL;
      else {
diff --git a/regexec.c b/regexec.c

index dc0cd9b..c9efaae 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1997,7 +1997,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
             SV * const prop = sv_newmortal();
             regprop(prog, prop, c);
             {
-               RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
+               RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
                     s,strend-s,60);
                 PerlIO_printf(Perl_debug_log,
                     "Matching stclass %.*s against %s (%d chars)\n",
diff --git a/t/op/pat.t b/t/op/pat.t

index 5bc68d7..423822a 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4316,6 +4316,16 @@ sub kt
          "Check that (?&..) to a buffer inside a (?|...) goes to the leftmost");
  }
  
+{
+    use warnings;
+    local $Message = "ASCII pattern that really is utf8";
+    my @w;
+    local $SIG{__WARN__}=sub{push @w,"@_"};
+    my $c=qq(\x{DF}); 
+    ok($c=~/${c}|\x{100}/);
+    ok(@w==0);
+}    
+
  # Test counter is at bottom of file. Put new tests above here.
  #-------------------------------------------------------------------
  # Keep the following tests last -- they may crash perl
@@ -4385,7 +4395,7 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
  iseq(0+$::test,$::TestCount,"Got the right number of tests!");
  # Don't forget to update this!
  BEGIN {
-    $::TestCount = 1650;
+    $::TestCount = 1652;
      print "1..$::TestCount\n";
  }
author	Yves Orton <demerphq@gmail.com>
	Tue, 20 Mar 2007 01:40:34 +0000 (02:40 +0100)
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
	Tue, 20 Mar 2007 09:01:05 +0000 (09:01 +0000)
pod/perlreguts.pod		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history
t/op/pat.t		patch \| blob \| blame \| history