[perl #131211] fixup File::Glob degenerate matching
authorYves Orton <demerphq@gmail.com>
Tue, 25 Apr 2017 13:17:06 +0000 (15:17 +0200)
committerYves Orton <demerphq@gmail.com>
Thu, 27 Apr 2017 18:19:09 +0000 (20:19 +0200)
The old code would go quadratic with recursion and backtracking
when doing patterns like "a*a*a*a*a*a*a*x" on a file like
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".

This patch changes the code to not recurse, and to not backtrack,
as per this article from Russ Cox: https://research.swtch.com/glob

(Thanks to Avar and Russ Cox for helping with this patch.)

MANIFEST
ext/File-Glob/bsd_glob.c
ext/File-Glob/t/rt131211.t [new file with mode: 0644]

index 197c704..39fd95f 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -3948,6 +3948,7 @@ ext/File-Glob/t/basic.t           See if File::Glob works
 ext/File-Glob/t/case.t         See if File::Glob works
 ext/File-Glob/t/global.t       See if File::Glob works
 ext/File-Glob/t/rt114984.t     See if File::Glob works
+ext/File-Glob/t/rt131211.t     See if File::Glob works
 ext/File-Glob/t/taint.t                See if File::Glob works
 ext/File-Glob/t/threads.t      See if File::Glob + threads works
 ext/File-Glob/TODO             File::Glob extension todo list
index 821ef20..d577d01 100644 (file)
@@ -911,33 +911,43 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp)
 /*
  * pattern matching function for filenames.  Each occurrence of the *
  * pattern causes a recursion level.
+ *
+ * Note, this function differs from the original as per the discussion
+ * here: https://research.swtch.com/glob
+ *
+ * Basically we removed the recursion and made it use the algorithm
+ * from Russ Cox to not go quadratic on cases like a file called ("a" x 100) . "x"
+ * matched against a pattern like "a*a*a*a*a*a*a*y".
+ *
  */
 static int
 match(Char *name, Char *pat, Char *patend, int nocase)
 {
        int ok, negate_range;
        Char c, k;
+       Char *nextp = NULL;
+       Char *nextn = NULL;
 
+    loop:
        while (pat < patend) {
                c = *pat++;
                switch (c & M_MASK) {
                case M_ALL:
                        if (pat == patend)
                                return(1);
-                       do
-                           if (match(name, pat, patend, nocase))
-                                   return(1);
-                       while (*name++ != BG_EOS)
-                               ;
-                       return(0);
+                       if (*name == BG_EOS)
+                               return 0;
+                       nextn = name + 1;
+                       nextp = pat - 1;
+                       break;
                case M_ONE:
                        if (*name++ == BG_EOS)
-                               return(0);
+                               goto fail;
                        break;
                case M_SET:
                        ok = 0;
                        if ((k = *name++) == BG_EOS)
-                               return(0);
+                               goto fail;
                        if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS)
                                ++pat;
                        while (((c = *pat++) & M_MASK) != M_END)
@@ -953,16 +963,25 @@ match(Char *name, Char *pat, Char *patend, int nocase)
                                } else if (nocase ? (tolower(c) == tolower(k)) : (c == k))
                                        ok = 1;
                        if (ok == negate_range)
-                               return(0);
+                               goto fail;
                        break;
                default:
                        k = *name++;
                        if (nocase ? (tolower(k) != tolower(c)) : (k != c))
-                               return(0);
+                               goto fail;
                        break;
                }
        }
-       return(*name == BG_EOS);
+       if (*name == BG_EOS)
+               return 1;
+
+    fail:
+       if (nextn) {
+               pat = nextp;
+               name = nextn;
+               goto loop;
+       }
+       return 0;
 }
 
 /* Free allocated data belonging to a glob_t structure. */
diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t
new file mode 100644 (file)
index 0000000..c1bcbe0
--- /dev/null
@@ -0,0 +1,94 @@
+use strict;
+use warnings;
+use v5.16.0;
+use File::Temp 'tempdir';
+use File::Spec::Functions;
+use Test::More;
+use Time::HiRes qw(time);
+
+plan tests => 13;
+
+my $path = tempdir uc cleanup => 1;
+my @files= (
+    "x".("a" x 50)."b", # 0
+    "abbbbbbbbbbbbc",   # 1
+    "abbbbbbbbbbbbd",   # 2
+    "aaabaaaabaaaabc",  # 3
+    "pq",               # 4
+    "r",                # 5
+    "rttiiiiiii",       # 6
+    "wewewewewewe",     # 7
+    "weeeweeeweee",     # 8
+    "weewweewweew",     # 9
+    "wewewewewewewewewewewewewewewewewq", # 10
+    "wtttttttetttttttwr", # 11
+);
+
+
+foreach (@files) {
+    open(my $f, ">", catfile $path, $_);
+}
+
+my $elapsed_fail= 0;
+my $elapsed_match= 0;
+my @got_files;
+my @no_files;
+my $count = 0;
+
+while (++$count < 10) {
+    $elapsed_match -= time;
+    @got_files= glob catfile $path, "x".("a*" x $count) . "b";
+    $elapsed_match += time;
+
+    $elapsed_fail -= time;
+    @no_files= glob catfile $path, "x".("a*" x $count) . "c";
+    $elapsed_fail += time;
+    last if $elapsed_fail > $elapsed_match * 100;
+}
+
+is $count,10,
+    "tried all the patterns without bailing out";
+
+cmp_ok $elapsed_fail/$elapsed_match,"<",2,
+    "time to fail less than twice the time to match";
+is "@got_files", catfile($path, $files[0]),
+    "only got the expected file for xa*..b";
+is "@no_files", "", "shouldnt have files for xa*..c";
+
+
+@got_files= glob catfile $path, "a*b*b*b*bc";
+is "@got_files", catfile($path, $files[1]),
+    "only got the expected file for a*b*b*b*bc";
+
+@got_files= sort glob catfile $path, "a*b*b*bc";
+is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]),
+    "got the expected two files for a*b*b*bc";
+
+@got_files= sort glob catfile $path, "p*";
+is "@got_files", catfile($path, $files[4]),
+    "p* matches pq";
+
+@got_files= sort glob catfile $path, "r*???????";
+is "@got_files", catfile($path, $files[6]),
+    "r*??????? works as expected";
+
+@got_files= sort glob catfile $path, "w*e*w??e";
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)),
+    "w*e*w??e works as expected";
+
+@got_files= sort glob catfile $path, "w*e*we??";
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
+    "w*e*we?? works as expected";
+
+@got_files= sort glob catfile $path, "w**e**w";
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)),
+    "w**e**w works as expected";
+
+@got_files= sort glob catfile $path, "*wee*";
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)),
+    "*wee* works as expected";
+
+@got_files= sort glob catfile $path, "we*";
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
+    "we* works as expected";
+