This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 1.0 patch 9: 3 portability problems
[perl5.git] / search.c
1 /* $Header: search.c,v 1.0.1.2 88/01/28 10:30:46 root Exp $
2  *
3  * $Log:        search.c,v $
4  * Revision 1.0.1.2  88/01/28  10:30:46  root
5  * patch8: uncommented free_compex for use with eval operator.
6  * 
7  * Revision 1.0.1.1  88/01/24  03:55:05  root
8  * patch 2: made depend on perl.h.
9  * 
10  * Revision 1.0  87/12/18  13:05:59  root
11  * Initial revision
12  * 
13  */
14
15 /* string search routines */
16  
17 #include "EXTERN.h"
18 #include "handy.h"
19 #include "util.h"
20 #include "INTERN.h"
21 #include "search.h"
22 #include "EXTERN.h"
23 #include "perl.h"
24
25 #define VERBOSE
26 #define FLUSH
27 #define MEM_SIZE int
28
29 #ifndef BITSPERBYTE
30 #define BITSPERBYTE 8
31 #endif
32
33 #define BMAPSIZ (127 / BITSPERBYTE + 1)
34
35 #define CHAR    0               /*      a normal character */
36 #define ANY     1               /* .    matches anything except newline */
37 #define CCL     2               /* [..] character class */
38 #define NCCL    3               /* [^..]negated character class */
39 #define BEG     4               /* ^    beginning of a line */
40 #define END     5               /* $    end of a line */
41 #define LPAR    6               /* (    begin sub-match */
42 #define RPAR    7               /* )    end sub-match */
43 #define REF     8               /* \N   backreference to the Nth submatch */
44 #define WORD    9               /* \w   matches alphanumeric character */
45 #define NWORD   10              /* \W   matches non-alphanumeric character */
46 #define WBOUND  11              /* \b   matches word boundary */
47 #define NWBOUND 12              /* \B   matches non-boundary  */
48 #define FINIS   13              /*      the end of the pattern */
49  
50 #define CODEMASK 15
51
52 /* Quantifiers: */
53
54 #define MINZERO 16              /* minimum is 0, not 1 */
55 #define MAXINF  32              /* maximum is infinity, not 1 */
56  
57 #define ASCSIZ 0200
58 typedef char    TRANSTABLE[ASCSIZ];
59
60 static  TRANSTABLE trans = {
61 0000,0001,0002,0003,0004,0005,0006,0007,
62 0010,0011,0012,0013,0014,0015,0016,0017,
63 0020,0021,0022,0023,0024,0025,0026,0027,
64 0030,0031,0032,0033,0034,0035,0036,0037,
65 0040,0041,0042,0043,0044,0045,0046,0047,
66 0050,0051,0052,0053,0054,0055,0056,0057,
67 0060,0061,0062,0063,0064,0065,0066,0067,
68 0070,0071,0072,0073,0074,0075,0076,0077,
69 0100,0101,0102,0103,0104,0105,0106,0107,
70 0110,0111,0112,0113,0114,0115,0116,0117,
71 0120,0121,0122,0123,0124,0125,0126,0127,
72 0130,0131,0132,0133,0134,0135,0136,0137,
73 0140,0141,0142,0143,0144,0145,0146,0147,
74 0150,0151,0152,0153,0154,0155,0156,0157,
75 0160,0161,0162,0163,0164,0165,0166,0167,
76 0170,0171,0172,0173,0174,0175,0176,0177,
77 };
78 static bool folding = FALSE;
79
80 static int err;
81 #define NOERR 0
82 #define BEGFAIL 1
83 #define FATAL 2
84
85 static char *FirstCharacter;
86 static char *matchend;
87 static char *matchtill;
88
89 void
90 search_init()
91 {
92 #ifdef UNDEF
93     register int    i;
94     
95     for (i = 0; i < ASCSIZ; i++)
96         trans[i] = i;
97 #else
98     ;
99 #endif
100 }
101
102 void
103 init_compex(compex)
104 register COMPEX *compex;
105 {
106     /* the following must start off zeroed */
107
108     compex->precomp = Nullch;
109     compex->complen = 0;
110     compex->subbase = Nullch;
111 }
112
113 void
114 free_compex(compex)
115 register COMPEX *compex;
116 {
117     if (compex->complen) {
118         safefree(compex->compbuf);
119         compex->complen = 0;
120     }
121     if (compex->subbase) {
122         safefree(compex->subbase);
123         compex->subbase = Nullch;
124     }
125 }
126
127 static char *gbr_str = Nullch;
128 static int gbr_siz = 0;
129
130 char *
131 getparen(compex,n)
132 register COMPEX *compex;
133 int n;
134 {
135     int length = compex->subend[n] - compex->subbeg[n];
136
137     if (!n &&
138         (!compex->numsubs || n > compex->numsubs || !compex->subend[n] || length<0))
139         return "";
140     growstr(&gbr_str, &gbr_siz, length+1);
141     safecpy(gbr_str, compex->subbeg[n], length+1);
142     return gbr_str;
143 }
144
145 void
146 case_fold(which)
147 int which;
148 {
149     register int i;
150
151     if (which != folding) {
152         if (which) {
153             for (i = 'A'; i <= 'Z'; i++)
154                 trans[i] = tolower(i);
155         }
156         else {
157             for (i = 'A'; i <= 'Z'; i++)
158                 trans[i] = i;
159         }
160         folding = which;
161     }
162 }
163
164 /* Compile the regular expression into internal form */
165
166 char *
167 compile(compex, sp, regex, fold)
168 register COMPEX *compex;
169 register char   *sp;
170 int regex;
171 int fold;
172 {
173     register int c;
174     register char  *cp;
175     char   *lastcp;
176     char    paren[MAXSUB],
177            *parenp;
178     char **alt = compex->alternatives;
179     char *retmes = "Badly formed search string";
180  
181     case_fold(compex->do_folding = fold);
182     if (compex->precomp)
183         safefree(compex->precomp);
184     compex->precomp = savestr(sp);
185     if (!compex->complen) {
186         compex->compbuf = safemalloc(84);
187         compex->complen = 80;
188     }
189     cp = compex->compbuf;               /* point at compiled buffer */
190     *alt++ = cp;                        /* first alternative starts here */
191     parenp = paren;                     /* first paren goes here */
192     if (*sp == 0) {                     /* nothing to compile? */
193 #ifdef NOTDEF
194         if (*cp == 0)                   /* nothing there yet? */
195             return "Null search string";
196 #endif
197         if (*cp)
198             return Nullch;                      /* just keep old expression */
199     }
200     compex->numsubs = 0;                        /* no parens yet */
201     lastcp = 0;
202     for (;;) {
203         if (cp - compex->compbuf >= compex->complen) {
204             char *ocompbuf = compex->compbuf;
205
206             grow_comp(compex);
207             if (ocompbuf != compex->compbuf) {  /* adjust pointers? */
208                 char **tmpalt;
209
210                 cp = compex->compbuf + (cp - ocompbuf);
211                 if (lastcp)
212                     lastcp = compex->compbuf + (lastcp - ocompbuf);
213                 for (tmpalt = compex->alternatives; tmpalt < alt; tmpalt++)
214                     if (*tmpalt)
215                         *tmpalt = compex->compbuf + (*tmpalt - ocompbuf);
216             }
217         }
218         c = *sp++;                      /* get next char of pattern */
219         if (c == 0) {                   /* end of pattern? */
220             if (parenp != paren) {      /* balanced parentheses? */
221 #ifdef VERBOSE
222                 retmes = "Missing right parenthesis";
223 #endif
224                 goto badcomp;
225             }
226             *cp++ = FINIS;              /* append a stopper */
227             *alt++ = 0;                 /* terminate alternative list */
228             /*
229             compex->complen = cp - compex->compbuf + 1;
230             compex->compbuf = saferealloc(compex->compbuf,compex->complen+4); */
231             return Nullch;              /* return success */
232         }
233         if (c != '*' && c != '?' && c != '+')
234             lastcp = cp;
235         if (!regex) {                   /* just a normal search string? */
236             *cp++ = CHAR;               /* everything is a normal char */
237             *cp++ = trans[c];
238         }
239         else                            /* it is a regular expression */
240             switch (c) {
241  
242                 default:
243                   normal_char:
244                     *cp++ = CHAR;
245                     *cp++ = trans[c];
246                     continue;
247
248                 case '.':
249                     *cp++ = ANY;
250                     continue;
251  
252                 case '[': {             /* character class */
253                     register int i;
254                     
255                     if (cp - compex->compbuf >= compex->complen - BMAPSIZ) {
256                         char *ocompbuf = compex->compbuf;
257
258                         grow_comp(compex);      /* reserve bitmap */
259                         if (ocompbuf != compex->compbuf) {/* adjust pointers? */
260                             char **tmpalt;
261
262                             cp = compex->compbuf + (cp - ocompbuf);
263                             if (lastcp)
264                                 lastcp = compex->compbuf + (lastcp - ocompbuf);
265                             for (tmpalt = compex->alternatives; tmpalt < alt;
266                               tmpalt++)
267                                 if (*tmpalt)
268                                     *tmpalt =
269                                         compex->compbuf + (*tmpalt - ocompbuf);
270                         }
271                     }
272                     for (i = BMAPSIZ; i; --i)
273                         cp[i] = 0;
274                     
275                     if ((c = *sp++) == '^') {
276                         c = *sp++;
277                         *cp++ = NCCL;   /* negated */
278                     }
279                     else
280                         *cp++ = CCL;    /* normal */
281                     
282                     i = 0;              /* remember oldchar */
283                     do {
284                         if (c == '\0') {
285 #ifdef VERBOSE
286                             retmes = "Missing ]";
287 #endif
288                             goto badcomp;
289                         }
290                         if (c == '\\' && *sp) {
291                             switch (*sp) {
292                             default:
293                                 c = *sp++;
294                                 break;
295                             case '0': case '1': case '2': case '3':
296                             case '4': case '5': case '6': case '7':
297                                 c = *sp++ - '0';
298                                 if (index("01234567",*sp)) {
299                                     c <<= 3;
300                                     c += *sp++ - '0';
301                                 }
302                                 if (index("01234567",*sp)) {
303                                     c <<= 3;
304                                     c += *sp++ - '0';
305                                 }
306                                 break;
307                             case 'b':
308                                 c = '\b';
309                                 sp++;
310                                 break;
311                             case 'n':
312                                 c = '\n';
313                                 sp++;
314                                 break;
315                             case 'r':
316                                 c = '\r';
317                                 sp++;
318                                 break;
319                             case 'f':
320                                 c = '\f';
321                                 sp++;
322                                 break;
323                             case 't':
324                                 c = '\t';
325                                 sp++;
326                                 break;
327                             }
328                         }
329                         if (*sp == '-' && *(++sp))
330                             i = *sp++;
331                         else
332                             i = c;
333                         while (c <= i) {
334                             cp[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
335                             if (fold && isalpha(c))
336                                 cp[(c ^ 32) / BITSPERBYTE] |=
337                                     1 << ((c ^ 32) % BITSPERBYTE);
338                                         /* set the other bit too */
339                             c++;
340                         }
341                     } while ((c = *sp++) != ']');
342                     if (cp[-1] == NCCL)
343                         cp[0] |= 1;
344                     cp += BMAPSIZ;
345                     continue;
346                 }
347  
348                 case '^':
349                     if (cp != compex->compbuf && cp[-1] != FINIS)
350                         goto normal_char;
351                     *cp++ = BEG;
352                     continue;
353  
354                 case '$':
355                     if (isdigit(*sp)) {
356                         *cp++ = REF;
357                         *cp++ = *sp - '0';
358                         break;
359                     }
360                     if (*sp && *sp != '|')
361                         goto normal_char;
362                     *cp++ = END;
363                     continue;
364  
365                 case '*': case '?': case '+':
366                     if (lastcp == 0 ||
367                         (*lastcp & (MINZERO|MAXINF)) ||
368                         *lastcp == LPAR ||
369                         *lastcp == RPAR ||
370                         *lastcp == BEG ||
371                         *lastcp == END ||
372                         *lastcp == WBOUND ||
373                         *lastcp == NWBOUND )
374                         goto normal_char;
375                     if (c != '+')
376                         *lastcp |= MINZERO;
377                     if (c != '?')
378                         *lastcp |= MAXINF;
379                     continue;
380  
381                 case '(':
382                     if (compex->numsubs >= MAXSUB) {
383 #ifdef VERBOSE
384                         retmes = "Too many parens";
385 #endif
386                         goto badcomp;
387                     }
388                     *parenp++ = ++compex->numsubs;
389                     *cp++ = LPAR;
390                     *cp++ = compex->numsubs;
391                     break;
392                 case ')':
393                     if (parenp <= paren) {
394 #ifdef VERBOSE
395                         retmes = "Unmatched right paren";
396 #endif
397                         goto badcomp;
398                     }
399                     *cp++ = RPAR;
400                     *cp++ = *--parenp;
401                     break;
402                 case '|':
403                     if (parenp>paren) {
404 #ifdef VERBOSE
405                         retmes = "No | in subpattern";  /* Sigh! */
406 #endif
407                         goto badcomp;
408                     }
409                     *cp++ = FINIS;
410                     if (alt - compex->alternatives >= MAXALT) {
411 #ifdef VERBOSE
412                         retmes = "Too many alternatives";
413 #endif
414                         goto badcomp;
415                     }
416                     *alt++ = cp;
417                     break;
418                 case '\\':              /* backslashed thingie */
419                     switch (c = *sp++) {
420                     case '0': case '1': case '2': case '3': case '4':
421                     case '5': case '6': case '7': case '8': case '9':
422                         *cp++ = REF;
423                         *cp++ = c - '0';
424                         break;
425                     case 'w':
426                         *cp++ = WORD;
427                         break;
428                     case 'W':
429                         *cp++ = NWORD;
430                         break;
431                     case 'b':
432                         *cp++ = WBOUND;
433                         break;
434                     case 'B':
435                         *cp++ = NWBOUND;
436                         break;
437                     default:
438                         *cp++ = CHAR;
439                         if (c == '\0')
440                             goto badcomp;
441                         switch (c) {
442                         case 'n':
443                             c = '\n';
444                             break;
445                         case 'r':
446                             c = '\r';
447                             break;
448                         case 'f':
449                             c = '\f';
450                             break;
451                         case 't':
452                             c = '\t';
453                             break;
454                         }
455                         *cp++ = c;
456                         break;
457                     }
458                     break;
459             }
460     }
461 badcomp:
462     compex->compbuf[0] = 0;
463     compex->numsubs = 0;
464     return retmes;
465 }
466
467 void
468 grow_comp(compex)
469 register COMPEX *compex;
470 {
471     compex->complen += 80;
472     compex->compbuf = saferealloc(compex->compbuf, (MEM_SIZE)compex->complen + 4);
473 }
474
475 char *
476 execute(compex, addr, beginning, minend)
477 register COMPEX *compex;
478 char *addr;
479 bool beginning;
480 int minend;
481 {
482     register char *p1 = addr;
483     register char *trt = trans;
484     register int c;
485     register int scr;
486     register int c2;
487  
488     if (addr == Nullch)
489         return Nullch;
490     if (compex->numsubs) {                      /* any submatches? */
491         for (c = 0; c <= compex->numsubs; c++)
492             compex->subbeg[c] = compex->subend[c] = Nullch;
493     }
494     case_fold(compex->do_folding);      /* make sure table is correct */
495     if (beginning)
496         FirstCharacter = p1;            /* for ^ tests */
497     else {
498         if (multiline || compex->alternatives[1] || compex->compbuf[0] != BEG)
499             FirstCharacter = Nullch;
500         else
501             return Nullch;              /* can't match */
502     }
503     matchend = Nullch;
504     matchtill = addr + minend;
505     err = 0;
506     if (compex->compbuf[0] == CHAR && !compex->alternatives[1]) {
507         if (compex->do_folding) {
508             c = compex->compbuf[1];     /* fast check for first character */
509             do {
510                 if (trt[*p1] == c && try(compex, p1, compex->compbuf))
511                     goto got_it;
512             } while (*p1++ && !err);
513         }
514         else {
515             c = compex->compbuf[1];     /* faster check for first character */
516             if (compex->compbuf[2] == CHAR)
517                 c2 = compex->compbuf[3];
518             else
519                 c2 = 0;
520             do {
521               false_alarm:
522                 while (scr = *p1++, scr && scr != c) ;
523                 if (!scr)
524                     break;
525                 if (c2 && *p1 != c2)    /* and maybe even second character */
526                     goto false_alarm;
527                 if (try(compex, p1, compex->compbuf+2)) {
528                     p1--;
529                     goto got_it;
530                 }
531             } while (!err);
532         }
533         return Nullch;
534     }
535     else {                      /* normal algorithm */
536         do {
537             register char **alt = compex->alternatives;
538             while (*alt) {
539                 if (try(compex, p1, *alt++))
540                     goto got_it;
541             }
542         } while (*p1++ && err < FATAL);
543         return Nullch;
544     }
545
546 got_it:
547     if (compex->numsubs) {                      /* any parens? */
548         trt = savestr(addr);            /* in case addr is not static */
549         if (compex->subbase)
550             safefree(compex->subbase);  /* (may be freeing addr!) */
551         compex->subbase = trt;
552         scr = compex->subbase - addr;
553         p1 += scr;
554         matchend += scr;
555         for (c = 0; c <= compex->numsubs; c++) {
556             if (compex->subend[c]) {
557                 compex->subbeg[c] += scr;
558                 compex->subend[c] += scr;
559             }
560         }
561     }
562     compex->subend[0] = matchend;
563     compex->subbeg[0] = p1;
564     return p1;
565 }
566  
567 bool
568 try(compex, sp, cp)
569 COMPEX *compex;
570 register char *cp;
571 register char *sp;
572 {
573     register char *basesp;
574     register char *trt = trans;
575     register int i;
576     register int backlen;
577     register int code;
578  
579     while (*sp || (*cp & MAXINF) || *cp == BEG || *cp == RPAR ||
580         *cp == WBOUND || *cp == NWBOUND) {
581         switch ((code = *cp++) & CODEMASK) {
582  
583             case CHAR:
584                 basesp = sp;
585                 i = *cp++;
586                 if (code & MAXINF)
587                     while (*sp && trt[*sp] == i) sp++;
588                 else
589                     if (*sp && trt[*sp] == i) sp++;
590                 backlen = 1;
591                 goto backoff;
592  
593           backoff:
594                 while (sp > basesp) {
595                     if (try(compex, sp, cp))
596                         goto right;
597                     sp -= backlen;
598                 }
599                 if (code & MINZERO)
600                     continue;
601                 goto wrong;
602  
603             case ANY:
604                 basesp = sp;
605                 if (code & MAXINF)
606                     while (*sp && *sp != '\n') sp++;
607                 else
608                     if (*sp && *sp != '\n') sp++;
609                 backlen = 1;
610                 goto backoff;
611
612             case CCL:
613                 basesp = sp;
614                 if (code & MAXINF)
615                     while (*sp && cclass(cp, *sp, 1)) sp++;
616                 else
617                     if (*sp && cclass(cp, *sp, 1)) sp++;
618                 cp += BMAPSIZ;
619                 backlen = 1;
620                 goto backoff;
621  
622             case NCCL:
623                 basesp = sp;
624                 if (code & MAXINF)
625                     while (*sp && cclass(cp, *sp, 0)) sp++;
626                 else
627                     if (*sp && cclass(cp, *sp, 0)) sp++;
628                 cp += BMAPSIZ;
629                 backlen = 1;
630                 goto backoff;
631  
632             case END:
633                 if (!*sp || *sp == '\n') {
634                     matchtill--;
635                     continue;
636                 }
637                 goto wrong;
638  
639             case BEG:
640                 if (sp == FirstCharacter || (
641                     *sp && sp[-1] == '\n') ) {
642                     matchtill--;
643                     continue;
644                 }
645                 if (!multiline)         /* no point in advancing more */
646                     err = BEGFAIL;
647                 goto wrong;
648  
649             case WORD:
650                 basesp = sp;
651                 if (code & MAXINF)
652                     while (*sp && isalnum(*sp)) sp++;
653                 else
654                     if (*sp && isalnum(*sp)) sp++;
655                 backlen = 1;
656                 goto backoff;
657  
658             case NWORD:
659                 basesp = sp;
660                 if (code & MAXINF)
661                     while (*sp && !isalnum(*sp)) sp++;
662                 else
663                     if (*sp && !isalnum(*sp)) sp++;
664                 backlen = 1;
665                 goto backoff;
666  
667             case WBOUND:
668                 if ((sp == FirstCharacter || !isalnum(sp[-1])) !=
669                         (!*sp || !isalnum(*sp)) )
670                     continue;
671                 goto wrong;
672  
673             case NWBOUND:
674                 if ((sp == FirstCharacter || !isalnum(sp[-1])) ==
675                         (!*sp || !isalnum(*sp)))
676                     continue;
677                 goto wrong;
678  
679             case FINIS:
680                 goto right;
681  
682             case LPAR:
683                 compex->subbeg[*cp++] = sp;
684                 continue;
685  
686             case RPAR:
687                 i = *cp++;
688                 compex->subend[i] = sp;
689                 compex->lastparen = i;
690                 continue;
691  
692             case REF:
693                 if (compex->subend[i = *cp++] == 0) {
694                     fputs("Bad subpattern reference\n",stdout) FLUSH;
695                     err = FATAL;
696                     goto wrong;
697                 }
698                 basesp = sp;
699                 backlen = compex->subend[i] - compex->subbeg[i];
700                 if (code & MAXINF)
701                     while (*sp && subpat(compex, i, sp)) sp += backlen;
702                 else
703                     if (*sp && subpat(compex, i, sp)) sp += backlen;
704                 goto backoff;
705  
706             default:
707                 fputs("Botched pattern compilation\n",stdout) FLUSH;
708                 err = FATAL;
709                 return -1;
710         }
711     }
712     if (*cp == FINIS || *cp == END) {
713 right:
714         if (matchend == Nullch || sp > matchend)
715             matchend = sp;
716         return matchend >= matchtill;
717     }
718 wrong:
719     matchend = Nullch;
720     return FALSE;
721 }
722  
723 bool
724 subpat(compex, i, sp)
725 register COMPEX *compex;
726 register int i;
727 register char *sp;
728 {
729     register char *bp;
730  
731     bp = compex->subbeg[i];
732     while (*sp && *bp == *sp) {
733         bp++;
734         sp++;
735         if (bp >= compex->subend[i])
736             return TRUE;
737     }
738     return FALSE;
739 }
740
741 bool
742 cclass(set, c, af)
743 register char  *set;
744 register int c;
745 {
746     c &= 0177;
747 #if BITSPERBYTE == 8
748     if (set[c >> 3] & 1 << (c & 7))
749 #else
750     if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
751 #endif
752         return af;
753     return !af;
754 }