This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 1.0 patch 14: a2p incorrectly translates 'for (a in b)' construct.
[perl5.git] / search.c
CommitLineData
a559c259 1/* $Header: search.c,v 1.0.1.2 88/01/28 10:30:46 root Exp $
8d063cd8
LW
2 *
3 * $Log: search.c,v $
a559c259
LW
4 * Revision 1.0.1.2 88/01/28 10:30:46 root
5 * patch8: uncommented free_compex for use with eval operator.
6 *
135863df
AB
7 * Revision 1.0.1.1 88/01/24 03:55:05 root
8 * patch 2: made depend on perl.h.
9 *
8d063cd8
LW
10 * Revision 1.0 87/12/18 13:05:59 root
11 * Initial revision
12 *
13 */
14
15/* string search routines */
16
8d063cd8
LW
17#include "EXTERN.h"
18#include "handy.h"
19#include "util.h"
20#include "INTERN.h"
21#include "search.h"
135863df
AB
22#include "EXTERN.h"
23#include "perl.h"
8d063cd8
LW
24
25#define VERBOSE
26#define FLUSH
27#define MEM_SIZE int
28
29#ifndef BITSPERBYTE
30#define BITSPERBYTE 8
31#endif
32
33#define BMAPSIZ (127 / BITSPERBYTE + 1)
34
35#define CHAR 0 /* a normal character */
36#define ANY 1 /* . matches anything except newline */
37#define CCL 2 /* [..] character class */
38#define NCCL 3 /* [^..]negated character class */
39#define BEG 4 /* ^ beginning of a line */
40#define END 5 /* $ end of a line */
41#define LPAR 6 /* ( begin sub-match */
42#define RPAR 7 /* ) end sub-match */
43#define REF 8 /* \N backreference to the Nth submatch */
44#define WORD 9 /* \w matches alphanumeric character */
45#define NWORD 10 /* \W matches non-alphanumeric character */
46#define WBOUND 11 /* \b matches word boundary */
47#define NWBOUND 12 /* \B matches non-boundary */
48#define FINIS 13 /* the end of the pattern */
49
50#define CODEMASK 15
51
52/* Quantifiers: */
53
54#define MINZERO 16 /* minimum is 0, not 1 */
55#define MAXINF 32 /* maximum is infinity, not 1 */
56
57#define ASCSIZ 0200
58typedef char TRANSTABLE[ASCSIZ];
59
60static TRANSTABLE trans = {
610000,0001,0002,0003,0004,0005,0006,0007,
620010,0011,0012,0013,0014,0015,0016,0017,
630020,0021,0022,0023,0024,0025,0026,0027,
640030,0031,0032,0033,0034,0035,0036,0037,
650040,0041,0042,0043,0044,0045,0046,0047,
660050,0051,0052,0053,0054,0055,0056,0057,
670060,0061,0062,0063,0064,0065,0066,0067,
680070,0071,0072,0073,0074,0075,0076,0077,
690100,0101,0102,0103,0104,0105,0106,0107,
700110,0111,0112,0113,0114,0115,0116,0117,
710120,0121,0122,0123,0124,0125,0126,0127,
720130,0131,0132,0133,0134,0135,0136,0137,
730140,0141,0142,0143,0144,0145,0146,0147,
740150,0151,0152,0153,0154,0155,0156,0157,
750160,0161,0162,0163,0164,0165,0166,0167,
760170,0171,0172,0173,0174,0175,0176,0177,
77};
78static bool folding = FALSE;
79
80static int err;
81#define NOERR 0
82#define BEGFAIL 1
83#define FATAL 2
84
85static char *FirstCharacter;
86static char *matchend;
87static char *matchtill;
88
89void
90search_init()
91{
92#ifdef UNDEF
93 register int i;
94
95 for (i = 0; i < ASCSIZ; i++)
96 trans[i] = i;
97#else
98 ;
99#endif
100}
101
102void
103init_compex(compex)
104register COMPEX *compex;
105{
106 /* the following must start off zeroed */
107
108 compex->precomp = Nullch;
109 compex->complen = 0;
110 compex->subbase = Nullch;
111}
112
8d063cd8
LW
113void
114free_compex(compex)
115register COMPEX *compex;
116{
117 if (compex->complen) {
118 safefree(compex->compbuf);
119 compex->complen = 0;
120 }
121 if (compex->subbase) {
122 safefree(compex->subbase);
123 compex->subbase = Nullch;
124 }
125}
8d063cd8
LW
126
127static char *gbr_str = Nullch;
128static int gbr_siz = 0;
129
130char *
131getparen(compex,n)
132register COMPEX *compex;
133int n;
134{
135 int length = compex->subend[n] - compex->subbeg[n];
136
137 if (!n &&
138 (!compex->numsubs || n > compex->numsubs || !compex->subend[n] || length<0))
139 return "";
140 growstr(&gbr_str, &gbr_siz, length+1);
141 safecpy(gbr_str, compex->subbeg[n], length+1);
142 return gbr_str;
143}
144
145void
146case_fold(which)
147int which;
148{
149 register int i;
150
151 if (which != folding) {
152 if (which) {
153 for (i = 'A'; i <= 'Z'; i++)
154 trans[i] = tolower(i);
155 }
156 else {
157 for (i = 'A'; i <= 'Z'; i++)
158 trans[i] = i;
159 }
160 folding = which;
161 }
162}
163
164/* Compile the regular expression into internal form */
165
166char *
167compile(compex, sp, regex, fold)
168register COMPEX *compex;
169register char *sp;
170int regex;
171int fold;
172{
173 register int c;
174 register char *cp;
175 char *lastcp;
176 char paren[MAXSUB],
177 *parenp;
178 char **alt = compex->alternatives;
179 char *retmes = "Badly formed search string";
180
181 case_fold(compex->do_folding = fold);
182 if (compex->precomp)
183 safefree(compex->precomp);
184 compex->precomp = savestr(sp);
185 if (!compex->complen) {
186 compex->compbuf = safemalloc(84);
187 compex->complen = 80;
188 }
189 cp = compex->compbuf; /* point at compiled buffer */
190 *alt++ = cp; /* first alternative starts here */
191 parenp = paren; /* first paren goes here */
192 if (*sp == 0) { /* nothing to compile? */
193#ifdef NOTDEF
194 if (*cp == 0) /* nothing there yet? */
195 return "Null search string";
196#endif
197 if (*cp)
198 return Nullch; /* just keep old expression */
199 }
200 compex->numsubs = 0; /* no parens yet */
201 lastcp = 0;
202 for (;;) {
203 if (cp - compex->compbuf >= compex->complen) {
204 char *ocompbuf = compex->compbuf;
205
206 grow_comp(compex);
207 if (ocompbuf != compex->compbuf) { /* adjust pointers? */
208 char **tmpalt;
209
210 cp = compex->compbuf + (cp - ocompbuf);
211 if (lastcp)
212 lastcp = compex->compbuf + (lastcp - ocompbuf);
213 for (tmpalt = compex->alternatives; tmpalt < alt; tmpalt++)
214 if (*tmpalt)
215 *tmpalt = compex->compbuf + (*tmpalt - ocompbuf);
216 }
217 }
218 c = *sp++; /* get next char of pattern */
219 if (c == 0) { /* end of pattern? */
220 if (parenp != paren) { /* balanced parentheses? */
221#ifdef VERBOSE
222 retmes = "Missing right parenthesis";
223#endif
224 goto badcomp;
225 }
226 *cp++ = FINIS; /* append a stopper */
227 *alt++ = 0; /* terminate alternative list */
228 /*
229 compex->complen = cp - compex->compbuf + 1;
230 compex->compbuf = saferealloc(compex->compbuf,compex->complen+4); */
231 return Nullch; /* return success */
232 }
233 if (c != '*' && c != '?' && c != '+')
234 lastcp = cp;
235 if (!regex) { /* just a normal search string? */
236 *cp++ = CHAR; /* everything is a normal char */
237 *cp++ = trans[c];
238 }
239 else /* it is a regular expression */
240 switch (c) {
241
242 default:
243 normal_char:
244 *cp++ = CHAR;
245 *cp++ = trans[c];
246 continue;
247
248 case '.':
249 *cp++ = ANY;
250 continue;
251
252 case '[': { /* character class */
253 register int i;
254
255 if (cp - compex->compbuf >= compex->complen - BMAPSIZ) {
256 char *ocompbuf = compex->compbuf;
257
258 grow_comp(compex); /* reserve bitmap */
259 if (ocompbuf != compex->compbuf) {/* adjust pointers? */
260 char **tmpalt;
261
262 cp = compex->compbuf + (cp - ocompbuf);
263 if (lastcp)
264 lastcp = compex->compbuf + (lastcp - ocompbuf);
265 for (tmpalt = compex->alternatives; tmpalt < alt;
266 tmpalt++)
267 if (*tmpalt)
268 *tmpalt =
269 compex->compbuf + (*tmpalt - ocompbuf);
270 }
271 }
272 for (i = BMAPSIZ; i; --i)
273 cp[i] = 0;
274
275 if ((c = *sp++) == '^') {
276 c = *sp++;
277 *cp++ = NCCL; /* negated */
278 }
279 else
280 *cp++ = CCL; /* normal */
281
282 i = 0; /* remember oldchar */
283 do {
284 if (c == '\0') {
285#ifdef VERBOSE
286 retmes = "Missing ]";
287#endif
288 goto badcomp;
289 }
290 if (c == '\\' && *sp) {
291 switch (*sp) {
292 default:
293 c = *sp++;
294 break;
295 case '0': case '1': case '2': case '3':
296 case '4': case '5': case '6': case '7':
297 c = *sp++ - '0';
298 if (index("01234567",*sp)) {
299 c <<= 3;
300 c += *sp++ - '0';
301 }
302 if (index("01234567",*sp)) {
303 c <<= 3;
304 c += *sp++ - '0';
305 }
306 break;
307 case 'b':
308 c = '\b';
309 sp++;
310 break;
311 case 'n':
312 c = '\n';
313 sp++;
314 break;
315 case 'r':
316 c = '\r';
317 sp++;
318 break;
319 case 'f':
320 c = '\f';
321 sp++;
322 break;
323 case 't':
324 c = '\t';
325 sp++;
326 break;
327 }
328 }
329 if (*sp == '-' && *(++sp))
330 i = *sp++;
331 else
332 i = c;
333 while (c <= i) {
334 cp[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
335 if (fold && isalpha(c))
336 cp[(c ^ 32) / BITSPERBYTE] |=
337 1 << ((c ^ 32) % BITSPERBYTE);
338 /* set the other bit too */
339 c++;
340 }
341 } while ((c = *sp++) != ']');
342 if (cp[-1] == NCCL)
343 cp[0] |= 1;
344 cp += BMAPSIZ;
345 continue;
346 }
347
348 case '^':
349 if (cp != compex->compbuf && cp[-1] != FINIS)
350 goto normal_char;
351 *cp++ = BEG;
352 continue;
353
354 case '$':
355 if (isdigit(*sp)) {
356 *cp++ = REF;
357 *cp++ = *sp - '0';
358 break;
359 }
360 if (*sp && *sp != '|')
361 goto normal_char;
362 *cp++ = END;
363 continue;
364
365 case '*': case '?': case '+':
366 if (lastcp == 0 ||
367 (*lastcp & (MINZERO|MAXINF)) ||
368 *lastcp == LPAR ||
369 *lastcp == RPAR ||
370 *lastcp == BEG ||
371 *lastcp == END ||
372 *lastcp == WBOUND ||
373 *lastcp == NWBOUND )
374 goto normal_char;
375 if (c != '+')
376 *lastcp |= MINZERO;
377 if (c != '?')
378 *lastcp |= MAXINF;
379 continue;
380
381 case '(':
382 if (compex->numsubs >= MAXSUB) {
383#ifdef VERBOSE
384 retmes = "Too many parens";
385#endif
386 goto badcomp;
387 }
388 *parenp++ = ++compex->numsubs;
389 *cp++ = LPAR;
390 *cp++ = compex->numsubs;
391 break;
392 case ')':
393 if (parenp <= paren) {
394#ifdef VERBOSE
395 retmes = "Unmatched right paren";
396#endif
397 goto badcomp;
398 }
399 *cp++ = RPAR;
400 *cp++ = *--parenp;
401 break;
402 case '|':
403 if (parenp>paren) {
404#ifdef VERBOSE
405 retmes = "No | in subpattern"; /* Sigh! */
406#endif
407 goto badcomp;
408 }
409 *cp++ = FINIS;
410 if (alt - compex->alternatives >= MAXALT) {
411#ifdef VERBOSE
412 retmes = "Too many alternatives";
413#endif
414 goto badcomp;
415 }
416 *alt++ = cp;
417 break;
418 case '\\': /* backslashed thingie */
419 switch (c = *sp++) {
420 case '0': case '1': case '2': case '3': case '4':
421 case '5': case '6': case '7': case '8': case '9':
422 *cp++ = REF;
423 *cp++ = c - '0';
424 break;
425 case 'w':
426 *cp++ = WORD;
427 break;
428 case 'W':
429 *cp++ = NWORD;
430 break;
431 case 'b':
432 *cp++ = WBOUND;
433 break;
434 case 'B':
435 *cp++ = NWBOUND;
436 break;
437 default:
438 *cp++ = CHAR;
439 if (c == '\0')
440 goto badcomp;
441 switch (c) {
442 case 'n':
443 c = '\n';
444 break;
445 case 'r':
446 c = '\r';
447 break;
448 case 'f':
449 c = '\f';
450 break;
451 case 't':
452 c = '\t';
453 break;
454 }
455 *cp++ = c;
456 break;
457 }
458 break;
459 }
460 }
461badcomp:
462 compex->compbuf[0] = 0;
463 compex->numsubs = 0;
464 return retmes;
465}
466
467void
468grow_comp(compex)
469register COMPEX *compex;
470{
471 compex->complen += 80;
472 compex->compbuf = saferealloc(compex->compbuf, (MEM_SIZE)compex->complen + 4);
473}
474
475char *
476execute(compex, addr, beginning, minend)
477register COMPEX *compex;
478char *addr;
479bool beginning;
480int minend;
481{
482 register char *p1 = addr;
483 register char *trt = trans;
484 register int c;
485 register int scr;
486 register int c2;
487
488 if (addr == Nullch)
489 return Nullch;
490 if (compex->numsubs) { /* any submatches? */
491 for (c = 0; c <= compex->numsubs; c++)
492 compex->subbeg[c] = compex->subend[c] = Nullch;
493 }
494 case_fold(compex->do_folding); /* make sure table is correct */
495 if (beginning)
496 FirstCharacter = p1; /* for ^ tests */
497 else {
498 if (multiline || compex->alternatives[1] || compex->compbuf[0] != BEG)
499 FirstCharacter = Nullch;
500 else
501 return Nullch; /* can't match */
502 }
503 matchend = Nullch;
504 matchtill = addr + minend;
505 err = 0;
506 if (compex->compbuf[0] == CHAR && !compex->alternatives[1]) {
507 if (compex->do_folding) {
508 c = compex->compbuf[1]; /* fast check for first character */
509 do {
510 if (trt[*p1] == c && try(compex, p1, compex->compbuf))
511 goto got_it;
512 } while (*p1++ && !err);
513 }
514 else {
515 c = compex->compbuf[1]; /* faster check for first character */
516 if (compex->compbuf[2] == CHAR)
517 c2 = compex->compbuf[3];
518 else
519 c2 = 0;
520 do {
521 false_alarm:
522 while (scr = *p1++, scr && scr != c) ;
523 if (!scr)
524 break;
525 if (c2 && *p1 != c2) /* and maybe even second character */
526 goto false_alarm;
527 if (try(compex, p1, compex->compbuf+2)) {
528 p1--;
529 goto got_it;
530 }
531 } while (!err);
532 }
533 return Nullch;
534 }
535 else { /* normal algorithm */
536 do {
537 register char **alt = compex->alternatives;
538 while (*alt) {
539 if (try(compex, p1, *alt++))
540 goto got_it;
541 }
542 } while (*p1++ && err < FATAL);
543 return Nullch;
544 }
545
546got_it:
547 if (compex->numsubs) { /* any parens? */
548 trt = savestr(addr); /* in case addr is not static */
549 if (compex->subbase)
550 safefree(compex->subbase); /* (may be freeing addr!) */
551 compex->subbase = trt;
552 scr = compex->subbase - addr;
553 p1 += scr;
554 matchend += scr;
555 for (c = 0; c <= compex->numsubs; c++) {
556 if (compex->subend[c]) {
557 compex->subbeg[c] += scr;
558 compex->subend[c] += scr;
559 }
560 }
561 }
562 compex->subend[0] = matchend;
563 compex->subbeg[0] = p1;
564 return p1;
565}
566
567bool
568try(compex, sp, cp)
569COMPEX *compex;
570register char *cp;
571register char *sp;
572{
573 register char *basesp;
574 register char *trt = trans;
575 register int i;
576 register int backlen;
577 register int code;
578
579 while (*sp || (*cp & MAXINF) || *cp == BEG || *cp == RPAR ||
580 *cp == WBOUND || *cp == NWBOUND) {
581 switch ((code = *cp++) & CODEMASK) {
582
583 case CHAR:
584 basesp = sp;
585 i = *cp++;
586 if (code & MAXINF)
587 while (*sp && trt[*sp] == i) sp++;
588 else
589 if (*sp && trt[*sp] == i) sp++;
590 backlen = 1;
591 goto backoff;
592
593 backoff:
594 while (sp > basesp) {
595 if (try(compex, sp, cp))
596 goto right;
597 sp -= backlen;
598 }
599 if (code & MINZERO)
600 continue;
601 goto wrong;
602
603 case ANY:
604 basesp = sp;
605 if (code & MAXINF)
606 while (*sp && *sp != '\n') sp++;
607 else
608 if (*sp && *sp != '\n') sp++;
609 backlen = 1;
610 goto backoff;
611
612 case CCL:
613 basesp = sp;
614 if (code & MAXINF)
615 while (*sp && cclass(cp, *sp, 1)) sp++;
616 else
617 if (*sp && cclass(cp, *sp, 1)) sp++;
618 cp += BMAPSIZ;
619 backlen = 1;
620 goto backoff;
621
622 case NCCL:
623 basesp = sp;
624 if (code & MAXINF)
625 while (*sp && cclass(cp, *sp, 0)) sp++;
626 else
627 if (*sp && cclass(cp, *sp, 0)) sp++;
628 cp += BMAPSIZ;
629 backlen = 1;
630 goto backoff;
631
632 case END:
633 if (!*sp || *sp == '\n') {
634 matchtill--;
635 continue;
636 }
637 goto wrong;
638
639 case BEG:
640 if (sp == FirstCharacter || (
641 *sp && sp[-1] == '\n') ) {
642 matchtill--;
643 continue;
644 }
645 if (!multiline) /* no point in advancing more */
646 err = BEGFAIL;
647 goto wrong;
648
649 case WORD:
650 basesp = sp;
651 if (code & MAXINF)
652 while (*sp && isalnum(*sp)) sp++;
653 else
654 if (*sp && isalnum(*sp)) sp++;
655 backlen = 1;
656 goto backoff;
657
658 case NWORD:
659 basesp = sp;
660 if (code & MAXINF)
661 while (*sp && !isalnum(*sp)) sp++;
662 else
663 if (*sp && !isalnum(*sp)) sp++;
664 backlen = 1;
665 goto backoff;
666
667 case WBOUND:
668 if ((sp == FirstCharacter || !isalnum(sp[-1])) !=
669 (!*sp || !isalnum(*sp)) )
670 continue;
671 goto wrong;
672
673 case NWBOUND:
674 if ((sp == FirstCharacter || !isalnum(sp[-1])) ==
675 (!*sp || !isalnum(*sp)))
676 continue;
677 goto wrong;
678
679 case FINIS:
680 goto right;
681
682 case LPAR:
683 compex->subbeg[*cp++] = sp;
684 continue;
685
686 case RPAR:
687 i = *cp++;
688 compex->subend[i] = sp;
689 compex->lastparen = i;
690 continue;
691
692 case REF:
693 if (compex->subend[i = *cp++] == 0) {
694 fputs("Bad subpattern reference\n",stdout) FLUSH;
695 err = FATAL;
696 goto wrong;
697 }
698 basesp = sp;
699 backlen = compex->subend[i] - compex->subbeg[i];
700 if (code & MAXINF)
701 while (*sp && subpat(compex, i, sp)) sp += backlen;
702 else
703 if (*sp && subpat(compex, i, sp)) sp += backlen;
704 goto backoff;
705
706 default:
707 fputs("Botched pattern compilation\n",stdout) FLUSH;
708 err = FATAL;
709 return -1;
710 }
711 }
712 if (*cp == FINIS || *cp == END) {
713right:
714 if (matchend == Nullch || sp > matchend)
715 matchend = sp;
716 return matchend >= matchtill;
717 }
718wrong:
719 matchend = Nullch;
720 return FALSE;
721}
722
723bool
724subpat(compex, i, sp)
725register COMPEX *compex;
726register int i;
727register char *sp;
728{
729 register char *bp;
730
731 bp = compex->subbeg[i];
732 while (*sp && *bp == *sp) {
733 bp++;
734 sp++;
735 if (bp >= compex->subend[i])
736 return TRUE;
737 }
738 return FALSE;
739}
740
741bool
742cclass(set, c, af)
743register char *set;
744register int c;
745{
746 c &= 0177;
747#if BITSPERBYTE == 8
748 if (set[c >> 3] & 1 << (c & 7))
749#else
750 if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
751#endif
752 return af;
753 return !af;
754}