This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Namespace cleanup: Does SDBM need binary compatibility?
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
f0fcb552 22/*SUPPRESS 112*/
a687059c 23/*
e50aee73 24 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
25 *
26 * Copyright (c) 1986 by University of Toronto.
27 * Written by Henry Spencer. Not derived from licensed software.
28 *
29 * Permission is granted to anyone to use this software for any
30 * purpose on any computer system, and to redistribute it freely,
31 * subject to the following restrictions:
32 *
33 * 1. The author is not responsible for the consequences of use of
34 * this software, no matter how awful, even if they arise
35 * from defects in it.
36 *
37 * 2. The origin of this software must not be misrepresented, either
38 * by explicit claim or by omission.
39 *
40 * 3. Altered versions must be plainly marked as such, and must not
41 * be misrepresented as being the original software.
42 *
43 **** Alterations to Henry's code are...
44 ****
a0d0e21e 45 **** Copyright (c) 1991-1994, Larry Wall
a687059c 46 ****
9ef589d8
LW
47 **** You may distribute under the terms of either the GNU General Public
48 **** License or the Artistic License, as specified in the README file.
a687059c
LW
49 *
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
53 */
54#include "EXTERN.h"
55#include "perl.h"
56#include "regcomp.h"
57
58#ifndef STATIC
59#define STATIC static
60#endif
61
62#ifdef DEBUGGING
a0d0e21e
LW
63static I32 regnarrate = 0;
64static char* regprogram = 0;
a687059c
LW
65#endif
66
a0d0e21e
LW
67/* Current curly descriptor */
68typedef struct curcur CURCUR;
69struct curcur {
70 int parenfloor; /* how far back to strip paren data */
71 int cur; /* how many instances of scan we've matched */
72 int min; /* the minimal number of scans to match */
73 int max; /* the maximal number of scans to match */
74 int minmod; /* whether to work our way up or down */
75 char * scan; /* the thing to match */
76 char * next; /* what has to match after it */
77 char * lastloc; /* where we started matching this scan */
78 CURCUR * oldcc; /* current curly before we started this one */
79};
80
81static CURCUR* regcc;
82
83typedef I32 CHECKPOINT;
84
55497cff 85static CHECKPOINT regcppush _((I32 parenfloor));
86static char * regcppop _((void));
a0d0e21e 87
55497cff 88static CHECKPOINT
a0d0e21e
LW
89regcppush(parenfloor)
90I32 parenfloor;
91{
92 int retval = savestack_ix;
93 int i = (regsize - parenfloor) * 3;
94 int p;
95
96 SSCHECK(i + 5);
97 for (p = regsize; p > parenfloor; p--) {
98 SSPUSHPTR(regendp[p]);
99 SSPUSHPTR(regstartp[p]);
100 SSPUSHINT(p);
101 }
102 SSPUSHINT(regsize);
103 SSPUSHINT(*reglastparen);
104 SSPUSHPTR(reginput);
105 SSPUSHINT(i + 3);
106 SSPUSHINT(SAVEt_REGCONTEXT);
107 return retval;
108}
109
55497cff 110static char *
a0d0e21e
LW
111regcppop()
112{
113 I32 i = SSPOPINT;
114 U32 paren = 0;
115 char *input;
116 char *tmps;
117 assert(i == SAVEt_REGCONTEXT);
118 i = SSPOPINT;
119 input = (char *) SSPOPPTR;
120 *reglastparen = SSPOPINT;
121 regsize = SSPOPINT;
122 for (i -= 3; i > 0; i -= 3) {
123 paren = (U32)SSPOPINT;
124 regstartp[paren] = (char *) SSPOPPTR;
125 tmps = (char*)SSPOPPTR;
126 if (paren <= *reglastparen)
127 regendp[paren] = tmps;
128 }
129 for (paren = *reglastparen + 1; paren <= regnpar; paren++) {
130 if (paren > regsize)
131 regstartp[paren] = Nullch;
132 regendp[paren] = Nullch;
133 }
134 return input;
135}
136
137#define regcpblow(cp) leave_scope(cp)
138
a687059c 139/*
e50aee73 140 * pregexec and friends
a687059c
LW
141 */
142
143/*
a687059c
LW
144 * Forwards.
145 */
a0d0e21e
LW
146
147static I32 regmatch _((char *prog));
148static I32 regrepeat _((char *p, I32 max));
149static I32 regtry _((regexp *prog, char *startpos));
bbce6d69 150static bool reginclass _((char *p, I32 c));
151
152static bool regtainted; /* tainted information used? */
a687059c
LW
153
154/*
e50aee73 155 - pregexec - match a regexp against a string
a687059c 156 */
79072805 157I32
e50aee73 158pregexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
a687059c
LW
159register regexp *prog;
160char *stringarg;
161register char *strend; /* pointer to null at end of string */
162char *strbeg; /* real beginning of string */
79072805
LW
163I32 minend; /* end of match must be at least minend after stringarg */
164SV *screamer;
165I32 safebase; /* no need to remember string in subbase */
a687059c 166{
a0d0e21e 167 register char *s;
a0d0e21e
LW
168 register char *c;
169 register char *startpos = stringarg;
170 register I32 tmp;
171 I32 minlen = 0; /* must match at least this many chars */
172 I32 dontbother = 0; /* how many characters not to try at end */
173 CURCUR cc;
a687059c 174
a0d0e21e 175 cc.cur = 0;
4633a7c4 176 cc.oldcc = 0;
a0d0e21e
LW
177 regcc = &cc;
178
179#ifdef DEBUGGING
180 regnarrate = debug & 512;
181 regprogram = prog->program;
182#endif
183
184 /* Be paranoid... */
185 if (prog == NULL || startpos == NULL) {
186 croak("NULL regexp parameter");
187 return 0;
188 }
189
190 if (startpos == strbeg) /* is ^ valid at stringarg? */
191 regprev = '\n';
192 else {
193 regprev = stringarg[-1];
194 if (!multiline && regprev == '\n')
195 regprev = '\0'; /* force ^ to NOT match */
196 }
bbce6d69 197
a0d0e21e 198 regprecomp = prog->precomp;
a0d0e21e
LW
199 /* Check validity of program. */
200 if (UCHARAT(prog->program) != MAGIC) {
201 FAIL("corrupted regexp program");
202 }
203
bbce6d69 204 regnpar = prog->nparens;
205 regtainted = FALSE;
a0d0e21e
LW
206
207 /* If there is a "must appear" string, look for it. */
208 s = startpos;
209 if (prog->regmust != Nullsv &&
210 (!(prog->reganch & ROPT_ANCH)
211 || (multiline && prog->regback >= 0)) )
212 {
213 if (stringarg == strbeg && screamer) {
214 if (screamfirst[BmRARE(prog->regmust)] >= 0)
215 s = screaminstr(screamer,prog->regmust);
216 else
217 s = Nullch;
0a12ae7d 218 }
a0d0e21e
LW
219 else
220 s = fbm_instr((unsigned char*)s, (unsigned char*)strend,
221 prog->regmust);
222 if (!s) {
223 ++BmUSEFUL(prog->regmust); /* hooray */
224 goto phooey; /* not present */
a687059c 225 }
a0d0e21e
LW
226 else if (prog->regback >= 0) {
227 s -= prog->regback;
228 if (s < startpos)
229 s = startpos;
230 minlen = prog->regback + SvCUR(prog->regmust);
a687059c 231 }
a0d0e21e
LW
232 else if (!prog->naughty && --BmUSEFUL(prog->regmust) < 0) { /* boo */
233 SvREFCNT_dec(prog->regmust);
234 prog->regmust = Nullsv; /* disable regmust */
235 s = startpos;
236 }
237 else {
238 s = startpos;
239 minlen = SvCUR(prog->regmust);
a687059c 240 }
a0d0e21e 241 }
a687059c 242
a0d0e21e
LW
243 /* Mark beginning of line for ^ . */
244 regbol = startpos;
a687059c 245
a0d0e21e
LW
246 /* Mark end of line for $ (and such) */
247 regeol = strend;
a687059c 248
a0d0e21e
LW
249 /* see how far we have to get to not match where we matched before */
250 regtill = startpos+minend;
a687059c 251
a0d0e21e
LW
252 /* Simplest case: anchored match need be tried only once. */
253 /* [unless multiline is set] */
254 if (prog->reganch & ROPT_ANCH) {
255 if (regtry(prog, startpos))
256 goto got_it;
257 else if (multiline || (prog->reganch & ROPT_IMPLICIT)) {
258 if (minlen)
259 dontbother = minlen - 1;
260 strend -= dontbother;
261 /* for multiline we only have to try after newlines */
262 if (s > startpos)
263 s--;
264 while (s < strend) {
265 if (*s++ == '\n') {
266 if (s < strend && regtry(prog, s))
267 goto got_it;
268 }
35c8bce7 269 }
35c8bce7 270 }
a0d0e21e
LW
271 goto phooey;
272 }
35c8bce7 273
a0d0e21e
LW
274 /* Messy cases: unanchored match. */
275 if (prog->regstart) {
276 if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */
277 /* it must be a one character string */
bbce6d69 278 char ch = SvPVX(prog->regstart)[0];
a0d0e21e 279 while (s < strend) {
bbce6d69 280 if (*s == ch) {
a0d0e21e 281 if (regtry(prog, s))
a687059c 282 goto got_it;
a0d0e21e 283 s++;
bbce6d69 284 while (s < strend && *s == ch)
a0d0e21e 285 s++;
a687059c 286 }
a0d0e21e
LW
287 s++;
288 }
a687059c 289 }
a0d0e21e
LW
290 else if (SvPOK(prog->regstart) == 3) {
291 /* We know what string it must start with. */
292 while ((s = fbm_instr((unsigned char*)s,
293 (unsigned char*)strend, prog->regstart)) != NULL)
294 {
295 if (regtry(prog, s))
296 goto got_it;
297 s++;
298 }
299 }
300 else {
301 c = SvPVX(prog->regstart);
302 while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL)
303 {
304 if (regtry(prog, s))
305 goto got_it;
306 s++;
307 }
308 }
309 goto phooey;
310 }
311 /*SUPPRESS 560*/
312 if (c = prog->regstclass) {
313 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
a687059c 314
a0d0e21e
LW
315 if (minlen)
316 dontbother = minlen - 1;
317 strend -= dontbother; /* don't bother with what can't match */
318 tmp = 1;
319 /* We know what class it must start with. */
320 switch (OP(c)) {
321 case ANYOF:
322 c = OPERAND(c);
323 while (s < strend) {
bbce6d69 324 if (reginclass(c, *s)) {
a0d0e21e
LW
325 if (tmp && regtry(prog, s))
326 goto got_it;
327 else
328 tmp = doevery;
a687059c 329 }
a0d0e21e
LW
330 else
331 tmp = 1;
332 s++;
333 }
334 break;
bbce6d69 335 case BOUNDL:
336 regtainted = TRUE;
337 /* FALL THROUGH */
a0d0e21e
LW
338 case BOUND:
339 if (minlen)
340 dontbother++,strend--;
bbce6d69 341 tmp = (s != startpos) ? UCHARAT(s - 1) : regprev;
95bac841 342 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
a0d0e21e 343 while (s < strend) {
95bac841 344 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
a0d0e21e
LW
345 tmp = !tmp;
346 if (regtry(prog, s))
347 goto got_it;
a687059c 348 }
a0d0e21e
LW
349 s++;
350 }
351 if ((minlen || tmp) && regtry(prog,s))
352 goto got_it;
353 break;
bbce6d69 354 case NBOUNDL:
355 regtainted = TRUE;
356 /* FALL THROUGH */
a0d0e21e
LW
357 case NBOUND:
358 if (minlen)
359 dontbother++,strend--;
bbce6d69 360 tmp = (s != startpos) ? UCHARAT(s - 1) : regprev;
95bac841 361 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
a0d0e21e 362 while (s < strend) {
95bac841 363 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
a0d0e21e
LW
364 tmp = !tmp;
365 else if (regtry(prog, s))
366 goto got_it;
367 s++;
368 }
369 if ((minlen || !tmp) && regtry(prog,s))
370 goto got_it;
371 break;
372 case ALNUM:
373 while (s < strend) {
bbce6d69 374 if (isALNUM(*s)) {
375 if (tmp && regtry(prog, s))
376 goto got_it;
377 else
378 tmp = doevery;
379 }
380 else
381 tmp = 1;
382 s++;
383 }
384 break;
385 case ALNUML:
386 regtainted = TRUE;
387 while (s < strend) {
388 if (isALNUM_LC(*s)) {
a0d0e21e
LW
389 if (tmp && regtry(prog, s))
390 goto got_it;
a687059c 391 else
a0d0e21e
LW
392 tmp = doevery;
393 }
394 else
395 tmp = 1;
396 s++;
397 }
398 break;
399 case NALNUM:
400 while (s < strend) {
bbce6d69 401 if (!isALNUM(*s)) {
402 if (tmp && regtry(prog, s))
403 goto got_it;
404 else
405 tmp = doevery;
406 }
407 else
408 tmp = 1;
409 s++;
410 }
411 break;
412 case NALNUML:
413 regtainted = TRUE;
414 while (s < strend) {
415 if (!isALNUM_LC(*s)) {
a0d0e21e
LW
416 if (tmp && regtry(prog, s))
417 goto got_it;
a687059c 418 else
a0d0e21e 419 tmp = doevery;
a687059c 420 }
a0d0e21e
LW
421 else
422 tmp = 1;
423 s++;
424 }
425 break;
426 case SPACE:
427 while (s < strend) {
428 if (isSPACE(*s)) {
429 if (tmp && regtry(prog, s))
430 goto got_it;
431 else
432 tmp = doevery;
2304df62 433 }
a0d0e21e
LW
434 else
435 tmp = 1;
436 s++;
437 }
438 break;
bbce6d69 439 case SPACEL:
440 regtainted = TRUE;
441 while (s < strend) {
442 if (isSPACE_LC(*s)) {
443 if (tmp && regtry(prog, s))
444 goto got_it;
445 else
446 tmp = doevery;
447 }
448 else
449 tmp = 1;
450 s++;
451 }
452 break;
a0d0e21e
LW
453 case NSPACE:
454 while (s < strend) {
455 if (!isSPACE(*s)) {
456 if (tmp && regtry(prog, s))
457 goto got_it;
458 else
459 tmp = doevery;
a687059c 460 }
a0d0e21e
LW
461 else
462 tmp = 1;
463 s++;
464 }
465 break;
bbce6d69 466 case NSPACEL:
467 regtainted = TRUE;
468 while (s < strend) {
469 if (!isSPACE_LC(*s)) {
470 if (tmp && regtry(prog, s))
471 goto got_it;
472 else
473 tmp = doevery;
474 }
475 else
476 tmp = 1;
477 s++;
478 }
479 break;
a0d0e21e
LW
480 case DIGIT:
481 while (s < strend) {
482 if (isDIGIT(*s)) {
483 if (tmp && regtry(prog, s))
484 goto got_it;
485 else
486 tmp = doevery;
2b69d0c2 487 }
a0d0e21e
LW
488 else
489 tmp = 1;
490 s++;
491 }
492 break;
493 case NDIGIT:
494 while (s < strend) {
495 if (!isDIGIT(*s)) {
496 if (tmp && regtry(prog, s))
497 goto got_it;
498 else
499 tmp = doevery;
a687059c 500 }
a0d0e21e
LW
501 else
502 tmp = 1;
503 s++;
504 }
505 break;
a687059c 506 }
a0d0e21e
LW
507 }
508 else {
509 if (minlen)
510 dontbother = minlen - 1;
511 strend -= dontbother;
512 /* We don't know much -- general case. */
513 do {
514 if (regtry(prog, s))
515 goto got_it;
516 } while (s++ < strend);
517 }
518
519 /* Failure. */
520 goto phooey;
a687059c 521
a0d0e21e 522got_it:
420218e7 523 strend += dontbother; /* uncheat */
a0d0e21e
LW
524 prog->subbeg = strbeg;
525 prog->subend = strend;
bbce6d69 526 prog->exec_tainted = regtainted;
527 if (!safebase && (prog->nparens || sawampersand)) {
528 I32 i = strend - startpos + (stringarg - strbeg);
a0d0e21e
LW
529 if (safebase) { /* no need for $digit later */
530 s = strbeg;
531 prog->subend = s+i;
532 }
533 else if (strbeg != prog->subbase) {
534 s = savepvn(strbeg,i); /* so $digit will work later */
535 if (prog->subbase)
536 Safefree(prog->subbase);
537 prog->subbeg = prog->subbase = s;
538 prog->subend = s+i;
539 }
540 else {
541 prog->subbeg = s = prog->subbase;
542 prog->subend = s+i;
543 }
544 s += (stringarg - strbeg);
545 for (i = 0; i <= prog->nparens; i++) {
546 if (prog->endp[i]) {
547 prog->startp[i] = s + (prog->startp[i] - startpos);
548 prog->endp[i] = s + (prog->endp[i] - startpos);
549 }
550 }
a0d0e21e
LW
551 }
552 return 1;
553
554phooey:
a0d0e21e 555 return 0;
a687059c
LW
556}
557
558/*
559 - regtry - try match at specific point
560 */
79072805 561static I32 /* 0 failure, 1 success */
a0d0e21e 562regtry(prog, startpos)
a687059c 563regexp *prog;
a0d0e21e 564char *startpos;
a687059c 565{
a0d0e21e
LW
566 register I32 i;
567 register char **sp;
568 register char **ep;
569
570 reginput = startpos;
571 regstartp = prog->startp;
572 regendp = prog->endp;
573 reglastparen = &prog->lastparen;
574 prog->lastparen = 0;
575 regsize = 0;
576
577 sp = prog->startp;
578 ep = prog->endp;
579 if (prog->nparens) {
580 for (i = prog->nparens; i >= 0; i--) {
581 *sp++ = NULL;
582 *ep++ = NULL;
a687059c 583 }
a0d0e21e
LW
584 }
585 if (regmatch(prog->program + 1) && reginput >= regtill) {
586 prog->startp[0] = startpos;
587 prog->endp[0] = reginput;
588 return 1;
589 }
590 else
591 return 0;
a687059c
LW
592}
593
594/*
595 - regmatch - main matching routine
596 *
597 * Conceptually the strategy is simple: check to see whether the current
598 * node matches, call self recursively to see whether the rest matches,
599 * and then act accordingly. In practice we make some effort to avoid
600 * recursion, in particular by going through "ordinary" nodes (that don't
601 * need to know whether the rest of the match failed) by a loop instead of
602 * by recursion.
603 */
604/* [lwall] I've hoisted the register declarations to the outer block in order to
605 * maybe save a little bit of pushing and popping on the stack. It also takes
606 * advantage of machines that use a register save mask on subroutine entry.
607 */
79072805 608static I32 /* 0 failure, 1 success */
a687059c
LW
609regmatch(prog)
610char *prog;
611{
a0d0e21e
LW
612 register char *scan; /* Current node. */
613 char *next; /* Next node. */
614 register I32 nextchar;
615 register I32 n; /* no or next */
616 register I32 ln; /* len or last */
617 register char *s; /* operand or save */
618 register char *locinput = reginput;
bbce6d69 619 register I32 c1, c2; /* case fold search */
a0d0e21e 620 int minmod = 0;
4633a7c4
LW
621#ifdef DEBUGGING
622 static int regindent = 0;
623 regindent++;
624#endif
a0d0e21e 625
bbce6d69 626 nextchar = UCHARAT(locinput);
a0d0e21e
LW
627 scan = prog;
628 while (scan != NULL) {
a687059c 629#ifdef DEBUGGING
4633a7c4
LW
630#define sayYES goto yes
631#define sayNO goto no
632#define saySAME(x) if (x) goto yes; else goto no
633 if (regnarrate) {
760ac839 634 PerlIO_printf(Perl_debug_log, "%*s%2d%-8.8s\t<%.10s>\n", regindent*2, "",
a0d0e21e 635 scan - regprogram, regprop(scan), locinput);
4633a7c4
LW
636 }
637#else
638#define sayYES return 1
639#define sayNO return 0
640#define saySAME(x) return x
a687059c
LW
641#endif
642
643#ifdef REGALIGN
a0d0e21e
LW
644 next = scan + NEXT(scan);
645 if (next == scan)
646 next = NULL;
a687059c 647#else
a0d0e21e 648 next = regnext(scan);
a687059c
LW
649#endif
650
a0d0e21e
LW
651 switch (OP(scan)) {
652 case BOL:
653 if (locinput == regbol
654 ? regprev == '\n'
655 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
656 {
657 /* regtill = regbol; */
658 break;
659 }
4633a7c4 660 sayNO;
a0d0e21e
LW
661 case MBOL:
662 if (locinput == regbol
663 ? regprev == '\n'
664 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
665 {
666 break;
667 }
4633a7c4 668 sayNO;
a0d0e21e
LW
669 case SBOL:
670 if (locinput == regbol && regprev == '\n')
671 break;
4633a7c4 672 sayNO;
a0d0e21e
LW
673 case GBOL:
674 if (locinput == regbol)
675 break;
4633a7c4 676 sayNO;
a0d0e21e
LW
677 case EOL:
678 if (multiline)
679 goto meol;
680 else
681 goto seol;
682 case MEOL:
683 meol:
684 if ((nextchar || locinput < regeol) && nextchar != '\n')
4633a7c4 685 sayNO;
a0d0e21e
LW
686 break;
687 case SEOL:
688 seol:
689 if ((nextchar || locinput < regeol) && nextchar != '\n')
4633a7c4 690 sayNO;
a0d0e21e 691 if (regeol - locinput > 1)
4633a7c4 692 sayNO;
a0d0e21e
LW
693 break;
694 case SANY:
695 if (!nextchar && locinput >= regeol)
4633a7c4 696 sayNO;
bbce6d69 697 nextchar = UCHARAT(++locinput);
a0d0e21e
LW
698 break;
699 case ANY:
700 if (!nextchar && locinput >= regeol || nextchar == '\n')
4633a7c4 701 sayNO;
bbce6d69 702 nextchar = UCHARAT(++locinput);
a0d0e21e 703 break;
bbce6d69 704 case EXACT:
a0d0e21e
LW
705 s = OPERAND(scan);
706 ln = *s++;
707 /* Inline the first character, for speed. */
95bac841 708 if (UCHARAT(s) != nextchar)
4633a7c4 709 sayNO;
a0d0e21e 710 if (regeol - locinput < ln)
4633a7c4 711 sayNO;
a026971d 712 if (ln > 1 && memcmp(s, locinput, ln) != 0)
4633a7c4 713 sayNO;
a0d0e21e 714 locinput += ln;
bbce6d69 715 nextchar = UCHARAT(locinput);
716 break;
717 case EXACTFL:
718 regtainted = TRUE;
719 /* FALL THROUGH */
720 case EXACTF:
721 s = OPERAND(scan);
722 ln = *s++;
723 /* Inline the first character, for speed. */
724 if (UCHARAT(s) != nextchar &&
725 UCHARAT(s) != ((OP(scan) == EXACTF)
726 ? fold : fold_locale)[nextchar])
727 sayNO;
728 if (regeol - locinput < ln)
729 sayNO;
730 if (ln > 1 && ((OP(scan) == EXACTF)
731 ? ibcmp : ibcmp_locale)(s, locinput, ln) != 0)
732 sayNO;
733 locinput += ln;
734 nextchar = UCHARAT(locinput);
a0d0e21e
LW
735 break;
736 case ANYOF:
737 s = OPERAND(scan);
738 if (nextchar < 0)
739 nextchar = UCHARAT(locinput);
bbce6d69 740 if (!reginclass(s, nextchar))
4633a7c4 741 sayNO;
a0d0e21e 742 if (!nextchar && locinput >= regeol)
4633a7c4 743 sayNO;
bbce6d69 744 nextchar = UCHARAT(++locinput);
a0d0e21e 745 break;
bbce6d69 746 case ALNUML:
747 regtainted = TRUE;
748 /* FALL THROUGH */
a0d0e21e
LW
749 case ALNUM:
750 if (!nextchar)
4633a7c4 751 sayNO;
bbce6d69 752 if (!(OP(scan) == ALNUM
753 ? isALNUM(nextchar) : isALNUM_LC(nextchar)))
4633a7c4 754 sayNO;
bbce6d69 755 nextchar = UCHARAT(++locinput);
a0d0e21e 756 break;
bbce6d69 757 case NALNUML:
758 regtainted = TRUE;
759 /* FALL THROUGH */
a0d0e21e
LW
760 case NALNUM:
761 if (!nextchar && locinput >= regeol)
4633a7c4 762 sayNO;
bbce6d69 763 if (OP(scan) == NALNUM
764 ? isALNUM(nextchar) : isALNUM_LC(nextchar))
4633a7c4 765 sayNO;
bbce6d69 766 nextchar = UCHARAT(++locinput);
a0d0e21e 767 break;
bbce6d69 768 case BOUNDL:
769 case NBOUNDL:
770 regtainted = TRUE;
771 /* FALL THROUGH */
a0d0e21e 772 case BOUND:
bbce6d69 773 case NBOUND:
774 /* was last char in word? */
775 ln = (locinput != regbol) ? UCHARAT(locinput - 1) : regprev;
776 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
777 ln = isALNUM(ln);
778 n = isALNUM(nextchar);
779 }
780 else {
781 ln = isALNUM_LC(ln);
782 n = isALNUM_LC(nextchar);
783 }
95bac841 784 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
4633a7c4 785 sayNO;
a0d0e21e 786 break;
bbce6d69 787 case SPACEL:
788 regtainted = TRUE;
789 /* FALL THROUGH */
a0d0e21e
LW
790 case SPACE:
791 if (!nextchar && locinput >= regeol)
4633a7c4 792 sayNO;
bbce6d69 793 if (!(OP(scan) == SPACE
794 ? isSPACE(nextchar) : isSPACE_LC(nextchar)))
4633a7c4 795 sayNO;
bbce6d69 796 nextchar = UCHARAT(++locinput);
a0d0e21e 797 break;
bbce6d69 798 case NSPACEL:
799 regtainted = TRUE;
800 /* FALL THROUGH */
a0d0e21e
LW
801 case NSPACE:
802 if (!nextchar)
4633a7c4 803 sayNO;
bbce6d69 804 if (OP(scan) == SPACE
805 ? isSPACE(nextchar) : isSPACE_LC(nextchar))
4633a7c4 806 sayNO;
bbce6d69 807 nextchar = UCHARAT(++locinput);
a0d0e21e
LW
808 break;
809 case DIGIT:
810 if (!isDIGIT(nextchar))
4633a7c4 811 sayNO;
bbce6d69 812 nextchar = UCHARAT(++locinput);
a0d0e21e
LW
813 break;
814 case NDIGIT:
815 if (!nextchar && locinput >= regeol)
4633a7c4 816 sayNO;
a0d0e21e 817 if (isDIGIT(nextchar))
4633a7c4 818 sayNO;
bbce6d69 819 nextchar = UCHARAT(++locinput);
a0d0e21e
LW
820 break;
821 case REF:
822 n = ARG1(scan); /* which paren pair */
823 s = regstartp[n];
824 if (!s)
4633a7c4 825 sayNO;
a0d0e21e 826 if (!regendp[n])
4633a7c4 827 sayNO;
a0d0e21e
LW
828 if (s == regendp[n])
829 break;
830 /* Inline the first character, for speed. */
95bac841 831 if (UCHARAT(s) != nextchar)
4633a7c4 832 sayNO;
a0d0e21e
LW
833 ln = regendp[n] - s;
834 if (locinput + ln > regeol)
4633a7c4 835 sayNO;
a026971d 836 if (ln > 1 && memcmp(s, locinput, ln) != 0)
4633a7c4 837 sayNO;
a0d0e21e 838 locinput += ln;
bbce6d69 839 nextchar = UCHARAT(locinput);
a0d0e21e
LW
840 break;
841
842 case NOTHING:
843 break;
844 case BACK:
845 break;
846 case OPEN:
847 n = ARG1(scan); /* which paren pair */
848 regstartp[n] = locinput;
849 if (n > regsize)
850 regsize = n;
851 break;
852 case CLOSE:
853 n = ARG1(scan); /* which paren pair */
854 regendp[n] = locinput;
855 if (n > *reglastparen)
856 *reglastparen = n;
857 break;
858 case CURLYX: {
859 CURCUR cc;
860 CHECKPOINT cp = savestack_ix;
861 cc.oldcc = regcc;
862 regcc = &cc;
863 cc.parenfloor = *reglastparen;
864 cc.cur = -1;
865 cc.min = ARG1(scan);
866 cc.max = ARG2(scan);
867 cc.scan = NEXTOPER(scan) + 4;
868 cc.next = next;
869 cc.minmod = minmod;
870 cc.lastloc = 0;
871 reginput = locinput;
872 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
873 regcpblow(cp);
874 regcc = cc.oldcc;
4633a7c4 875 saySAME(n);
a0d0e21e
LW
876 }
877 /* NOT REACHED */
878 case WHILEM: {
879 /*
880 * This is really hard to understand, because after we match
881 * what we're trying to match, we must make sure the rest of
882 * the RE is going to match for sure, and to do that we have
883 * to go back UP the parse tree by recursing ever deeper. And
884 * if it fails, we have to reset our parent's current state
885 * that we can try again after backing off.
886 */
887
888 CURCUR* cc = regcc;
4633a7c4 889 n = cc->cur + 1; /* how many we know we matched */
a0d0e21e
LW
890 reginput = locinput;
891
4633a7c4
LW
892#ifdef DEBUGGING
893 if (regnarrate)
760ac839 894 PerlIO_printf(Perl_debug_log, "%*s %d %lx\n", regindent*2, "",
4633a7c4
LW
895 n, (long)cc);
896#endif
897
a0d0e21e
LW
898 /* If degenerate scan matches "", assume scan done. */
899
900 if (locinput == cc->lastloc) {
901 regcc = cc->oldcc;
902 ln = regcc->cur;
903 if (regmatch(cc->next))
4633a7c4 904 sayYES;
a0d0e21e
LW
905 regcc->cur = ln;
906 regcc = cc;
4633a7c4 907 sayNO;
a0d0e21e
LW
908 }
909
910 /* First just match a string of min scans. */
911
912 if (n < cc->min) {
913 cc->cur = n;
914 cc->lastloc = locinput;
4633a7c4
LW
915 if (regmatch(cc->scan))
916 sayYES;
917 cc->cur = n - 1;
918 sayNO;
a0d0e21e
LW
919 }
920
921 /* Prefer next over scan for minimal matching. */
922
923 if (cc->minmod) {
924 regcc = cc->oldcc;
925 ln = regcc->cur;
926 if (regmatch(cc->next))
4633a7c4 927 sayYES; /* All done. */
a0d0e21e
LW
928 regcc->cur = ln;
929 regcc = cc;
930
931 if (n >= cc->max) /* Maximum greed exceeded? */
4633a7c4 932 sayNO;
a687059c 933
a0d0e21e
LW
934 /* Try scanning more and see if it helps. */
935 reginput = locinput;
936 cc->cur = n;
937 cc->lastloc = locinput;
4633a7c4
LW
938 if (regmatch(cc->scan))
939 sayYES;
940 cc->cur = n - 1;
941 sayNO;
a0d0e21e
LW
942 }
943
944 /* Prefer scan over next for maximal matching. */
945
946 if (n < cc->max) { /* More greed allowed? */
947 regcppush(cc->parenfloor);
948 cc->cur = n;
949 cc->lastloc = locinput;
950 if (regmatch(cc->scan))
4633a7c4 951 sayYES;
a0d0e21e
LW
952 regcppop(); /* Restore some previous $<digit>s? */
953 reginput = locinput;
954 }
955
956 /* Failed deeper matches of scan, so see if this one works. */
957 regcc = cc->oldcc;
958 ln = regcc->cur;
959 if (regmatch(cc->next))
4633a7c4 960 sayYES;
a0d0e21e
LW
961 regcc->cur = ln;
962 regcc = cc;
4633a7c4
LW
963 cc->cur = n - 1;
964 sayNO;
a0d0e21e
LW
965 }
966 /* NOT REACHED */
967 case BRANCH: {
968 if (OP(next) != BRANCH) /* No choice. */
969 next = NEXTOPER(scan);/* Avoid recursion. */
970 else {
748a9306 971 int lastparen = *reglastparen;
a0d0e21e
LW
972 do {
973 reginput = locinput;
974 if (regmatch(NEXTOPER(scan)))
4633a7c4 975 sayYES;
748a9306
LW
976 for (n = *reglastparen; n > lastparen; n--)
977 regendp[n] = 0;
978 *reglastparen = n;
979
a687059c 980#ifdef REGALIGN
a0d0e21e
LW
981 /*SUPPRESS 560*/
982 if (n = NEXT(scan))
983 scan += n;
984 else
985 scan = NULL;
a687059c 986#else
a0d0e21e 987 scan = regnext(scan);
79072805 988#endif
a0d0e21e 989 } while (scan != NULL && OP(scan) == BRANCH);
4633a7c4 990 sayNO;
a0d0e21e 991 /* NOTREACHED */
a687059c 992 }
a0d0e21e
LW
993 }
994 break;
995 case MINMOD:
996 minmod = 1;
997 break;
998 case CURLY:
999 ln = ARG1(scan); /* min to match */
1000 n = ARG2(scan); /* max to match */
1001 scan = NEXTOPER(scan) + 4;
1002 goto repeat;
1003 case STAR:
1004 ln = 0;
1005 n = 32767;
1006 scan = NEXTOPER(scan);
1007 goto repeat;
1008 case PLUS:
1009 /*
1010 * Lookahead to avoid useless match attempts
1011 * when we know what character comes next.
1012 */
1013 ln = 1;
1014 n = 32767;
1015 scan = NEXTOPER(scan);
1016 repeat:
bbce6d69 1017 if (regkind[(U8)OP(next)] == EXACT) {
1018 c1 = UCHARAT(OPERAND(next) + 1);
1019 if (OP(next) == EXACTF)
1020 c2 = fold[c1];
1021 else if (OP(next) == EXACTFL)
1022 c2 = fold_locale[c1];
1023 else
1024 c2 = c1;
1025 }
a0d0e21e 1026 else
bbce6d69 1027 c1 = c2 = -1000;
a0d0e21e
LW
1028 reginput = locinput;
1029 if (minmod) {
1030 minmod = 0;
1031 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 1032 sayNO;
8e07c86e 1033 while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */
a0d0e21e 1034 /* If it could work, try it. */
bbce6d69 1035 if (c1 == -1000 ||
1036 UCHARAT(reginput) == c1 ||
1037 UCHARAT(reginput) == c2)
1038 {
a0d0e21e 1039 if (regmatch(next))
4633a7c4 1040 sayYES;
bbce6d69 1041 }
a0d0e21e 1042 /* Couldn't or didn't -- back up. */
748a9306 1043 reginput = locinput + ln;
a0d0e21e
LW
1044 if (regrepeat(scan, 1)) {
1045 ln++;
1046 reginput = locinput + ln;
1047 }
1048 else
4633a7c4 1049 sayNO;
a0d0e21e
LW
1050 }
1051 }
1052 else {
1053 n = regrepeat(scan, n);
1054 if (ln < n && regkind[(U8)OP(next)] == EOL &&
1055 (!multiline || OP(next) == SEOL))
1056 ln = n; /* why back off? */
1057 while (n >= ln) {
1058 /* If it could work, try it. */
bbce6d69 1059 if (c1 == -1000 ||
1060 UCHARAT(reginput) == c1 ||
1061 UCHARAT(reginput) == c2)
1062 {
a0d0e21e 1063 if (regmatch(next))
4633a7c4 1064 sayYES;
bbce6d69 1065 }
a0d0e21e
LW
1066 /* Couldn't or didn't -- back up. */
1067 n--;
1068 reginput = locinput + n;
1069 }
1070 }
4633a7c4 1071 sayNO;
a0d0e21e
LW
1072 case SUCCEED:
1073 case END:
1074 reginput = locinput; /* put where regtry can find it */
4633a7c4 1075 sayYES; /* Success! */
a0d0e21e
LW
1076 case IFMATCH:
1077 reginput = locinput;
1078 scan = NEXTOPER(scan);
1079 if (!regmatch(scan))
4633a7c4 1080 sayNO;
a0d0e21e
LW
1081 break;
1082 case UNLESSM:
1083 reginput = locinput;
1084 scan = NEXTOPER(scan);
1085 if (regmatch(scan))
4633a7c4 1086 sayNO;
a0d0e21e
LW
1087 break;
1088 default:
760ac839 1089 PerlIO_printf(PerlIO_stderr(), "%x %d\n",(unsigned)scan,scan[1]);
a0d0e21e 1090 FAIL("regexp memory corruption");
a687059c 1091 }
a0d0e21e
LW
1092 scan = next;
1093 }
a687059c 1094
a0d0e21e
LW
1095 /*
1096 * We get here only if there's trouble -- normally "case END" is
1097 * the terminating point.
1098 */
1099 FAIL("corrupted regexp pointers");
1100 /*NOTREACHED*/
4633a7c4
LW
1101 sayNO;
1102
1103yes:
1104#ifdef DEBUGGING
1105 regindent--;
1106#endif
1107 return 1;
1108
1109no:
1110#ifdef DEBUGGING
1111 regindent--;
1112#endif
a0d0e21e 1113 return 0;
a687059c
LW
1114}
1115
1116/*
1117 - regrepeat - repeatedly match something simple, report how many
1118 */
1119/*
1120 * [This routine now assumes that it will only match on things of length 1.
1121 * That was true before, but now we assume scan - reginput is the count,
1122 * rather than incrementing count on every character.]
1123 */
79072805 1124static I32
00bf170e 1125regrepeat(p, max)
a687059c 1126char *p;
79072805 1127I32 max;
a687059c 1128{
a0d0e21e
LW
1129 register char *scan;
1130 register char *opnd;
1131 register I32 c;
1132 register char *loceol = regeol;
1133
1134 scan = reginput;
1135 if (max != 32767 && max < loceol - scan)
1136 loceol = scan + max;
1137 opnd = OPERAND(p);
1138 switch (OP(p)) {
1139 case ANY:
1140 while (scan < loceol && *scan != '\n')
1141 scan++;
1142 break;
1143 case SANY:
1144 scan = loceol;
1145 break;
bbce6d69 1146 case EXACT: /* length of string is 1 */
1147 c = UCHARAT(++opnd);
1148 while (scan < loceol && UCHARAT(scan) == c)
1149 scan++;
1150 break;
1151 case EXACTF: /* length of string is 1 */
1152 c = UCHARAT(++opnd);
1153 while (scan < loceol &&
1154 (UCHARAT(scan) == c || UCHARAT(scan) == fold[c]))
1155 scan++;
1156 break;
1157 case EXACTFL: /* length of string is 1 */
1158 regtainted = TRUE;
1159 c = UCHARAT(++opnd);
1160 while (scan < loceol &&
1161 (UCHARAT(scan) == c || UCHARAT(scan) == fold_locale[c]))
a0d0e21e
LW
1162 scan++;
1163 break;
1164 case ANYOF:
bbce6d69 1165 while (scan < loceol && reginclass(opnd, *scan))
a0d0e21e 1166 scan++;
a0d0e21e
LW
1167 break;
1168 case ALNUM:
1169 while (scan < loceol && isALNUM(*scan))
1170 scan++;
1171 break;
bbce6d69 1172 case ALNUML:
1173 regtainted = TRUE;
1174 while (scan < loceol && isALNUM_LC(*scan))
1175 scan++;
1176 break;
a0d0e21e
LW
1177 case NALNUM:
1178 while (scan < loceol && !isALNUM(*scan))
1179 scan++;
1180 break;
bbce6d69 1181 case NALNUML:
1182 regtainted = TRUE;
1183 while (scan < loceol && !isALNUM_LC(*scan))
1184 scan++;
1185 break;
a0d0e21e
LW
1186 case SPACE:
1187 while (scan < loceol && isSPACE(*scan))
1188 scan++;
1189 break;
bbce6d69 1190 case SPACEL:
1191 regtainted = TRUE;
1192 while (scan < loceol && isSPACE_LC(*scan))
1193 scan++;
1194 break;
a0d0e21e
LW
1195 case NSPACE:
1196 while (scan < loceol && !isSPACE(*scan))
1197 scan++;
1198 break;
bbce6d69 1199 case NSPACEL:
1200 regtainted = TRUE;
1201 while (scan < loceol && !isSPACE_LC(*scan))
1202 scan++;
1203 break;
a0d0e21e
LW
1204 case DIGIT:
1205 while (scan < loceol && isDIGIT(*scan))
1206 scan++;
1207 break;
1208 case NDIGIT:
1209 while (scan < loceol && !isDIGIT(*scan))
1210 scan++;
1211 break;
1212 default: /* Called on something of 0 width. */
1213 break; /* So match right here or not at all. */
1214 }
a687059c 1215
a0d0e21e
LW
1216 c = scan - reginput;
1217 reginput = scan;
a687059c 1218
a0d0e21e 1219 return(c);
a687059c
LW
1220}
1221
1222/*
bbce6d69 1223 - regclass - determine if a character falls into a character class
1224 */
1225
1226static bool
1227reginclass(p, c)
1228register char *p;
1229register I32 c;
1230{
1231 char flags = *p;
1232 bool match = FALSE;
1233
1234 c &= 0xFF;
1235 if (p[1 + (c >> 3)] & (1 << (c & 7)))
1236 match = TRUE;
1237 else if (flags & ANYOF_FOLD) {
1238 I32 cf;
1239 if (flags & ANYOF_LOCALE) {
1240 regtainted = TRUE;
1241 cf = fold_locale[c];
1242 }
1243 else
1244 cf = fold[c];
1245 if (p[1 + (cf >> 3)] & (1 << (cf & 7)))
1246 match = TRUE;
1247 }
1248
1249 if (!match && (flags & ANYOF_ISA)) {
1250 regtainted = TRUE;
1251
1252 if (((flags & ANYOF_ALNUML) && isALNUM_LC(c)) ||
1253 ((flags & ANYOF_NALNUML) && !isALNUM_LC(c)) ||
1254 ((flags & ANYOF_SPACEL) && isSPACE_LC(c)) ||
1255 ((flags & ANYOF_NSPACEL) && !isSPACE_LC(c)))
1256 {
1257 match = TRUE;
1258 }
1259 }
1260
1261 return match ^ ((flags & ANYOF_INVERT) != 0);
1262}
1263
1264/*
a687059c
LW
1265 - regnext - dig the "next" pointer out of a node
1266 *
1267 * [Note, when REGALIGN is defined there are two places in regmatch()
1268 * that bypass this code for speed.]
1269 */
1270char *
1271regnext(p)
1272register char *p;
1273{
a0d0e21e 1274 register I32 offset;
a687059c 1275
a0d0e21e
LW
1276 if (p == &regdummy)
1277 return(NULL);
a687059c 1278
a0d0e21e
LW
1279 offset = NEXT(p);
1280 if (offset == 0)
1281 return(NULL);
a687059c
LW
1282
1283#ifdef REGALIGN
a0d0e21e 1284 return(p+offset);
a687059c 1285#else
a0d0e21e
LW
1286 if (OP(p) == BACK)
1287 return(p-offset);
1288 else
1289 return(p+offset);
a687059c
LW
1290#endif
1291}