This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
win32: additional default libraries
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
f0fcb552 22/*SUPPRESS 112*/
a687059c 23/*
e50aee73 24 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
25 *
26 * Copyright (c) 1986 by University of Toronto.
27 * Written by Henry Spencer. Not derived from licensed software.
28 *
29 * Permission is granted to anyone to use this software for any
30 * purpose on any computer system, and to redistribute it freely,
31 * subject to the following restrictions:
32 *
33 * 1. The author is not responsible for the consequences of use of
34 * this software, no matter how awful, even if they arise
35 * from defects in it.
36 *
37 * 2. The origin of this software must not be misrepresented, either
38 * by explicit claim or by omission.
39 *
40 * 3. Altered versions must be plainly marked as such, and must not
41 * be misrepresented as being the original software.
42 *
43 **** Alterations to Henry's code are...
44 ****
9607fc9c 45 **** Copyright (c) 1991-1997, Larry Wall
a687059c 46 ****
9ef589d8
LW
47 **** You may distribute under the terms of either the GNU General Public
48 **** License or the Artistic License, as specified in the README file.
a687059c
LW
49 *
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
53 */
54#include "EXTERN.h"
55#include "perl.h"
56#include "regcomp.h"
57
58#ifndef STATIC
59#define STATIC static
60#endif
61
62#ifdef DEBUGGING
a0d0e21e
LW
63static I32 regnarrate = 0;
64static char* regprogram = 0;
a687059c
LW
65#endif
66
a0d0e21e
LW
67/* Current curly descriptor */
68typedef struct curcur CURCUR;
69struct curcur {
70 int parenfloor; /* how far back to strip paren data */
71 int cur; /* how many instances of scan we've matched */
72 int min; /* the minimal number of scans to match */
73 int max; /* the maximal number of scans to match */
74 int minmod; /* whether to work our way up or down */
75 char * scan; /* the thing to match */
76 char * next; /* what has to match after it */
77 char * lastloc; /* where we started matching this scan */
78 CURCUR * oldcc; /* current curly before we started this one */
79};
80
81static CURCUR* regcc;
82
83typedef I32 CHECKPOINT;
84
55497cff 85static CHECKPOINT regcppush _((I32 parenfloor));
86static char * regcppop _((void));
a0d0e21e 87
55497cff 88static CHECKPOINT
a0d0e21e
LW
89regcppush(parenfloor)
90I32 parenfloor;
91{
92 int retval = savestack_ix;
93 int i = (regsize - parenfloor) * 3;
94 int p;
95
96 SSCHECK(i + 5);
97 for (p = regsize; p > parenfloor; p--) {
98 SSPUSHPTR(regendp[p]);
99 SSPUSHPTR(regstartp[p]);
100 SSPUSHINT(p);
101 }
102 SSPUSHINT(regsize);
103 SSPUSHINT(*reglastparen);
104 SSPUSHPTR(reginput);
105 SSPUSHINT(i + 3);
106 SSPUSHINT(SAVEt_REGCONTEXT);
107 return retval;
108}
109
55497cff 110static char *
a0d0e21e
LW
111regcppop()
112{
113 I32 i = SSPOPINT;
114 U32 paren = 0;
115 char *input;
116 char *tmps;
117 assert(i == SAVEt_REGCONTEXT);
118 i = SSPOPINT;
119 input = (char *) SSPOPPTR;
120 *reglastparen = SSPOPINT;
121 regsize = SSPOPINT;
122 for (i -= 3; i > 0; i -= 3) {
123 paren = (U32)SSPOPINT;
124 regstartp[paren] = (char *) SSPOPPTR;
125 tmps = (char*)SSPOPPTR;
126 if (paren <= *reglastparen)
127 regendp[paren] = tmps;
128 }
129 for (paren = *reglastparen + 1; paren <= regnpar; paren++) {
130 if (paren > regsize)
131 regstartp[paren] = Nullch;
132 regendp[paren] = Nullch;
133 }
134 return input;
135}
136
137#define regcpblow(cp) leave_scope(cp)
138
a687059c 139/*
e50aee73 140 * pregexec and friends
a687059c
LW
141 */
142
143/*
a687059c
LW
144 * Forwards.
145 */
a0d0e21e
LW
146
147static I32 regmatch _((char *prog));
148static I32 regrepeat _((char *p, I32 max));
149static I32 regtry _((regexp *prog, char *startpos));
bbce6d69 150static bool reginclass _((char *p, I32 c));
151
152static bool regtainted; /* tainted information used? */
a687059c
LW
153
154/*
e50aee73 155 - pregexec - match a regexp against a string
a687059c 156 */
79072805 157I32
e50aee73 158pregexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
a687059c
LW
159register regexp *prog;
160char *stringarg;
161register char *strend; /* pointer to null at end of string */
162char *strbeg; /* real beginning of string */
79072805
LW
163I32 minend; /* end of match must be at least minend after stringarg */
164SV *screamer;
165I32 safebase; /* no need to remember string in subbase */
a687059c 166{
a0d0e21e 167 register char *s;
a0d0e21e
LW
168 register char *c;
169 register char *startpos = stringarg;
170 register I32 tmp;
171 I32 minlen = 0; /* must match at least this many chars */
172 I32 dontbother = 0; /* how many characters not to try at end */
173 CURCUR cc;
a687059c 174
a0d0e21e 175 cc.cur = 0;
4633a7c4 176 cc.oldcc = 0;
a0d0e21e
LW
177 regcc = &cc;
178
179#ifdef DEBUGGING
180 regnarrate = debug & 512;
181 regprogram = prog->program;
182#endif
183
184 /* Be paranoid... */
185 if (prog == NULL || startpos == NULL) {
186 croak("NULL regexp parameter");
187 return 0;
188 }
189
190 if (startpos == strbeg) /* is ^ valid at stringarg? */
191 regprev = '\n';
192 else {
193 regprev = stringarg[-1];
194 if (!multiline && regprev == '\n')
195 regprev = '\0'; /* force ^ to NOT match */
196 }
bbce6d69 197
a0d0e21e 198 regprecomp = prog->precomp;
a0d0e21e
LW
199 /* Check validity of program. */
200 if (UCHARAT(prog->program) != MAGIC) {
201 FAIL("corrupted regexp program");
202 }
203
bbce6d69 204 regnpar = prog->nparens;
205 regtainted = FALSE;
a0d0e21e
LW
206
207 /* If there is a "must appear" string, look for it. */
208 s = startpos;
209 if (prog->regmust != Nullsv &&
774d564b 210 !(prog->reganch & ROPT_ANCH_GPOS) &&
211 (!(prog->reganch & ROPT_ANCH_BOL)
a0d0e21e
LW
212 || (multiline && prog->regback >= 0)) )
213 {
214 if (stringarg == strbeg && screamer) {
215 if (screamfirst[BmRARE(prog->regmust)] >= 0)
216 s = screaminstr(screamer,prog->regmust);
217 else
218 s = Nullch;
0a12ae7d 219 }
a0d0e21e
LW
220 else
221 s = fbm_instr((unsigned char*)s, (unsigned char*)strend,
222 prog->regmust);
223 if (!s) {
224 ++BmUSEFUL(prog->regmust); /* hooray */
225 goto phooey; /* not present */
a687059c 226 }
a0d0e21e
LW
227 else if (prog->regback >= 0) {
228 s -= prog->regback;
229 if (s < startpos)
230 s = startpos;
231 minlen = prog->regback + SvCUR(prog->regmust);
a687059c 232 }
a0d0e21e
LW
233 else if (!prog->naughty && --BmUSEFUL(prog->regmust) < 0) { /* boo */
234 SvREFCNT_dec(prog->regmust);
235 prog->regmust = Nullsv; /* disable regmust */
236 s = startpos;
237 }
238 else {
239 s = startpos;
240 minlen = SvCUR(prog->regmust);
a687059c 241 }
a0d0e21e 242 }
a687059c 243
a0d0e21e
LW
244 /* Mark beginning of line for ^ . */
245 regbol = startpos;
a687059c 246
a0d0e21e
LW
247 /* Mark end of line for $ (and such) */
248 regeol = strend;
a687059c 249
a0d0e21e
LW
250 /* see how far we have to get to not match where we matched before */
251 regtill = startpos+minend;
a687059c 252
a0d0e21e 253 /* Simplest case: anchored match need be tried only once. */
774d564b 254 /* [unless only anchor is BOL and multiline is set] */
a0d0e21e
LW
255 if (prog->reganch & ROPT_ANCH) {
256 if (regtry(prog, startpos))
257 goto got_it;
774d564b 258 else if (!(prog->reganch & ROPT_ANCH_GPOS) &&
259 (multiline || (prog->reganch & ROPT_IMPLICIT)))
260 {
a0d0e21e
LW
261 if (minlen)
262 dontbother = minlen - 1;
263 strend -= dontbother;
264 /* for multiline we only have to try after newlines */
265 if (s > startpos)
266 s--;
267 while (s < strend) {
268 if (*s++ == '\n') {
269 if (s < strend && regtry(prog, s))
270 goto got_it;
271 }
35c8bce7 272 }
35c8bce7 273 }
a0d0e21e
LW
274 goto phooey;
275 }
35c8bce7 276
a0d0e21e
LW
277 /* Messy cases: unanchored match. */
278 if (prog->regstart) {
279 if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */
280 /* it must be a one character string */
bbce6d69 281 char ch = SvPVX(prog->regstart)[0];
a0d0e21e 282 while (s < strend) {
bbce6d69 283 if (*s == ch) {
a0d0e21e 284 if (regtry(prog, s))
a687059c 285 goto got_it;
a0d0e21e 286 s++;
bbce6d69 287 while (s < strend && *s == ch)
a0d0e21e 288 s++;
a687059c 289 }
a0d0e21e
LW
290 s++;
291 }
a687059c 292 }
47109bfb 293 else if (SvTYPE(prog->regstart) == SVt_PVBM) {
a0d0e21e
LW
294 /* We know what string it must start with. */
295 while ((s = fbm_instr((unsigned char*)s,
296 (unsigned char*)strend, prog->regstart)) != NULL)
297 {
298 if (regtry(prog, s))
299 goto got_it;
300 s++;
301 }
302 }
47109bfb 303 else { /* Optimized fbm_instr: */
a0d0e21e
LW
304 c = SvPVX(prog->regstart);
305 while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL)
306 {
307 if (regtry(prog, s))
308 goto got_it;
309 s++;
310 }
311 }
312 goto phooey;
313 }
314 /*SUPPRESS 560*/
315 if (c = prog->regstclass) {
316 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
a687059c 317
a0d0e21e
LW
318 if (minlen)
319 dontbother = minlen - 1;
320 strend -= dontbother; /* don't bother with what can't match */
321 tmp = 1;
322 /* We know what class it must start with. */
323 switch (OP(c)) {
324 case ANYOF:
325 c = OPERAND(c);
326 while (s < strend) {
bbce6d69 327 if (reginclass(c, *s)) {
a0d0e21e
LW
328 if (tmp && regtry(prog, s))
329 goto got_it;
330 else
331 tmp = doevery;
a687059c 332 }
a0d0e21e
LW
333 else
334 tmp = 1;
335 s++;
336 }
337 break;
bbce6d69 338 case BOUNDL:
339 regtainted = TRUE;
340 /* FALL THROUGH */
a0d0e21e
LW
341 case BOUND:
342 if (minlen)
343 dontbother++,strend--;
bbce6d69 344 tmp = (s != startpos) ? UCHARAT(s - 1) : regprev;
95bac841 345 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
a0d0e21e 346 while (s < strend) {
95bac841 347 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
a0d0e21e
LW
348 tmp = !tmp;
349 if (regtry(prog, s))
350 goto got_it;
a687059c 351 }
a0d0e21e
LW
352 s++;
353 }
354 if ((minlen || tmp) && regtry(prog,s))
355 goto got_it;
356 break;
bbce6d69 357 case NBOUNDL:
358 regtainted = TRUE;
359 /* FALL THROUGH */
a0d0e21e
LW
360 case NBOUND:
361 if (minlen)
362 dontbother++,strend--;
bbce6d69 363 tmp = (s != startpos) ? UCHARAT(s - 1) : regprev;
95bac841 364 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
a0d0e21e 365 while (s < strend) {
95bac841 366 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
a0d0e21e
LW
367 tmp = !tmp;
368 else if (regtry(prog, s))
369 goto got_it;
370 s++;
371 }
372 if ((minlen || !tmp) && regtry(prog,s))
373 goto got_it;
374 break;
375 case ALNUM:
376 while (s < strend) {
bbce6d69 377 if (isALNUM(*s)) {
378 if (tmp && regtry(prog, s))
379 goto got_it;
380 else
381 tmp = doevery;
382 }
383 else
384 tmp = 1;
385 s++;
386 }
387 break;
388 case ALNUML:
389 regtainted = TRUE;
390 while (s < strend) {
391 if (isALNUM_LC(*s)) {
a0d0e21e
LW
392 if (tmp && regtry(prog, s))
393 goto got_it;
a687059c 394 else
a0d0e21e
LW
395 tmp = doevery;
396 }
397 else
398 tmp = 1;
399 s++;
400 }
401 break;
402 case NALNUM:
403 while (s < strend) {
bbce6d69 404 if (!isALNUM(*s)) {
405 if (tmp && regtry(prog, s))
406 goto got_it;
407 else
408 tmp = doevery;
409 }
410 else
411 tmp = 1;
412 s++;
413 }
414 break;
415 case NALNUML:
416 regtainted = TRUE;
417 while (s < strend) {
418 if (!isALNUM_LC(*s)) {
a0d0e21e
LW
419 if (tmp && regtry(prog, s))
420 goto got_it;
a687059c 421 else
a0d0e21e 422 tmp = doevery;
a687059c 423 }
a0d0e21e
LW
424 else
425 tmp = 1;
426 s++;
427 }
428 break;
429 case SPACE:
430 while (s < strend) {
431 if (isSPACE(*s)) {
432 if (tmp && regtry(prog, s))
433 goto got_it;
434 else
435 tmp = doevery;
2304df62 436 }
a0d0e21e
LW
437 else
438 tmp = 1;
439 s++;
440 }
441 break;
bbce6d69 442 case SPACEL:
443 regtainted = TRUE;
444 while (s < strend) {
445 if (isSPACE_LC(*s)) {
446 if (tmp && regtry(prog, s))
447 goto got_it;
448 else
449 tmp = doevery;
450 }
451 else
452 tmp = 1;
453 s++;
454 }
455 break;
a0d0e21e
LW
456 case NSPACE:
457 while (s < strend) {
458 if (!isSPACE(*s)) {
459 if (tmp && regtry(prog, s))
460 goto got_it;
461 else
462 tmp = doevery;
a687059c 463 }
a0d0e21e
LW
464 else
465 tmp = 1;
466 s++;
467 }
468 break;
bbce6d69 469 case NSPACEL:
470 regtainted = TRUE;
471 while (s < strend) {
472 if (!isSPACE_LC(*s)) {
473 if (tmp && regtry(prog, s))
474 goto got_it;
475 else
476 tmp = doevery;
477 }
478 else
479 tmp = 1;
480 s++;
481 }
482 break;
a0d0e21e
LW
483 case DIGIT:
484 while (s < strend) {
485 if (isDIGIT(*s)) {
486 if (tmp && regtry(prog, s))
487 goto got_it;
488 else
489 tmp = doevery;
2b69d0c2 490 }
a0d0e21e
LW
491 else
492 tmp = 1;
493 s++;
494 }
495 break;
496 case NDIGIT:
497 while (s < strend) {
498 if (!isDIGIT(*s)) {
499 if (tmp && regtry(prog, s))
500 goto got_it;
501 else
502 tmp = doevery;
a687059c 503 }
a0d0e21e
LW
504 else
505 tmp = 1;
506 s++;
507 }
508 break;
a687059c 509 }
a0d0e21e
LW
510 }
511 else {
512 if (minlen)
513 dontbother = minlen - 1;
514 strend -= dontbother;
515 /* We don't know much -- general case. */
516 do {
517 if (regtry(prog, s))
518 goto got_it;
519 } while (s++ < strend);
520 }
521
522 /* Failure. */
523 goto phooey;
a687059c 524
a0d0e21e 525got_it:
420218e7 526 strend += dontbother; /* uncheat */
a0d0e21e
LW
527 prog->subbeg = strbeg;
528 prog->subend = strend;
bbce6d69 529 prog->exec_tainted = regtainted;
5f05dabc 530
531 /* make sure $`, $&, $', and $digit will work later */
137443ea 532 if (strbeg != prog->subbase) {
533 if (safebase) {
534 if (prog->subbase) {
535 Safefree(prog->subbase);
536 prog->subbase = Nullch;
537 }
538 }
539 else {
540 I32 i = strend - startpos + (stringarg - strbeg);
541 s = savepvn(strbeg, i);
542 Safefree(prog->subbase);
543 prog->subbase = s;
544 prog->subbeg = prog->subbase;
545 prog->subend = prog->subbase + i;
546 s = prog->subbase + (stringarg - strbeg);
547 for (i = 0; i <= prog->nparens; i++) {
548 if (prog->endp[i]) {
549 prog->startp[i] = s + (prog->startp[i] - startpos);
550 prog->endp[i] = s + (prog->endp[i] - startpos);
551 }
a0d0e21e
LW
552 }
553 }
a0d0e21e
LW
554 }
555 return 1;
556
557phooey:
a0d0e21e 558 return 0;
a687059c
LW
559}
560
561/*
562 - regtry - try match at specific point
563 */
79072805 564static I32 /* 0 failure, 1 success */
a0d0e21e 565regtry(prog, startpos)
a687059c 566regexp *prog;
a0d0e21e 567char *startpos;
a687059c 568{
a0d0e21e
LW
569 register I32 i;
570 register char **sp;
571 register char **ep;
572
573 reginput = startpos;
574 regstartp = prog->startp;
575 regendp = prog->endp;
576 reglastparen = &prog->lastparen;
577 prog->lastparen = 0;
578 regsize = 0;
579
580 sp = prog->startp;
581 ep = prog->endp;
582 if (prog->nparens) {
583 for (i = prog->nparens; i >= 0; i--) {
584 *sp++ = NULL;
585 *ep++ = NULL;
a687059c 586 }
a0d0e21e
LW
587 }
588 if (regmatch(prog->program + 1) && reginput >= regtill) {
589 prog->startp[0] = startpos;
590 prog->endp[0] = reginput;
591 return 1;
592 }
593 else
594 return 0;
a687059c
LW
595}
596
597/*
598 - regmatch - main matching routine
599 *
600 * Conceptually the strategy is simple: check to see whether the current
601 * node matches, call self recursively to see whether the rest matches,
602 * and then act accordingly. In practice we make some effort to avoid
603 * recursion, in particular by going through "ordinary" nodes (that don't
604 * need to know whether the rest of the match failed) by a loop instead of
605 * by recursion.
606 */
607/* [lwall] I've hoisted the register declarations to the outer block in order to
608 * maybe save a little bit of pushing and popping on the stack. It also takes
609 * advantage of machines that use a register save mask on subroutine entry.
610 */
79072805 611static I32 /* 0 failure, 1 success */
a687059c
LW
612regmatch(prog)
613char *prog;
614{
a0d0e21e
LW
615 register char *scan; /* Current node. */
616 char *next; /* Next node. */
617 register I32 nextchar;
618 register I32 n; /* no or next */
619 register I32 ln; /* len or last */
620 register char *s; /* operand or save */
621 register char *locinput = reginput;
bbce6d69 622 register I32 c1, c2; /* case fold search */
a0d0e21e 623 int minmod = 0;
4633a7c4
LW
624#ifdef DEBUGGING
625 static int regindent = 0;
626 regindent++;
627#endif
a0d0e21e 628
bbce6d69 629 nextchar = UCHARAT(locinput);
a0d0e21e
LW
630 scan = prog;
631 while (scan != NULL) {
a687059c 632#ifdef DEBUGGING
4633a7c4
LW
633#define sayYES goto yes
634#define sayNO goto no
635#define saySAME(x) if (x) goto yes; else goto no
636 if (regnarrate) {
46fc3d4c 637 SV *prop = sv_newmortal();
638 regprop(prop, scan);
639 PerlIO_printf(Perl_debug_log, "%*s%2d%-8.8s\t<%.10s>\n",
640 regindent*2, "", scan - regprogram,
641 SvPVX(prop), locinput);
4633a7c4
LW
642 }
643#else
644#define sayYES return 1
645#define sayNO return 0
646#define saySAME(x) return x
a687059c
LW
647#endif
648
649#ifdef REGALIGN
a0d0e21e
LW
650 next = scan + NEXT(scan);
651 if (next == scan)
652 next = NULL;
a687059c 653#else
a0d0e21e 654 next = regnext(scan);
a687059c
LW
655#endif
656
a0d0e21e
LW
657 switch (OP(scan)) {
658 case BOL:
659 if (locinput == regbol
660 ? regprev == '\n'
661 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
662 {
663 /* regtill = regbol; */
664 break;
665 }
4633a7c4 666 sayNO;
a0d0e21e
LW
667 case MBOL:
668 if (locinput == regbol
669 ? regprev == '\n'
670 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
671 {
672 break;
673 }
4633a7c4 674 sayNO;
a0d0e21e
LW
675 case SBOL:
676 if (locinput == regbol && regprev == '\n')
677 break;
4633a7c4 678 sayNO;
774d564b 679 case GPOS:
a0d0e21e
LW
680 if (locinput == regbol)
681 break;
4633a7c4 682 sayNO;
a0d0e21e
LW
683 case EOL:
684 if (multiline)
685 goto meol;
686 else
687 goto seol;
688 case MEOL:
689 meol:
690 if ((nextchar || locinput < regeol) && nextchar != '\n')
4633a7c4 691 sayNO;
a0d0e21e
LW
692 break;
693 case SEOL:
694 seol:
695 if ((nextchar || locinput < regeol) && nextchar != '\n')
4633a7c4 696 sayNO;
a0d0e21e 697 if (regeol - locinput > 1)
4633a7c4 698 sayNO;
a0d0e21e
LW
699 break;
700 case SANY:
701 if (!nextchar && locinput >= regeol)
4633a7c4 702 sayNO;
bbce6d69 703 nextchar = UCHARAT(++locinput);
a0d0e21e
LW
704 break;
705 case ANY:
706 if (!nextchar && locinput >= regeol || nextchar == '\n')
4633a7c4 707 sayNO;
bbce6d69 708 nextchar = UCHARAT(++locinput);
a0d0e21e 709 break;
bbce6d69 710 case EXACT:
a0d0e21e
LW
711 s = OPERAND(scan);
712 ln = *s++;
713 /* Inline the first character, for speed. */
95bac841 714 if (UCHARAT(s) != nextchar)
4633a7c4 715 sayNO;
a0d0e21e 716 if (regeol - locinput < ln)
4633a7c4 717 sayNO;
36477c24 718 if (ln > 1 && memNE(s, locinput, ln))
4633a7c4 719 sayNO;
a0d0e21e 720 locinput += ln;
bbce6d69 721 nextchar = UCHARAT(locinput);
722 break;
723 case EXACTFL:
724 regtainted = TRUE;
725 /* FALL THROUGH */
726 case EXACTF:
727 s = OPERAND(scan);
728 ln = *s++;
729 /* Inline the first character, for speed. */
730 if (UCHARAT(s) != nextchar &&
731 UCHARAT(s) != ((OP(scan) == EXACTF)
732 ? fold : fold_locale)[nextchar])
733 sayNO;
734 if (regeol - locinput < ln)
735 sayNO;
5f05dabc 736 if (ln > 1 && (OP(scan) == EXACTF
737 ? ibcmp(s, locinput, ln)
738 : ibcmp_locale(s, locinput, ln)))
bbce6d69 739 sayNO;
740 locinput += ln;
741 nextchar = UCHARAT(locinput);
a0d0e21e
LW
742 break;
743 case ANYOF:
744 s = OPERAND(scan);
745 if (nextchar < 0)
746 nextchar = UCHARAT(locinput);
bbce6d69 747 if (!reginclass(s, nextchar))
4633a7c4 748 sayNO;
a0d0e21e 749 if (!nextchar && locinput >= regeol)
4633a7c4 750 sayNO;
bbce6d69 751 nextchar = UCHARAT(++locinput);
a0d0e21e 752 break;
bbce6d69 753 case ALNUML:
754 regtainted = TRUE;
755 /* FALL THROUGH */
a0d0e21e
LW
756 case ALNUM:
757 if (!nextchar)
4633a7c4 758 sayNO;
bbce6d69 759 if (!(OP(scan) == ALNUM
760 ? isALNUM(nextchar) : isALNUM_LC(nextchar)))
4633a7c4 761 sayNO;
bbce6d69 762 nextchar = UCHARAT(++locinput);
a0d0e21e 763 break;
bbce6d69 764 case NALNUML:
765 regtainted = TRUE;
766 /* FALL THROUGH */
a0d0e21e
LW
767 case NALNUM:
768 if (!nextchar && locinput >= regeol)
4633a7c4 769 sayNO;
bbce6d69 770 if (OP(scan) == NALNUM
771 ? isALNUM(nextchar) : isALNUM_LC(nextchar))
4633a7c4 772 sayNO;
bbce6d69 773 nextchar = UCHARAT(++locinput);
a0d0e21e 774 break;
bbce6d69 775 case BOUNDL:
776 case NBOUNDL:
777 regtainted = TRUE;
778 /* FALL THROUGH */
a0d0e21e 779 case BOUND:
bbce6d69 780 case NBOUND:
781 /* was last char in word? */
782 ln = (locinput != regbol) ? UCHARAT(locinput - 1) : regprev;
783 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
784 ln = isALNUM(ln);
785 n = isALNUM(nextchar);
786 }
787 else {
788 ln = isALNUM_LC(ln);
789 n = isALNUM_LC(nextchar);
790 }
95bac841 791 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
4633a7c4 792 sayNO;
a0d0e21e 793 break;
bbce6d69 794 case SPACEL:
795 regtainted = TRUE;
796 /* FALL THROUGH */
a0d0e21e
LW
797 case SPACE:
798 if (!nextchar && locinput >= regeol)
4633a7c4 799 sayNO;
bbce6d69 800 if (!(OP(scan) == SPACE
801 ? isSPACE(nextchar) : isSPACE_LC(nextchar)))
4633a7c4 802 sayNO;
bbce6d69 803 nextchar = UCHARAT(++locinput);
a0d0e21e 804 break;
bbce6d69 805 case NSPACEL:
806 regtainted = TRUE;
807 /* FALL THROUGH */
a0d0e21e
LW
808 case NSPACE:
809 if (!nextchar)
4633a7c4 810 sayNO;
bbce6d69 811 if (OP(scan) == SPACE
812 ? isSPACE(nextchar) : isSPACE_LC(nextchar))
4633a7c4 813 sayNO;
bbce6d69 814 nextchar = UCHARAT(++locinput);
a0d0e21e
LW
815 break;
816 case DIGIT:
817 if (!isDIGIT(nextchar))
4633a7c4 818 sayNO;
bbce6d69 819 nextchar = UCHARAT(++locinput);
a0d0e21e
LW
820 break;
821 case NDIGIT:
822 if (!nextchar && locinput >= regeol)
4633a7c4 823 sayNO;
a0d0e21e 824 if (isDIGIT(nextchar))
4633a7c4 825 sayNO;
bbce6d69 826 nextchar = UCHARAT(++locinput);
a0d0e21e 827 break;
c8756f30
AK
828 case REFFL:
829 regtainted = TRUE;
830 /* FALL THROUGH */
a0d0e21e 831 case REF:
c8756f30 832 case REFF:
a0d0e21e
LW
833 n = ARG1(scan); /* which paren pair */
834 s = regstartp[n];
835 if (!s)
4633a7c4 836 sayNO;
a0d0e21e 837 if (!regendp[n])
4633a7c4 838 sayNO;
a0d0e21e
LW
839 if (s == regendp[n])
840 break;
841 /* Inline the first character, for speed. */
c8756f30
AK
842 if (UCHARAT(s) != nextchar &&
843 (OP(scan) == REF ||
844 (UCHARAT(s) != ((OP(scan) == REFF
845 ? fold : fold_locale)[nextchar]))))
4633a7c4 846 sayNO;
a0d0e21e
LW
847 ln = regendp[n] - s;
848 if (locinput + ln > regeol)
4633a7c4 849 sayNO;
c8756f30
AK
850 if (ln > 1 && (OP(scan) == REF
851 ? memNE(s, locinput, ln)
852 : (OP(scan) == REFF
853 ? ibcmp(s, locinput, ln)
854 : ibcmp_locale(s, locinput, ln))))
4633a7c4 855 sayNO;
a0d0e21e 856 locinput += ln;
bbce6d69 857 nextchar = UCHARAT(locinput);
a0d0e21e
LW
858 break;
859
860 case NOTHING:
861 break;
862 case BACK:
863 break;
864 case OPEN:
865 n = ARG1(scan); /* which paren pair */
866 regstartp[n] = locinput;
867 if (n > regsize)
868 regsize = n;
869 break;
870 case CLOSE:
871 n = ARG1(scan); /* which paren pair */
872 regendp[n] = locinput;
873 if (n > *reglastparen)
874 *reglastparen = n;
875 break;
876 case CURLYX: {
877 CURCUR cc;
878 CHECKPOINT cp = savestack_ix;
879 cc.oldcc = regcc;
880 regcc = &cc;
881 cc.parenfloor = *reglastparen;
882 cc.cur = -1;
883 cc.min = ARG1(scan);
884 cc.max = ARG2(scan);
885 cc.scan = NEXTOPER(scan) + 4;
886 cc.next = next;
887 cc.minmod = minmod;
888 cc.lastloc = 0;
889 reginput = locinput;
890 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
891 regcpblow(cp);
892 regcc = cc.oldcc;
4633a7c4 893 saySAME(n);
a0d0e21e
LW
894 }
895 /* NOT REACHED */
896 case WHILEM: {
897 /*
898 * This is really hard to understand, because after we match
899 * what we're trying to match, we must make sure the rest of
900 * the RE is going to match for sure, and to do that we have
901 * to go back UP the parse tree by recursing ever deeper. And
902 * if it fails, we have to reset our parent's current state
903 * that we can try again after backing off.
904 */
905
5f05dabc 906 CHECKPOINT cp;
a0d0e21e 907 CURCUR* cc = regcc;
4633a7c4 908 n = cc->cur + 1; /* how many we know we matched */
a0d0e21e
LW
909 reginput = locinput;
910
4633a7c4
LW
911#ifdef DEBUGGING
912 if (regnarrate)
68dc0745 913 PerlIO_printf(Perl_debug_log, "%*s %ld %lx\n", regindent*2, "",
914 (long)n, (long)cc);
4633a7c4
LW
915#endif
916
a0d0e21e
LW
917 /* If degenerate scan matches "", assume scan done. */
918
579cf2c3 919 if (locinput == cc->lastloc && n >= cc->min) {
a0d0e21e
LW
920 regcc = cc->oldcc;
921 ln = regcc->cur;
922 if (regmatch(cc->next))
4633a7c4 923 sayYES;
a0d0e21e
LW
924 regcc->cur = ln;
925 regcc = cc;
4633a7c4 926 sayNO;
a0d0e21e
LW
927 }
928
929 /* First just match a string of min scans. */
930
931 if (n < cc->min) {
932 cc->cur = n;
933 cc->lastloc = locinput;
4633a7c4
LW
934 if (regmatch(cc->scan))
935 sayYES;
936 cc->cur = n - 1;
937 sayNO;
a0d0e21e
LW
938 }
939
940 /* Prefer next over scan for minimal matching. */
941
942 if (cc->minmod) {
943 regcc = cc->oldcc;
944 ln = regcc->cur;
5f05dabc 945 cp = regcppush(cc->parenfloor);
946 if (regmatch(cc->next)) {
947 regcpblow(cp);
4633a7c4 948 sayYES; /* All done. */
5f05dabc 949 }
950 regcppop();
a0d0e21e
LW
951 regcc->cur = ln;
952 regcc = cc;
953
954 if (n >= cc->max) /* Maximum greed exceeded? */
4633a7c4 955 sayNO;
a687059c 956
a0d0e21e
LW
957 /* Try scanning more and see if it helps. */
958 reginput = locinput;
959 cc->cur = n;
960 cc->lastloc = locinput;
5f05dabc 961 cp = regcppush(cc->parenfloor);
962 if (regmatch(cc->scan)) {
963 regcpblow(cp);
4633a7c4 964 sayYES;
5f05dabc 965 }
966 regcppop();
4633a7c4
LW
967 cc->cur = n - 1;
968 sayNO;
a0d0e21e
LW
969 }
970
971 /* Prefer scan over next for maximal matching. */
972
973 if (n < cc->max) { /* More greed allowed? */
5f05dabc 974 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
975 cc->cur = n;
976 cc->lastloc = locinput;
5f05dabc 977 if (regmatch(cc->scan)) {
978 regcpblow(cp);
4633a7c4 979 sayYES;
5f05dabc 980 }
a0d0e21e
LW
981 regcppop(); /* Restore some previous $<digit>s? */
982 reginput = locinput;
983 }
984
985 /* Failed deeper matches of scan, so see if this one works. */
986 regcc = cc->oldcc;
987 ln = regcc->cur;
988 if (regmatch(cc->next))
4633a7c4 989 sayYES;
a0d0e21e
LW
990 regcc->cur = ln;
991 regcc = cc;
4633a7c4
LW
992 cc->cur = n - 1;
993 sayNO;
a0d0e21e
LW
994 }
995 /* NOT REACHED */
996 case BRANCH: {
997 if (OP(next) != BRANCH) /* No choice. */
998 next = NEXTOPER(scan);/* Avoid recursion. */
999 else {
748a9306 1000 int lastparen = *reglastparen;
a0d0e21e
LW
1001 do {
1002 reginput = locinput;
1003 if (regmatch(NEXTOPER(scan)))
4633a7c4 1004 sayYES;
748a9306
LW
1005 for (n = *reglastparen; n > lastparen; n--)
1006 regendp[n] = 0;
1007 *reglastparen = n;
1008
a687059c 1009#ifdef REGALIGN
a0d0e21e
LW
1010 /*SUPPRESS 560*/
1011 if (n = NEXT(scan))
1012 scan += n;
1013 else
1014 scan = NULL;
a687059c 1015#else
a0d0e21e 1016 scan = regnext(scan);
79072805 1017#endif
a0d0e21e 1018 } while (scan != NULL && OP(scan) == BRANCH);
4633a7c4 1019 sayNO;
a0d0e21e 1020 /* NOTREACHED */
a687059c 1021 }
a0d0e21e
LW
1022 }
1023 break;
1024 case MINMOD:
1025 minmod = 1;
1026 break;
1027 case CURLY:
1028 ln = ARG1(scan); /* min to match */
1029 n = ARG2(scan); /* max to match */
1030 scan = NEXTOPER(scan) + 4;
1031 goto repeat;
1032 case STAR:
1033 ln = 0;
1034 n = 32767;
1035 scan = NEXTOPER(scan);
1036 goto repeat;
1037 case PLUS:
1038 /*
1039 * Lookahead to avoid useless match attempts
1040 * when we know what character comes next.
1041 */
1042 ln = 1;
1043 n = 32767;
1044 scan = NEXTOPER(scan);
1045 repeat:
bbce6d69 1046 if (regkind[(U8)OP(next)] == EXACT) {
1047 c1 = UCHARAT(OPERAND(next) + 1);
1048 if (OP(next) == EXACTF)
1049 c2 = fold[c1];
1050 else if (OP(next) == EXACTFL)
1051 c2 = fold_locale[c1];
1052 else
1053 c2 = c1;
1054 }
a0d0e21e 1055 else
bbce6d69 1056 c1 = c2 = -1000;
a0d0e21e
LW
1057 reginput = locinput;
1058 if (minmod) {
1059 minmod = 0;
1060 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 1061 sayNO;
8e07c86e 1062 while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */
a0d0e21e 1063 /* If it could work, try it. */
bbce6d69 1064 if (c1 == -1000 ||
1065 UCHARAT(reginput) == c1 ||
1066 UCHARAT(reginput) == c2)
1067 {
a0d0e21e 1068 if (regmatch(next))
4633a7c4 1069 sayYES;
bbce6d69 1070 }
a0d0e21e 1071 /* Couldn't or didn't -- back up. */
748a9306 1072 reginput = locinput + ln;
a0d0e21e
LW
1073 if (regrepeat(scan, 1)) {
1074 ln++;
1075 reginput = locinput + ln;
1076 }
1077 else
4633a7c4 1078 sayNO;
a0d0e21e
LW
1079 }
1080 }
1081 else {
1082 n = regrepeat(scan, n);
1083 if (ln < n && regkind[(U8)OP(next)] == EOL &&
1084 (!multiline || OP(next) == SEOL))
1085 ln = n; /* why back off? */
1086 while (n >= ln) {
1087 /* If it could work, try it. */
bbce6d69 1088 if (c1 == -1000 ||
1089 UCHARAT(reginput) == c1 ||
1090 UCHARAT(reginput) == c2)
1091 {
a0d0e21e 1092 if (regmatch(next))
4633a7c4 1093 sayYES;
bbce6d69 1094 }
a0d0e21e
LW
1095 /* Couldn't or didn't -- back up. */
1096 n--;
1097 reginput = locinput + n;
1098 }
1099 }
4633a7c4 1100 sayNO;
a0d0e21e
LW
1101 case SUCCEED:
1102 case END:
1103 reginput = locinput; /* put where regtry can find it */
4633a7c4 1104 sayYES; /* Success! */
a0d0e21e
LW
1105 case IFMATCH:
1106 reginput = locinput;
1107 scan = NEXTOPER(scan);
1108 if (!regmatch(scan))
4633a7c4 1109 sayNO;
a0d0e21e
LW
1110 break;
1111 case UNLESSM:
1112 reginput = locinput;
1113 scan = NEXTOPER(scan);
1114 if (regmatch(scan))
4633a7c4 1115 sayNO;
a0d0e21e
LW
1116 break;
1117 default:
c030ccd9
CS
1118 PerlIO_printf(PerlIO_stderr(), "%lx %d\n",
1119 (unsigned long)scan, scan[1]);
a0d0e21e 1120 FAIL("regexp memory corruption");
a687059c 1121 }
a0d0e21e
LW
1122 scan = next;
1123 }
a687059c 1124
a0d0e21e
LW
1125 /*
1126 * We get here only if there's trouble -- normally "case END" is
1127 * the terminating point.
1128 */
1129 FAIL("corrupted regexp pointers");
1130 /*NOTREACHED*/
4633a7c4
LW
1131 sayNO;
1132
1133yes:
1134#ifdef DEBUGGING
1135 regindent--;
1136#endif
1137 return 1;
1138
1139no:
1140#ifdef DEBUGGING
1141 regindent--;
1142#endif
a0d0e21e 1143 return 0;
a687059c
LW
1144}
1145
1146/*
1147 - regrepeat - repeatedly match something simple, report how many
1148 */
1149/*
1150 * [This routine now assumes that it will only match on things of length 1.
1151 * That was true before, but now we assume scan - reginput is the count,
1152 * rather than incrementing count on every character.]
1153 */
79072805 1154static I32
00bf170e 1155regrepeat(p, max)
a687059c 1156char *p;
79072805 1157I32 max;
a687059c 1158{
a0d0e21e
LW
1159 register char *scan;
1160 register char *opnd;
1161 register I32 c;
1162 register char *loceol = regeol;
1163
1164 scan = reginput;
1165 if (max != 32767 && max < loceol - scan)
1166 loceol = scan + max;
1167 opnd = OPERAND(p);
1168 switch (OP(p)) {
1169 case ANY:
1170 while (scan < loceol && *scan != '\n')
1171 scan++;
1172 break;
1173 case SANY:
1174 scan = loceol;
1175 break;
bbce6d69 1176 case EXACT: /* length of string is 1 */
1177 c = UCHARAT(++opnd);
1178 while (scan < loceol && UCHARAT(scan) == c)
1179 scan++;
1180 break;
1181 case EXACTF: /* length of string is 1 */
1182 c = UCHARAT(++opnd);
1183 while (scan < loceol &&
1184 (UCHARAT(scan) == c || UCHARAT(scan) == fold[c]))
1185 scan++;
1186 break;
1187 case EXACTFL: /* length of string is 1 */
1188 regtainted = TRUE;
1189 c = UCHARAT(++opnd);
1190 while (scan < loceol &&
1191 (UCHARAT(scan) == c || UCHARAT(scan) == fold_locale[c]))
a0d0e21e
LW
1192 scan++;
1193 break;
1194 case ANYOF:
bbce6d69 1195 while (scan < loceol && reginclass(opnd, *scan))
a0d0e21e 1196 scan++;
a0d0e21e
LW
1197 break;
1198 case ALNUM:
1199 while (scan < loceol && isALNUM(*scan))
1200 scan++;
1201 break;
bbce6d69 1202 case ALNUML:
1203 regtainted = TRUE;
1204 while (scan < loceol && isALNUM_LC(*scan))
1205 scan++;
1206 break;
a0d0e21e
LW
1207 case NALNUM:
1208 while (scan < loceol && !isALNUM(*scan))
1209 scan++;
1210 break;
bbce6d69 1211 case NALNUML:
1212 regtainted = TRUE;
1213 while (scan < loceol && !isALNUM_LC(*scan))
1214 scan++;
1215 break;
a0d0e21e
LW
1216 case SPACE:
1217 while (scan < loceol && isSPACE(*scan))
1218 scan++;
1219 break;
bbce6d69 1220 case SPACEL:
1221 regtainted = TRUE;
1222 while (scan < loceol && isSPACE_LC(*scan))
1223 scan++;
1224 break;
a0d0e21e
LW
1225 case NSPACE:
1226 while (scan < loceol && !isSPACE(*scan))
1227 scan++;
1228 break;
bbce6d69 1229 case NSPACEL:
1230 regtainted = TRUE;
1231 while (scan < loceol && !isSPACE_LC(*scan))
1232 scan++;
1233 break;
a0d0e21e
LW
1234 case DIGIT:
1235 while (scan < loceol && isDIGIT(*scan))
1236 scan++;
1237 break;
1238 case NDIGIT:
1239 while (scan < loceol && !isDIGIT(*scan))
1240 scan++;
1241 break;
1242 default: /* Called on something of 0 width. */
1243 break; /* So match right here or not at all. */
1244 }
a687059c 1245
a0d0e21e
LW
1246 c = scan - reginput;
1247 reginput = scan;
a687059c 1248
a0d0e21e 1249 return(c);
a687059c
LW
1250}
1251
1252/*
bbce6d69 1253 - regclass - determine if a character falls into a character class
1254 */
1255
1256static bool
1257reginclass(p, c)
1258register char *p;
1259register I32 c;
1260{
1261 char flags = *p;
1262 bool match = FALSE;
1263
1264 c &= 0xFF;
1265 if (p[1 + (c >> 3)] & (1 << (c & 7)))
1266 match = TRUE;
1267 else if (flags & ANYOF_FOLD) {
1268 I32 cf;
1269 if (flags & ANYOF_LOCALE) {
1270 regtainted = TRUE;
1271 cf = fold_locale[c];
1272 }
1273 else
1274 cf = fold[c];
1275 if (p[1 + (cf >> 3)] & (1 << (cf & 7)))
1276 match = TRUE;
1277 }
1278
1279 if (!match && (flags & ANYOF_ISA)) {
1280 regtainted = TRUE;
1281
1282 if (((flags & ANYOF_ALNUML) && isALNUM_LC(c)) ||
1283 ((flags & ANYOF_NALNUML) && !isALNUM_LC(c)) ||
1284 ((flags & ANYOF_SPACEL) && isSPACE_LC(c)) ||
1285 ((flags & ANYOF_NSPACEL) && !isSPACE_LC(c)))
1286 {
1287 match = TRUE;
1288 }
1289 }
1290
1291 return match ^ ((flags & ANYOF_INVERT) != 0);
1292}
1293
1294/*
a687059c
LW
1295 - regnext - dig the "next" pointer out of a node
1296 *
1297 * [Note, when REGALIGN is defined there are two places in regmatch()
1298 * that bypass this code for speed.]
1299 */
1300char *
1301regnext(p)
1302register char *p;
1303{
a0d0e21e 1304 register I32 offset;
a687059c 1305
a0d0e21e
LW
1306 if (p == &regdummy)
1307 return(NULL);
a687059c 1308
a0d0e21e
LW
1309 offset = NEXT(p);
1310 if (offset == 0)
1311 return(NULL);
a687059c
LW
1312
1313#ifdef REGALIGN
a0d0e21e 1314 return(p+offset);
a687059c 1315#else
a0d0e21e
LW
1316 if (OP(p) == BACK)
1317 return(p-offset);
1318 else
1319 return(p+offset);
a687059c
LW
1320#endif
1321}