This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The sv_catsv() fix, take two.
[perl5.git] / regexec.c
... / ...
CommitLineData
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
34/* We *really* need to overwrite these symbols: */
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
38# define Perl_re_intuit_start my_re_intuit_start
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
41# define Perl_reginitcolors my_reginitcolors
42# define Perl_regclass_swash my_regclass_swash
43
44# define PERL_NO_GET_CONTEXT
45#endif
46
47/*SUPPRESS 112*/
48/*
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
70 **** Copyright (c) 1991-2000, Larry Wall
71 ****
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
80#define PERL_IN_REGEXEC_C
81#include "perl.h"
82
83#ifdef PERL_IN_XSUB_RE
84# if defined(PERL_CAPI) || defined(PERL_OBJECT)
85# include "XSUB.h"
86# endif
87#endif
88
89#include "regcomp.h"
90
91#define RF_tainted 1 /* tainted information used? */
92#define RF_warned 2 /* warned about big count? */
93#define RF_evaled 4 /* Did an EVAL with setting? */
94#define RF_utf8 8 /* String contains multibyte chars? */
95
96#define UTF (PL_reg_flags & RF_utf8)
97
98#define RS_init 1 /* eval environment created */
99#define RS_set 2 /* replsv value is set */
100
101#ifndef STATIC
102#define STATIC static
103#endif
104
105/*
106 * Forwards.
107 */
108
109#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
111
112#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114#define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
115#define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116#define HOPc(pos,off) ((char*)HOP(pos,off))
117#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
118
119static void restore_pos(pTHXo_ void *arg);
120
121
122STATIC CHECKPOINT
123S_regcppush(pTHX_ I32 parenfloor)
124{
125 int retval = PL_savestack_ix;
126 int i = (PL_regsize - parenfloor) * 4;
127 int p;
128
129 SSCHECK(i + 5);
130 for (p = PL_regsize; p > parenfloor; p--) {
131 SSPUSHINT(PL_regendp[p]);
132 SSPUSHINT(PL_regstartp[p]);
133 SSPUSHPTR(PL_reg_start_tmp[p]);
134 SSPUSHINT(p);
135 }
136 SSPUSHINT(PL_regsize);
137 SSPUSHINT(*PL_reglastparen);
138 SSPUSHPTR(PL_reginput);
139 SSPUSHINT(i + 3);
140 SSPUSHINT(SAVEt_REGCONTEXT);
141 return retval;
142}
143
144/* These are needed since we do not localize EVAL nodes: */
145# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
146 " Setting an EVAL scope, savestack=%"IVdf"\n", \
147 (IV)PL_savestack_ix)); cp = PL_savestack_ix
148
149# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
150 PerlIO_printf(Perl_debug_log, \
151 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
152 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
153
154STATIC char *
155S_regcppop(pTHX)
156{
157 I32 i = SSPOPINT;
158 U32 paren = 0;
159 char *input;
160 I32 tmps;
161 assert(i == SAVEt_REGCONTEXT);
162 i = SSPOPINT;
163 input = (char *) SSPOPPTR;
164 *PL_reglastparen = SSPOPINT;
165 PL_regsize = SSPOPINT;
166 for (i -= 3; i > 0; i -= 4) {
167 paren = (U32)SSPOPINT;
168 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
169 PL_regstartp[paren] = SSPOPINT;
170 tmps = SSPOPINT;
171 if (paren <= *PL_reglastparen)
172 PL_regendp[paren] = tmps;
173 DEBUG_r(
174 PerlIO_printf(Perl_debug_log,
175 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
176 (UV)paren, (IV)PL_regstartp[paren],
177 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
178 (IV)PL_regendp[paren],
179 (paren > *PL_reglastparen ? "(no)" : ""));
180 );
181 }
182 DEBUG_r(
183 if (*PL_reglastparen + 1 <= PL_regnpar) {
184 PerlIO_printf(Perl_debug_log,
185 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
186 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
187 }
188 );
189#if 1
190 /* It would seem that the similar code in regtry()
191 * already takes care of this, and in fact it is in
192 * a better location to since this code can #if 0-ed out
193 * but the code in regtry() is needed or otherwise tests
194 * requiring null fields (pat.t#187 and split.t#{13,14}
195 * (as of patchlevel 7877) will fail. Then again,
196 * this code seems to be necessary or otherwise
197 * building DynaLoader will fail:
198 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
199 * --jhi */
200 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
201 if (paren > PL_regsize)
202 PL_regstartp[paren] = -1;
203 PL_regendp[paren] = -1;
204 }
205#endif
206 return input;
207}
208
209STATIC char *
210S_regcp_set_to(pTHX_ I32 ss)
211{
212 I32 tmp = PL_savestack_ix;
213
214 PL_savestack_ix = ss;
215 regcppop();
216 PL_savestack_ix = tmp;
217 return Nullch;
218}
219
220typedef struct re_cc_state
221{
222 I32 ss;
223 regnode *node;
224 struct re_cc_state *prev;
225 CURCUR *cc;
226 regexp *re;
227} re_cc_state;
228
229#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
230
231#define TRYPAREN(paren, n, input) { \
232 if (paren) { \
233 if (n) { \
234 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
235 PL_regendp[paren] = input - PL_bostr; \
236 } \
237 else \
238 PL_regendp[paren] = -1; \
239 } \
240 if (regmatch(next)) \
241 sayYES; \
242 if (paren && n) \
243 PL_regendp[paren] = -1; \
244}
245
246
247/*
248 * pregexec and friends
249 */
250
251/*
252 - pregexec - match a regexp against a string
253 */
254I32
255Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
256 char *strbeg, I32 minend, SV *screamer, U32 nosave)
257/* strend: pointer to null at end of string */
258/* strbeg: real beginning of string */
259/* minend: end of match must be >=minend after stringarg. */
260/* nosave: For optimizations. */
261{
262 return
263 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
264 nosave ? 0 : REXEC_COPY_STR);
265}
266
267STATIC void
268S_cache_re(pTHX_ regexp *prog)
269{
270 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
271#ifdef DEBUGGING
272 PL_regprogram = prog->program;
273#endif
274 PL_regnpar = prog->nparens;
275 PL_regdata = prog->data;
276 PL_reg_re = prog;
277}
278
279/*
280 * Need to implement the following flags for reg_anch:
281 *
282 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
283 * USE_INTUIT_ML
284 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
285 * INTUIT_AUTORITATIVE_ML
286 * INTUIT_ONCE_NOML - Intuit can match in one location only.
287 * INTUIT_ONCE_ML
288 *
289 * Another flag for this function: SECOND_TIME (so that float substrs
290 * with giant delta may be not rechecked).
291 */
292
293/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
294
295/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
296 Otherwise, only SvCUR(sv) is used to get strbeg. */
297
298/* XXXX We assume that strpos is strbeg unless sv. */
299
300/* XXXX Some places assume that there is a fixed substring.
301 An update may be needed if optimizer marks as "INTUITable"
302 RExen without fixed substrings. Similarly, it is assumed that
303 lengths of all the strings are no more than minlen, thus they
304 cannot come from lookahead.
305 (Or minlen should take into account lookahead.) */
306
307/* A failure to find a constant substring means that there is no need to make
308 an expensive call to REx engine, thus we celebrate a failure. Similarly,
309 finding a substring too deep into the string means that less calls to
310 regtry() should be needed.
311
312 REx compiler's optimizer found 4 possible hints:
313 a) Anchored substring;
314 b) Fixed substring;
315 c) Whether we are anchored (beginning-of-line or \G);
316 d) First node (of those at offset 0) which may distingush positions;
317 We use a)b)d) and multiline-part of c), and try to find a position in the
318 string which does not contradict any of them.
319 */
320
321/* Most of decisions we do here should have been done at compile time.
322 The nodes of the REx which we used for the search should have been
323 deleted from the finite automaton. */
324
325char *
326Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
327 char *strend, U32 flags, re_scream_pos_data *data)
328{
329 register I32 start_shift;
330 /* Should be nonnegative! */
331 register I32 end_shift;
332 register char *s;
333 register SV *check;
334 char *strbeg;
335 char *t;
336 I32 ml_anch;
337 char *tmp;
338 register char *other_last = Nullch; /* other substr checked before this */
339 char *check_at; /* check substr found at this pos */
340#ifdef DEBUGGING
341 char *i_strpos = strpos;
342#endif
343
344 DEBUG_r( if (!PL_colorset) reginitcolors() );
345 DEBUG_r(PerlIO_printf(Perl_debug_log,
346 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
347 PL_colors[4],PL_colors[5],PL_colors[0],
348 prog->precomp,
349 PL_colors[1],
350 (strlen(prog->precomp) > 60 ? "..." : ""),
351 PL_colors[0],
352 (int)(strend - strpos > 60 ? 60 : strend - strpos),
353 strpos, PL_colors[1],
354 (strend - strpos > 60 ? "..." : ""))
355 );
356
357 if (prog->minlen > strend - strpos) {
358 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
359 goto fail;
360 }
361 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
362 check = prog->check_substr;
363 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
364 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
365 || ( (prog->reganch & ROPT_ANCH_BOL)
366 && !PL_multiline ) ); /* Check after \n? */
367
368 if (!ml_anch) {
369 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
370 /* SvCUR is not set on references: SvRV and SvPVX overlap */
371 && sv && !SvROK(sv)
372 && (strpos != strbeg)) {
373 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
374 goto fail;
375 }
376 if (prog->check_offset_min == prog->check_offset_max) {
377 /* Substring at constant offset from beg-of-str... */
378 I32 slen;
379
380 PL_regeol = strend; /* Used in HOP() */
381 s = HOPc(strpos, prog->check_offset_min);
382 if (SvTAIL(check)) {
383 slen = SvCUR(check); /* >= 1 */
384
385 if ( strend - s > slen || strend - s < slen - 1
386 || (strend - s == slen && strend[-1] != '\n')) {
387 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
388 goto fail_finish;
389 }
390 /* Now should match s[0..slen-2] */
391 slen--;
392 if (slen && (*SvPVX(check) != *s
393 || (slen > 1
394 && memNE(SvPVX(check), s, slen)))) {
395 report_neq:
396 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
397 goto fail_finish;
398 }
399 }
400 else if (*SvPVX(check) != *s
401 || ((slen = SvCUR(check)) > 1
402 && memNE(SvPVX(check), s, slen)))
403 goto report_neq;
404 goto success_at_start;
405 }
406 }
407 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
408 s = strpos;
409 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
410 end_shift = prog->minlen - start_shift -
411 CHR_SVLEN(check) + (SvTAIL(check) != 0);
412 if (!ml_anch) {
413 I32 end = prog->check_offset_max + CHR_SVLEN(check)
414 - (SvTAIL(check) != 0);
415 I32 eshift = strend - s - end;
416
417 if (end_shift < eshift)
418 end_shift = eshift;
419 }
420 }
421 else { /* Can match at random position */
422 ml_anch = 0;
423 s = strpos;
424 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
425 /* Should be nonnegative! */
426 end_shift = prog->minlen - start_shift -
427 CHR_SVLEN(check) + (SvTAIL(check) != 0);
428 }
429
430#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
431 if (end_shift < 0)
432 Perl_croak(aTHX_ "panic: end_shift");
433#endif
434
435 restart:
436 /* Find a possible match in the region s..strend by looking for
437 the "check" substring in the region corrected by start/end_shift. */
438 if (flags & REXEC_SCREAM) {
439 I32 p = -1; /* Internal iterator of scream. */
440 I32 *pp = data ? data->scream_pos : &p;
441
442 if (PL_screamfirst[BmRARE(check)] >= 0
443 || ( BmRARE(check) == '\n'
444 && (BmPREVIOUS(check) == SvCUR(check) - 1)
445 && SvTAIL(check) ))
446 s = screaminstr(sv, check,
447 start_shift + (s - strbeg), end_shift, pp, 0);
448 else
449 goto fail_finish;
450 if (data)
451 *data->scream_olds = s;
452 }
453 else
454 s = fbm_instr((unsigned char*)s + start_shift,
455 (unsigned char*)strend - end_shift,
456 check, PL_multiline ? FBMrf_MULTILINE : 0);
457
458 /* Update the count-of-usability, remove useless subpatterns,
459 unshift s. */
460
461 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
462 (s ? "Found" : "Did not find"),
463 ((check == prog->anchored_substr) ? "anchored" : "floating"),
464 PL_colors[0],
465 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
466 SvPVX(check),
467 PL_colors[1], (SvTAIL(check) ? "$" : ""),
468 (s ? " at offset " : "...\n") ) );
469
470 if (!s)
471 goto fail_finish;
472
473 check_at = s;
474
475 /* Finish the diagnostic message */
476 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
477
478 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
479 Start with the other substr.
480 XXXX no SCREAM optimization yet - and a very coarse implementation
481 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
482 *always* match. Probably should be marked during compile...
483 Probably it is right to do no SCREAM here...
484 */
485
486 if (prog->float_substr && prog->anchored_substr) {
487 /* Take into account the "other" substring. */
488 /* XXXX May be hopelessly wrong for UTF... */
489 if (!other_last)
490 other_last = strpos;
491 if (check == prog->float_substr) {
492 do_other_anchored:
493 {
494 char *last = s - start_shift, *last1, *last2;
495 char *s1 = s;
496
497 tmp = PL_bostr;
498 t = s - prog->check_offset_max;
499 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
500 && (!(prog->reganch & ROPT_UTF8)
501 || (PL_bostr = strpos, /* Used in regcopmaybe() */
502 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
503 && t > strpos)))
504 /* EMPTY */;
505 else
506 t = strpos;
507 t += prog->anchored_offset;
508 if (t < other_last) /* These positions already checked */
509 t = other_last;
510 PL_bostr = tmp;
511 last2 = last1 = strend - prog->minlen;
512 if (last < last1)
513 last1 = last;
514 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
515 /* On end-of-str: see comment below. */
516 s = fbm_instr((unsigned char*)t,
517 (unsigned char*)last1 + prog->anchored_offset
518 + SvCUR(prog->anchored_substr)
519 - (SvTAIL(prog->anchored_substr)!=0),
520 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
521 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
522 (s ? "Found" : "Contradicts"),
523 PL_colors[0],
524 (int)(SvCUR(prog->anchored_substr)
525 - (SvTAIL(prog->anchored_substr)!=0)),
526 SvPVX(prog->anchored_substr),
527 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
528 if (!s) {
529 if (last1 >= last2) {
530 DEBUG_r(PerlIO_printf(Perl_debug_log,
531 ", giving up...\n"));
532 goto fail_finish;
533 }
534 DEBUG_r(PerlIO_printf(Perl_debug_log,
535 ", trying floating at offset %ld...\n",
536 (long)(s1 + 1 - i_strpos)));
537 PL_regeol = strend; /* Used in HOP() */
538 other_last = last1 + prog->anchored_offset + 1;
539 s = HOPc(last, 1);
540 goto restart;
541 }
542 else {
543 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
544 (long)(s - i_strpos)));
545 t = s - prog->anchored_offset;
546 other_last = s + 1;
547 s = s1;
548 if (t == strpos)
549 goto try_at_start;
550 goto try_at_offset;
551 }
552 }
553 }
554 else { /* Take into account the floating substring. */
555 char *last, *last1;
556 char *s1 = s;
557
558 t = s - start_shift;
559 last1 = last = strend - prog->minlen + prog->float_min_offset;
560 if (last - t > prog->float_max_offset)
561 last = t + prog->float_max_offset;
562 s = t + prog->float_min_offset;
563 if (s < other_last)
564 s = other_last;
565 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
566 /* fbm_instr() takes into account exact value of end-of-str
567 if the check is SvTAIL(ed). Since false positives are OK,
568 and end-of-str is not later than strend we are OK. */
569 s = fbm_instr((unsigned char*)s,
570 (unsigned char*)last + SvCUR(prog->float_substr)
571 - (SvTAIL(prog->float_substr)!=0),
572 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
573 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
574 (s ? "Found" : "Contradicts"),
575 PL_colors[0],
576 (int)(SvCUR(prog->float_substr)
577 - (SvTAIL(prog->float_substr)!=0)),
578 SvPVX(prog->float_substr),
579 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
580 if (!s) {
581 if (last1 == last) {
582 DEBUG_r(PerlIO_printf(Perl_debug_log,
583 ", giving up...\n"));
584 goto fail_finish;
585 }
586 DEBUG_r(PerlIO_printf(Perl_debug_log,
587 ", trying anchored starting at offset %ld...\n",
588 (long)(s1 + 1 - i_strpos)));
589 other_last = last + 1;
590 PL_regeol = strend; /* Used in HOP() */
591 s = HOPc(t, 1);
592 goto restart;
593 }
594 else {
595 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
596 (long)(s - i_strpos)));
597 other_last = s + 1;
598 s = s1;
599 if (t == strpos)
600 goto try_at_start;
601 goto try_at_offset;
602 }
603 }
604 }
605
606 t = s - prog->check_offset_max;
607 tmp = PL_bostr;
608 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
609 && (!(prog->reganch & ROPT_UTF8)
610 || (PL_bostr = strpos, /* Used in regcopmaybe() */
611 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
612 && t > strpos)))) {
613 PL_bostr = tmp;
614 /* Fixed substring is found far enough so that the match
615 cannot start at strpos. */
616 try_at_offset:
617 if (ml_anch && t[-1] != '\n') {
618 /* Eventually fbm_*() should handle this, but often
619 anchored_offset is not 0, so this check will not be wasted. */
620 /* XXXX In the code below we prefer to look for "^" even in
621 presence of anchored substrings. And we search even
622 beyond the found float position. These pessimizations
623 are historical artefacts only. */
624 find_anchor:
625 while (t < strend - prog->minlen) {
626 if (*t == '\n') {
627 if (t < check_at - prog->check_offset_min) {
628 if (prog->anchored_substr) {
629 /* Since we moved from the found position,
630 we definitely contradict the found anchored
631 substr. Due to the above check we do not
632 contradict "check" substr.
633 Thus we can arrive here only if check substr
634 is float. Redo checking for "other"=="fixed".
635 */
636 strpos = t + 1;
637 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
638 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
639 goto do_other_anchored;
640 }
641 /* We don't contradict the found floating substring. */
642 /* XXXX Why not check for STCLASS? */
643 s = t + 1;
644 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
645 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
646 goto set_useful;
647 }
648 /* Position contradicts check-string */
649 /* XXXX probably better to look for check-string
650 than for "\n", so one should lower the limit for t? */
651 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
652 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
653 other_last = strpos = s = t + 1;
654 goto restart;
655 }
656 t++;
657 }
658 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
659 PL_colors[0],PL_colors[1]));
660 goto fail_finish;
661 }
662 else {
663 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
664 PL_colors[0],PL_colors[1]));
665 }
666 s = t;
667 set_useful:
668 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
669 }
670 else {
671 PL_bostr = tmp;
672 /* The found string does not prohibit matching at strpos,
673 - no optimization of calling REx engine can be performed,
674 unless it was an MBOL and we are not after MBOL,
675 or a future STCLASS check will fail this. */
676 try_at_start:
677 /* Even in this situation we may use MBOL flag if strpos is offset
678 wrt the start of the string. */
679 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
680 && (strpos != strbeg) && strpos[-1] != '\n'
681 /* May be due to an implicit anchor of m{.*foo} */
682 && !(prog->reganch & ROPT_IMPLICIT))
683 {
684 t = strpos;
685 goto find_anchor;
686 }
687 DEBUG_r( if (ml_anch)
688 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
689 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
690 );
691 success_at_start:
692 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
693 && prog->check_substr /* Could be deleted already */
694 && --BmUSEFUL(prog->check_substr) < 0
695 && prog->check_substr == prog->float_substr)
696 {
697 /* If flags & SOMETHING - do not do it many times on the same match */
698 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
699 SvREFCNT_dec(prog->check_substr);
700 prog->check_substr = Nullsv; /* disable */
701 prog->float_substr = Nullsv; /* clear */
702 check = Nullsv; /* abort */
703 s = strpos;
704 /* XXXX This is a remnant of the old implementation. It
705 looks wasteful, since now INTUIT can use many
706 other heuristics. */
707 prog->reganch &= ~RE_USE_INTUIT;
708 }
709 else
710 s = strpos;
711 }
712
713 /* Last resort... */
714 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
715 if (prog->regstclass) {
716 /* minlen == 0 is possible if regstclass is \b or \B,
717 and the fixed substr is ''$.
718 Since minlen is already taken into account, s+1 is before strend;
719 accidentally, minlen >= 1 guaranties no false positives at s + 1
720 even for \b or \B. But (minlen? 1 : 0) below assumes that
721 regstclass does not come from lookahead... */
722 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
723 This leaves EXACTF only, which is dealt with in find_byclass(). */
724 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
725 ? STR_LEN(prog->regstclass)
726 : 1);
727 char *endpos = (prog->anchored_substr || ml_anch)
728 ? s + (prog->minlen? cl_l : 0)
729 : (prog->float_substr ? check_at - start_shift + cl_l
730 : strend) ;
731 char *startpos = strbeg;
732
733 t = s;
734 if (prog->reganch & ROPT_UTF8) {
735 PL_regdata = prog->data;
736 PL_bostr = startpos;
737 }
738 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
739 if (!s) {
740#ifdef DEBUGGING
741 char *what;
742#endif
743 if (endpos == strend) {
744 DEBUG_r( PerlIO_printf(Perl_debug_log,
745 "Could not match STCLASS...\n") );
746 goto fail;
747 }
748 DEBUG_r( PerlIO_printf(Perl_debug_log,
749 "This position contradicts STCLASS...\n") );
750 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
751 goto fail;
752 /* Contradict one of substrings */
753 if (prog->anchored_substr) {
754 if (prog->anchored_substr == check) {
755 DEBUG_r( what = "anchored" );
756 hop_and_restart:
757 PL_regeol = strend; /* Used in HOP() */
758 s = HOPc(t, 1);
759 if (s + start_shift + end_shift > strend) {
760 /* XXXX Should be taken into account earlier? */
761 DEBUG_r( PerlIO_printf(Perl_debug_log,
762 "Could not match STCLASS...\n") );
763 goto fail;
764 }
765 if (!check)
766 goto giveup;
767 DEBUG_r( PerlIO_printf(Perl_debug_log,
768 "Looking for %s substr starting at offset %ld...\n",
769 what, (long)(s + start_shift - i_strpos)) );
770 goto restart;
771 }
772 /* Have both, check_string is floating */
773 if (t + start_shift >= check_at) /* Contradicts floating=check */
774 goto retry_floating_check;
775 /* Recheck anchored substring, but not floating... */
776 s = check_at;
777 if (!check)
778 goto giveup;
779 DEBUG_r( PerlIO_printf(Perl_debug_log,
780 "Looking for anchored substr starting at offset %ld...\n",
781 (long)(other_last - i_strpos)) );
782 goto do_other_anchored;
783 }
784 /* Another way we could have checked stclass at the
785 current position only: */
786 if (ml_anch) {
787 s = t = t + 1;
788 if (!check)
789 goto giveup;
790 DEBUG_r( PerlIO_printf(Perl_debug_log,
791 "Looking for /%s^%s/m starting at offset %ld...\n",
792 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
793 goto try_at_offset;
794 }
795 if (!prog->float_substr) /* Could have been deleted */
796 goto fail;
797 /* Check is floating subtring. */
798 retry_floating_check:
799 t = check_at - start_shift;
800 DEBUG_r( what = "floating" );
801 goto hop_and_restart;
802 }
803 DEBUG_r( if (t != s)
804 PerlIO_printf(Perl_debug_log,
805 "By STCLASS: moving %ld --> %ld\n",
806 (long)(t - i_strpos), (long)(s - i_strpos));
807 else
808 PerlIO_printf(Perl_debug_log,
809 "Does not contradict STCLASS...\n") );
810 }
811 giveup:
812 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
813 PL_colors[4], (check ? "Guessed" : "Giving up"),
814 PL_colors[5], (long)(s - i_strpos)) );
815 return s;
816
817 fail_finish: /* Substring not found */
818 if (prog->check_substr) /* could be removed already */
819 BmUSEFUL(prog->check_substr) += 5; /* hooray */
820 fail:
821 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
822 PL_colors[4],PL_colors[5]));
823 return Nullch;
824}
825
826/* We know what class REx starts with. Try to find this position... */
827STATIC char *
828S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
829{
830 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
831 char *m;
832 STRLEN ln;
833 unsigned int c1;
834 unsigned int c2;
835 char *e;
836 register I32 tmp = 1; /* Scratch variable? */
837 register bool do_utf8 = DO_UTF8(PL_reg_sv);
838
839 /* We know what class it must start with. */
840 switch (OP(c)) {
841 case ANYOF:
842 while (s < strend) {
843 if (reginclass(c, (U8*)s, do_utf8)) {
844 if (tmp && (norun || regtry(prog, s)))
845 goto got_it;
846 else
847 tmp = doevery;
848 }
849 else
850 tmp = 1;
851 s += do_utf8 ? UTF8SKIP(s) : 1;
852 }
853 break;
854 case EXACTF:
855 m = STRING(c);
856 ln = STR_LEN(c);
857 c1 = *(U8*)m;
858 c2 = PL_fold[c1];
859 goto do_exactf;
860 case EXACTFL:
861 m = STRING(c);
862 ln = STR_LEN(c);
863 c1 = *(U8*)m;
864 c2 = PL_fold_locale[c1];
865 do_exactf:
866 e = strend - ln;
867
868 if (norun && e < s)
869 e = s; /* Due to minlen logic of intuit() */
870 /* Here it is NOT UTF! */
871 if (c1 == c2) {
872 while (s <= e) {
873 if ( *(U8*)s == c1
874 && (ln == 1 || !(OP(c) == EXACTF
875 ? ibcmp(s, m, ln)
876 : ibcmp_locale(s, m, ln)))
877 && (norun || regtry(prog, s)) )
878 goto got_it;
879 s++;
880 }
881 } else {
882 while (s <= e) {
883 if ( (*(U8*)s == c1 || *(U8*)s == c2)
884 && (ln == 1 || !(OP(c) == EXACTF
885 ? ibcmp(s, m, ln)
886 : ibcmp_locale(s, m, ln)))
887 && (norun || regtry(prog, s)) )
888 goto got_it;
889 s++;
890 }
891 }
892 break;
893 case BOUNDL:
894 PL_reg_flags |= RF_tainted;
895 /* FALL THROUGH */
896 case BOUND:
897 if (do_utf8) {
898 if (s == startpos)
899 tmp = '\n';
900 else {
901 U8 *r = reghop((U8*)s, -1);
902
903 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
904 }
905 tmp = ((OP(c) == BOUND ?
906 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
907 while (s < strend) {
908 if (tmp == !(OP(c) == BOUND ?
909 swash_fetch(PL_utf8_alnum, (U8*)s) :
910 isALNUM_LC_utf8((U8*)s)))
911 {
912 tmp = !tmp;
913 if ((norun || regtry(prog, s)))
914 goto got_it;
915 }
916 s += UTF8SKIP(s);
917 }
918 }
919 else {
920 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
921 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
922 while (s < strend) {
923 if (tmp ==
924 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
925 tmp = !tmp;
926 if ((norun || regtry(prog, s)))
927 goto got_it;
928 }
929 s++;
930 }
931 }
932 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
933 goto got_it;
934 break;
935 case NBOUNDL:
936 PL_reg_flags |= RF_tainted;
937 /* FALL THROUGH */
938 case NBOUND:
939 if (do_utf8) {
940 if (s == startpos)
941 tmp = '\n';
942 else {
943 U8 *r = reghop((U8*)s, -1);
944
945 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
946 }
947 tmp = ((OP(c) == NBOUND ?
948 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
949 while (s < strend) {
950 if (tmp == !(OP(c) == NBOUND ?
951 swash_fetch(PL_utf8_alnum, (U8*)s) :
952 isALNUM_LC_utf8((U8*)s)))
953 tmp = !tmp;
954 else if ((norun || regtry(prog, s)))
955 goto got_it;
956 s += UTF8SKIP(s);
957 }
958 }
959 else {
960 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
961 tmp = ((OP(c) == NBOUND ?
962 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
963 while (s < strend) {
964 if (tmp ==
965 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
966 tmp = !tmp;
967 else if ((norun || regtry(prog, s)))
968 goto got_it;
969 s++;
970 }
971 }
972 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
973 goto got_it;
974 break;
975 case ALNUM:
976 if (do_utf8) {
977 while (s < strend) {
978 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
979 if (tmp && (norun || regtry(prog, s)))
980 goto got_it;
981 else
982 tmp = doevery;
983 }
984 else
985 tmp = 1;
986 s += UTF8SKIP(s);
987 }
988 }
989 else {
990 while (s < strend) {
991 if (isALNUM(*s)) {
992 if (tmp && (norun || regtry(prog, s)))
993 goto got_it;
994 else
995 tmp = doevery;
996 }
997 else
998 tmp = 1;
999 s++;
1000 }
1001 }
1002 break;
1003 case ALNUML:
1004 PL_reg_flags |= RF_tainted;
1005 if (do_utf8) {
1006 while (s < strend) {
1007 if (isALNUM_LC_utf8((U8*)s)) {
1008 if (tmp && (norun || regtry(prog, s)))
1009 goto got_it;
1010 else
1011 tmp = doevery;
1012 }
1013 else
1014 tmp = 1;
1015 s += UTF8SKIP(s);
1016 }
1017 }
1018 else {
1019 while (s < strend) {
1020 if (isALNUM_LC(*s)) {
1021 if (tmp && (norun || regtry(prog, s)))
1022 goto got_it;
1023 else
1024 tmp = doevery;
1025 }
1026 else
1027 tmp = 1;
1028 s++;
1029 }
1030 }
1031 break;
1032 case NALNUM:
1033 if (do_utf8) {
1034 while (s < strend) {
1035 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1036 if (tmp && (norun || regtry(prog, s)))
1037 goto got_it;
1038 else
1039 tmp = doevery;
1040 }
1041 else
1042 tmp = 1;
1043 s += UTF8SKIP(s);
1044 }
1045 }
1046 else {
1047 while (s < strend) {
1048 if (!isALNUM(*s)) {
1049 if (tmp && (norun || regtry(prog, s)))
1050 goto got_it;
1051 else
1052 tmp = doevery;
1053 }
1054 else
1055 tmp = 1;
1056 s++;
1057 }
1058 }
1059 break;
1060 case NALNUML:
1061 PL_reg_flags |= RF_tainted;
1062 if (do_utf8) {
1063 while (s < strend) {
1064 if (!isALNUM_LC_utf8((U8*)s)) {
1065 if (tmp && (norun || regtry(prog, s)))
1066 goto got_it;
1067 else
1068 tmp = doevery;
1069 }
1070 else
1071 tmp = 1;
1072 s += UTF8SKIP(s);
1073 }
1074 }
1075 else {
1076 while (s < strend) {
1077 if (!isALNUM_LC(*s)) {
1078 if (tmp && (norun || regtry(prog, s)))
1079 goto got_it;
1080 else
1081 tmp = doevery;
1082 }
1083 else
1084 tmp = 1;
1085 s++;
1086 }
1087 }
1088 break;
1089 case SPACE:
1090 if (do_utf8) {
1091 while (s < strend) {
1092 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1093 if (tmp && (norun || regtry(prog, s)))
1094 goto got_it;
1095 else
1096 tmp = doevery;
1097 }
1098 else
1099 tmp = 1;
1100 s += UTF8SKIP(s);
1101 }
1102 }
1103 else {
1104 while (s < strend) {
1105 if (isSPACE(*s)) {
1106 if (tmp && (norun || regtry(prog, s)))
1107 goto got_it;
1108 else
1109 tmp = doevery;
1110 }
1111 else
1112 tmp = 1;
1113 s++;
1114 }
1115 }
1116 break;
1117 case SPACEL:
1118 PL_reg_flags |= RF_tainted;
1119 if (do_utf8) {
1120 while (s < strend) {
1121 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1122 if (tmp && (norun || regtry(prog, s)))
1123 goto got_it;
1124 else
1125 tmp = doevery;
1126 }
1127 else
1128 tmp = 1;
1129 s += UTF8SKIP(s);
1130 }
1131 }
1132 else {
1133 while (s < strend) {
1134 if (isSPACE_LC(*s)) {
1135 if (tmp && (norun || regtry(prog, s)))
1136 goto got_it;
1137 else
1138 tmp = doevery;
1139 }
1140 else
1141 tmp = 1;
1142 s++;
1143 }
1144 }
1145 break;
1146 case NSPACE:
1147 if (do_utf8) {
1148 while (s < strend) {
1149 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1150 if (tmp && (norun || regtry(prog, s)))
1151 goto got_it;
1152 else
1153 tmp = doevery;
1154 }
1155 else
1156 tmp = 1;
1157 s += UTF8SKIP(s);
1158 }
1159 }
1160 else {
1161 while (s < strend) {
1162 if (!isSPACE(*s)) {
1163 if (tmp && (norun || regtry(prog, s)))
1164 goto got_it;
1165 else
1166 tmp = doevery;
1167 }
1168 else
1169 tmp = 1;
1170 s++;
1171 }
1172 }
1173 break;
1174 case NSPACEL:
1175 PL_reg_flags |= RF_tainted;
1176 if (do_utf8) {
1177 while (s < strend) {
1178 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1179 if (tmp && (norun || regtry(prog, s)))
1180 goto got_it;
1181 else
1182 tmp = doevery;
1183 }
1184 else
1185 tmp = 1;
1186 s += UTF8SKIP(s);
1187 }
1188 }
1189 else {
1190 while (s < strend) {
1191 if (!isSPACE_LC(*s)) {
1192 if (tmp && (norun || regtry(prog, s)))
1193 goto got_it;
1194 else
1195 tmp = doevery;
1196 }
1197 else
1198 tmp = 1;
1199 s++;
1200 }
1201 }
1202 break;
1203 case DIGIT:
1204 if (do_utf8) {
1205 while (s < strend) {
1206 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1207 if (tmp && (norun || regtry(prog, s)))
1208 goto got_it;
1209 else
1210 tmp = doevery;
1211 }
1212 else
1213 tmp = 1;
1214 s += UTF8SKIP(s);
1215 }
1216 }
1217 else {
1218 while (s < strend) {
1219 if (isDIGIT(*s)) {
1220 if (tmp && (norun || regtry(prog, s)))
1221 goto got_it;
1222 else
1223 tmp = doevery;
1224 }
1225 else
1226 tmp = 1;
1227 s++;
1228 }
1229 }
1230 break;
1231 case DIGITL:
1232 PL_reg_flags |= RF_tainted;
1233 if (do_utf8) {
1234 while (s < strend) {
1235 if (isDIGIT_LC_utf8((U8*)s)) {
1236 if (tmp && (norun || regtry(prog, s)))
1237 goto got_it;
1238 else
1239 tmp = doevery;
1240 }
1241 else
1242 tmp = 1;
1243 s += UTF8SKIP(s);
1244 }
1245 }
1246 else {
1247 while (s < strend) {
1248 if (isDIGIT_LC(*s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
1254 else
1255 tmp = 1;
1256 s++;
1257 }
1258 }
1259 break;
1260 case NDIGIT:
1261 if (do_utf8) {
1262 while (s < strend) {
1263 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1264 if (tmp && (norun || regtry(prog, s)))
1265 goto got_it;
1266 else
1267 tmp = doevery;
1268 }
1269 else
1270 tmp = 1;
1271 s += UTF8SKIP(s);
1272 }
1273 }
1274 else {
1275 while (s < strend) {
1276 if (!isDIGIT(*s)) {
1277 if (tmp && (norun || regtry(prog, s)))
1278 goto got_it;
1279 else
1280 tmp = doevery;
1281 }
1282 else
1283 tmp = 1;
1284 s++;
1285 }
1286 }
1287 break;
1288 case NDIGITL:
1289 PL_reg_flags |= RF_tainted;
1290 if (do_utf8) {
1291 while (s < strend) {
1292 if (!isDIGIT_LC_utf8((U8*)s)) {
1293 if (tmp && (norun || regtry(prog, s)))
1294 goto got_it;
1295 else
1296 tmp = doevery;
1297 }
1298 else
1299 tmp = 1;
1300 s += UTF8SKIP(s);
1301 }
1302 }
1303 else {
1304 while (s < strend) {
1305 if (!isDIGIT_LC(*s)) {
1306 if (tmp && (norun || regtry(prog, s)))
1307 goto got_it;
1308 else
1309 tmp = doevery;
1310 }
1311 else
1312 tmp = 1;
1313 s++;
1314 }
1315 }
1316 break;
1317 default:
1318 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1319 break;
1320 }
1321 return 0;
1322 got_it:
1323 return s;
1324}
1325
1326/*
1327 - regexec_flags - match a regexp against a string
1328 */
1329I32
1330Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1331 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1332/* strend: pointer to null at end of string */
1333/* strbeg: real beginning of string */
1334/* minend: end of match must be >=minend after stringarg. */
1335/* data: May be used for some additional optimizations. */
1336/* nosave: For optimizations. */
1337{
1338 register char *s;
1339 register regnode *c;
1340 register char *startpos = stringarg;
1341 I32 minlen; /* must match at least this many chars */
1342 I32 dontbother = 0; /* how many characters not to try at end */
1343 /* I32 start_shift = 0; */ /* Offset of the start to find
1344 constant substr. */ /* CC */
1345 I32 end_shift = 0; /* Same for the end. */ /* CC */
1346 I32 scream_pos = -1; /* Internal iterator of scream. */
1347 char *scream_olds;
1348 SV* oreplsv = GvSV(PL_replgv);
1349
1350 PL_regcc = 0;
1351
1352 cache_re(prog);
1353#ifdef DEBUGGING
1354 PL_regnarrate = PL_debug & 512;
1355#endif
1356
1357 /* Be paranoid... */
1358 if (prog == NULL || startpos == NULL) {
1359 Perl_croak(aTHX_ "NULL regexp parameter");
1360 return 0;
1361 }
1362
1363 minlen = prog->minlen;
1364 if (strend - startpos < minlen) goto phooey;
1365
1366 if (startpos == strbeg) /* is ^ valid at stringarg? */
1367 PL_regprev = '\n';
1368 else {
1369 PL_regprev = (U32)stringarg[-1];
1370 if (!PL_multiline && PL_regprev == '\n')
1371 PL_regprev = '\0'; /* force ^ to NOT match */
1372 }
1373
1374 /* Check validity of program. */
1375 if (UCHARAT(prog->program) != REG_MAGIC) {
1376 Perl_croak(aTHX_ "corrupted regexp program");
1377 }
1378
1379 PL_reg_flags = 0;
1380 PL_reg_eval_set = 0;
1381 PL_reg_maxiter = 0;
1382
1383 if (prog->reganch & ROPT_UTF8)
1384 PL_reg_flags |= RF_utf8;
1385
1386 /* Mark beginning of line for ^ and lookbehind. */
1387 PL_regbol = startpos;
1388 PL_bostr = strbeg;
1389 PL_reg_sv = sv;
1390
1391 /* Mark end of line for $ (and such) */
1392 PL_regeol = strend;
1393
1394 /* see how far we have to get to not match where we matched before */
1395 PL_regtill = startpos+minend;
1396
1397 /* We start without call_cc context. */
1398 PL_reg_call_cc = 0;
1399
1400 /* If there is a "must appear" string, look for it. */
1401 s = startpos;
1402
1403 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1404 MAGIC *mg;
1405
1406 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1407 PL_reg_ganch = startpos;
1408 else if (sv && SvTYPE(sv) >= SVt_PVMG
1409 && SvMAGIC(sv)
1410 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1411 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1412 if (prog->reganch & ROPT_ANCH_GPOS) {
1413 if (s > PL_reg_ganch)
1414 goto phooey;
1415 s = PL_reg_ganch;
1416 }
1417 }
1418 else /* pos() not defined */
1419 PL_reg_ganch = strbeg;
1420 }
1421
1422 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1423 re_scream_pos_data d;
1424
1425 d.scream_olds = &scream_olds;
1426 d.scream_pos = &scream_pos;
1427 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1428 if (!s)
1429 goto phooey; /* not present */
1430 }
1431
1432 DEBUG_r( if (!PL_colorset) reginitcolors() );
1433 DEBUG_r(PerlIO_printf(Perl_debug_log,
1434 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1435 PL_colors[4],PL_colors[5],PL_colors[0],
1436 prog->precomp,
1437 PL_colors[1],
1438 (strlen(prog->precomp) > 60 ? "..." : ""),
1439 PL_colors[0],
1440 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1441 startpos, PL_colors[1],
1442 (strend - startpos > 60 ? "..." : ""))
1443 );
1444
1445 /* Simplest case: anchored match need be tried only once. */
1446 /* [unless only anchor is BOL and multiline is set] */
1447 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1448 if (s == startpos && regtry(prog, startpos))
1449 goto got_it;
1450 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1451 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1452 {
1453 char *end;
1454
1455 if (minlen)
1456 dontbother = minlen - 1;
1457 end = HOPc(strend, -dontbother) - 1;
1458 /* for multiline we only have to try after newlines */
1459 if (prog->check_substr) {
1460 if (s == startpos)
1461 goto after_try;
1462 while (1) {
1463 if (regtry(prog, s))
1464 goto got_it;
1465 after_try:
1466 if (s >= end)
1467 goto phooey;
1468 if (prog->reganch & RE_USE_INTUIT) {
1469 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1470 if (!s)
1471 goto phooey;
1472 }
1473 else
1474 s++;
1475 }
1476 } else {
1477 if (s > startpos)
1478 s--;
1479 while (s < end) {
1480 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1481 if (regtry(prog, s))
1482 goto got_it;
1483 }
1484 }
1485 }
1486 }
1487 goto phooey;
1488 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1489 if (regtry(prog, PL_reg_ganch))
1490 goto got_it;
1491 goto phooey;
1492 }
1493
1494 /* Messy cases: unanchored match. */
1495 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1496 /* we have /x+whatever/ */
1497 /* it must be a one character string (XXXX Except UTF?) */
1498 char ch = SvPVX(prog->anchored_substr)[0];
1499#ifdef DEBUGGING
1500 int did_match = 0;
1501#endif
1502
1503 if (UTF) {
1504 while (s < strend) {
1505 if (*s == ch) {
1506 DEBUG_r( did_match = 1 );
1507 if (regtry(prog, s)) goto got_it;
1508 s += UTF8SKIP(s);
1509 while (s < strend && *s == ch)
1510 s += UTF8SKIP(s);
1511 }
1512 s += UTF8SKIP(s);
1513 }
1514 }
1515 else {
1516 while (s < strend) {
1517 if (*s == ch) {
1518 DEBUG_r( did_match = 1 );
1519 if (regtry(prog, s)) goto got_it;
1520 s++;
1521 while (s < strend && *s == ch)
1522 s++;
1523 }
1524 s++;
1525 }
1526 }
1527 DEBUG_r(did_match ||
1528 PerlIO_printf(Perl_debug_log,
1529 "Did not find anchored character...\n"));
1530 }
1531 /*SUPPRESS 560*/
1532 else if (prog->anchored_substr != Nullsv
1533 || (prog->float_substr != Nullsv
1534 && prog->float_max_offset < strend - s)) {
1535 SV *must = prog->anchored_substr
1536 ? prog->anchored_substr : prog->float_substr;
1537 I32 back_max =
1538 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1539 I32 back_min =
1540 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1541 char *last = HOPc(strend, /* Cannot start after this */
1542 -(I32)(CHR_SVLEN(must)
1543 - (SvTAIL(must) != 0) + back_min));
1544 char *last1; /* Last position checked before */
1545#ifdef DEBUGGING
1546 int did_match = 0;
1547#endif
1548
1549 if (s > PL_bostr)
1550 last1 = HOPc(s, -1);
1551 else
1552 last1 = s - 1; /* bogus */
1553
1554 /* XXXX check_substr already used to find `s', can optimize if
1555 check_substr==must. */
1556 scream_pos = -1;
1557 dontbother = end_shift;
1558 strend = HOPc(strend, -dontbother);
1559 while ( (s <= last) &&
1560 ((flags & REXEC_SCREAM)
1561 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1562 end_shift, &scream_pos, 0))
1563 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1564 (unsigned char*)strend, must,
1565 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1566 DEBUG_r( did_match = 1 );
1567 if (HOPc(s, -back_max) > last1) {
1568 last1 = HOPc(s, -back_min);
1569 s = HOPc(s, -back_max);
1570 }
1571 else {
1572 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1573
1574 last1 = HOPc(s, -back_min);
1575 s = t;
1576 }
1577 if (UTF) {
1578 while (s <= last1) {
1579 if (regtry(prog, s))
1580 goto got_it;
1581 s += UTF8SKIP(s);
1582 }
1583 }
1584 else {
1585 while (s <= last1) {
1586 if (regtry(prog, s))
1587 goto got_it;
1588 s++;
1589 }
1590 }
1591 }
1592 DEBUG_r(did_match ||
1593 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1594 ((must == prog->anchored_substr)
1595 ? "anchored" : "floating"),
1596 PL_colors[0],
1597 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1598 SvPVX(must),
1599 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1600 goto phooey;
1601 }
1602 else if ((c = prog->regstclass)) {
1603 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1604 /* don't bother with what can't match */
1605 strend = HOPc(strend, -(minlen - 1));
1606 DEBUG_r({
1607 SV *prop = sv_newmortal();
1608 regprop(prop, c);
1609 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1610 });
1611 if (find_byclass(prog, c, s, strend, startpos, 0))
1612 goto got_it;
1613 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1614 }
1615 else {
1616 dontbother = 0;
1617 if (prog->float_substr != Nullsv) { /* Trim the end. */
1618 char *last;
1619
1620 if (flags & REXEC_SCREAM) {
1621 last = screaminstr(sv, prog->float_substr, s - strbeg,
1622 end_shift, &scream_pos, 1); /* last one */
1623 if (!last)
1624 last = scream_olds; /* Only one occurrence. */
1625 }
1626 else {
1627 STRLEN len;
1628 char *little = SvPV(prog->float_substr, len);
1629
1630 if (SvTAIL(prog->float_substr)) {
1631 if (memEQ(strend - len + 1, little, len - 1))
1632 last = strend - len + 1;
1633 else if (!PL_multiline)
1634 last = memEQ(strend - len, little, len)
1635 ? strend - len : Nullch;
1636 else
1637 goto find_last;
1638 } else {
1639 find_last:
1640 if (len)
1641 last = rninstr(s, strend, little, little + len);
1642 else
1643 last = strend; /* matching `$' */
1644 }
1645 }
1646 if (last == NULL) {
1647 DEBUG_r(PerlIO_printf(Perl_debug_log,
1648 "%sCan't trim the tail, match fails (should not happen)%s\n",
1649 PL_colors[4],PL_colors[5]));
1650 goto phooey; /* Should not happen! */
1651 }
1652 dontbother = strend - last + prog->float_min_offset;
1653 }
1654 if (minlen && (dontbother < minlen))
1655 dontbother = minlen - 1;
1656 strend -= dontbother; /* this one's always in bytes! */
1657 /* We don't know much -- general case. */
1658 if (UTF) {
1659 for (;;) {
1660 if (regtry(prog, s))
1661 goto got_it;
1662 if (s >= strend)
1663 break;
1664 s += UTF8SKIP(s);
1665 };
1666 }
1667 else {
1668 do {
1669 if (regtry(prog, s))
1670 goto got_it;
1671 } while (s++ < strend);
1672 }
1673 }
1674
1675 /* Failure. */
1676 goto phooey;
1677
1678got_it:
1679 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1680
1681 if (PL_reg_eval_set) {
1682 /* Preserve the current value of $^R */
1683 if (oreplsv != GvSV(PL_replgv))
1684 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1685 restored, the value remains
1686 the same. */
1687 restore_pos(aTHXo_ 0);
1688 }
1689
1690 /* make sure $`, $&, $', and $digit will work later */
1691 if ( !(flags & REXEC_NOT_FIRST) ) {
1692 if (RX_MATCH_COPIED(prog)) {
1693 Safefree(prog->subbeg);
1694 RX_MATCH_COPIED_off(prog);
1695 }
1696 if (flags & REXEC_COPY_STR) {
1697 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1698
1699 s = savepvn(strbeg, i);
1700 prog->subbeg = s;
1701 prog->sublen = i;
1702 RX_MATCH_COPIED_on(prog);
1703 }
1704 else {
1705 prog->subbeg = strbeg;
1706 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1707 }
1708 }
1709
1710 return 1;
1711
1712phooey:
1713 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1714 PL_colors[4],PL_colors[5]));
1715 if (PL_reg_eval_set)
1716 restore_pos(aTHXo_ 0);
1717 return 0;
1718}
1719
1720/*
1721 - regtry - try match at specific point
1722 */
1723STATIC I32 /* 0 failure, 1 success */
1724S_regtry(pTHX_ regexp *prog, char *startpos)
1725{
1726 register I32 i;
1727 register I32 *sp;
1728 register I32 *ep;
1729 CHECKPOINT lastcp;
1730
1731#ifdef DEBUGGING
1732 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1733#endif
1734 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1735 MAGIC *mg;
1736
1737 PL_reg_eval_set = RS_init;
1738 DEBUG_r(DEBUG_s(
1739 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1740 (IV)(PL_stack_sp - PL_stack_base));
1741 ));
1742 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1743 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1744 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1745 SAVETMPS;
1746 /* Apparently this is not needed, judging by wantarray. */
1747 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1748 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1749
1750 if (PL_reg_sv) {
1751 /* Make $_ available to executed code. */
1752 if (PL_reg_sv != DEFSV) {
1753 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1754 SAVESPTR(DEFSV);
1755 DEFSV = PL_reg_sv;
1756 }
1757
1758 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1759 && (mg = mg_find(PL_reg_sv, 'g')))) {
1760 /* prepare for quick setting of pos */
1761 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1762 mg = mg_find(PL_reg_sv, 'g');
1763 mg->mg_len = -1;
1764 }
1765 PL_reg_magic = mg;
1766 PL_reg_oldpos = mg->mg_len;
1767 SAVEDESTRUCTOR_X(restore_pos, 0);
1768 }
1769 if (!PL_reg_curpm)
1770 Newz(22,PL_reg_curpm, 1, PMOP);
1771 PL_reg_curpm->op_pmregexp = prog;
1772 PL_reg_oldcurpm = PL_curpm;
1773 PL_curpm = PL_reg_curpm;
1774 if (RX_MATCH_COPIED(prog)) {
1775 /* Here is a serious problem: we cannot rewrite subbeg,
1776 since it may be needed if this match fails. Thus
1777 $` inside (?{}) could fail... */
1778 PL_reg_oldsaved = prog->subbeg;
1779 PL_reg_oldsavedlen = prog->sublen;
1780 RX_MATCH_COPIED_off(prog);
1781 }
1782 else
1783 PL_reg_oldsaved = Nullch;
1784 prog->subbeg = PL_bostr;
1785 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1786 }
1787 prog->startp[0] = startpos - PL_bostr;
1788 PL_reginput = startpos;
1789 PL_regstartp = prog->startp;
1790 PL_regendp = prog->endp;
1791 PL_reglastparen = &prog->lastparen;
1792 prog->lastparen = 0;
1793 PL_regsize = 0;
1794 DEBUG_r(PL_reg_starttry = startpos);
1795 if (PL_reg_start_tmpl <= prog->nparens) {
1796 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1797 if(PL_reg_start_tmp)
1798 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1799 else
1800 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1801 }
1802
1803 /* XXXX What this code is doing here?!!! There should be no need
1804 to do this again and again, PL_reglastparen should take care of
1805 this! --ilya*/
1806
1807 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1808 * Actually, the code in regcppop() (which Ilya may be meaning by
1809 * PL_reglastparen), is not needed at all by the test suite
1810 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1811 * enough, for building DynaLoader, or otherwise this
1812 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1813 * will happen. Meanwhile, this code *is* needed for the
1814 * above-mentioned test suite tests to succeed. The common theme
1815 * on those tests seems to be returning null fields from matches.
1816 * --jhi */
1817#if 1
1818 sp = prog->startp;
1819 ep = prog->endp;
1820 if (prog->nparens) {
1821 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1822 *++sp = -1;
1823 *++ep = -1;
1824 }
1825 }
1826#endif
1827 REGCP_SET(lastcp);
1828 if (regmatch(prog->program + 1)) {
1829 prog->endp[0] = PL_reginput - PL_bostr;
1830 return 1;
1831 }
1832 REGCP_UNWIND(lastcp);
1833 return 0;
1834}
1835
1836#define RE_UNWIND_BRANCH 1
1837#define RE_UNWIND_BRANCHJ 2
1838
1839union re_unwind_t;
1840
1841typedef struct { /* XX: makes sense to enlarge it... */
1842 I32 type;
1843 I32 prev;
1844 CHECKPOINT lastcp;
1845} re_unwind_generic_t;
1846
1847typedef struct {
1848 I32 type;
1849 I32 prev;
1850 CHECKPOINT lastcp;
1851 I32 lastparen;
1852 regnode *next;
1853 char *locinput;
1854 I32 nextchr;
1855#ifdef DEBUGGING
1856 int regindent;
1857#endif
1858} re_unwind_branch_t;
1859
1860typedef union re_unwind_t {
1861 I32 type;
1862 re_unwind_generic_t generic;
1863 re_unwind_branch_t branch;
1864} re_unwind_t;
1865
1866/*
1867 - regmatch - main matching routine
1868 *
1869 * Conceptually the strategy is simple: check to see whether the current
1870 * node matches, call self recursively to see whether the rest matches,
1871 * and then act accordingly. In practice we make some effort to avoid
1872 * recursion, in particular by going through "ordinary" nodes (that don't
1873 * need to know whether the rest of the match failed) by a loop instead of
1874 * by recursion.
1875 */
1876/* [lwall] I've hoisted the register declarations to the outer block in order to
1877 * maybe save a little bit of pushing and popping on the stack. It also takes
1878 * advantage of machines that use a register save mask on subroutine entry.
1879 */
1880STATIC I32 /* 0 failure, 1 success */
1881S_regmatch(pTHX_ regnode *prog)
1882{
1883 register regnode *scan; /* Current node. */
1884 regnode *next; /* Next node. */
1885 regnode *inner; /* Next node in internal branch. */
1886 register I32 nextchr; /* renamed nextchr - nextchar colides with
1887 function of same name */
1888 register I32 n; /* no or next */
1889 register I32 ln; /* len or last */
1890 register char *s; /* operand or save */
1891 register char *locinput = PL_reginput;
1892 register I32 c1, c2, paren; /* case fold search, parenth */
1893 int minmod = 0, sw = 0, logical = 0;
1894 I32 unwind = 0;
1895 I32 firstcp = PL_savestack_ix;
1896 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1897
1898#ifdef DEBUGGING
1899 PL_regindent++;
1900#endif
1901
1902 /* Note that nextchr is a byte even in UTF */
1903 nextchr = UCHARAT(locinput);
1904 scan = prog;
1905 while (scan != NULL) {
1906#define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1907#if 1
1908# define sayYES goto yes
1909# define sayNO goto no
1910# define sayYES_FINAL goto yes_final
1911# define sayYES_LOUD goto yes_loud
1912# define sayNO_FINAL goto no_final
1913# define sayNO_SILENT goto do_no
1914# define saySAME(x) if (x) goto yes; else goto no
1915# define REPORT_CODE_OFF 24
1916#else
1917# define sayYES return 1
1918# define sayNO return 0
1919# define sayYES_FINAL return 1
1920# define sayYES_LOUD return 1
1921# define sayNO_FINAL return 0
1922# define sayNO_SILENT return 0
1923# define saySAME(x) return x
1924#endif
1925 DEBUG_r( {
1926 SV *prop = sv_newmortal();
1927 int docolor = *PL_colors[0];
1928 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1929 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1930 /* The part of the string before starttry has one color
1931 (pref0_len chars), between starttry and current
1932 position another one (pref_len - pref0_len chars),
1933 after the current position the third one.
1934 We assume that pref0_len <= pref_len, otherwise we
1935 decrease pref0_len. */
1936 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1937 ? (5 + taill) - l : locinput - PL_bostr);
1938 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1939
1940 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1941 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1942 ? (5 + taill) - pref_len : PL_regeol - locinput);
1943 if (pref0_len < 0)
1944 pref0_len = 0;
1945 if (pref0_len > pref_len)
1946 pref0_len = pref_len;
1947 regprop(prop, scan);
1948 PerlIO_printf(Perl_debug_log,
1949 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1950 (IV)(locinput - PL_bostr),
1951 PL_colors[4], pref0_len,
1952 locinput - pref_len, PL_colors[5],
1953 PL_colors[2], pref_len - pref0_len,
1954 locinput - pref_len + pref0_len, PL_colors[3],
1955 (docolor ? "" : "> <"),
1956 PL_colors[0], l, locinput, PL_colors[1],
1957 15 - l - pref_len + 1,
1958 "",
1959 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1960 SvPVX(prop));
1961 } );
1962
1963 next = scan + NEXT_OFF(scan);
1964 if (next == scan)
1965 next = NULL;
1966
1967 switch (OP(scan)) {
1968 case BOL:
1969 if (locinput == PL_bostr
1970 ? PL_regprev == '\n'
1971 : (PL_multiline &&
1972 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1973 {
1974 /* regtill = regbol; */
1975 break;
1976 }
1977 sayNO;
1978 case MBOL:
1979 if (locinput == PL_bostr
1980 ? PL_regprev == '\n'
1981 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1982 {
1983 break;
1984 }
1985 sayNO;
1986 case SBOL:
1987 if (locinput == PL_bostr)
1988 break;
1989 sayNO;
1990 case GPOS:
1991 if (locinput == PL_reg_ganch)
1992 break;
1993 sayNO;
1994 case EOL:
1995 if (PL_multiline)
1996 goto meol;
1997 else
1998 goto seol;
1999 case MEOL:
2000 meol:
2001 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2002 sayNO;
2003 break;
2004 case SEOL:
2005 seol:
2006 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2007 sayNO;
2008 if (PL_regeol - locinput > 1)
2009 sayNO;
2010 break;
2011 case EOS:
2012 if (PL_regeol != locinput)
2013 sayNO;
2014 break;
2015 case SANY:
2016 if (DO_UTF8(PL_reg_sv)) {
2017 locinput += PL_utf8skip[nextchr];
2018 if (locinput > PL_regeol)
2019 sayNO;
2020 nextchr = UCHARAT(locinput);
2021 break;
2022 }
2023 if (!nextchr && locinput >= PL_regeol)
2024 sayNO;
2025 nextchr = UCHARAT(++locinput);
2026 break;
2027 case REG_ANY:
2028 if (DO_UTF8(PL_reg_sv)) {
2029 locinput += PL_utf8skip[nextchr];
2030 if (locinput > PL_regeol)
2031 sayNO;
2032 nextchr = UCHARAT(locinput);
2033 break;
2034 }
2035 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2036 sayNO;
2037 nextchr = UCHARAT(++locinput);
2038 break;
2039 case EXACT:
2040 s = STRING(scan);
2041 ln = STR_LEN(scan);
2042 /* Inline the first character, for speed. */
2043 if (UCHARAT(s) != nextchr)
2044 sayNO;
2045 if (PL_regeol - locinput < ln)
2046 sayNO;
2047 if (ln > 1 && memNE(s, locinput, ln))
2048 sayNO;
2049 locinput += ln;
2050 nextchr = UCHARAT(locinput);
2051 break;
2052 case EXACTFL:
2053 PL_reg_flags |= RF_tainted;
2054 /* FALL THROUGH */
2055 case EXACTF:
2056 s = STRING(scan);
2057 ln = STR_LEN(scan);
2058
2059 if (UTF) {
2060 char *l = locinput;
2061 char *e = s + ln;
2062 c1 = OP(scan) == EXACTF;
2063 while (s < e) {
2064 if (l >= PL_regeol)
2065 sayNO;
2066 if (utf8_to_uv((U8*)s, e - s, 0, 0) !=
2067 (c1 ?
2068 toLOWER_utf8((U8*)l) :
2069 toLOWER_LC_utf8((U8*)l)))
2070 {
2071 sayNO;
2072 }
2073 s += UTF8SKIP(s);
2074 l += UTF8SKIP(l);
2075 }
2076 locinput = l;
2077 nextchr = UCHARAT(locinput);
2078 break;
2079 }
2080
2081 /* Inline the first character, for speed. */
2082 if (UCHARAT(s) != nextchr &&
2083 UCHARAT(s) != ((OP(scan) == EXACTF)
2084 ? PL_fold : PL_fold_locale)[nextchr])
2085 sayNO;
2086 if (PL_regeol - locinput < ln)
2087 sayNO;
2088 if (ln > 1 && (OP(scan) == EXACTF
2089 ? ibcmp(s, locinput, ln)
2090 : ibcmp_locale(s, locinput, ln)))
2091 sayNO;
2092 locinput += ln;
2093 nextchr = UCHARAT(locinput);
2094 break;
2095 case ANYOF:
2096 if (do_utf8) {
2097 if (!reginclass(scan, (U8*)locinput, do_utf8))
2098 sayNO;
2099 if (locinput >= PL_regeol)
2100 sayNO;
2101 locinput += PL_utf8skip[nextchr];
2102 nextchr = UCHARAT(locinput);
2103 }
2104 else {
2105 if (nextchr < 0)
2106 nextchr = UCHARAT(locinput);
2107 if (!reginclass(scan, (U8*)locinput, do_utf8))
2108 sayNO;
2109 if (!nextchr && locinput >= PL_regeol)
2110 sayNO;
2111 nextchr = UCHARAT(++locinput);
2112 }
2113 break;
2114 case ALNUML:
2115 PL_reg_flags |= RF_tainted;
2116 /* FALL THROUGH */
2117 case ALNUM:
2118 if (!nextchr)
2119 sayNO;
2120 if (do_utf8) {
2121 if (!(OP(scan) == ALNUM
2122 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2123 : isALNUM_LC_utf8((U8*)locinput)))
2124 {
2125 sayNO;
2126 }
2127 locinput += PL_utf8skip[nextchr];
2128 nextchr = UCHARAT(locinput);
2129 break;
2130 }
2131 if (!(OP(scan) == ALNUM
2132 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2133 sayNO;
2134 nextchr = UCHARAT(++locinput);
2135 break;
2136 case NALNUML:
2137 PL_reg_flags |= RF_tainted;
2138 /* FALL THROUGH */
2139 case NALNUM:
2140 if (!nextchr && locinput >= PL_regeol)
2141 sayNO;
2142 if (do_utf8) {
2143 if (OP(scan) == NALNUM
2144 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2145 : isALNUM_LC_utf8((U8*)locinput))
2146 {
2147 sayNO;
2148 }
2149 locinput += PL_utf8skip[nextchr];
2150 nextchr = UCHARAT(locinput);
2151 break;
2152 }
2153 if (OP(scan) == NALNUM
2154 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2155 sayNO;
2156 nextchr = UCHARAT(++locinput);
2157 break;
2158 case BOUNDL:
2159 case NBOUNDL:
2160 PL_reg_flags |= RF_tainted;
2161 /* FALL THROUGH */
2162 case BOUND:
2163 case NBOUND:
2164 /* was last char in word? */
2165 if (do_utf8) {
2166 if (locinput == PL_regbol)
2167 ln = PL_regprev;
2168 else {
2169 U8 *r = reghop((U8*)locinput, -1);
2170
2171 ln = utf8_to_uv(r, s - (char*)r, 0, 0);
2172 }
2173 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2174 ln = isALNUM_uni(ln);
2175 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2176 }
2177 else {
2178 ln = isALNUM_LC_uni(ln);
2179 n = isALNUM_LC_utf8((U8*)locinput);
2180 }
2181 }
2182 else {
2183 ln = (locinput != PL_regbol) ?
2184 UCHARAT(locinput - 1) : PL_regprev;
2185 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2186 ln = isALNUM(ln);
2187 n = isALNUM(nextchr);
2188 }
2189 else {
2190 ln = isALNUM_LC(ln);
2191 n = isALNUM_LC(nextchr);
2192 }
2193 }
2194 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2195 OP(scan) == BOUNDL))
2196 sayNO;
2197 break;
2198 case SPACEL:
2199 PL_reg_flags |= RF_tainted;
2200 /* FALL THROUGH */
2201 case SPACE:
2202 if (!nextchr)
2203 sayNO;
2204 if (DO_UTF8(PL_reg_sv)) {
2205 if (nextchr & 0x80) {
2206 if (!(OP(scan) == SPACE
2207 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2208 : isSPACE_LC_utf8((U8*)locinput)))
2209 {
2210 sayNO;
2211 }
2212 locinput += PL_utf8skip[nextchr];
2213 nextchr = UCHARAT(locinput);
2214 break;
2215 }
2216 if (!(OP(scan) == SPACE
2217 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2218 sayNO;
2219 nextchr = UCHARAT(++locinput);
2220 }
2221 else {
2222 if (!(OP(scan) == SPACE
2223 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2224 sayNO;
2225 nextchr = UCHARAT(++locinput);
2226 }
2227 break;
2228 case NSPACEL:
2229 PL_reg_flags |= RF_tainted;
2230 /* FALL THROUGH */
2231 case NSPACE:
2232 if (!nextchr && locinput >= PL_regeol)
2233 sayNO;
2234 if (DO_UTF8(PL_reg_sv)) {
2235 if (OP(scan) == NSPACE
2236 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2237 : isSPACE_LC_utf8((U8*)locinput))
2238 {
2239 sayNO;
2240 }
2241 locinput += PL_utf8skip[nextchr];
2242 nextchr = UCHARAT(locinput);
2243 break;
2244 }
2245 if (OP(scan) == NSPACE
2246 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2247 sayNO;
2248 nextchr = UCHARAT(++locinput);
2249 break;
2250 case DIGITL:
2251 PL_reg_flags |= RF_tainted;
2252 /* FALL THROUGH */
2253 case DIGIT:
2254 if (!nextchr)
2255 sayNO;
2256 if (DO_UTF8(PL_reg_sv)) {
2257 if (!(OP(scan) == DIGIT
2258 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2259 : isDIGIT_LC_utf8((U8*)locinput)))
2260 {
2261 sayNO;
2262 }
2263 locinput += PL_utf8skip[nextchr];
2264 nextchr = UCHARAT(locinput);
2265 break;
2266 }
2267 if (!(OP(scan) == DIGIT
2268 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2269 sayNO;
2270 nextchr = UCHARAT(++locinput);
2271 break;
2272 case NDIGITL:
2273 PL_reg_flags |= RF_tainted;
2274 /* FALL THROUGH */
2275 case NDIGIT:
2276 if (!nextchr && locinput >= PL_regeol)
2277 sayNO;
2278 if (DO_UTF8(PL_reg_sv)) {
2279 if (OP(scan) == NDIGIT
2280 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2281 : isDIGIT_LC_utf8((U8*)locinput))
2282 {
2283 sayNO;
2284 }
2285 locinput += PL_utf8skip[nextchr];
2286 nextchr = UCHARAT(locinput);
2287 break;
2288 }
2289 if (OP(scan) == NDIGIT
2290 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2291 sayNO;
2292 nextchr = UCHARAT(++locinput);
2293 break;
2294 case CLUMP:
2295 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2296 sayNO;
2297 locinput += PL_utf8skip[nextchr];
2298 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2299 locinput += UTF8SKIP(locinput);
2300 if (locinput > PL_regeol)
2301 sayNO;
2302 nextchr = UCHARAT(locinput);
2303 break;
2304 case REFFL:
2305 PL_reg_flags |= RF_tainted;
2306 /* FALL THROUGH */
2307 case REF:
2308 case REFF:
2309 n = ARG(scan); /* which paren pair */
2310 ln = PL_regstartp[n];
2311 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2312 if (*PL_reglastparen < n || ln == -1)
2313 sayNO; /* Do not match unless seen CLOSEn. */
2314 if (ln == PL_regendp[n])
2315 break;
2316
2317 s = PL_bostr + ln;
2318 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2319 char *l = locinput;
2320 char *e = PL_bostr + PL_regendp[n];
2321 /*
2322 * Note that we can't do the "other character" lookup trick as
2323 * in the 8-bit case (no pun intended) because in Unicode we
2324 * have to map both upper and title case to lower case.
2325 */
2326 if (OP(scan) == REFF) {
2327 while (s < e) {
2328 if (l >= PL_regeol)
2329 sayNO;
2330 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2331 sayNO;
2332 s += UTF8SKIP(s);
2333 l += UTF8SKIP(l);
2334 }
2335 }
2336 else {
2337 while (s < e) {
2338 if (l >= PL_regeol)
2339 sayNO;
2340 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2341 sayNO;
2342 s += UTF8SKIP(s);
2343 l += UTF8SKIP(l);
2344 }
2345 }
2346 locinput = l;
2347 nextchr = UCHARAT(locinput);
2348 break;
2349 }
2350
2351 /* Inline the first character, for speed. */
2352 if (UCHARAT(s) != nextchr &&
2353 (OP(scan) == REF ||
2354 (UCHARAT(s) != ((OP(scan) == REFF
2355 ? PL_fold : PL_fold_locale)[nextchr]))))
2356 sayNO;
2357 ln = PL_regendp[n] - ln;
2358 if (locinput + ln > PL_regeol)
2359 sayNO;
2360 if (ln > 1 && (OP(scan) == REF
2361 ? memNE(s, locinput, ln)
2362 : (OP(scan) == REFF
2363 ? ibcmp(s, locinput, ln)
2364 : ibcmp_locale(s, locinput, ln))))
2365 sayNO;
2366 locinput += ln;
2367 nextchr = UCHARAT(locinput);
2368 break;
2369
2370 case NOTHING:
2371 case TAIL:
2372 break;
2373 case BACK:
2374 break;
2375 case EVAL:
2376 {
2377 dSP;
2378 OP_4tree *oop = PL_op;
2379 COP *ocurcop = PL_curcop;
2380 SV **ocurpad = PL_curpad;
2381 SV *ret;
2382
2383 n = ARG(scan);
2384 PL_op = (OP_4tree*)PL_regdata->data[n];
2385 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2386 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2387 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2388
2389 CALLRUNOPS(aTHX); /* Scalar context. */
2390 SPAGAIN;
2391 ret = POPs;
2392 PUTBACK;
2393
2394 PL_op = oop;
2395 PL_curpad = ocurpad;
2396 PL_curcop = ocurcop;
2397 if (logical) {
2398 if (logical == 2) { /* Postponed subexpression. */
2399 regexp *re;
2400 MAGIC *mg = Null(MAGIC*);
2401 re_cc_state state;
2402 CHECKPOINT cp, lastcp;
2403
2404 if(SvROK(ret) || SvRMAGICAL(ret)) {
2405 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2406
2407 if(SvMAGICAL(sv))
2408 mg = mg_find(sv, 'r');
2409 }
2410 if (mg) {
2411 re = (regexp *)mg->mg_obj;
2412 (void)ReREFCNT_inc(re);
2413 }
2414 else {
2415 STRLEN len;
2416 char *t = SvPV(ret, len);
2417 PMOP pm;
2418 char *oprecomp = PL_regprecomp;
2419 I32 osize = PL_regsize;
2420 I32 onpar = PL_regnpar;
2421
2422 pm.op_pmflags = 0;
2423 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2424 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2425 if (!(SvFLAGS(ret)
2426 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2427 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2428 PL_regprecomp = oprecomp;
2429 PL_regsize = osize;
2430 PL_regnpar = onpar;
2431 }
2432 DEBUG_r(
2433 PerlIO_printf(Perl_debug_log,
2434 "Entering embedded `%s%.60s%s%s'\n",
2435 PL_colors[0],
2436 re->precomp,
2437 PL_colors[1],
2438 (strlen(re->precomp) > 60 ? "..." : ""))
2439 );
2440 state.node = next;
2441 state.prev = PL_reg_call_cc;
2442 state.cc = PL_regcc;
2443 state.re = PL_reg_re;
2444
2445 PL_regcc = 0;
2446
2447 cp = regcppush(0); /* Save *all* the positions. */
2448 REGCP_SET(lastcp);
2449 cache_re(re);
2450 state.ss = PL_savestack_ix;
2451 *PL_reglastparen = 0;
2452 PL_reg_call_cc = &state;
2453 PL_reginput = locinput;
2454
2455 /* XXXX This is too dramatic a measure... */
2456 PL_reg_maxiter = 0;
2457
2458 if (regmatch(re->program + 1)) {
2459 /* Even though we succeeded, we need to restore
2460 global variables, since we may be wrapped inside
2461 SUSPEND, thus the match may be not finished yet. */
2462
2463 /* XXXX Do this only if SUSPENDed? */
2464 PL_reg_call_cc = state.prev;
2465 PL_regcc = state.cc;
2466 PL_reg_re = state.re;
2467 cache_re(PL_reg_re);
2468
2469 /* XXXX This is too dramatic a measure... */
2470 PL_reg_maxiter = 0;
2471
2472 /* These are needed even if not SUSPEND. */
2473 ReREFCNT_dec(re);
2474 regcpblow(cp);
2475 sayYES;
2476 }
2477 ReREFCNT_dec(re);
2478 REGCP_UNWIND(lastcp);
2479 regcppop();
2480 PL_reg_call_cc = state.prev;
2481 PL_regcc = state.cc;
2482 PL_reg_re = state.re;
2483 cache_re(PL_reg_re);
2484
2485 /* XXXX This is too dramatic a measure... */
2486 PL_reg_maxiter = 0;
2487
2488 sayNO;
2489 }
2490 sw = SvTRUE(ret);
2491 logical = 0;
2492 }
2493 else
2494 sv_setsv(save_scalar(PL_replgv), ret);
2495 break;
2496 }
2497 case OPEN:
2498 n = ARG(scan); /* which paren pair */
2499 PL_reg_start_tmp[n] = locinput;
2500 if (n > PL_regsize)
2501 PL_regsize = n;
2502 break;
2503 case CLOSE:
2504 n = ARG(scan); /* which paren pair */
2505 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2506 PL_regendp[n] = locinput - PL_bostr;
2507 if (n > *PL_reglastparen)
2508 *PL_reglastparen = n;
2509 break;
2510 case GROUPP:
2511 n = ARG(scan); /* which paren pair */
2512 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2513 break;
2514 case IFTHEN:
2515 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2516 if (sw)
2517 next = NEXTOPER(NEXTOPER(scan));
2518 else {
2519 next = scan + ARG(scan);
2520 if (OP(next) == IFTHEN) /* Fake one. */
2521 next = NEXTOPER(NEXTOPER(next));
2522 }
2523 break;
2524 case LOGICAL:
2525 logical = scan->flags;
2526 break;
2527/*******************************************************************
2528 PL_regcc contains infoblock about the innermost (...)* loop, and
2529 a pointer to the next outer infoblock.
2530
2531 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2532
2533 1) After matching X, regnode for CURLYX is processed;
2534
2535 2) This regnode creates infoblock on the stack, and calls
2536 regmatch() recursively with the starting point at WHILEM node;
2537
2538 3) Each hit of WHILEM node tries to match A and Z (in the order
2539 depending on the current iteration, min/max of {min,max} and
2540 greediness). The information about where are nodes for "A"
2541 and "Z" is read from the infoblock, as is info on how many times "A"
2542 was already matched, and greediness.
2543
2544 4) After A matches, the same WHILEM node is hit again.
2545
2546 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2547 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2548 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2549 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2550 of the external loop.
2551
2552 Currently present infoblocks form a tree with a stem formed by PL_curcc
2553 and whatever it mentions via ->next, and additional attached trees
2554 corresponding to temporarily unset infoblocks as in "5" above.
2555
2556 In the following picture infoblocks for outer loop of
2557 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2558 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2559 infoblocks are drawn below the "reset" infoblock.
2560
2561 In fact in the picture below we do not show failed matches for Z and T
2562 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2563 more obvious *why* one needs to *temporary* unset infoblocks.]
2564
2565 Matched REx position InfoBlocks Comment
2566 (Y(A)*?Z)*?T x
2567 Y(A)*?Z)*?T x <- O
2568 Y (A)*?Z)*?T x <- O
2569 Y A)*?Z)*?T x <- O <- I
2570 YA )*?Z)*?T x <- O <- I
2571 YA A)*?Z)*?T x <- O <- I
2572 YAA )*?Z)*?T x <- O <- I
2573 YAA Z)*?T x <- O # Temporary unset I
2574 I
2575
2576 YAAZ Y(A)*?Z)*?T x <- O
2577 I
2578
2579 YAAZY (A)*?Z)*?T x <- O
2580 I
2581
2582 YAAZY A)*?Z)*?T x <- O <- I
2583 I
2584
2585 YAAZYA )*?Z)*?T x <- O <- I
2586 I
2587
2588 YAAZYA Z)*?T x <- O # Temporary unset I
2589 I,I
2590
2591 YAAZYAZ )*?T x <- O
2592 I,I
2593
2594 YAAZYAZ T x # Temporary unset O
2595 O
2596 I,I
2597
2598 YAAZYAZT x
2599 O
2600 I,I
2601 *******************************************************************/
2602 case CURLYX: {
2603 CURCUR cc;
2604 CHECKPOINT cp = PL_savestack_ix;
2605 /* No need to save/restore up to this paren */
2606 I32 parenfloor = scan->flags;
2607
2608 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2609 next += ARG(next);
2610 cc.oldcc = PL_regcc;
2611 PL_regcc = &cc;
2612 /* XXXX Probably it is better to teach regpush to support
2613 parenfloor > PL_regsize... */
2614 if (parenfloor > *PL_reglastparen)
2615 parenfloor = *PL_reglastparen; /* Pessimization... */
2616 cc.parenfloor = parenfloor;
2617 cc.cur = -1;
2618 cc.min = ARG1(scan);
2619 cc.max = ARG2(scan);
2620 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2621 cc.next = next;
2622 cc.minmod = minmod;
2623 cc.lastloc = 0;
2624 PL_reginput = locinput;
2625 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2626 regcpblow(cp);
2627 PL_regcc = cc.oldcc;
2628 saySAME(n);
2629 }
2630 /* NOT REACHED */
2631 case WHILEM: {
2632 /*
2633 * This is really hard to understand, because after we match
2634 * what we're trying to match, we must make sure the rest of
2635 * the REx is going to match for sure, and to do that we have
2636 * to go back UP the parse tree by recursing ever deeper. And
2637 * if it fails, we have to reset our parent's current state
2638 * that we can try again after backing off.
2639 */
2640
2641 CHECKPOINT cp, lastcp;
2642 CURCUR* cc = PL_regcc;
2643 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2644
2645 n = cc->cur + 1; /* how many we know we matched */
2646 PL_reginput = locinput;
2647
2648 DEBUG_r(
2649 PerlIO_printf(Perl_debug_log,
2650 "%*s %ld out of %ld..%ld cc=%lx\n",
2651 REPORT_CODE_OFF+PL_regindent*2, "",
2652 (long)n, (long)cc->min,
2653 (long)cc->max, (long)cc)
2654 );
2655
2656 /* If degenerate scan matches "", assume scan done. */
2657
2658 if (locinput == cc->lastloc && n >= cc->min) {
2659 PL_regcc = cc->oldcc;
2660 if (PL_regcc)
2661 ln = PL_regcc->cur;
2662 DEBUG_r(
2663 PerlIO_printf(Perl_debug_log,
2664 "%*s empty match detected, try continuation...\n",
2665 REPORT_CODE_OFF+PL_regindent*2, "")
2666 );
2667 if (regmatch(cc->next))
2668 sayYES;
2669 if (PL_regcc)
2670 PL_regcc->cur = ln;
2671 PL_regcc = cc;
2672 sayNO;
2673 }
2674
2675 /* First just match a string of min scans. */
2676
2677 if (n < cc->min) {
2678 cc->cur = n;
2679 cc->lastloc = locinput;
2680 if (regmatch(cc->scan))
2681 sayYES;
2682 cc->cur = n - 1;
2683 cc->lastloc = lastloc;
2684 sayNO;
2685 }
2686
2687 if (scan->flags) {
2688 /* Check whether we already were at this position.
2689 Postpone detection until we know the match is not
2690 *that* much linear. */
2691 if (!PL_reg_maxiter) {
2692 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2693 PL_reg_leftiter = PL_reg_maxiter;
2694 }
2695 if (PL_reg_leftiter-- == 0) {
2696 I32 size = (PL_reg_maxiter + 7)/8;
2697 if (PL_reg_poscache) {
2698 if (PL_reg_poscache_size < size) {
2699 Renew(PL_reg_poscache, size, char);
2700 PL_reg_poscache_size = size;
2701 }
2702 Zero(PL_reg_poscache, size, char);
2703 }
2704 else {
2705 PL_reg_poscache_size = size;
2706 Newz(29, PL_reg_poscache, size, char);
2707 }
2708 DEBUG_r(
2709 PerlIO_printf(Perl_debug_log,
2710 "%sDetected a super-linear match, switching on caching%s...\n",
2711 PL_colors[4], PL_colors[5])
2712 );
2713 }
2714 if (PL_reg_leftiter < 0) {
2715 I32 o = locinput - PL_bostr, b;
2716
2717 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2718 b = o % 8;
2719 o /= 8;
2720 if (PL_reg_poscache[o] & (1<<b)) {
2721 DEBUG_r(
2722 PerlIO_printf(Perl_debug_log,
2723 "%*s already tried at this position...\n",
2724 REPORT_CODE_OFF+PL_regindent*2, "")
2725 );
2726 sayNO_SILENT;
2727 }
2728 PL_reg_poscache[o] |= (1<<b);
2729 }
2730 }
2731
2732 /* Prefer next over scan for minimal matching. */
2733
2734 if (cc->minmod) {
2735 PL_regcc = cc->oldcc;
2736 if (PL_regcc)
2737 ln = PL_regcc->cur;
2738 cp = regcppush(cc->parenfloor);
2739 REGCP_SET(lastcp);
2740 if (regmatch(cc->next)) {
2741 regcpblow(cp);
2742 sayYES; /* All done. */
2743 }
2744 REGCP_UNWIND(lastcp);
2745 regcppop();
2746 if (PL_regcc)
2747 PL_regcc->cur = ln;
2748 PL_regcc = cc;
2749
2750 if (n >= cc->max) { /* Maximum greed exceeded? */
2751 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2752 && !(PL_reg_flags & RF_warned)) {
2753 PL_reg_flags |= RF_warned;
2754 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2755 "Complex regular subexpression recursion",
2756 REG_INFTY - 1);
2757 }
2758 sayNO;
2759 }
2760
2761 DEBUG_r(
2762 PerlIO_printf(Perl_debug_log,
2763 "%*s trying longer...\n",
2764 REPORT_CODE_OFF+PL_regindent*2, "")
2765 );
2766 /* Try scanning more and see if it helps. */
2767 PL_reginput = locinput;
2768 cc->cur = n;
2769 cc->lastloc = locinput;
2770 cp = regcppush(cc->parenfloor);
2771 REGCP_SET(lastcp);
2772 if (regmatch(cc->scan)) {
2773 regcpblow(cp);
2774 sayYES;
2775 }
2776 REGCP_UNWIND(lastcp);
2777 regcppop();
2778 cc->cur = n - 1;
2779 cc->lastloc = lastloc;
2780 sayNO;
2781 }
2782
2783 /* Prefer scan over next for maximal matching. */
2784
2785 if (n < cc->max) { /* More greed allowed? */
2786 cp = regcppush(cc->parenfloor);
2787 cc->cur = n;
2788 cc->lastloc = locinput;
2789 REGCP_SET(lastcp);
2790 if (regmatch(cc->scan)) {
2791 regcpblow(cp);
2792 sayYES;
2793 }
2794 REGCP_UNWIND(lastcp);
2795 regcppop(); /* Restore some previous $<digit>s? */
2796 PL_reginput = locinput;
2797 DEBUG_r(
2798 PerlIO_printf(Perl_debug_log,
2799 "%*s failed, try continuation...\n",
2800 REPORT_CODE_OFF+PL_regindent*2, "")
2801 );
2802 }
2803 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2804 && !(PL_reg_flags & RF_warned)) {
2805 PL_reg_flags |= RF_warned;
2806 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2807 "Complex regular subexpression recursion",
2808 REG_INFTY - 1);
2809 }
2810
2811 /* Failed deeper matches of scan, so see if this one works. */
2812 PL_regcc = cc->oldcc;
2813 if (PL_regcc)
2814 ln = PL_regcc->cur;
2815 if (regmatch(cc->next))
2816 sayYES;
2817 if (PL_regcc)
2818 PL_regcc->cur = ln;
2819 PL_regcc = cc;
2820 cc->cur = n - 1;
2821 cc->lastloc = lastloc;
2822 sayNO;
2823 }
2824 /* NOT REACHED */
2825 case BRANCHJ:
2826 next = scan + ARG(scan);
2827 if (next == scan)
2828 next = NULL;
2829 inner = NEXTOPER(NEXTOPER(scan));
2830 goto do_branch;
2831 case BRANCH:
2832 inner = NEXTOPER(scan);
2833 do_branch:
2834 {
2835 CHECKPOINT lastcp;
2836 c1 = OP(scan);
2837 if (OP(next) != c1) /* No choice. */
2838 next = inner; /* Avoid recursion. */
2839 else {
2840 I32 lastparen = *PL_reglastparen;
2841 I32 unwind1;
2842 re_unwind_branch_t *uw;
2843
2844 /* Put unwinding data on stack */
2845 unwind1 = SSNEWt(1,re_unwind_branch_t);
2846 uw = SSPTRt(unwind1,re_unwind_branch_t);
2847 uw->prev = unwind;
2848 unwind = unwind1;
2849 uw->type = ((c1 == BRANCH)
2850 ? RE_UNWIND_BRANCH
2851 : RE_UNWIND_BRANCHJ);
2852 uw->lastparen = lastparen;
2853 uw->next = next;
2854 uw->locinput = locinput;
2855 uw->nextchr = nextchr;
2856#ifdef DEBUGGING
2857 uw->regindent = ++PL_regindent;
2858#endif
2859
2860 REGCP_SET(uw->lastcp);
2861
2862 /* Now go into the first branch */
2863 next = inner;
2864 }
2865 }
2866 break;
2867 case MINMOD:
2868 minmod = 1;
2869 break;
2870 case CURLYM:
2871 {
2872 I32 l = 0;
2873 CHECKPOINT lastcp;
2874
2875 /* We suppose that the next guy does not need
2876 backtracking: in particular, it is of constant length,
2877 and has no parenths to influence future backrefs. */
2878 ln = ARG1(scan); /* min to match */
2879 n = ARG2(scan); /* max to match */
2880 paren = scan->flags;
2881 if (paren) {
2882 if (paren > PL_regsize)
2883 PL_regsize = paren;
2884 if (paren > *PL_reglastparen)
2885 *PL_reglastparen = paren;
2886 }
2887 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2888 if (paren)
2889 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2890 PL_reginput = locinput;
2891 if (minmod) {
2892 minmod = 0;
2893 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2894 sayNO;
2895 if (ln && l == 0 && n >= ln
2896 /* In fact, this is tricky. If paren, then the
2897 fact that we did/didnot match may influence
2898 future execution. */
2899 && !(paren && ln == 0))
2900 ln = n;
2901 locinput = PL_reginput;
2902 if (PL_regkind[(U8)OP(next)] == EXACT) {
2903 c1 = (U8)*STRING(next);
2904 if (OP(next) == EXACTF)
2905 c2 = PL_fold[c1];
2906 else if (OP(next) == EXACTFL)
2907 c2 = PL_fold_locale[c1];
2908 else
2909 c2 = c1;
2910 }
2911 else
2912 c1 = c2 = -1000;
2913 REGCP_SET(lastcp);
2914 /* This may be improved if l == 0. */
2915 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2916 /* If it could work, try it. */
2917 if (c1 == -1000 ||
2918 UCHARAT(PL_reginput) == c1 ||
2919 UCHARAT(PL_reginput) == c2)
2920 {
2921 if (paren) {
2922 if (n) {
2923 PL_regstartp[paren] =
2924 HOPc(PL_reginput, -l) - PL_bostr;
2925 PL_regendp[paren] = PL_reginput - PL_bostr;
2926 }
2927 else
2928 PL_regendp[paren] = -1;
2929 }
2930 if (regmatch(next))
2931 sayYES;
2932 REGCP_UNWIND(lastcp);
2933 }
2934 /* Couldn't or didn't -- move forward. */
2935 PL_reginput = locinput;
2936 if (regrepeat_hard(scan, 1, &l)) {
2937 ln++;
2938 locinput = PL_reginput;
2939 }
2940 else
2941 sayNO;
2942 }
2943 }
2944 else {
2945 n = regrepeat_hard(scan, n, &l);
2946 if (n != 0 && l == 0
2947 /* In fact, this is tricky. If paren, then the
2948 fact that we did/didnot match may influence
2949 future execution. */
2950 && !(paren && ln == 0))
2951 ln = n;
2952 locinput = PL_reginput;
2953 DEBUG_r(
2954 PerlIO_printf(Perl_debug_log,
2955 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2956 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2957 (IV) n, (IV)l)
2958 );
2959 if (n >= ln) {
2960 if (PL_regkind[(U8)OP(next)] == EXACT) {
2961 c1 = (U8)*STRING(next);
2962 if (OP(next) == EXACTF)
2963 c2 = PL_fold[c1];
2964 else if (OP(next) == EXACTFL)
2965 c2 = PL_fold_locale[c1];
2966 else
2967 c2 = c1;
2968 }
2969 else
2970 c1 = c2 = -1000;
2971 }
2972 REGCP_SET(lastcp);
2973 while (n >= ln) {
2974 /* If it could work, try it. */
2975 if (c1 == -1000 ||
2976 UCHARAT(PL_reginput) == c1 ||
2977 UCHARAT(PL_reginput) == c2)
2978 {
2979 DEBUG_r(
2980 PerlIO_printf(Perl_debug_log,
2981 "%*s trying tail with n=%"IVdf"...\n",
2982 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2983 );
2984 if (paren) {
2985 if (n) {
2986 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2987 PL_regendp[paren] = PL_reginput - PL_bostr;
2988 }
2989 else
2990 PL_regendp[paren] = -1;
2991 }
2992 if (regmatch(next))
2993 sayYES;
2994 REGCP_UNWIND(lastcp);
2995 }
2996 /* Couldn't or didn't -- back up. */
2997 n--;
2998 locinput = HOPc(locinput, -l);
2999 PL_reginput = locinput;
3000 }
3001 }
3002 sayNO;
3003 break;
3004 }
3005 case CURLYN:
3006 paren = scan->flags; /* Which paren to set */
3007 if (paren > PL_regsize)
3008 PL_regsize = paren;
3009 if (paren > *PL_reglastparen)
3010 *PL_reglastparen = paren;
3011 ln = ARG1(scan); /* min to match */
3012 n = ARG2(scan); /* max to match */
3013 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3014 goto repeat;
3015 case CURLY:
3016 paren = 0;
3017 ln = ARG1(scan); /* min to match */
3018 n = ARG2(scan); /* max to match */
3019 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3020 goto repeat;
3021 case STAR:
3022 ln = 0;
3023 n = REG_INFTY;
3024 scan = NEXTOPER(scan);
3025 paren = 0;
3026 goto repeat;
3027 case PLUS:
3028 ln = 1;
3029 n = REG_INFTY;
3030 scan = NEXTOPER(scan);
3031 paren = 0;
3032 repeat:
3033 /*
3034 * Lookahead to avoid useless match attempts
3035 * when we know what character comes next.
3036 */
3037 if (PL_regkind[(U8)OP(next)] == EXACT) {
3038 c1 = (U8)*STRING(next);
3039 if (OP(next) == EXACTF)
3040 c2 = PL_fold[c1];
3041 else if (OP(next) == EXACTFL)
3042 c2 = PL_fold_locale[c1];
3043 else
3044 c2 = c1;
3045 }
3046 else
3047 c1 = c2 = -1000;
3048 PL_reginput = locinput;
3049 if (minmod) {
3050 CHECKPOINT lastcp;
3051 minmod = 0;
3052 if (ln && regrepeat(scan, ln) < ln)
3053 sayNO;
3054 locinput = PL_reginput;
3055 REGCP_SET(lastcp);
3056 if (c1 != -1000) {
3057 char *e = locinput + n - ln; /* Should not check after this */
3058 char *old = locinput;
3059
3060 if (e >= PL_regeol || (n == REG_INFTY))
3061 e = PL_regeol - 1;
3062 while (1) {
3063 /* Find place 'next' could work */
3064 if (c1 == c2) {
3065 while (locinput <= e && *locinput != c1)
3066 locinput++;
3067 } else {
3068 while (locinput <= e
3069 && *locinput != c1
3070 && *locinput != c2)
3071 locinput++;
3072 }
3073 if (locinput > e)
3074 sayNO;
3075 /* PL_reginput == old now */
3076 if (locinput != old) {
3077 ln = 1; /* Did some */
3078 if (regrepeat(scan, locinput - old) <
3079 locinput - old)
3080 sayNO;
3081 }
3082 /* PL_reginput == locinput now */
3083 TRYPAREN(paren, ln, locinput);
3084 PL_reginput = locinput; /* Could be reset... */
3085 REGCP_UNWIND(lastcp);
3086 /* Couldn't or didn't -- move forward. */
3087 old = locinput++;
3088 }
3089 }
3090 else
3091 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3092 /* If it could work, try it. */
3093 if (c1 == -1000 ||
3094 UCHARAT(PL_reginput) == c1 ||
3095 UCHARAT(PL_reginput) == c2)
3096 {
3097 TRYPAREN(paren, n, PL_reginput);
3098 REGCP_UNWIND(lastcp);
3099 }
3100 /* Couldn't or didn't -- move forward. */
3101 PL_reginput = locinput;
3102 if (regrepeat(scan, 1)) {
3103 ln++;
3104 locinput = PL_reginput;
3105 }
3106 else
3107 sayNO;
3108 }
3109 }
3110 else {
3111 CHECKPOINT lastcp;
3112 n = regrepeat(scan, n);
3113 locinput = PL_reginput;
3114 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3115 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3116 ln = n; /* why back off? */
3117 /* ...because $ and \Z can match before *and* after
3118 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3119 We should back off by one in this case. */
3120 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3121 ln--;
3122 }
3123 REGCP_SET(lastcp);
3124 if (paren) {
3125 while (n >= ln) {
3126 /* If it could work, try it. */
3127 if (c1 == -1000 ||
3128 UCHARAT(PL_reginput) == c1 ||
3129 UCHARAT(PL_reginput) == c2)
3130 {
3131 TRYPAREN(paren, n, PL_reginput);
3132 REGCP_UNWIND(lastcp);
3133 }
3134 /* Couldn't or didn't -- back up. */
3135 n--;
3136 PL_reginput = locinput = HOPc(locinput, -1);
3137 }
3138 }
3139 else {
3140 while (n >= ln) {
3141 /* If it could work, try it. */
3142 if (c1 == -1000 ||
3143 UCHARAT(PL_reginput) == c1 ||
3144 UCHARAT(PL_reginput) == c2)
3145 {
3146 TRYPAREN(paren, n, PL_reginput);
3147 REGCP_UNWIND(lastcp);
3148 }
3149 /* Couldn't or didn't -- back up. */
3150 n--;
3151 PL_reginput = locinput = HOPc(locinput, -1);
3152 }
3153 }
3154 }
3155 sayNO;
3156 break;
3157 case END:
3158 if (PL_reg_call_cc) {
3159 re_cc_state *cur_call_cc = PL_reg_call_cc;
3160 CURCUR *cctmp = PL_regcc;
3161 regexp *re = PL_reg_re;
3162 CHECKPOINT cp, lastcp;
3163
3164 cp = regcppush(0); /* Save *all* the positions. */
3165 REGCP_SET(lastcp);
3166 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3167 the caller. */
3168 PL_reginput = locinput; /* Make position available to
3169 the callcc. */
3170 cache_re(PL_reg_call_cc->re);
3171 PL_regcc = PL_reg_call_cc->cc;
3172 PL_reg_call_cc = PL_reg_call_cc->prev;
3173 if (regmatch(cur_call_cc->node)) {
3174 PL_reg_call_cc = cur_call_cc;
3175 regcpblow(cp);
3176 sayYES;
3177 }
3178 REGCP_UNWIND(lastcp);
3179 regcppop();
3180 PL_reg_call_cc = cur_call_cc;
3181 PL_regcc = cctmp;
3182 PL_reg_re = re;
3183 cache_re(re);
3184
3185 DEBUG_r(
3186 PerlIO_printf(Perl_debug_log,
3187 "%*s continuation failed...\n",
3188 REPORT_CODE_OFF+PL_regindent*2, "")
3189 );
3190 sayNO_SILENT;
3191 }
3192 if (locinput < PL_regtill) {
3193 DEBUG_r(PerlIO_printf(Perl_debug_log,
3194 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3195 PL_colors[4],
3196 (long)(locinput - PL_reg_starttry),
3197 (long)(PL_regtill - PL_reg_starttry),
3198 PL_colors[5]));
3199 sayNO_FINAL; /* Cannot match: too short. */
3200 }
3201 PL_reginput = locinput; /* put where regtry can find it */
3202 sayYES_FINAL; /* Success! */
3203 case SUCCEED:
3204 PL_reginput = locinput; /* put where regtry can find it */
3205 sayYES_LOUD; /* Success! */
3206 case SUSPEND:
3207 n = 1;
3208 PL_reginput = locinput;
3209 goto do_ifmatch;
3210 case UNLESSM:
3211 n = 0;
3212 if (scan->flags) {
3213 if (UTF) { /* XXXX This is absolutely
3214 broken, we read before
3215 start of string. */
3216 s = HOPMAYBEc(locinput, -scan->flags);
3217 if (!s)
3218 goto say_yes;
3219 PL_reginput = s;
3220 }
3221 else {
3222 if (locinput < PL_bostr + scan->flags)
3223 goto say_yes;
3224 PL_reginput = locinput - scan->flags;
3225 goto do_ifmatch;
3226 }
3227 }
3228 else
3229 PL_reginput = locinput;
3230 goto do_ifmatch;
3231 case IFMATCH:
3232 n = 1;
3233 if (scan->flags) {
3234 if (UTF) { /* XXXX This is absolutely
3235 broken, we read before
3236 start of string. */
3237 s = HOPMAYBEc(locinput, -scan->flags);
3238 if (!s || s < PL_bostr)
3239 goto say_no;
3240 PL_reginput = s;
3241 }
3242 else {
3243 if (locinput < PL_bostr + scan->flags)
3244 goto say_no;
3245 PL_reginput = locinput - scan->flags;
3246 goto do_ifmatch;
3247 }
3248 }
3249 else
3250 PL_reginput = locinput;
3251
3252 do_ifmatch:
3253 inner = NEXTOPER(NEXTOPER(scan));
3254 if (regmatch(inner) != n) {
3255 say_no:
3256 if (logical) {
3257 logical = 0;
3258 sw = 0;
3259 goto do_longjump;
3260 }
3261 else
3262 sayNO;
3263 }
3264 say_yes:
3265 if (logical) {
3266 logical = 0;
3267 sw = 1;
3268 }
3269 if (OP(scan) == SUSPEND) {
3270 locinput = PL_reginput;
3271 nextchr = UCHARAT(locinput);
3272 }
3273 /* FALL THROUGH. */
3274 case LONGJMP:
3275 do_longjump:
3276 next = scan + ARG(scan);
3277 if (next == scan)
3278 next = NULL;
3279 break;
3280 default:
3281 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3282 PTR2UV(scan), OP(scan));
3283 Perl_croak(aTHX_ "regexp memory corruption");
3284 }
3285 reenter:
3286 scan = next;
3287 }
3288
3289 /*
3290 * We get here only if there's trouble -- normally "case END" is
3291 * the terminating point.
3292 */
3293 Perl_croak(aTHX_ "corrupted regexp pointers");
3294 /*NOTREACHED*/
3295 sayNO;
3296
3297yes_loud:
3298 DEBUG_r(
3299 PerlIO_printf(Perl_debug_log,
3300 "%*s %scould match...%s\n",
3301 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3302 );
3303 goto yes;
3304yes_final:
3305 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3306 PL_colors[4],PL_colors[5]));
3307yes:
3308#ifdef DEBUGGING
3309 PL_regindent--;
3310#endif
3311
3312#if 0 /* Breaks $^R */
3313 if (unwind)
3314 regcpblow(firstcp);
3315#endif
3316 return 1;
3317
3318no:
3319 DEBUG_r(
3320 PerlIO_printf(Perl_debug_log,
3321 "%*s %sfailed...%s\n",
3322 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3323 );
3324 goto do_no;
3325no_final:
3326do_no:
3327 if (unwind) {
3328 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3329
3330 switch (uw->type) {
3331 case RE_UNWIND_BRANCH:
3332 case RE_UNWIND_BRANCHJ:
3333 {
3334 re_unwind_branch_t *uwb = &(uw->branch);
3335 I32 lastparen = uwb->lastparen;
3336
3337 REGCP_UNWIND(uwb->lastcp);
3338 for (n = *PL_reglastparen; n > lastparen; n--)
3339 PL_regendp[n] = -1;
3340 *PL_reglastparen = n;
3341 scan = next = uwb->next;
3342 if ( !scan ||
3343 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3344 ? BRANCH : BRANCHJ) ) { /* Failure */
3345 unwind = uwb->prev;
3346#ifdef DEBUGGING
3347 PL_regindent--;
3348#endif
3349 goto do_no;
3350 }
3351 /* Have more choice yet. Reuse the same uwb. */
3352 /*SUPPRESS 560*/
3353 if ((n = (uwb->type == RE_UNWIND_BRANCH
3354 ? NEXT_OFF(next) : ARG(next))))
3355 next += n;
3356 else
3357 next = NULL; /* XXXX Needn't unwinding in this case... */
3358 uwb->next = next;
3359 next = NEXTOPER(scan);
3360 if (uwb->type == RE_UNWIND_BRANCHJ)
3361 next = NEXTOPER(next);
3362 locinput = uwb->locinput;
3363 nextchr = uwb->nextchr;
3364#ifdef DEBUGGING
3365 PL_regindent = uwb->regindent;
3366#endif
3367
3368 goto reenter;
3369 }
3370 /* NOT REACHED */
3371 default:
3372 Perl_croak(aTHX_ "regexp unwind memory corruption");
3373 }
3374 /* NOT REACHED */
3375 }
3376#ifdef DEBUGGING
3377 PL_regindent--;
3378#endif
3379 return 0;
3380}
3381
3382/*
3383 - regrepeat - repeatedly match something simple, report how many
3384 */
3385/*
3386 * [This routine now assumes that it will only match on things of length 1.
3387 * That was true before, but now we assume scan - reginput is the count,
3388 * rather than incrementing count on every character. [Er, except utf8.]]
3389 */
3390STATIC I32
3391S_regrepeat(pTHX_ regnode *p, I32 max)
3392{
3393 register char *scan;
3394 register I32 c;
3395 register char *loceol = PL_regeol;
3396 register I32 hardcount = 0;
3397 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3398
3399 scan = PL_reginput;
3400 if (max != REG_INFTY && max < loceol - scan)
3401 loceol = scan + max;
3402 switch (OP(p)) {
3403 case REG_ANY:
3404 if (DO_UTF8(PL_reg_sv)) {
3405 loceol = PL_regeol;
3406 while (scan < loceol && *scan != '\n') {
3407 scan += UTF8SKIP(scan);
3408 hardcount++;
3409 }
3410 } else {
3411 while (scan < loceol && *scan != '\n')
3412 scan++;
3413 }
3414 break;
3415 case SANY:
3416 if (DO_UTF8(PL_reg_sv)) {
3417 loceol = PL_regeol;
3418 while (scan < loceol) {
3419 scan += UTF8SKIP(scan);
3420 hardcount++;
3421 }
3422 } else {
3423 scan = loceol;
3424 }
3425 break;
3426 case EXACT: /* length of string is 1 */
3427 c = (U8)*STRING(p);
3428 while (scan < loceol && UCHARAT(scan) == c)
3429 scan++;
3430 break;
3431 case EXACTF: /* length of string is 1 */
3432 c = (U8)*STRING(p);
3433 while (scan < loceol &&
3434 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3435 scan++;
3436 break;
3437 case EXACTFL: /* length of string is 1 */
3438 PL_reg_flags |= RF_tainted;
3439 c = (U8)*STRING(p);
3440 while (scan < loceol &&
3441 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3442 scan++;
3443 break;
3444 case ANYOF:
3445 if (do_utf8) {
3446 loceol = PL_regeol;
3447 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) {
3448 scan += UTF8SKIP(scan);
3449 hardcount++;
3450 }
3451 } else {
3452 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3453 scan++;
3454 }
3455 break;
3456 case ALNUM:
3457 if (DO_UTF8(PL_reg_sv)) {
3458 loceol = PL_regeol;
3459 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3460 scan += UTF8SKIP(scan);
3461 hardcount++;
3462 }
3463 } else {
3464 while (scan < loceol && isALNUM(*scan))
3465 scan++;
3466 }
3467 break;
3468 case ALNUML:
3469 PL_reg_flags |= RF_tainted;
3470 if (DO_UTF8(PL_reg_sv)) {
3471 loceol = PL_regeol;
3472 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3473 scan += UTF8SKIP(scan);
3474 hardcount++;
3475 }
3476 } else {
3477 while (scan < loceol && isALNUM_LC(*scan))
3478 scan++;
3479 }
3480 break;
3481 case NALNUM:
3482 if (DO_UTF8(PL_reg_sv)) {
3483 loceol = PL_regeol;
3484 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3485 scan += UTF8SKIP(scan);
3486 hardcount++;
3487 }
3488 } else {
3489 while (scan < loceol && !isALNUM(*scan))
3490 scan++;
3491 }
3492 break;
3493 case NALNUML:
3494 PL_reg_flags |= RF_tainted;
3495 if (DO_UTF8(PL_reg_sv)) {
3496 loceol = PL_regeol;
3497 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3498 scan += UTF8SKIP(scan);
3499 hardcount++;
3500 }
3501 } else {
3502 while (scan < loceol && !isALNUM_LC(*scan))
3503 scan++;
3504 }
3505 break;
3506 case SPACE:
3507 if (DO_UTF8(PL_reg_sv)) {
3508 loceol = PL_regeol;
3509 while (scan < loceol &&
3510 (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3511 scan += UTF8SKIP(scan);
3512 hardcount++;
3513 }
3514 } else {
3515 while (scan < loceol && isSPACE(*scan))
3516 scan++;
3517 }
3518 break;
3519 case SPACEL:
3520 PL_reg_flags |= RF_tainted;
3521 if (DO_UTF8(PL_reg_sv)) {
3522 loceol = PL_regeol;
3523 while (scan < loceol &&
3524 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3525 scan += UTF8SKIP(scan);
3526 hardcount++;
3527 }
3528 } else {
3529 while (scan < loceol && isSPACE_LC(*scan))
3530 scan++;
3531 }
3532 break;
3533 case NSPACE:
3534 if (DO_UTF8(PL_reg_sv)) {
3535 loceol = PL_regeol;
3536 while (scan < loceol &&
3537 !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3538 scan += UTF8SKIP(scan);
3539 hardcount++;
3540 }
3541 } else {
3542 while (scan < loceol && !isSPACE(*scan))
3543 scan++;
3544 break;
3545 }
3546 case NSPACEL:
3547 PL_reg_flags |= RF_tainted;
3548 if (DO_UTF8(PL_reg_sv)) {
3549 loceol = PL_regeol;
3550 while (scan < loceol &&
3551 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3552 scan += UTF8SKIP(scan);
3553 hardcount++;
3554 }
3555 } else {
3556 while (scan < loceol && !isSPACE_LC(*scan))
3557 scan++;
3558 }
3559 break;
3560 case DIGIT:
3561 if (DO_UTF8(PL_reg_sv)) {
3562 loceol = PL_regeol;
3563 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3564 scan += UTF8SKIP(scan);
3565 hardcount++;
3566 }
3567 } else {
3568 while (scan < loceol && isDIGIT(*scan))
3569 scan++;
3570 }
3571 break;
3572 case NDIGIT:
3573 if (DO_UTF8(PL_reg_sv)) {
3574 loceol = PL_regeol;
3575 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3576 scan += UTF8SKIP(scan);
3577 hardcount++;
3578 }
3579 } else {
3580 while (scan < loceol && !isDIGIT(*scan))
3581 scan++;
3582 }
3583 break;
3584 default: /* Called on something of 0 width. */
3585 break; /* So match right here or not at all. */
3586 }
3587
3588 if (hardcount)
3589 c = hardcount;
3590 else
3591 c = scan - PL_reginput;
3592 PL_reginput = scan;
3593
3594 DEBUG_r(
3595 {
3596 SV *prop = sv_newmortal();
3597
3598 regprop(prop, p);
3599 PerlIO_printf(Perl_debug_log,
3600 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3601 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3602 });
3603
3604 return(c);
3605}
3606
3607/*
3608 - regrepeat_hard - repeatedly match something, report total lenth and length
3609 *
3610 * The repeater is supposed to have constant length.
3611 */
3612
3613STATIC I32
3614S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3615{
3616 register char *scan;
3617 register char *start;
3618 register char *loceol = PL_regeol;
3619 I32 l = 0;
3620 I32 count = 0, res = 1;
3621
3622 if (!max)
3623 return 0;
3624
3625 start = PL_reginput;
3626 if (UTF) {
3627 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3628 if (!count++) {
3629 l = 0;
3630 while (start < PL_reginput) {
3631 l++;
3632 start += UTF8SKIP(start);
3633 }
3634 *lp = l;
3635 if (l == 0)
3636 return max;
3637 }
3638 if (count == max)
3639 return count;
3640 }
3641 }
3642 else {
3643 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3644 if (!count++) {
3645 *lp = l = PL_reginput - start;
3646 if (max != REG_INFTY && l*max < loceol - scan)
3647 loceol = scan + l*max;
3648 if (l == 0)
3649 return max;
3650 }
3651 }
3652 }
3653 if (!res)
3654 PL_reginput = scan;
3655
3656 return count;
3657}
3658
3659/*
3660- regclass_swash - prepare the utf8 swash
3661*/
3662
3663SV *
3664Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3665{
3666 SV *sw = NULL;
3667 SV *si = NULL;
3668
3669 if (PL_regdata && PL_regdata->count) {
3670 U32 n = ARG(node);
3671
3672 if (PL_regdata->what[n] == 's') {
3673 SV *rv = (SV*)PL_regdata->data[n];
3674 AV *av = (AV*)SvRV((SV*)rv);
3675 SV **a;
3676
3677 si = *av_fetch(av, 0, FALSE);
3678 a = av_fetch(av, 1, FALSE);
3679
3680 if (a)
3681 sw = *a;
3682 else if (si && doinit) {
3683 sw = swash_init("utf8", "", si, 1, 0);
3684 (void)av_store(av, 1, sw);
3685 }
3686 }
3687 }
3688
3689 if (initsvp)
3690 *initsvp = si;
3691
3692 return sw;
3693}
3694
3695/*
3696 - reginclass - determine if a character falls into a character class
3697 */
3698
3699STATIC bool
3700S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3701{
3702 char flags = ANYOF_FLAGS(n);
3703 bool match = FALSE;
3704
3705 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3706 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3707 STRLEN len;
3708 UV c = utf8_to_uv_simple(p, &len);
3709
3710 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3711 match = TRUE;
3712 }
3713
3714 if (!match) {
3715 SV *sw = regclass_swash(n, TRUE, 0);
3716
3717 if (sw) {
3718 if (swash_fetch(sw, p))
3719 match = TRUE;
3720 else if (flags & ANYOF_FOLD) {
3721 U8 tmpbuf[UTF8_MAXLEN+1];
3722
3723 if (flags & ANYOF_LOCALE) {
3724 PL_reg_flags |= RF_tainted;
3725 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3726 }
3727 else
3728 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3729 if (swash_fetch(sw, tmpbuf))
3730 match = TRUE;
3731 }
3732 }
3733 }
3734 }
3735 else {
3736 U8 c = *p;
3737
3738 if (ANYOF_BITMAP_TEST(n, c))
3739 match = TRUE;
3740 else if (flags & ANYOF_FOLD) {
3741 I32 f;
3742
3743 if (flags & ANYOF_LOCALE) {
3744 PL_reg_flags |= RF_tainted;
3745 f = PL_fold_locale[c];
3746 }
3747 else
3748 f = PL_fold[c];
3749 if (f != c && ANYOF_BITMAP_TEST(n, f))
3750 match = TRUE;
3751 }
3752
3753 if (!match && (flags & ANYOF_CLASS)) {
3754 PL_reg_flags |= RF_tainted;
3755 if (
3756 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3757 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3758 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3759 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3760 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3761 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3762 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3763 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3764 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3765 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3766 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3767 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3768 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3769 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3770 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3771 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3772 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3773 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3774 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3775 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3776 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3777 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3778 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3779 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3780 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3781 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3782 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3783 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3784 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3785 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3786 ) /* How's that for a conditional? */
3787 {
3788 match = TRUE;
3789 }
3790 }
3791 }
3792
3793 return (flags & ANYOF_INVERT) ? !match : match;
3794}
3795
3796STATIC U8 *
3797S_reghop(pTHX_ U8 *s, I32 off)
3798{
3799 if (off >= 0) {
3800 while (off-- && s < (U8*)PL_regeol) {
3801 /* XXX could check well-formedness here */
3802 s += UTF8SKIP(s);
3803 }
3804 }
3805 else {
3806 while (off++) {
3807 if (s > (U8*)PL_bostr) {
3808 s--;
3809 if (UTF8_IS_CONTINUED(*s)) {
3810 while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
3811 s--;
3812 }
3813 /* XXX could check well-formedness here */
3814 }
3815 }
3816 }
3817 return s;
3818}
3819
3820STATIC U8 *
3821S_reghopmaybe(pTHX_ U8* s, I32 off)
3822{
3823 if (off >= 0) {
3824 while (off-- && s < (U8*)PL_regeol) {
3825 /* XXX could check well-formedness here */
3826 s += UTF8SKIP(s);
3827 }
3828 if (off >= 0)
3829 return 0;
3830 }
3831 else {
3832 while (off++) {
3833 if (s > (U8*)PL_bostr) {
3834 s--;
3835 if (UTF8_IS_CONTINUED(*s)) {
3836 while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
3837 s--;
3838 }
3839 /* XXX could check well-formedness here */
3840 }
3841 else
3842 break;
3843 }
3844 if (off <= 0)
3845 return 0;
3846 }
3847 return s;
3848}
3849
3850#ifdef PERL_OBJECT
3851#include "XSUB.h"
3852#endif
3853
3854static void
3855restore_pos(pTHXo_ void *arg)
3856{
3857 if (PL_reg_eval_set) {
3858 if (PL_reg_oldsaved) {
3859 PL_reg_re->subbeg = PL_reg_oldsaved;
3860 PL_reg_re->sublen = PL_reg_oldsavedlen;
3861 RX_MATCH_COPIED_on(PL_reg_re);
3862 }
3863 PL_reg_magic->mg_len = PL_reg_oldpos;
3864 PL_reg_eval_set = 0;
3865 PL_curpm = PL_reg_oldcurpm;
3866 }
3867}