This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Unused variable.
[perl5.git] / regexec.c
... / ...
CommitLineData
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
34/* We *really* need to overwrite these symbols: */
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
38# define Perl_re_intuit_start my_re_intuit_start
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
41# define Perl_reginitcolors my_reginitcolors
42# define Perl_regclass_swash my_regclass_swash
43
44# define PERL_NO_GET_CONTEXT
45#endif
46
47/*SUPPRESS 112*/
48/*
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
70 **** Copyright (c) 1991-2001, Larry Wall
71 ****
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
80#define PERL_IN_REGEXEC_C
81#include "perl.h"
82
83#include "regcomp.h"
84
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
87#define RF_evaled 4 /* Did an EVAL with setting? */
88#define RF_utf8 8 /* String contains multibyte chars? */
89
90#define UTF (PL_reg_flags & RF_utf8)
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
94
95#ifndef STATIC
96#define STATIC static
97#endif
98
99/*
100 * Forwards.
101 */
102
103#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
105
106#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
107#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
108#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
109#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
110#define HOPc(pos,off) ((char*)HOP(pos,off))
111#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
112
113#define HOPBACK(pos, off) ( \
114 (UTF && PL_reg_match_utf8) \
115 ? reghopmaybe((U8*)pos, -off) \
116 : (pos - off >= PL_bostr) \
117 ? (U8*)(pos - off) \
118 : (U8*)NULL \
119)
120#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
121
122#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
123#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
124#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
125#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
126#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
127#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
128
129#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
130
131/* for use after a quantifier and before an EXACT-like node -- japhy */
132#define JUMPABLE(rn) ( \
133 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
134 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
135 OP(rn) == PLUS || OP(rn) == MINMOD || \
136 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
137)
138
139#define HAS_TEXT(rn) ( \
140 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
141)
142
143#define FIND_NEXT_IMPT(rn) STMT_START { \
144 while (JUMPABLE(rn)) \
145 if (OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
146 PL_regkind[(U8)OP(rn)] == CURLY) \
147 rn = NEXTOPER(NEXTOPER(rn)); \
148 else if (OP(rn) == PLUS) \
149 rn = NEXTOPER(rn); \
150 else rn += NEXT_OFF(rn); \
151} STMT_END
152
153static void restore_pos(pTHX_ void *arg);
154
155STATIC CHECKPOINT
156S_regcppush(pTHX_ I32 parenfloor)
157{
158 int retval = PL_savestack_ix;
159#define REGCP_PAREN_ELEMS 4
160 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
161 int p;
162
163 if (paren_elems_to_push < 0)
164 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
165
166#define REGCP_OTHER_ELEMS 6
167 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
168 for (p = PL_regsize; p > parenfloor; p--) {
169/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
170 SSPUSHINT(PL_regendp[p]);
171 SSPUSHINT(PL_regstartp[p]);
172 SSPUSHPTR(PL_reg_start_tmp[p]);
173 SSPUSHINT(p);
174 }
175/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
176 SSPUSHINT(PL_regsize);
177 SSPUSHINT(*PL_reglastparen);
178 SSPUSHINT(*PL_reglastcloseparen);
179 SSPUSHPTR(PL_reginput);
180#define REGCP_FRAME_ELEMS 2
181/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
182 * are needed for the regexp context stack bookkeeping. */
183 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
184 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
185
186 return retval;
187}
188
189/* These are needed since we do not localize EVAL nodes: */
190# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
191 " Setting an EVAL scope, savestack=%"IVdf"\n", \
192 (IV)PL_savestack_ix)); cp = PL_savestack_ix
193
194# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
195 PerlIO_printf(Perl_debug_log, \
196 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
197 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
198
199STATIC char *
200S_regcppop(pTHX)
201{
202 I32 i;
203 U32 paren = 0;
204 char *input;
205 I32 tmps;
206
207 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
208 i = SSPOPINT;
209 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
210 i = SSPOPINT; /* Parentheses elements to pop. */
211 input = (char *) SSPOPPTR;
212 *PL_reglastcloseparen = SSPOPINT;
213 *PL_reglastparen = SSPOPINT;
214 PL_regsize = SSPOPINT;
215
216 /* Now restore the parentheses context. */
217 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
218 i > 0; i -= REGCP_PAREN_ELEMS) {
219 paren = (U32)SSPOPINT;
220 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
221 PL_regstartp[paren] = SSPOPINT;
222 tmps = SSPOPINT;
223 if (paren <= *PL_reglastparen)
224 PL_regendp[paren] = tmps;
225 DEBUG_r(
226 PerlIO_printf(Perl_debug_log,
227 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
228 (UV)paren, (IV)PL_regstartp[paren],
229 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
230 (IV)PL_regendp[paren],
231 (paren > *PL_reglastparen ? "(no)" : ""));
232 );
233 }
234 DEBUG_r(
235 if (*PL_reglastparen + 1 <= PL_regnpar) {
236 PerlIO_printf(Perl_debug_log,
237 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
238 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
239 }
240 );
241#if 1
242 /* It would seem that the similar code in regtry()
243 * already takes care of this, and in fact it is in
244 * a better location to since this code can #if 0-ed out
245 * but the code in regtry() is needed or otherwise tests
246 * requiring null fields (pat.t#187 and split.t#{13,14}
247 * (as of patchlevel 7877) will fail. Then again,
248 * this code seems to be necessary or otherwise
249 * building DynaLoader will fail:
250 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
251 * --jhi */
252 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
253 if (paren > PL_regsize)
254 PL_regstartp[paren] = -1;
255 PL_regendp[paren] = -1;
256 }
257#endif
258 return input;
259}
260
261STATIC char *
262S_regcp_set_to(pTHX_ I32 ss)
263{
264 I32 tmp = PL_savestack_ix;
265
266 PL_savestack_ix = ss;
267 regcppop();
268 PL_savestack_ix = tmp;
269 return Nullch;
270}
271
272typedef struct re_cc_state
273{
274 I32 ss;
275 regnode *node;
276 struct re_cc_state *prev;
277 CURCUR *cc;
278 regexp *re;
279} re_cc_state;
280
281#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
282
283#define TRYPAREN(paren, n, input) { \
284 if (paren) { \
285 if (n) { \
286 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
287 PL_regendp[paren] = input - PL_bostr; \
288 } \
289 else \
290 PL_regendp[paren] = -1; \
291 } \
292 if (regmatch(next)) \
293 sayYES; \
294 if (paren && n) \
295 PL_regendp[paren] = -1; \
296}
297
298
299/*
300 * pregexec and friends
301 */
302
303/*
304 - pregexec - match a regexp against a string
305 */
306I32
307Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
308 char *strbeg, I32 minend, SV *screamer, U32 nosave)
309/* strend: pointer to null at end of string */
310/* strbeg: real beginning of string */
311/* minend: end of match must be >=minend after stringarg. */
312/* nosave: For optimizations. */
313{
314 return
315 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
316 nosave ? 0 : REXEC_COPY_STR);
317}
318
319STATIC void
320S_cache_re(pTHX_ regexp *prog)
321{
322 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
323#ifdef DEBUGGING
324 PL_regprogram = prog->program;
325#endif
326 PL_regnpar = prog->nparens;
327 PL_regdata = prog->data;
328 PL_reg_re = prog;
329}
330
331/*
332 * Need to implement the following flags for reg_anch:
333 *
334 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
335 * USE_INTUIT_ML
336 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
337 * INTUIT_AUTORITATIVE_ML
338 * INTUIT_ONCE_NOML - Intuit can match in one location only.
339 * INTUIT_ONCE_ML
340 *
341 * Another flag for this function: SECOND_TIME (so that float substrs
342 * with giant delta may be not rechecked).
343 */
344
345/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
346
347/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
348 Otherwise, only SvCUR(sv) is used to get strbeg. */
349
350/* XXXX We assume that strpos is strbeg unless sv. */
351
352/* XXXX Some places assume that there is a fixed substring.
353 An update may be needed if optimizer marks as "INTUITable"
354 RExen without fixed substrings. Similarly, it is assumed that
355 lengths of all the strings are no more than minlen, thus they
356 cannot come from lookahead.
357 (Or minlen should take into account lookahead.) */
358
359/* A failure to find a constant substring means that there is no need to make
360 an expensive call to REx engine, thus we celebrate a failure. Similarly,
361 finding a substring too deep into the string means that less calls to
362 regtry() should be needed.
363
364 REx compiler's optimizer found 4 possible hints:
365 a) Anchored substring;
366 b) Fixed substring;
367 c) Whether we are anchored (beginning-of-line or \G);
368 d) First node (of those at offset 0) which may distingush positions;
369 We use a)b)d) and multiline-part of c), and try to find a position in the
370 string which does not contradict any of them.
371 */
372
373/* Most of decisions we do here should have been done at compile time.
374 The nodes of the REx which we used for the search should have been
375 deleted from the finite automaton. */
376
377char *
378Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
379 char *strend, U32 flags, re_scream_pos_data *data)
380{
381 register I32 start_shift = 0;
382 /* Should be nonnegative! */
383 register I32 end_shift = 0;
384 register char *s;
385 register SV *check;
386 char *strbeg;
387 char *t;
388 I32 ml_anch;
389 register char *other_last = Nullch; /* other substr checked before this */
390 char *check_at = Nullch; /* check substr found at this pos */
391#ifdef DEBUGGING
392 char *i_strpos = strpos;
393 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
394#endif
395
396 if (prog->reganch & ROPT_UTF8) {
397 DEBUG_r(PerlIO_printf(Perl_debug_log,
398 "UTF-8 regex...\n"));
399 PL_reg_flags |= RF_utf8;
400 }
401
402 DEBUG_r({
403 char *s = PL_reg_match_utf8 ?
404 sv_uni_display(dsv, sv, 60, 0) : strpos;
405 int len = PL_reg_match_utf8 ?
406 strlen(s) : strend - strpos;
407 if (!PL_colorset)
408 reginitcolors();
409 if (PL_reg_match_utf8)
410 DEBUG_r(PerlIO_printf(Perl_debug_log,
411 "UTF-8 target...\n"));
412 PerlIO_printf(Perl_debug_log,
413 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
414 PL_colors[4],PL_colors[5],PL_colors[0],
415 prog->precomp,
416 PL_colors[1],
417 (strlen(prog->precomp) > 60 ? "..." : ""),
418 PL_colors[0],
419 (int)(len > 60 ? 60 : len),
420 s, PL_colors[1],
421 (len > 60 ? "..." : "")
422 );
423 });
424
425 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
426 DEBUG_r(PerlIO_printf(Perl_debug_log,
427 "String too short... [re_intuit_start]\n"));
428 goto fail;
429 }
430 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
431 PL_regeol = strend;
432 check = prog->check_substr;
433 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
434 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
435 || ( (prog->reganch & ROPT_ANCH_BOL)
436 && !PL_multiline ) ); /* Check after \n? */
437
438 if (!ml_anch) {
439 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
440 | ROPT_IMPLICIT)) /* not a real BOL */
441 /* SvCUR is not set on references: SvRV and SvPVX overlap */
442 && sv && !SvROK(sv)
443 && (strpos != strbeg)) {
444 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
445 goto fail;
446 }
447 if (prog->check_offset_min == prog->check_offset_max &&
448 !(prog->reganch & ROPT_CANY_SEEN)) {
449 /* Substring at constant offset from beg-of-str... */
450 I32 slen;
451
452 s = HOP3c(strpos, prog->check_offset_min, strend);
453 if (SvTAIL(check)) {
454 slen = SvCUR(check); /* >= 1 */
455
456 if ( strend - s > slen || strend - s < slen - 1
457 || (strend - s == slen && strend[-1] != '\n')) {
458 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
459 goto fail_finish;
460 }
461 /* Now should match s[0..slen-2] */
462 slen--;
463 if (slen && (*SvPVX(check) != *s
464 || (slen > 1
465 && memNE(SvPVX(check), s, slen)))) {
466 report_neq:
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
468 goto fail_finish;
469 }
470 }
471 else if (*SvPVX(check) != *s
472 || ((slen = SvCUR(check)) > 1
473 && memNE(SvPVX(check), s, slen)))
474 goto report_neq;
475 goto success_at_start;
476 }
477 }
478 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
479 s = strpos;
480 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
481 end_shift = prog->minlen - start_shift -
482 CHR_SVLEN(check) + (SvTAIL(check) != 0);
483 if (!ml_anch) {
484 I32 end = prog->check_offset_max + CHR_SVLEN(check)
485 - (SvTAIL(check) != 0);
486 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
487
488 if (end_shift < eshift)
489 end_shift = eshift;
490 }
491 }
492 else { /* Can match at random position */
493 ml_anch = 0;
494 s = strpos;
495 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
496 /* Should be nonnegative! */
497 end_shift = prog->minlen - start_shift -
498 CHR_SVLEN(check) + (SvTAIL(check) != 0);
499 }
500
501#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
502 if (end_shift < 0)
503 Perl_croak(aTHX_ "panic: end_shift");
504#endif
505
506 restart:
507 /* Find a possible match in the region s..strend by looking for
508 the "check" substring in the region corrected by start/end_shift. */
509 if (flags & REXEC_SCREAM) {
510 I32 p = -1; /* Internal iterator of scream. */
511 I32 *pp = data ? data->scream_pos : &p;
512
513 if (PL_screamfirst[BmRARE(check)] >= 0
514 || ( BmRARE(check) == '\n'
515 && (BmPREVIOUS(check) == SvCUR(check) - 1)
516 && SvTAIL(check) ))
517 s = screaminstr(sv, check,
518 start_shift + (s - strbeg), end_shift, pp, 0);
519 else
520 goto fail_finish;
521 if (data)
522 *data->scream_olds = s;
523 }
524 else if (prog->reganch & ROPT_CANY_SEEN)
525 s = fbm_instr((U8*)(s + start_shift),
526 (U8*)(strend - end_shift),
527 check, PL_multiline ? FBMrf_MULTILINE : 0);
528 else
529 s = fbm_instr(HOP3(s, start_shift, strend),
530 HOP3(strend, -end_shift, strbeg),
531 check, PL_multiline ? FBMrf_MULTILINE : 0);
532
533 /* Update the count-of-usability, remove useless subpatterns,
534 unshift s. */
535
536 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
537 (s ? "Found" : "Did not find"),
538 ((check == prog->anchored_substr) ? "anchored" : "floating"),
539 PL_colors[0],
540 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
541 SvPVX(check),
542 PL_colors[1], (SvTAIL(check) ? "$" : ""),
543 (s ? " at offset " : "...\n") ) );
544
545 if (!s)
546 goto fail_finish;
547
548 check_at = s;
549
550 /* Finish the diagnostic message */
551 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
552
553 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
554 Start with the other substr.
555 XXXX no SCREAM optimization yet - and a very coarse implementation
556 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
557 *always* match. Probably should be marked during compile...
558 Probably it is right to do no SCREAM here...
559 */
560
561 if (prog->float_substr && prog->anchored_substr) {
562 /* Take into account the "other" substring. */
563 /* XXXX May be hopelessly wrong for UTF... */
564 if (!other_last)
565 other_last = strpos;
566 if (check == prog->float_substr) {
567 do_other_anchored:
568 {
569 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
570 char *s1 = s;
571
572 t = s - prog->check_offset_max;
573 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
574 && (!(prog->reganch & ROPT_UTF8)
575 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
576 && t > strpos)))
577 /* EMPTY */;
578 else
579 t = strpos;
580 t = HOP3c(t, prog->anchored_offset, strend);
581 if (t < other_last) /* These positions already checked */
582 t = other_last;
583 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
584 if (last < last1)
585 last1 = last;
586 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
587 /* On end-of-str: see comment below. */
588 s = fbm_instr((unsigned char*)t,
589 HOP3(HOP3(last1, prog->anchored_offset, strend)
590 + SvCUR(prog->anchored_substr),
591 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
592 prog->anchored_substr,
593 PL_multiline ? FBMrf_MULTILINE : 0);
594 DEBUG_r(PerlIO_printf(Perl_debug_log,
595 "%s anchored substr `%s%.*s%s'%s",
596 (s ? "Found" : "Contradicts"),
597 PL_colors[0],
598 (int)(SvCUR(prog->anchored_substr)
599 - (SvTAIL(prog->anchored_substr)!=0)),
600 SvPVX(prog->anchored_substr),
601 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
602 if (!s) {
603 if (last1 >= last2) {
604 DEBUG_r(PerlIO_printf(Perl_debug_log,
605 ", giving up...\n"));
606 goto fail_finish;
607 }
608 DEBUG_r(PerlIO_printf(Perl_debug_log,
609 ", trying floating at offset %ld...\n",
610 (long)(HOP3c(s1, 1, strend) - i_strpos)));
611 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
612 s = HOP3c(last, 1, strend);
613 goto restart;
614 }
615 else {
616 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
617 (long)(s - i_strpos)));
618 t = HOP3c(s, -prog->anchored_offset, strbeg);
619 other_last = HOP3c(s, 1, strend);
620 s = s1;
621 if (t == strpos)
622 goto try_at_start;
623 goto try_at_offset;
624 }
625 }
626 }
627 else { /* Take into account the floating substring. */
628 char *last, *last1;
629 char *s1 = s;
630
631 t = HOP3c(s, -start_shift, strbeg);
632 last1 = last =
633 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
634 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
635 last = HOP3c(t, prog->float_max_offset, strend);
636 s = HOP3c(t, prog->float_min_offset, strend);
637 if (s < other_last)
638 s = other_last;
639 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
640 /* fbm_instr() takes into account exact value of end-of-str
641 if the check is SvTAIL(ed). Since false positives are OK,
642 and end-of-str is not later than strend we are OK. */
643 s = fbm_instr((unsigned char*)s,
644 (unsigned char*)last + SvCUR(prog->float_substr)
645 - (SvTAIL(prog->float_substr)!=0),
646 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
647 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
648 (s ? "Found" : "Contradicts"),
649 PL_colors[0],
650 (int)(SvCUR(prog->float_substr)
651 - (SvTAIL(prog->float_substr)!=0)),
652 SvPVX(prog->float_substr),
653 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
654 if (!s) {
655 if (last1 == last) {
656 DEBUG_r(PerlIO_printf(Perl_debug_log,
657 ", giving up...\n"));
658 goto fail_finish;
659 }
660 DEBUG_r(PerlIO_printf(Perl_debug_log,
661 ", trying anchored starting at offset %ld...\n",
662 (long)(s1 + 1 - i_strpos)));
663 other_last = last;
664 s = HOP3c(t, 1, strend);
665 goto restart;
666 }
667 else {
668 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
669 (long)(s - i_strpos)));
670 other_last = s; /* Fix this later. --Hugo */
671 s = s1;
672 if (t == strpos)
673 goto try_at_start;
674 goto try_at_offset;
675 }
676 }
677 }
678
679 t = s - prog->check_offset_max;
680 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
681 && (!(prog->reganch & ROPT_UTF8)
682 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
683 && t > strpos))) {
684 /* Fixed substring is found far enough so that the match
685 cannot start at strpos. */
686 try_at_offset:
687 if (ml_anch && t[-1] != '\n') {
688 /* Eventually fbm_*() should handle this, but often
689 anchored_offset is not 0, so this check will not be wasted. */
690 /* XXXX In the code below we prefer to look for "^" even in
691 presence of anchored substrings. And we search even
692 beyond the found float position. These pessimizations
693 are historical artefacts only. */
694 find_anchor:
695 while (t < strend - prog->minlen) {
696 if (*t == '\n') {
697 if (t < check_at - prog->check_offset_min) {
698 if (prog->anchored_substr) {
699 /* Since we moved from the found position,
700 we definitely contradict the found anchored
701 substr. Due to the above check we do not
702 contradict "check" substr.
703 Thus we can arrive here only if check substr
704 is float. Redo checking for "other"=="fixed".
705 */
706 strpos = t + 1;
707 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
708 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
709 goto do_other_anchored;
710 }
711 /* We don't contradict the found floating substring. */
712 /* XXXX Why not check for STCLASS? */
713 s = t + 1;
714 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
715 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
716 goto set_useful;
717 }
718 /* Position contradicts check-string */
719 /* XXXX probably better to look for check-string
720 than for "\n", so one should lower the limit for t? */
721 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
722 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
723 other_last = strpos = s = t + 1;
724 goto restart;
725 }
726 t++;
727 }
728 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
729 PL_colors[0],PL_colors[1]));
730 goto fail_finish;
731 }
732 else {
733 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
734 PL_colors[0],PL_colors[1]));
735 }
736 s = t;
737 set_useful:
738 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
739 }
740 else {
741 /* The found string does not prohibit matching at strpos,
742 - no optimization of calling REx engine can be performed,
743 unless it was an MBOL and we are not after MBOL,
744 or a future STCLASS check will fail this. */
745 try_at_start:
746 /* Even in this situation we may use MBOL flag if strpos is offset
747 wrt the start of the string. */
748 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
749 && (strpos != strbeg) && strpos[-1] != '\n'
750 /* May be due to an implicit anchor of m{.*foo} */
751 && !(prog->reganch & ROPT_IMPLICIT))
752 {
753 t = strpos;
754 goto find_anchor;
755 }
756 DEBUG_r( if (ml_anch)
757 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
758 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
759 );
760 success_at_start:
761 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
762 && prog->check_substr /* Could be deleted already */
763 && --BmUSEFUL(prog->check_substr) < 0
764 && prog->check_substr == prog->float_substr)
765 {
766 /* If flags & SOMETHING - do not do it many times on the same match */
767 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
768 SvREFCNT_dec(prog->check_substr);
769 prog->check_substr = Nullsv; /* disable */
770 prog->float_substr = Nullsv; /* clear */
771 check = Nullsv; /* abort */
772 s = strpos;
773 /* XXXX This is a remnant of the old implementation. It
774 looks wasteful, since now INTUIT can use many
775 other heuristics. */
776 prog->reganch &= ~RE_USE_INTUIT;
777 }
778 else
779 s = strpos;
780 }
781
782 /* Last resort... */
783 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
784 if (prog->regstclass) {
785 /* minlen == 0 is possible if regstclass is \b or \B,
786 and the fixed substr is ''$.
787 Since minlen is already taken into account, s+1 is before strend;
788 accidentally, minlen >= 1 guaranties no false positives at s + 1
789 even for \b or \B. But (minlen? 1 : 0) below assumes that
790 regstclass does not come from lookahead... */
791 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
792 This leaves EXACTF only, which is dealt with in find_byclass(). */
793 U8* str = (U8*)STRING(prog->regstclass);
794 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
795 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
796 : 1);
797 char *endpos = (prog->anchored_substr || ml_anch)
798 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
799 : (prog->float_substr
800 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
801 cl_l, strend)
802 : strend);
803 char *startpos = strbeg;
804
805 t = s;
806 if (prog->reganch & ROPT_UTF8) {
807 PL_regdata = prog->data;
808 PL_bostr = startpos;
809 }
810 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
811 if (!s) {
812#ifdef DEBUGGING
813 char *what = 0;
814#endif
815 if (endpos == strend) {
816 DEBUG_r( PerlIO_printf(Perl_debug_log,
817 "Could not match STCLASS...\n") );
818 goto fail;
819 }
820 DEBUG_r( PerlIO_printf(Perl_debug_log,
821 "This position contradicts STCLASS...\n") );
822 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
823 goto fail;
824 /* Contradict one of substrings */
825 if (prog->anchored_substr) {
826 if (prog->anchored_substr == check) {
827 DEBUG_r( what = "anchored" );
828 hop_and_restart:
829 s = HOP3c(t, 1, strend);
830 if (s + start_shift + end_shift > strend) {
831 /* XXXX Should be taken into account earlier? */
832 DEBUG_r( PerlIO_printf(Perl_debug_log,
833 "Could not match STCLASS...\n") );
834 goto fail;
835 }
836 if (!check)
837 goto giveup;
838 DEBUG_r( PerlIO_printf(Perl_debug_log,
839 "Looking for %s substr starting at offset %ld...\n",
840 what, (long)(s + start_shift - i_strpos)) );
841 goto restart;
842 }
843 /* Have both, check_string is floating */
844 if (t + start_shift >= check_at) /* Contradicts floating=check */
845 goto retry_floating_check;
846 /* Recheck anchored substring, but not floating... */
847 s = check_at;
848 if (!check)
849 goto giveup;
850 DEBUG_r( PerlIO_printf(Perl_debug_log,
851 "Looking for anchored substr starting at offset %ld...\n",
852 (long)(other_last - i_strpos)) );
853 goto do_other_anchored;
854 }
855 /* Another way we could have checked stclass at the
856 current position only: */
857 if (ml_anch) {
858 s = t = t + 1;
859 if (!check)
860 goto giveup;
861 DEBUG_r( PerlIO_printf(Perl_debug_log,
862 "Looking for /%s^%s/m starting at offset %ld...\n",
863 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
864 goto try_at_offset;
865 }
866 if (!prog->float_substr) /* Could have been deleted */
867 goto fail;
868 /* Check is floating subtring. */
869 retry_floating_check:
870 t = check_at - start_shift;
871 DEBUG_r( what = "floating" );
872 goto hop_and_restart;
873 }
874 if (t != s) {
875 DEBUG_r(PerlIO_printf(Perl_debug_log,
876 "By STCLASS: moving %ld --> %ld\n",
877 (long)(t - i_strpos), (long)(s - i_strpos))
878 );
879 }
880 else {
881 DEBUG_r(PerlIO_printf(Perl_debug_log,
882 "Does not contradict STCLASS...\n");
883 );
884 }
885 }
886 giveup:
887 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
888 PL_colors[4], (check ? "Guessed" : "Giving up"),
889 PL_colors[5], (long)(s - i_strpos)) );
890 return s;
891
892 fail_finish: /* Substring not found */
893 if (prog->check_substr) /* could be removed already */
894 BmUSEFUL(prog->check_substr) += 5; /* hooray */
895 fail:
896 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
897 PL_colors[4],PL_colors[5]));
898 return Nullch;
899}
900
901/* We know what class REx starts with. Try to find this position... */
902STATIC char *
903S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
904{
905 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
906 char *m;
907 STRLEN ln;
908 unsigned int c1;
909 unsigned int c2;
910 char *e;
911 register I32 tmp = 1; /* Scratch variable? */
912 register bool do_utf8 = PL_reg_match_utf8;
913
914 /* We know what class it must start with. */
915 switch (OP(c)) {
916 case ANYOF:
917 while (s < strend) {
918 if (reginclass(c, (U8*)s, do_utf8)) {
919 if (tmp && (norun || regtry(prog, s)))
920 goto got_it;
921 else
922 tmp = doevery;
923 }
924 else
925 tmp = 1;
926 s += do_utf8 ? UTF8SKIP(s) : 1;
927 }
928 break;
929 case CANY:
930 while (s < strend) {
931 if (tmp && (norun || regtry(prog, s)))
932 goto got_it;
933 else
934 tmp = doevery;
935 s++;
936 }
937 break;
938 case EXACTF:
939 m = STRING(c);
940 ln = STR_LEN(c);
941 if (UTF) {
942 STRLEN ulen1, ulen2;
943 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
944 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
945
946 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
947 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
948
949 c1 = utf8_to_uvuni(tmpbuf1, 0);
950 c2 = utf8_to_uvuni(tmpbuf2, 0);
951 }
952 else {
953 c1 = *(U8*)m;
954 c2 = PL_fold[c1];
955 }
956 goto do_exactf;
957 case EXACTFL:
958 m = STRING(c);
959 ln = STR_LEN(c);
960 c1 = *(U8*)m;
961 c2 = PL_fold_locale[c1];
962 do_exactf:
963 e = do_utf8 ? s + ln : strend - ln;
964
965 if (norun && e < s)
966 e = s; /* Due to minlen logic of intuit() */
967
968 /* The idea in the EXACTF* cases is to first find the
969 * first character of the EXACTF* node and then, if
970 * necessary, case-insensitively compare the full
971 * text of the node. The c1 and c2 are the first
972 * characters (though in Unicode it gets a bit
973 * more complicated because there are more cases
974 * than just upper and lower: one is really supposed
975 * to use the so-called folding case for case-insensitive
976 * matching (called "loose matching" in Unicode). */
977
978 if (do_utf8) {
979 UV c, f;
980 U8 tmpbuf [UTF8_MAXLEN+1];
981 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
982 STRLEN len, foldlen;
983
984 if (c1 == c2) {
985 while (s <= e) {
986 c = utf8_to_uvchr((U8*)s, &len);
987 if ( c == c1
988 && (ln == len ||
989 ibcmp_utf8(s, (char **)0, 0, do_utf8,
990 m, (char **)0, ln, UTF))
991 && (norun || regtry(prog, s)) )
992 goto got_it;
993 else {
994 uvchr_to_utf8(tmpbuf, c);
995 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
996 if ( f != c
997 && (f == c1 || f == c2)
998 && (ln == foldlen ||
999 !ibcmp_utf8((char *) foldbuf,
1000 (char **)0, foldlen, do_utf8,
1001 m,
1002 (char **)0, ln, UTF))
1003 && (norun || regtry(prog, s)) )
1004 goto got_it;
1005 }
1006 s += len;
1007 }
1008 }
1009 else {
1010 while (s <= e) {
1011 c = utf8_to_uvchr((U8*)s, &len);
1012
1013 /* Handle some of the three Greek sigmas cases.
1014 * Note that not all the possible combinations
1015 * are handled here: some of them are handled
1016 * handled by the standard folding rules, and
1017 * some of them (the character class or ANYOF
1018 * cases) are handled during compiletime in
1019 * regexec.c:S_regclass(). */
1020 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1021 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1022 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1023
1024 if ( (c == c1 || c == c2)
1025 && (ln == len ||
1026 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1027 m, (char **)0, ln, UTF))
1028 && (norun || regtry(prog, s)) )
1029 goto got_it;
1030 else {
1031 uvchr_to_utf8(tmpbuf, c);
1032 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1033 if ( f != c
1034 && (f == c1 || f == c2)
1035 && (ln == foldlen ||
1036 !ibcmp_utf8((char *)foldbuf,
1037 (char **)0, foldlen, do_utf8,
1038 m,
1039 (char **)0, ln, UTF))
1040 && (norun || regtry(prog, s)) )
1041 goto got_it;
1042 }
1043 s += len;
1044 }
1045 }
1046 }
1047 else {
1048 if (c1 == c2)
1049 while (s <= e) {
1050 if ( *(U8*)s == c1
1051 && (ln == 1 || !(OP(c) == EXACTF
1052 ? ibcmp(s, m, ln)
1053 : ibcmp_locale(s, m, ln)))
1054 && (norun || regtry(prog, s)) )
1055 goto got_it;
1056 s++;
1057 }
1058 else
1059 while (s <= e) {
1060 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1061 && (ln == 1 || !(OP(c) == EXACTF
1062 ? ibcmp(s, m, ln)
1063 : ibcmp_locale(s, m, ln)))
1064 && (norun || regtry(prog, s)) )
1065 goto got_it;
1066 s++;
1067 }
1068 }
1069 break;
1070 case BOUNDL:
1071 PL_reg_flags |= RF_tainted;
1072 /* FALL THROUGH */
1073 case BOUND:
1074 if (do_utf8) {
1075 if (s == PL_bostr)
1076 tmp = '\n';
1077 else {
1078 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1079
1080 if (s > (char*)r)
1081 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1082 }
1083 tmp = ((OP(c) == BOUND ?
1084 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1085 LOAD_UTF8_CHARCLASS(alnum,"a");
1086 while (s < strend) {
1087 if (tmp == !(OP(c) == BOUND ?
1088 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1089 isALNUM_LC_utf8((U8*)s)))
1090 {
1091 tmp = !tmp;
1092 if ((norun || regtry(prog, s)))
1093 goto got_it;
1094 }
1095 s += UTF8SKIP(s);
1096 }
1097 }
1098 else {
1099 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1100 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1101 while (s < strend) {
1102 if (tmp ==
1103 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1104 tmp = !tmp;
1105 if ((norun || regtry(prog, s)))
1106 goto got_it;
1107 }
1108 s++;
1109 }
1110 }
1111 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1112 goto got_it;
1113 break;
1114 case NBOUNDL:
1115 PL_reg_flags |= RF_tainted;
1116 /* FALL THROUGH */
1117 case NBOUND:
1118 if (do_utf8) {
1119 if (s == PL_bostr)
1120 tmp = '\n';
1121 else {
1122 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1123
1124 if (s > (char*)r)
1125 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1126 }
1127 tmp = ((OP(c) == NBOUND ?
1128 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1129 LOAD_UTF8_CHARCLASS(alnum,"a");
1130 while (s < strend) {
1131 if (tmp == !(OP(c) == NBOUND ?
1132 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1133 isALNUM_LC_utf8((U8*)s)))
1134 tmp = !tmp;
1135 else if ((norun || regtry(prog, s)))
1136 goto got_it;
1137 s += UTF8SKIP(s);
1138 }
1139 }
1140 else {
1141 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1142 tmp = ((OP(c) == NBOUND ?
1143 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1144 while (s < strend) {
1145 if (tmp ==
1146 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1147 tmp = !tmp;
1148 else if ((norun || regtry(prog, s)))
1149 goto got_it;
1150 s++;
1151 }
1152 }
1153 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1154 goto got_it;
1155 break;
1156 case ALNUM:
1157 if (do_utf8) {
1158 LOAD_UTF8_CHARCLASS(alnum,"a");
1159 while (s < strend) {
1160 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1161 if (tmp && (norun || regtry(prog, s)))
1162 goto got_it;
1163 else
1164 tmp = doevery;
1165 }
1166 else
1167 tmp = 1;
1168 s += UTF8SKIP(s);
1169 }
1170 }
1171 else {
1172 while (s < strend) {
1173 if (isALNUM(*s)) {
1174 if (tmp && (norun || regtry(prog, s)))
1175 goto got_it;
1176 else
1177 tmp = doevery;
1178 }
1179 else
1180 tmp = 1;
1181 s++;
1182 }
1183 }
1184 break;
1185 case ALNUML:
1186 PL_reg_flags |= RF_tainted;
1187 if (do_utf8) {
1188 while (s < strend) {
1189 if (isALNUM_LC_utf8((U8*)s)) {
1190 if (tmp && (norun || regtry(prog, s)))
1191 goto got_it;
1192 else
1193 tmp = doevery;
1194 }
1195 else
1196 tmp = 1;
1197 s += UTF8SKIP(s);
1198 }
1199 }
1200 else {
1201 while (s < strend) {
1202 if (isALNUM_LC(*s)) {
1203 if (tmp && (norun || regtry(prog, s)))
1204 goto got_it;
1205 else
1206 tmp = doevery;
1207 }
1208 else
1209 tmp = 1;
1210 s++;
1211 }
1212 }
1213 break;
1214 case NALNUM:
1215 if (do_utf8) {
1216 LOAD_UTF8_CHARCLASS(alnum,"a");
1217 while (s < strend) {
1218 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1219 if (tmp && (norun || regtry(prog, s)))
1220 goto got_it;
1221 else
1222 tmp = doevery;
1223 }
1224 else
1225 tmp = 1;
1226 s += UTF8SKIP(s);
1227 }
1228 }
1229 else {
1230 while (s < strend) {
1231 if (!isALNUM(*s)) {
1232 if (tmp && (norun || regtry(prog, s)))
1233 goto got_it;
1234 else
1235 tmp = doevery;
1236 }
1237 else
1238 tmp = 1;
1239 s++;
1240 }
1241 }
1242 break;
1243 case NALNUML:
1244 PL_reg_flags |= RF_tainted;
1245 if (do_utf8) {
1246 while (s < strend) {
1247 if (!isALNUM_LC_utf8((U8*)s)) {
1248 if (tmp && (norun || regtry(prog, s)))
1249 goto got_it;
1250 else
1251 tmp = doevery;
1252 }
1253 else
1254 tmp = 1;
1255 s += UTF8SKIP(s);
1256 }
1257 }
1258 else {
1259 while (s < strend) {
1260 if (!isALNUM_LC(*s)) {
1261 if (tmp && (norun || regtry(prog, s)))
1262 goto got_it;
1263 else
1264 tmp = doevery;
1265 }
1266 else
1267 tmp = 1;
1268 s++;
1269 }
1270 }
1271 break;
1272 case SPACE:
1273 if (do_utf8) {
1274 LOAD_UTF8_CHARCLASS(space," ");
1275 while (s < strend) {
1276 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1277 if (tmp && (norun || regtry(prog, s)))
1278 goto got_it;
1279 else
1280 tmp = doevery;
1281 }
1282 else
1283 tmp = 1;
1284 s += UTF8SKIP(s);
1285 }
1286 }
1287 else {
1288 while (s < strend) {
1289 if (isSPACE(*s)) {
1290 if (tmp && (norun || regtry(prog, s)))
1291 goto got_it;
1292 else
1293 tmp = doevery;
1294 }
1295 else
1296 tmp = 1;
1297 s++;
1298 }
1299 }
1300 break;
1301 case SPACEL:
1302 PL_reg_flags |= RF_tainted;
1303 if (do_utf8) {
1304 while (s < strend) {
1305 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1306 if (tmp && (norun || regtry(prog, s)))
1307 goto got_it;
1308 else
1309 tmp = doevery;
1310 }
1311 else
1312 tmp = 1;
1313 s += UTF8SKIP(s);
1314 }
1315 }
1316 else {
1317 while (s < strend) {
1318 if (isSPACE_LC(*s)) {
1319 if (tmp && (norun || regtry(prog, s)))
1320 goto got_it;
1321 else
1322 tmp = doevery;
1323 }
1324 else
1325 tmp = 1;
1326 s++;
1327 }
1328 }
1329 break;
1330 case NSPACE:
1331 if (do_utf8) {
1332 LOAD_UTF8_CHARCLASS(space," ");
1333 while (s < strend) {
1334 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1335 if (tmp && (norun || regtry(prog, s)))
1336 goto got_it;
1337 else
1338 tmp = doevery;
1339 }
1340 else
1341 tmp = 1;
1342 s += UTF8SKIP(s);
1343 }
1344 }
1345 else {
1346 while (s < strend) {
1347 if (!isSPACE(*s)) {
1348 if (tmp && (norun || regtry(prog, s)))
1349 goto got_it;
1350 else
1351 tmp = doevery;
1352 }
1353 else
1354 tmp = 1;
1355 s++;
1356 }
1357 }
1358 break;
1359 case NSPACEL:
1360 PL_reg_flags |= RF_tainted;
1361 if (do_utf8) {
1362 while (s < strend) {
1363 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1364 if (tmp && (norun || regtry(prog, s)))
1365 goto got_it;
1366 else
1367 tmp = doevery;
1368 }
1369 else
1370 tmp = 1;
1371 s += UTF8SKIP(s);
1372 }
1373 }
1374 else {
1375 while (s < strend) {
1376 if (!isSPACE_LC(*s)) {
1377 if (tmp && (norun || regtry(prog, s)))
1378 goto got_it;
1379 else
1380 tmp = doevery;
1381 }
1382 else
1383 tmp = 1;
1384 s++;
1385 }
1386 }
1387 break;
1388 case DIGIT:
1389 if (do_utf8) {
1390 LOAD_UTF8_CHARCLASS(digit,"0");
1391 while (s < strend) {
1392 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1393 if (tmp && (norun || regtry(prog, s)))
1394 goto got_it;
1395 else
1396 tmp = doevery;
1397 }
1398 else
1399 tmp = 1;
1400 s += UTF8SKIP(s);
1401 }
1402 }
1403 else {
1404 while (s < strend) {
1405 if (isDIGIT(*s)) {
1406 if (tmp && (norun || regtry(prog, s)))
1407 goto got_it;
1408 else
1409 tmp = doevery;
1410 }
1411 else
1412 tmp = 1;
1413 s++;
1414 }
1415 }
1416 break;
1417 case DIGITL:
1418 PL_reg_flags |= RF_tainted;
1419 if (do_utf8) {
1420 while (s < strend) {
1421 if (isDIGIT_LC_utf8((U8*)s)) {
1422 if (tmp && (norun || regtry(prog, s)))
1423 goto got_it;
1424 else
1425 tmp = doevery;
1426 }
1427 else
1428 tmp = 1;
1429 s += UTF8SKIP(s);
1430 }
1431 }
1432 else {
1433 while (s < strend) {
1434 if (isDIGIT_LC(*s)) {
1435 if (tmp && (norun || regtry(prog, s)))
1436 goto got_it;
1437 else
1438 tmp = doevery;
1439 }
1440 else
1441 tmp = 1;
1442 s++;
1443 }
1444 }
1445 break;
1446 case NDIGIT:
1447 if (do_utf8) {
1448 LOAD_UTF8_CHARCLASS(digit,"0");
1449 while (s < strend) {
1450 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1451 if (tmp && (norun || regtry(prog, s)))
1452 goto got_it;
1453 else
1454 tmp = doevery;
1455 }
1456 else
1457 tmp = 1;
1458 s += UTF8SKIP(s);
1459 }
1460 }
1461 else {
1462 while (s < strend) {
1463 if (!isDIGIT(*s)) {
1464 if (tmp && (norun || regtry(prog, s)))
1465 goto got_it;
1466 else
1467 tmp = doevery;
1468 }
1469 else
1470 tmp = 1;
1471 s++;
1472 }
1473 }
1474 break;
1475 case NDIGITL:
1476 PL_reg_flags |= RF_tainted;
1477 if (do_utf8) {
1478 while (s < strend) {
1479 if (!isDIGIT_LC_utf8((U8*)s)) {
1480 if (tmp && (norun || regtry(prog, s)))
1481 goto got_it;
1482 else
1483 tmp = doevery;
1484 }
1485 else
1486 tmp = 1;
1487 s += UTF8SKIP(s);
1488 }
1489 }
1490 else {
1491 while (s < strend) {
1492 if (!isDIGIT_LC(*s)) {
1493 if (tmp && (norun || regtry(prog, s)))
1494 goto got_it;
1495 else
1496 tmp = doevery;
1497 }
1498 else
1499 tmp = 1;
1500 s++;
1501 }
1502 }
1503 break;
1504 default:
1505 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1506 break;
1507 }
1508 return 0;
1509 got_it:
1510 return s;
1511}
1512
1513/*
1514 - regexec_flags - match a regexp against a string
1515 */
1516I32
1517Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1518 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1519/* strend: pointer to null at end of string */
1520/* strbeg: real beginning of string */
1521/* minend: end of match must be >=minend after stringarg. */
1522/* data: May be used for some additional optimizations. */
1523/* nosave: For optimizations. */
1524{
1525 register char *s;
1526 register regnode *c;
1527 register char *startpos = stringarg;
1528 I32 minlen; /* must match at least this many chars */
1529 I32 dontbother = 0; /* how many characters not to try at end */
1530 /* I32 start_shift = 0; */ /* Offset of the start to find
1531 constant substr. */ /* CC */
1532 I32 end_shift = 0; /* Same for the end. */ /* CC */
1533 I32 scream_pos = -1; /* Internal iterator of scream. */
1534 char *scream_olds;
1535 SV* oreplsv = GvSV(PL_replgv);
1536 bool do_utf8 = DO_UTF8(sv);
1537#ifdef DEBUGGING
1538 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1539 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1540#endif
1541
1542 PL_regcc = 0;
1543
1544 cache_re(prog);
1545#ifdef DEBUGGING
1546 PL_regnarrate = DEBUG_r_TEST;
1547#endif
1548
1549 /* Be paranoid... */
1550 if (prog == NULL || startpos == NULL) {
1551 Perl_croak(aTHX_ "NULL regexp parameter");
1552 return 0;
1553 }
1554
1555 minlen = prog->minlen;
1556 if (strend - startpos < minlen &&
1557 !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */
1558 ) {
1559 DEBUG_r(PerlIO_printf(Perl_debug_log,
1560 "String too short [regexec_flags]...\n"));
1561 goto phooey;
1562 }
1563
1564 /* Check validity of program. */
1565 if (UCHARAT(prog->program) != REG_MAGIC) {
1566 Perl_croak(aTHX_ "corrupted regexp program");
1567 }
1568
1569 PL_reg_flags = 0;
1570 PL_reg_eval_set = 0;
1571 PL_reg_maxiter = 0;
1572
1573 if (prog->reganch & ROPT_UTF8)
1574 PL_reg_flags |= RF_utf8;
1575
1576 /* Mark beginning of line for ^ and lookbehind. */
1577 PL_regbol = startpos;
1578 PL_bostr = strbeg;
1579 PL_reg_sv = sv;
1580
1581 /* Mark end of line for $ (and such) */
1582 PL_regeol = strend;
1583
1584 /* see how far we have to get to not match where we matched before */
1585 PL_regtill = startpos+minend;
1586
1587 /* We start without call_cc context. */
1588 PL_reg_call_cc = 0;
1589
1590 /* If there is a "must appear" string, look for it. */
1591 s = startpos;
1592
1593 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1594 MAGIC *mg;
1595
1596 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1597 PL_reg_ganch = startpos;
1598 else if (sv && SvTYPE(sv) >= SVt_PVMG
1599 && SvMAGIC(sv)
1600 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1601 && mg->mg_len >= 0) {
1602 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1603 if (prog->reganch & ROPT_ANCH_GPOS) {
1604 if (s > PL_reg_ganch)
1605 goto phooey;
1606 s = PL_reg_ganch;
1607 }
1608 }
1609 else /* pos() not defined */
1610 PL_reg_ganch = strbeg;
1611 }
1612
1613 if (do_utf8 == (UTF!=0) &&
1614 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1615 re_scream_pos_data d;
1616
1617 d.scream_olds = &scream_olds;
1618 d.scream_pos = &scream_pos;
1619 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1620 if (!s) {
1621 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1622 goto phooey; /* not present */
1623 }
1624 }
1625
1626 DEBUG_r({
1627 char *s0 = UTF ?
1628 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1629 UNI_DISPLAY_ISPRINT) :
1630 prog->precomp;
1631 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1632 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1633 UNI_DISPLAY_ISPRINT) : startpos;
1634 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1635 if (!PL_colorset)
1636 reginitcolors();
1637 PerlIO_printf(Perl_debug_log,
1638 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1639 PL_colors[4],PL_colors[5],PL_colors[0],
1640 len0, len0, s0,
1641 PL_colors[1],
1642 len0 > 60 ? "..." : "",
1643 PL_colors[0],
1644 (int)(len1 > 60 ? 60 : len1),
1645 s1, PL_colors[1],
1646 (len1 > 60 ? "..." : "")
1647 );
1648 });
1649
1650 /* Simplest case: anchored match need be tried only once. */
1651 /* [unless only anchor is BOL and multiline is set] */
1652 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1653 if (s == startpos && regtry(prog, startpos))
1654 goto got_it;
1655 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1656 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1657 {
1658 char *end;
1659
1660 if (minlen)
1661 dontbother = minlen - 1;
1662 end = HOP3c(strend, -dontbother, strbeg) - 1;
1663 /* for multiline we only have to try after newlines */
1664 if (prog->check_substr) {
1665 if (s == startpos)
1666 goto after_try;
1667 while (1) {
1668 if (regtry(prog, s))
1669 goto got_it;
1670 after_try:
1671 if (s >= end)
1672 goto phooey;
1673 if (prog->reganch & RE_USE_INTUIT) {
1674 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1675 if (!s)
1676 goto phooey;
1677 }
1678 else
1679 s++;
1680 }
1681 } else {
1682 if (s > startpos)
1683 s--;
1684 while (s < end) {
1685 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1686 if (regtry(prog, s))
1687 goto got_it;
1688 }
1689 }
1690 }
1691 }
1692 goto phooey;
1693 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1694 if (regtry(prog, PL_reg_ganch))
1695 goto got_it;
1696 goto phooey;
1697 }
1698
1699 /* Messy cases: unanchored match. */
1700 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1701 /* we have /x+whatever/ */
1702 /* it must be a one character string (XXXX Except UTF?) */
1703 char ch = SvPVX(prog->anchored_substr)[0];
1704#ifdef DEBUGGING
1705 int did_match = 0;
1706#endif
1707
1708 if (do_utf8) {
1709 while (s < strend) {
1710 if (*s == ch) {
1711 DEBUG_r( did_match = 1 );
1712 if (regtry(prog, s)) goto got_it;
1713 s += UTF8SKIP(s);
1714 while (s < strend && *s == ch)
1715 s += UTF8SKIP(s);
1716 }
1717 s += UTF8SKIP(s);
1718 }
1719 }
1720 else {
1721 while (s < strend) {
1722 if (*s == ch) {
1723 DEBUG_r( did_match = 1 );
1724 if (regtry(prog, s)) goto got_it;
1725 s++;
1726 while (s < strend && *s == ch)
1727 s++;
1728 }
1729 s++;
1730 }
1731 }
1732 DEBUG_r(if (!did_match)
1733 PerlIO_printf(Perl_debug_log,
1734 "Did not find anchored character...\n")
1735 );
1736 }
1737 /*SUPPRESS 560*/
1738 else if (do_utf8 == (UTF!=0) &&
1739 (prog->anchored_substr != Nullsv
1740 || (prog->float_substr != Nullsv
1741 && prog->float_max_offset < strend - s))) {
1742 SV *must = prog->anchored_substr
1743 ? prog->anchored_substr : prog->float_substr;
1744 I32 back_max =
1745 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1746 I32 back_min =
1747 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1748 char *last = HOP3c(strend, /* Cannot start after this */
1749 -(I32)(CHR_SVLEN(must)
1750 - (SvTAIL(must) != 0) + back_min), strbeg);
1751 char *last1; /* Last position checked before */
1752#ifdef DEBUGGING
1753 int did_match = 0;
1754#endif
1755
1756 if (s > PL_bostr)
1757 last1 = HOPc(s, -1);
1758 else
1759 last1 = s - 1; /* bogus */
1760
1761 /* XXXX check_substr already used to find `s', can optimize if
1762 check_substr==must. */
1763 scream_pos = -1;
1764 dontbother = end_shift;
1765 strend = HOPc(strend, -dontbother);
1766 while ( (s <= last) &&
1767 ((flags & REXEC_SCREAM)
1768 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1769 end_shift, &scream_pos, 0))
1770 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1771 (unsigned char*)strend, must,
1772 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1773 DEBUG_r( did_match = 1 );
1774 if (HOPc(s, -back_max) > last1) {
1775 last1 = HOPc(s, -back_min);
1776 s = HOPc(s, -back_max);
1777 }
1778 else {
1779 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1780
1781 last1 = HOPc(s, -back_min);
1782 s = t;
1783 }
1784 if (do_utf8) {
1785 while (s <= last1) {
1786 if (regtry(prog, s))
1787 goto got_it;
1788 s += UTF8SKIP(s);
1789 }
1790 }
1791 else {
1792 while (s <= last1) {
1793 if (regtry(prog, s))
1794 goto got_it;
1795 s++;
1796 }
1797 }
1798 }
1799 DEBUG_r(if (!did_match)
1800 PerlIO_printf(Perl_debug_log,
1801 "Did not find %s substr `%s%.*s%s'%s...\n",
1802 ((must == prog->anchored_substr)
1803 ? "anchored" : "floating"),
1804 PL_colors[0],
1805 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1806 SvPVX(must),
1807 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1808 );
1809 goto phooey;
1810 }
1811 else if ((c = prog->regstclass)) {
1812 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1813 /* don't bother with what can't match */
1814 strend = HOPc(strend, -(minlen - 1));
1815 DEBUG_r({
1816 SV *prop = sv_newmortal();
1817 char *s0;
1818 char *s1;
1819 int len0;
1820 int len1;
1821
1822 regprop(prop, c);
1823 s0 = UTF ?
1824 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1825 UNI_DISPLAY_ISPRINT) :
1826 SvPVX(prop);
1827 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1828 s1 = UTF ?
1829 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_ISPRINT) : s;
1830 len1 = UTF ? SvCUR(dsv1) : strend - s;
1831 PerlIO_printf(Perl_debug_log,
1832 "Matching stclass `%*.*s' against `%*.*s'\n",
1833 len0, len0, s0,
1834 len1, len1, s1);
1835 });
1836 if (find_byclass(prog, c, s, strend, startpos, 0))
1837 goto got_it;
1838 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1839 }
1840 else {
1841 dontbother = 0;
1842 if (prog->float_substr != Nullsv) { /* Trim the end. */
1843 char *last;
1844
1845 if (flags & REXEC_SCREAM) {
1846 last = screaminstr(sv, prog->float_substr, s - strbeg,
1847 end_shift, &scream_pos, 1); /* last one */
1848 if (!last)
1849 last = scream_olds; /* Only one occurrence. */
1850 }
1851 else {
1852 STRLEN len;
1853 char *little = SvPV(prog->float_substr, len);
1854
1855 if (SvTAIL(prog->float_substr)) {
1856 if (memEQ(strend - len + 1, little, len - 1))
1857 last = strend - len + 1;
1858 else if (!PL_multiline)
1859 last = memEQ(strend - len, little, len)
1860 ? strend - len : Nullch;
1861 else
1862 goto find_last;
1863 } else {
1864 find_last:
1865 if (len)
1866 last = rninstr(s, strend, little, little + len);
1867 else
1868 last = strend; /* matching `$' */
1869 }
1870 }
1871 if (last == NULL) {
1872 DEBUG_r(PerlIO_printf(Perl_debug_log,
1873 "%sCan't trim the tail, match fails (should not happen)%s\n",
1874 PL_colors[4],PL_colors[5]));
1875 goto phooey; /* Should not happen! */
1876 }
1877 dontbother = strend - last + prog->float_min_offset;
1878 }
1879 if (minlen && (dontbother < minlen))
1880 dontbother = minlen - 1;
1881 strend -= dontbother; /* this one's always in bytes! */
1882 /* We don't know much -- general case. */
1883 if (do_utf8) {
1884 for (;;) {
1885 if (regtry(prog, s))
1886 goto got_it;
1887 if (s >= strend)
1888 break;
1889 s += UTF8SKIP(s);
1890 };
1891 }
1892 else {
1893 do {
1894 if (regtry(prog, s))
1895 goto got_it;
1896 } while (s++ < strend);
1897 }
1898 }
1899
1900 /* Failure. */
1901 goto phooey;
1902
1903got_it:
1904 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1905
1906 if (PL_reg_eval_set) {
1907 /* Preserve the current value of $^R */
1908 if (oreplsv != GvSV(PL_replgv))
1909 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1910 restored, the value remains
1911 the same. */
1912 restore_pos(aTHX_ 0);
1913 }
1914
1915 /* make sure $`, $&, $', and $digit will work later */
1916 if ( !(flags & REXEC_NOT_FIRST) ) {
1917 if (RX_MATCH_COPIED(prog)) {
1918 Safefree(prog->subbeg);
1919 RX_MATCH_COPIED_off(prog);
1920 }
1921 if (flags & REXEC_COPY_STR) {
1922 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1923
1924 s = savepvn(strbeg, i);
1925 prog->subbeg = s;
1926 prog->sublen = i;
1927 RX_MATCH_COPIED_on(prog);
1928 }
1929 else {
1930 prog->subbeg = strbeg;
1931 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1932 }
1933 }
1934
1935 return 1;
1936
1937phooey:
1938 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1939 PL_colors[4],PL_colors[5]));
1940 if (PL_reg_eval_set)
1941 restore_pos(aTHX_ 0);
1942 return 0;
1943}
1944
1945/*
1946 - regtry - try match at specific point
1947 */
1948STATIC I32 /* 0 failure, 1 success */
1949S_regtry(pTHX_ regexp *prog, char *startpos)
1950{
1951 register I32 i;
1952 register I32 *sp;
1953 register I32 *ep;
1954 CHECKPOINT lastcp;
1955
1956#ifdef DEBUGGING
1957 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1958#endif
1959 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1960 MAGIC *mg;
1961
1962 PL_reg_eval_set = RS_init;
1963 DEBUG_r(DEBUG_s(
1964 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1965 (IV)(PL_stack_sp - PL_stack_base));
1966 ));
1967 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1968 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1969 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1970 SAVETMPS;
1971 /* Apparently this is not needed, judging by wantarray. */
1972 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1973 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1974
1975 if (PL_reg_sv) {
1976 /* Make $_ available to executed code. */
1977 if (PL_reg_sv != DEFSV) {
1978 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
1979 SAVESPTR(DEFSV);
1980 DEFSV = PL_reg_sv;
1981 }
1982
1983 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1984 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1985 /* prepare for quick setting of pos */
1986 sv_magic(PL_reg_sv, (SV*)0,
1987 PERL_MAGIC_regex_global, Nullch, 0);
1988 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1989 mg->mg_len = -1;
1990 }
1991 PL_reg_magic = mg;
1992 PL_reg_oldpos = mg->mg_len;
1993 SAVEDESTRUCTOR_X(restore_pos, 0);
1994 }
1995 if (!PL_reg_curpm) {
1996 Newz(22,PL_reg_curpm, 1, PMOP);
1997#ifdef USE_ITHREADS
1998 {
1999 SV* repointer = newSViv(0);
2000 /* so we know which PL_regex_padav element is PL_reg_curpm */
2001 SvFLAGS(repointer) |= SVf_BREAK;
2002 av_push(PL_regex_padav,repointer);
2003 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2004 PL_regex_pad = AvARRAY(PL_regex_padav);
2005 }
2006#endif
2007 }
2008 PM_SETRE(PL_reg_curpm, prog);
2009 PL_reg_oldcurpm = PL_curpm;
2010 PL_curpm = PL_reg_curpm;
2011 if (RX_MATCH_COPIED(prog)) {
2012 /* Here is a serious problem: we cannot rewrite subbeg,
2013 since it may be needed if this match fails. Thus
2014 $` inside (?{}) could fail... */
2015 PL_reg_oldsaved = prog->subbeg;
2016 PL_reg_oldsavedlen = prog->sublen;
2017 RX_MATCH_COPIED_off(prog);
2018 }
2019 else
2020 PL_reg_oldsaved = Nullch;
2021 prog->subbeg = PL_bostr;
2022 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2023 }
2024 prog->startp[0] = startpos - PL_bostr;
2025 PL_reginput = startpos;
2026 PL_regstartp = prog->startp;
2027 PL_regendp = prog->endp;
2028 PL_reglastparen = &prog->lastparen;
2029 PL_reglastcloseparen = &prog->lastcloseparen;
2030 prog->lastparen = 0;
2031 PL_regsize = 0;
2032 DEBUG_r(PL_reg_starttry = startpos);
2033 if (PL_reg_start_tmpl <= prog->nparens) {
2034 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2035 if(PL_reg_start_tmp)
2036 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2037 else
2038 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2039 }
2040
2041#ifdef DEBUGGING
2042 sv_setpvn(PERL_DEBUG_PAD(0), "", 0);
2043 sv_setpvn(PERL_DEBUG_PAD(1), "", 0);
2044 sv_setpvn(PERL_DEBUG_PAD(2), "", 0);
2045#endif
2046
2047 /* XXXX What this code is doing here?!!! There should be no need
2048 to do this again and again, PL_reglastparen should take care of
2049 this! --ilya*/
2050
2051 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2052 * Actually, the code in regcppop() (which Ilya may be meaning by
2053 * PL_reglastparen), is not needed at all by the test suite
2054 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2055 * enough, for building DynaLoader, or otherwise this
2056 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2057 * will happen. Meanwhile, this code *is* needed for the
2058 * above-mentioned test suite tests to succeed. The common theme
2059 * on those tests seems to be returning null fields from matches.
2060 * --jhi */
2061#if 1
2062 sp = prog->startp;
2063 ep = prog->endp;
2064 if (prog->nparens) {
2065 for (i = prog->nparens; i > *PL_reglastparen; i--) {
2066 *++sp = -1;
2067 *++ep = -1;
2068 }
2069 }
2070#endif
2071 REGCP_SET(lastcp);
2072 if (regmatch(prog->program + 1)) {
2073 prog->endp[0] = PL_reginput - PL_bostr;
2074 return 1;
2075 }
2076 REGCP_UNWIND(lastcp);
2077 return 0;
2078}
2079
2080#define RE_UNWIND_BRANCH 1
2081#define RE_UNWIND_BRANCHJ 2
2082
2083union re_unwind_t;
2084
2085typedef struct { /* XX: makes sense to enlarge it... */
2086 I32 type;
2087 I32 prev;
2088 CHECKPOINT lastcp;
2089} re_unwind_generic_t;
2090
2091typedef struct {
2092 I32 type;
2093 I32 prev;
2094 CHECKPOINT lastcp;
2095 I32 lastparen;
2096 regnode *next;
2097 char *locinput;
2098 I32 nextchr;
2099#ifdef DEBUGGING
2100 int regindent;
2101#endif
2102} re_unwind_branch_t;
2103
2104typedef union re_unwind_t {
2105 I32 type;
2106 re_unwind_generic_t generic;
2107 re_unwind_branch_t branch;
2108} re_unwind_t;
2109
2110#define sayYES goto yes
2111#define sayNO goto no
2112#define sayYES_FINAL goto yes_final
2113#define sayYES_LOUD goto yes_loud
2114#define sayNO_FINAL goto no_final
2115#define sayNO_SILENT goto do_no
2116#define saySAME(x) if (x) goto yes; else goto no
2117
2118#define REPORT_CODE_OFF 24
2119
2120/*
2121 - regmatch - main matching routine
2122 *
2123 * Conceptually the strategy is simple: check to see whether the current
2124 * node matches, call self recursively to see whether the rest matches,
2125 * and then act accordingly. In practice we make some effort to avoid
2126 * recursion, in particular by going through "ordinary" nodes (that don't
2127 * need to know whether the rest of the match failed) by a loop instead of
2128 * by recursion.
2129 */
2130/* [lwall] I've hoisted the register declarations to the outer block in order to
2131 * maybe save a little bit of pushing and popping on the stack. It also takes
2132 * advantage of machines that use a register save mask on subroutine entry.
2133 */
2134STATIC I32 /* 0 failure, 1 success */
2135S_regmatch(pTHX_ regnode *prog)
2136{
2137 register regnode *scan; /* Current node. */
2138 regnode *next; /* Next node. */
2139 regnode *inner; /* Next node in internal branch. */
2140 register I32 nextchr; /* renamed nextchr - nextchar colides with
2141 function of same name */
2142 register I32 n; /* no or next */
2143 register I32 ln = 0; /* len or last */
2144 register char *s = Nullch; /* operand or save */
2145 register char *locinput = PL_reginput;
2146 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2147 int minmod = 0, sw = 0, logical = 0;
2148 I32 unwind = 0;
2149#if 0
2150 I32 firstcp = PL_savestack_ix;
2151#endif
2152 register bool do_utf8 = PL_reg_match_utf8;
2153#ifdef DEBUGGING
2154 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2155 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2156 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2157#endif
2158
2159#ifdef DEBUGGING
2160 PL_regindent++;
2161#endif
2162
2163 /* Note that nextchr is a byte even in UTF */
2164 nextchr = UCHARAT(locinput);
2165 scan = prog;
2166 while (scan != NULL) {
2167
2168 DEBUG_r( {
2169 SV *prop = sv_newmortal();
2170 int docolor = *PL_colors[0];
2171 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2172 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2173 /* The part of the string before starttry has one color
2174 (pref0_len chars), between starttry and current
2175 position another one (pref_len - pref0_len chars),
2176 after the current position the third one.
2177 We assume that pref0_len <= pref_len, otherwise we
2178 decrease pref0_len. */
2179 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2180 ? (5 + taill) - l : locinput - PL_bostr;
2181 int pref0_len;
2182
2183 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2184 pref_len++;
2185 pref0_len = pref_len - (locinput - PL_reg_starttry);
2186 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2187 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2188 ? (5 + taill) - pref_len : PL_regeol - locinput);
2189 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2190 l--;
2191 if (pref0_len < 0)
2192 pref0_len = 0;
2193 if (pref0_len > pref_len)
2194 pref0_len = pref_len;
2195 regprop(prop, scan);
2196 {
2197 char *s0 =
2198 do_utf8 ?
2199 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2200 pref0_len, 60, 0) :
2201 locinput - pref_len;
2202 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2203 char *s1 = do_utf8 ?
2204 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2205 pref_len - pref0_len, 60, 0) :
2206 locinput - pref_len + pref0_len;
2207 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2208 char *s2 = do_utf8 ?
2209 pv_uni_display(dsv2, (U8*)locinput,
2210 PL_regeol - locinput, 60, 0) :
2211 locinput;
2212 int len2 = do_utf8 ? strlen(s2) : l;
2213 PerlIO_printf(Perl_debug_log,
2214 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2215 (IV)(locinput - PL_bostr),
2216 PL_colors[4],
2217 len0, s0,
2218 PL_colors[5],
2219 PL_colors[2],
2220 len1, s1,
2221 PL_colors[3],
2222 (docolor ? "" : "> <"),
2223 PL_colors[0],
2224 len2, s2,
2225 PL_colors[1],
2226 15 - l - pref_len + 1,
2227 "",
2228 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2229 SvPVX(prop));
2230 }
2231 });
2232
2233 next = scan + NEXT_OFF(scan);
2234 if (next == scan)
2235 next = NULL;
2236
2237 switch (OP(scan)) {
2238 case BOL:
2239 if (locinput == PL_bostr || (PL_multiline &&
2240 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2241 {
2242 /* regtill = regbol; */
2243 break;
2244 }
2245 sayNO;
2246 case MBOL:
2247 if (locinput == PL_bostr ||
2248 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2249 {
2250 break;
2251 }
2252 sayNO;
2253 case SBOL:
2254 if (locinput == PL_bostr)
2255 break;
2256 sayNO;
2257 case GPOS:
2258 if (locinput == PL_reg_ganch)
2259 break;
2260 sayNO;
2261 case EOL:
2262 if (PL_multiline)
2263 goto meol;
2264 else
2265 goto seol;
2266 case MEOL:
2267 meol:
2268 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2269 sayNO;
2270 break;
2271 case SEOL:
2272 seol:
2273 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2274 sayNO;
2275 if (PL_regeol - locinput > 1)
2276 sayNO;
2277 break;
2278 case EOS:
2279 if (PL_regeol != locinput)
2280 sayNO;
2281 break;
2282 case SANY:
2283 if (!nextchr && locinput >= PL_regeol)
2284 sayNO;
2285 if (do_utf8) {
2286 locinput += PL_utf8skip[nextchr];
2287 if (locinput > PL_regeol)
2288 sayNO;
2289 nextchr = UCHARAT(locinput);
2290 }
2291 else
2292 nextchr = UCHARAT(++locinput);
2293 break;
2294 case CANY:
2295 if (!nextchr && locinput >= PL_regeol)
2296 sayNO;
2297 nextchr = UCHARAT(++locinput);
2298 break;
2299 case REG_ANY:
2300 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2301 sayNO;
2302 if (do_utf8) {
2303 locinput += PL_utf8skip[nextchr];
2304 if (locinput > PL_regeol)
2305 sayNO;
2306 nextchr = UCHARAT(locinput);
2307 }
2308 else
2309 nextchr = UCHARAT(++locinput);
2310 break;
2311 case EXACT:
2312 s = STRING(scan);
2313 ln = STR_LEN(scan);
2314 if (do_utf8 != (UTF!=0)) {
2315 /* The target and the pattern have differing utf8ness. */
2316 char *l = locinput;
2317 char *e = s + ln;
2318 STRLEN ulen;
2319
2320 if (do_utf8) {
2321 /* The target is utf8, the pattern is not utf8. */
2322 while (s < e) {
2323 if (l >= PL_regeol)
2324 sayNO;
2325 if (NATIVE_TO_UNI(*(U8*)s) !=
2326 utf8_to_uvchr((U8*)l, &ulen))
2327 sayNO;
2328 l += ulen;
2329 s ++;
2330 }
2331 }
2332 else {
2333 /* The target is not utf8, the pattern is utf8. */
2334 while (s < e) {
2335 if (l >= PL_regeol)
2336 sayNO;
2337 if (NATIVE_TO_UNI(*((U8*)l)) !=
2338 utf8_to_uvchr((U8*)s, &ulen))
2339 sayNO;
2340 s += ulen;
2341 l ++;
2342 }
2343 }
2344 locinput = l;
2345 nextchr = UCHARAT(locinput);
2346 break;
2347 }
2348 /* The target and the pattern have the same utf8ness. */
2349 /* Inline the first character, for speed. */
2350 if (UCHARAT(s) != nextchr)
2351 sayNO;
2352 if (PL_regeol - locinput < ln)
2353 sayNO;
2354 if (ln > 1 && memNE(s, locinput, ln))
2355 sayNO;
2356 locinput += ln;
2357 nextchr = UCHARAT(locinput);
2358 break;
2359 case EXACTFL:
2360 PL_reg_flags |= RF_tainted;
2361 /* FALL THROUGH */
2362 case EXACTF:
2363 s = STRING(scan);
2364 ln = STR_LEN(scan);
2365
2366 if (do_utf8 || UTF) {
2367 /* Either target or the pattern are utf8. */
2368 char *l = locinput;
2369 char *e = PL_regeol;
2370
2371 if (ibcmp_utf8(s, 0, ln, do_utf8,
2372 l, &e, 0, UTF))
2373 sayNO;
2374 locinput = e;
2375 nextchr = UCHARAT(locinput);
2376 break;
2377 }
2378
2379 /* Neither the target and the pattern are utf8. */
2380
2381 /* Inline the first character, for speed. */
2382 if (UCHARAT(s) != nextchr &&
2383 UCHARAT(s) != ((OP(scan) == EXACTF)
2384 ? PL_fold : PL_fold_locale)[nextchr])
2385 sayNO;
2386 if (PL_regeol - locinput < ln)
2387 sayNO;
2388 if (ln > 1 && (OP(scan) == EXACTF
2389 ? ibcmp(s, locinput, ln)
2390 : ibcmp_locale(s, locinput, ln)))
2391 sayNO;
2392 locinput += ln;
2393 nextchr = UCHARAT(locinput);
2394 break;
2395 case ANYOF:
2396 if (do_utf8) {
2397 STRLEN inclasslen = PL_regeol - locinput;
2398
2399 if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
2400 sayNO;
2401 if (locinput >= PL_regeol)
2402 sayNO;
2403 locinput += inclasslen;
2404 nextchr = UCHARAT(locinput);
2405 }
2406 else {
2407 if (nextchr < 0)
2408 nextchr = UCHARAT(locinput);
2409 if (!reginclass(scan, (U8*)locinput, do_utf8))
2410 sayNO;
2411 if (!nextchr && locinput >= PL_regeol)
2412 sayNO;
2413 nextchr = UCHARAT(++locinput);
2414 }
2415 break;
2416 case ALNUML:
2417 PL_reg_flags |= RF_tainted;
2418 /* FALL THROUGH */
2419 case ALNUM:
2420 if (!nextchr)
2421 sayNO;
2422 if (do_utf8) {
2423 LOAD_UTF8_CHARCLASS(alnum,"a");
2424 if (!(OP(scan) == ALNUM
2425 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2426 : isALNUM_LC_utf8((U8*)locinput)))
2427 {
2428 sayNO;
2429 }
2430 locinput += PL_utf8skip[nextchr];
2431 nextchr = UCHARAT(locinput);
2432 break;
2433 }
2434 if (!(OP(scan) == ALNUM
2435 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2436 sayNO;
2437 nextchr = UCHARAT(++locinput);
2438 break;
2439 case NALNUML:
2440 PL_reg_flags |= RF_tainted;
2441 /* FALL THROUGH */
2442 case NALNUM:
2443 if (!nextchr && locinput >= PL_regeol)
2444 sayNO;
2445 if (do_utf8) {
2446 LOAD_UTF8_CHARCLASS(alnum,"a");
2447 if (OP(scan) == NALNUM
2448 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2449 : isALNUM_LC_utf8((U8*)locinput))
2450 {
2451 sayNO;
2452 }
2453 locinput += PL_utf8skip[nextchr];
2454 nextchr = UCHARAT(locinput);
2455 break;
2456 }
2457 if (OP(scan) == NALNUM
2458 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2459 sayNO;
2460 nextchr = UCHARAT(++locinput);
2461 break;
2462 case BOUNDL:
2463 case NBOUNDL:
2464 PL_reg_flags |= RF_tainted;
2465 /* FALL THROUGH */
2466 case BOUND:
2467 case NBOUND:
2468 /* was last char in word? */
2469 if (do_utf8) {
2470 if (locinput == PL_bostr)
2471 ln = '\n';
2472 else {
2473 U8 *r = reghop((U8*)locinput, -1);
2474
2475 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2476 }
2477 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2478 ln = isALNUM_uni(ln);
2479 LOAD_UTF8_CHARCLASS(alnum,"a");
2480 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2481 }
2482 else {
2483 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2484 n = isALNUM_LC_utf8((U8*)locinput);
2485 }
2486 }
2487 else {
2488 ln = (locinput != PL_bostr) ?
2489 UCHARAT(locinput - 1) : '\n';
2490 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2491 ln = isALNUM(ln);
2492 n = isALNUM(nextchr);
2493 }
2494 else {
2495 ln = isALNUM_LC(ln);
2496 n = isALNUM_LC(nextchr);
2497 }
2498 }
2499 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2500 OP(scan) == BOUNDL))
2501 sayNO;
2502 break;
2503 case SPACEL:
2504 PL_reg_flags |= RF_tainted;
2505 /* FALL THROUGH */
2506 case SPACE:
2507 if (!nextchr)
2508 sayNO;
2509 if (do_utf8) {
2510 if (UTF8_IS_CONTINUED(nextchr)) {
2511 LOAD_UTF8_CHARCLASS(space," ");
2512 if (!(OP(scan) == SPACE
2513 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2514 : isSPACE_LC_utf8((U8*)locinput)))
2515 {
2516 sayNO;
2517 }
2518 locinput += PL_utf8skip[nextchr];
2519 nextchr = UCHARAT(locinput);
2520 break;
2521 }
2522 if (!(OP(scan) == SPACE
2523 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2524 sayNO;
2525 nextchr = UCHARAT(++locinput);
2526 }
2527 else {
2528 if (!(OP(scan) == SPACE
2529 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2530 sayNO;
2531 nextchr = UCHARAT(++locinput);
2532 }
2533 break;
2534 case NSPACEL:
2535 PL_reg_flags |= RF_tainted;
2536 /* FALL THROUGH */
2537 case NSPACE:
2538 if (!nextchr && locinput >= PL_regeol)
2539 sayNO;
2540 if (do_utf8) {
2541 LOAD_UTF8_CHARCLASS(space," ");
2542 if (OP(scan) == NSPACE
2543 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2544 : isSPACE_LC_utf8((U8*)locinput))
2545 {
2546 sayNO;
2547 }
2548 locinput += PL_utf8skip[nextchr];
2549 nextchr = UCHARAT(locinput);
2550 break;
2551 }
2552 if (OP(scan) == NSPACE
2553 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2554 sayNO;
2555 nextchr = UCHARAT(++locinput);
2556 break;
2557 case DIGITL:
2558 PL_reg_flags |= RF_tainted;
2559 /* FALL THROUGH */
2560 case DIGIT:
2561 if (!nextchr)
2562 sayNO;
2563 if (do_utf8) {
2564 LOAD_UTF8_CHARCLASS(digit,"0");
2565 if (!(OP(scan) == DIGIT
2566 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2567 : isDIGIT_LC_utf8((U8*)locinput)))
2568 {
2569 sayNO;
2570 }
2571 locinput += PL_utf8skip[nextchr];
2572 nextchr = UCHARAT(locinput);
2573 break;
2574 }
2575 if (!(OP(scan) == DIGIT
2576 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2577 sayNO;
2578 nextchr = UCHARAT(++locinput);
2579 break;
2580 case NDIGITL:
2581 PL_reg_flags |= RF_tainted;
2582 /* FALL THROUGH */
2583 case NDIGIT:
2584 if (!nextchr && locinput >= PL_regeol)
2585 sayNO;
2586 if (do_utf8) {
2587 LOAD_UTF8_CHARCLASS(digit,"0");
2588 if (OP(scan) == NDIGIT
2589 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2590 : isDIGIT_LC_utf8((U8*)locinput))
2591 {
2592 sayNO;
2593 }
2594 locinput += PL_utf8skip[nextchr];
2595 nextchr = UCHARAT(locinput);
2596 break;
2597 }
2598 if (OP(scan) == NDIGIT
2599 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2600 sayNO;
2601 nextchr = UCHARAT(++locinput);
2602 break;
2603 case CLUMP:
2604 if (locinput >= PL_regeol)
2605 sayNO;
2606 if (do_utf8) {
2607 LOAD_UTF8_CHARCLASS(mark,"~");
2608 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2609 sayNO;
2610 locinput += PL_utf8skip[nextchr];
2611 while (locinput < PL_regeol &&
2612 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2613 locinput += UTF8SKIP(locinput);
2614 if (locinput > PL_regeol)
2615 sayNO;
2616 }
2617 else
2618 locinput++;
2619 nextchr = UCHARAT(locinput);
2620 break;
2621 case REFFL:
2622 PL_reg_flags |= RF_tainted;
2623 /* FALL THROUGH */
2624 case REF:
2625 case REFF:
2626 n = ARG(scan); /* which paren pair */
2627 ln = PL_regstartp[n];
2628 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2629 if (*PL_reglastparen < n || ln == -1)
2630 sayNO; /* Do not match unless seen CLOSEn. */
2631 if (ln == PL_regendp[n])
2632 break;
2633
2634 s = PL_bostr + ln;
2635 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2636 char *l = locinput;
2637 char *e = PL_bostr + PL_regendp[n];
2638 /*
2639 * Note that we can't do the "other character" lookup trick as
2640 * in the 8-bit case (no pun intended) because in Unicode we
2641 * have to map both upper and title case to lower case.
2642 */
2643 if (OP(scan) == REFF) {
2644 STRLEN ulen1, ulen2;
2645 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2646 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
2647 while (s < e) {
2648 if (l >= PL_regeol)
2649 sayNO;
2650 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2651 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2652 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2653 sayNO;
2654 s += ulen1;
2655 l += ulen2;
2656 }
2657 }
2658 locinput = l;
2659 nextchr = UCHARAT(locinput);
2660 break;
2661 }
2662
2663 /* Inline the first character, for speed. */
2664 if (UCHARAT(s) != nextchr &&
2665 (OP(scan) == REF ||
2666 (UCHARAT(s) != ((OP(scan) == REFF
2667 ? PL_fold : PL_fold_locale)[nextchr]))))
2668 sayNO;
2669 ln = PL_regendp[n] - ln;
2670 if (locinput + ln > PL_regeol)
2671 sayNO;
2672 if (ln > 1 && (OP(scan) == REF
2673 ? memNE(s, locinput, ln)
2674 : (OP(scan) == REFF
2675 ? ibcmp(s, locinput, ln)
2676 : ibcmp_locale(s, locinput, ln))))
2677 sayNO;
2678 locinput += ln;
2679 nextchr = UCHARAT(locinput);
2680 break;
2681
2682 case NOTHING:
2683 case TAIL:
2684 break;
2685 case BACK:
2686 break;
2687 case EVAL:
2688 {
2689 dSP;
2690 OP_4tree *oop = PL_op;
2691 COP *ocurcop = PL_curcop;
2692 SV **ocurpad = PL_curpad;
2693 SV *ret;
2694
2695 n = ARG(scan);
2696 PL_op = (OP_4tree*)PL_regdata->data[n];
2697 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2698 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2699 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2700
2701 {
2702 SV **before = SP;
2703 CALLRUNOPS(aTHX); /* Scalar context. */
2704 SPAGAIN;
2705 if (SP == before)
2706 ret = Nullsv; /* protect against empty (?{}) blocks. */
2707 else {
2708 ret = POPs;
2709 PUTBACK;
2710 }
2711 }
2712
2713 PL_op = oop;
2714 PL_curpad = ocurpad;
2715 PL_curcop = ocurcop;
2716 if (logical) {
2717 if (logical == 2) { /* Postponed subexpression. */
2718 regexp *re;
2719 MAGIC *mg = Null(MAGIC*);
2720 re_cc_state state;
2721 CHECKPOINT cp, lastcp;
2722
2723 if(SvROK(ret) || SvRMAGICAL(ret)) {
2724 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2725
2726 if(SvMAGICAL(sv))
2727 mg = mg_find(sv, PERL_MAGIC_qr);
2728 }
2729 if (mg) {
2730 re = (regexp *)mg->mg_obj;
2731 (void)ReREFCNT_inc(re);
2732 }
2733 else {
2734 STRLEN len;
2735 char *t = SvPV(ret, len);
2736 PMOP pm;
2737 char *oprecomp = PL_regprecomp;
2738 I32 osize = PL_regsize;
2739 I32 onpar = PL_regnpar;
2740
2741 Zero(&pm, 1, PMOP);
2742 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2743 if (!(SvFLAGS(ret)
2744 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2745 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2746 PERL_MAGIC_qr,0,0);
2747 PL_regprecomp = oprecomp;
2748 PL_regsize = osize;
2749 PL_regnpar = onpar;
2750 }
2751 DEBUG_r(
2752 PerlIO_printf(Perl_debug_log,
2753 "Entering embedded `%s%.60s%s%s'\n",
2754 PL_colors[0],
2755 re->precomp,
2756 PL_colors[1],
2757 (strlen(re->precomp) > 60 ? "..." : ""))
2758 );
2759 state.node = next;
2760 state.prev = PL_reg_call_cc;
2761 state.cc = PL_regcc;
2762 state.re = PL_reg_re;
2763
2764 PL_regcc = 0;
2765
2766 cp = regcppush(0); /* Save *all* the positions. */
2767 REGCP_SET(lastcp);
2768 cache_re(re);
2769 state.ss = PL_savestack_ix;
2770 *PL_reglastparen = 0;
2771 *PL_reglastcloseparen = 0;
2772 PL_reg_call_cc = &state;
2773 PL_reginput = locinput;
2774
2775 /* XXXX This is too dramatic a measure... */
2776 PL_reg_maxiter = 0;
2777
2778 if (regmatch(re->program + 1)) {
2779 /* Even though we succeeded, we need to restore
2780 global variables, since we may be wrapped inside
2781 SUSPEND, thus the match may be not finished yet. */
2782
2783 /* XXXX Do this only if SUSPENDed? */
2784 PL_reg_call_cc = state.prev;
2785 PL_regcc = state.cc;
2786 PL_reg_re = state.re;
2787 cache_re(PL_reg_re);
2788
2789 /* XXXX This is too dramatic a measure... */
2790 PL_reg_maxiter = 0;
2791
2792 /* These are needed even if not SUSPEND. */
2793 ReREFCNT_dec(re);
2794 regcpblow(cp);
2795 sayYES;
2796 }
2797 ReREFCNT_dec(re);
2798 REGCP_UNWIND(lastcp);
2799 regcppop();
2800 PL_reg_call_cc = state.prev;
2801 PL_regcc = state.cc;
2802 PL_reg_re = state.re;
2803 cache_re(PL_reg_re);
2804
2805 /* XXXX This is too dramatic a measure... */
2806 PL_reg_maxiter = 0;
2807
2808 logical = 0;
2809 sayNO;
2810 }
2811 sw = SvTRUE(ret);
2812 logical = 0;
2813 }
2814 else
2815 sv_setsv(save_scalar(PL_replgv), ret);
2816 break;
2817 }
2818 case OPEN:
2819 n = ARG(scan); /* which paren pair */
2820 PL_reg_start_tmp[n] = locinput;
2821 if (n > PL_regsize)
2822 PL_regsize = n;
2823 break;
2824 case CLOSE:
2825 n = ARG(scan); /* which paren pair */
2826 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2827 PL_regendp[n] = locinput - PL_bostr;
2828 if (n > *PL_reglastparen)
2829 *PL_reglastparen = n;
2830 *PL_reglastcloseparen = n;
2831 break;
2832 case GROUPP:
2833 n = ARG(scan); /* which paren pair */
2834 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2835 break;
2836 case IFTHEN:
2837 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2838 if (sw)
2839 next = NEXTOPER(NEXTOPER(scan));
2840 else {
2841 next = scan + ARG(scan);
2842 if (OP(next) == IFTHEN) /* Fake one. */
2843 next = NEXTOPER(NEXTOPER(next));
2844 }
2845 break;
2846 case LOGICAL:
2847 logical = scan->flags;
2848 break;
2849/*******************************************************************
2850 PL_regcc contains infoblock about the innermost (...)* loop, and
2851 a pointer to the next outer infoblock.
2852
2853 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2854
2855 1) After matching X, regnode for CURLYX is processed;
2856
2857 2) This regnode creates infoblock on the stack, and calls
2858 regmatch() recursively with the starting point at WHILEM node;
2859
2860 3) Each hit of WHILEM node tries to match A and Z (in the order
2861 depending on the current iteration, min/max of {min,max} and
2862 greediness). The information about where are nodes for "A"
2863 and "Z" is read from the infoblock, as is info on how many times "A"
2864 was already matched, and greediness.
2865
2866 4) After A matches, the same WHILEM node is hit again.
2867
2868 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2869 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2870 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2871 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2872 of the external loop.
2873
2874 Currently present infoblocks form a tree with a stem formed by PL_curcc
2875 and whatever it mentions via ->next, and additional attached trees
2876 corresponding to temporarily unset infoblocks as in "5" above.
2877
2878 In the following picture infoblocks for outer loop of
2879 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2880 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2881 infoblocks are drawn below the "reset" infoblock.
2882
2883 In fact in the picture below we do not show failed matches for Z and T
2884 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2885 more obvious *why* one needs to *temporary* unset infoblocks.]
2886
2887 Matched REx position InfoBlocks Comment
2888 (Y(A)*?Z)*?T x
2889 Y(A)*?Z)*?T x <- O
2890 Y (A)*?Z)*?T x <- O
2891 Y A)*?Z)*?T x <- O <- I
2892 YA )*?Z)*?T x <- O <- I
2893 YA A)*?Z)*?T x <- O <- I
2894 YAA )*?Z)*?T x <- O <- I
2895 YAA Z)*?T x <- O # Temporary unset I
2896 I
2897
2898 YAAZ Y(A)*?Z)*?T x <- O
2899 I
2900
2901 YAAZY (A)*?Z)*?T x <- O
2902 I
2903
2904 YAAZY A)*?Z)*?T x <- O <- I
2905 I
2906
2907 YAAZYA )*?Z)*?T x <- O <- I
2908 I
2909
2910 YAAZYA Z)*?T x <- O # Temporary unset I
2911 I,I
2912
2913 YAAZYAZ )*?T x <- O
2914 I,I
2915
2916 YAAZYAZ T x # Temporary unset O
2917 O
2918 I,I
2919
2920 YAAZYAZT x
2921 O
2922 I,I
2923 *******************************************************************/
2924 case CURLYX: {
2925 CURCUR cc;
2926 CHECKPOINT cp = PL_savestack_ix;
2927 /* No need to save/restore up to this paren */
2928 I32 parenfloor = scan->flags;
2929
2930 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2931 next += ARG(next);
2932 cc.oldcc = PL_regcc;
2933 PL_regcc = &cc;
2934 /* XXXX Probably it is better to teach regpush to support
2935 parenfloor > PL_regsize... */
2936 if (parenfloor > *PL_reglastparen)
2937 parenfloor = *PL_reglastparen; /* Pessimization... */
2938 cc.parenfloor = parenfloor;
2939 cc.cur = -1;
2940 cc.min = ARG1(scan);
2941 cc.max = ARG2(scan);
2942 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2943 cc.next = next;
2944 cc.minmod = minmod;
2945 cc.lastloc = 0;
2946 PL_reginput = locinput;
2947 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2948 regcpblow(cp);
2949 PL_regcc = cc.oldcc;
2950 saySAME(n);
2951 }
2952 /* NOT REACHED */
2953 case WHILEM: {
2954 /*
2955 * This is really hard to understand, because after we match
2956 * what we're trying to match, we must make sure the rest of
2957 * the REx is going to match for sure, and to do that we have
2958 * to go back UP the parse tree by recursing ever deeper. And
2959 * if it fails, we have to reset our parent's current state
2960 * that we can try again after backing off.
2961 */
2962
2963 CHECKPOINT cp, lastcp;
2964 CURCUR* cc = PL_regcc;
2965 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2966
2967 n = cc->cur + 1; /* how many we know we matched */
2968 PL_reginput = locinput;
2969
2970 DEBUG_r(
2971 PerlIO_printf(Perl_debug_log,
2972 "%*s %ld out of %ld..%ld cc=%lx\n",
2973 REPORT_CODE_OFF+PL_regindent*2, "",
2974 (long)n, (long)cc->min,
2975 (long)cc->max, (long)cc)
2976 );
2977
2978 /* If degenerate scan matches "", assume scan done. */
2979
2980 if (locinput == cc->lastloc && n >= cc->min) {
2981 PL_regcc = cc->oldcc;
2982 if (PL_regcc)
2983 ln = PL_regcc->cur;
2984 DEBUG_r(
2985 PerlIO_printf(Perl_debug_log,
2986 "%*s empty match detected, try continuation...\n",
2987 REPORT_CODE_OFF+PL_regindent*2, "")
2988 );
2989 if (regmatch(cc->next))
2990 sayYES;
2991 if (PL_regcc)
2992 PL_regcc->cur = ln;
2993 PL_regcc = cc;
2994 sayNO;
2995 }
2996
2997 /* First just match a string of min scans. */
2998
2999 if (n < cc->min) {
3000 cc->cur = n;
3001 cc->lastloc = locinput;
3002 if (regmatch(cc->scan))
3003 sayYES;
3004 cc->cur = n - 1;
3005 cc->lastloc = lastloc;
3006 sayNO;
3007 }
3008
3009 if (scan->flags) {
3010 /* Check whether we already were at this position.
3011 Postpone detection until we know the match is not
3012 *that* much linear. */
3013 if (!PL_reg_maxiter) {
3014 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3015 PL_reg_leftiter = PL_reg_maxiter;
3016 }
3017 if (PL_reg_leftiter-- == 0) {
3018 I32 size = (PL_reg_maxiter + 7)/8;
3019 if (PL_reg_poscache) {
3020 if (PL_reg_poscache_size < size) {
3021 Renew(PL_reg_poscache, size, char);
3022 PL_reg_poscache_size = size;
3023 }
3024 Zero(PL_reg_poscache, size, char);
3025 }
3026 else {
3027 PL_reg_poscache_size = size;
3028 Newz(29, PL_reg_poscache, size, char);
3029 }
3030 DEBUG_r(
3031 PerlIO_printf(Perl_debug_log,
3032 "%sDetected a super-linear match, switching on caching%s...\n",
3033 PL_colors[4], PL_colors[5])
3034 );
3035 }
3036 if (PL_reg_leftiter < 0) {
3037 I32 o = locinput - PL_bostr, b;
3038
3039 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3040 b = o % 8;
3041 o /= 8;
3042 if (PL_reg_poscache[o] & (1<<b)) {
3043 DEBUG_r(
3044 PerlIO_printf(Perl_debug_log,
3045 "%*s already tried at this position...\n",
3046 REPORT_CODE_OFF+PL_regindent*2, "")
3047 );
3048 sayNO_SILENT;
3049 }
3050 PL_reg_poscache[o] |= (1<<b);
3051 }
3052 }
3053
3054 /* Prefer next over scan for minimal matching. */
3055
3056 if (cc->minmod) {
3057 PL_regcc = cc->oldcc;
3058 if (PL_regcc)
3059 ln = PL_regcc->cur;
3060 cp = regcppush(cc->parenfloor);
3061 REGCP_SET(lastcp);
3062 if (regmatch(cc->next)) {
3063 regcpblow(cp);
3064 sayYES; /* All done. */
3065 }
3066 REGCP_UNWIND(lastcp);
3067 regcppop();
3068 if (PL_regcc)
3069 PL_regcc->cur = ln;
3070 PL_regcc = cc;
3071
3072 if (n >= cc->max) { /* Maximum greed exceeded? */
3073 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3074 && !(PL_reg_flags & RF_warned)) {
3075 PL_reg_flags |= RF_warned;
3076 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
3077 "Complex regular subexpression recursion",
3078 REG_INFTY - 1);
3079 }
3080 sayNO;
3081 }
3082
3083 DEBUG_r(
3084 PerlIO_printf(Perl_debug_log,
3085 "%*s trying longer...\n",
3086 REPORT_CODE_OFF+PL_regindent*2, "")
3087 );
3088 /* Try scanning more and see if it helps. */
3089 PL_reginput = locinput;
3090 cc->cur = n;
3091 cc->lastloc = locinput;
3092 cp = regcppush(cc->parenfloor);
3093 REGCP_SET(lastcp);
3094 if (regmatch(cc->scan)) {
3095 regcpblow(cp);
3096 sayYES;
3097 }
3098 REGCP_UNWIND(lastcp);
3099 regcppop();
3100 cc->cur = n - 1;
3101 cc->lastloc = lastloc;
3102 sayNO;
3103 }
3104
3105 /* Prefer scan over next for maximal matching. */
3106
3107 if (n < cc->max) { /* More greed allowed? */
3108 cp = regcppush(cc->parenfloor);
3109 cc->cur = n;
3110 cc->lastloc = locinput;
3111 REGCP_SET(lastcp);
3112 if (regmatch(cc->scan)) {
3113 regcpblow(cp);
3114 sayYES;
3115 }
3116 REGCP_UNWIND(lastcp);
3117 regcppop(); /* Restore some previous $<digit>s? */
3118 PL_reginput = locinput;
3119 DEBUG_r(
3120 PerlIO_printf(Perl_debug_log,
3121 "%*s failed, try continuation...\n",
3122 REPORT_CODE_OFF+PL_regindent*2, "")
3123 );
3124 }
3125 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3126 && !(PL_reg_flags & RF_warned)) {
3127 PL_reg_flags |= RF_warned;
3128 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
3129 "Complex regular subexpression recursion",
3130 REG_INFTY - 1);
3131 }
3132
3133 /* Failed deeper matches of scan, so see if this one works. */
3134 PL_regcc = cc->oldcc;
3135 if (PL_regcc)
3136 ln = PL_regcc->cur;
3137 if (regmatch(cc->next))
3138 sayYES;
3139 if (PL_regcc)
3140 PL_regcc->cur = ln;
3141 PL_regcc = cc;
3142 cc->cur = n - 1;
3143 cc->lastloc = lastloc;
3144 sayNO;
3145 }
3146 /* NOT REACHED */
3147 case BRANCHJ:
3148 next = scan + ARG(scan);
3149 if (next == scan)
3150 next = NULL;
3151 inner = NEXTOPER(NEXTOPER(scan));
3152 goto do_branch;
3153 case BRANCH:
3154 inner = NEXTOPER(scan);
3155 do_branch:
3156 {
3157 c1 = OP(scan);
3158 if (OP(next) != c1) /* No choice. */
3159 next = inner; /* Avoid recursion. */
3160 else {
3161 I32 lastparen = *PL_reglastparen;
3162 I32 unwind1;
3163 re_unwind_branch_t *uw;
3164
3165 /* Put unwinding data on stack */
3166 unwind1 = SSNEWt(1,re_unwind_branch_t);
3167 uw = SSPTRt(unwind1,re_unwind_branch_t);
3168 uw->prev = unwind;
3169 unwind = unwind1;
3170 uw->type = ((c1 == BRANCH)
3171 ? RE_UNWIND_BRANCH
3172 : RE_UNWIND_BRANCHJ);
3173 uw->lastparen = lastparen;
3174 uw->next = next;
3175 uw->locinput = locinput;
3176 uw->nextchr = nextchr;
3177#ifdef DEBUGGING
3178 uw->regindent = ++PL_regindent;
3179#endif
3180
3181 REGCP_SET(uw->lastcp);
3182
3183 /* Now go into the first branch */
3184 next = inner;
3185 }
3186 }
3187 break;
3188 case MINMOD:
3189 minmod = 1;
3190 break;
3191 case CURLYM:
3192 {
3193 I32 l = 0;
3194 CHECKPOINT lastcp;
3195
3196 /* We suppose that the next guy does not need
3197 backtracking: in particular, it is of constant length,
3198 and has no parenths to influence future backrefs. */
3199 ln = ARG1(scan); /* min to match */
3200 n = ARG2(scan); /* max to match */
3201 paren = scan->flags;
3202 if (paren) {
3203 if (paren > PL_regsize)
3204 PL_regsize = paren;
3205 if (paren > *PL_reglastparen)
3206 *PL_reglastparen = paren;
3207 }
3208 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3209 if (paren)
3210 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3211 PL_reginput = locinput;
3212 if (minmod) {
3213 minmod = 0;
3214 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3215 sayNO;
3216 /* if we matched something zero-length we don't need to
3217 backtrack - capturing parens are already defined, so
3218 the caveat in the maximal case doesn't apply
3219
3220 XXXX if ln == 0, we can redo this check first time
3221 through the following loop
3222 */
3223 if (ln && l == 0)
3224 n = ln; /* don't backtrack */
3225 locinput = PL_reginput;
3226 if (HAS_TEXT(next) || JUMPABLE(next)) {
3227 regnode *text_node = next;
3228
3229 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3230
3231 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3232 else {
3233 if (PL_regkind[(U8)OP(text_node)] == REF) {
3234 I32 n, ln;
3235 n = ARG(text_node); /* which paren pair */
3236 ln = PL_regstartp[n];
3237 /* assume yes if we haven't seen CLOSEn */
3238 if (
3239 *PL_reglastparen < n ||
3240 ln == -1 ||
3241 ln == PL_regendp[n]
3242 ) {
3243 c1 = c2 = -1000;
3244 goto assume_ok_MM;
3245 }
3246 c1 = *(PL_bostr + ln);
3247 }
3248 else { c1 = (U8)*STRING(text_node); }
3249 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3250 c2 = PL_fold[c1];
3251 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3252 c2 = PL_fold_locale[c1];
3253 else
3254 c2 = c1;
3255 }
3256 }
3257 else
3258 c1 = c2 = -1000;
3259 assume_ok_MM:
3260 REGCP_SET(lastcp);
3261 /* This may be improved if l == 0. */
3262 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3263 /* If it could work, try it. */
3264 if (c1 == -1000 ||
3265 UCHARAT(PL_reginput) == c1 ||
3266 UCHARAT(PL_reginput) == c2)
3267 {
3268 if (paren) {
3269 if (ln) {
3270 PL_regstartp[paren] =
3271 HOPc(PL_reginput, -l) - PL_bostr;
3272 PL_regendp[paren] = PL_reginput - PL_bostr;
3273 }
3274 else
3275 PL_regendp[paren] = -1;
3276 }
3277 if (regmatch(next))
3278 sayYES;
3279 REGCP_UNWIND(lastcp);
3280 }
3281 /* Couldn't or didn't -- move forward. */
3282 PL_reginput = locinput;
3283 if (regrepeat_hard(scan, 1, &l)) {
3284 ln++;
3285 locinput = PL_reginput;
3286 }
3287 else
3288 sayNO;
3289 }
3290 }
3291 else {
3292 n = regrepeat_hard(scan, n, &l);
3293 /* if we matched something zero-length we don't need to
3294 backtrack, unless the minimum count is zero and we
3295 are capturing the result - in that case the capture
3296 being defined or not may affect later execution
3297 */
3298 if (n != 0 && l == 0 && !(paren && ln == 0))
3299 ln = n; /* don't backtrack */
3300 locinput = PL_reginput;
3301 DEBUG_r(
3302 PerlIO_printf(Perl_debug_log,
3303 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3304 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3305 (IV) n, (IV)l)
3306 );
3307 if (n >= ln) {
3308 if (HAS_TEXT(next) || JUMPABLE(next)) {
3309 regnode *text_node = next;
3310
3311 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3312
3313 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3314 else {
3315 if (PL_regkind[(U8)OP(text_node)] == REF) {
3316 I32 n, ln;
3317 n = ARG(text_node); /* which paren pair */
3318 ln = PL_regstartp[n];
3319 /* assume yes if we haven't seen CLOSEn */
3320 if (
3321 *PL_reglastparen < n ||
3322 ln == -1 ||
3323 ln == PL_regendp[n]
3324 ) {
3325 c1 = c2 = -1000;
3326 goto assume_ok_REG;
3327 }
3328 c1 = *(PL_bostr + ln);
3329 }
3330 else { c1 = (U8)*STRING(text_node); }
3331
3332 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3333 c2 = PL_fold[c1];
3334 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3335 c2 = PL_fold_locale[c1];
3336 else
3337 c2 = c1;
3338 }
3339 }
3340 else
3341 c1 = c2 = -1000;
3342 }
3343 assume_ok_REG:
3344 REGCP_SET(lastcp);
3345 while (n >= ln) {
3346 /* If it could work, try it. */
3347 if (c1 == -1000 ||
3348 UCHARAT(PL_reginput) == c1 ||
3349 UCHARAT(PL_reginput) == c2)
3350 {
3351 DEBUG_r(
3352 PerlIO_printf(Perl_debug_log,
3353 "%*s trying tail with n=%"IVdf"...\n",
3354 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3355 );
3356 if (paren) {
3357 if (n) {
3358 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3359 PL_regendp[paren] = PL_reginput - PL_bostr;
3360 }
3361 else
3362 PL_regendp[paren] = -1;
3363 }
3364 if (regmatch(next))
3365 sayYES;
3366 REGCP_UNWIND(lastcp);
3367 }
3368 /* Couldn't or didn't -- back up. */
3369 n--;
3370 locinput = HOPc(locinput, -l);
3371 PL_reginput = locinput;
3372 }
3373 }
3374 sayNO;
3375 break;
3376 }
3377 case CURLYN:
3378 paren = scan->flags; /* Which paren to set */
3379 if (paren > PL_regsize)
3380 PL_regsize = paren;
3381 if (paren > *PL_reglastparen)
3382 *PL_reglastparen = paren;
3383 ln = ARG1(scan); /* min to match */
3384 n = ARG2(scan); /* max to match */
3385 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3386 goto repeat;
3387 case CURLY:
3388 paren = 0;
3389 ln = ARG1(scan); /* min to match */
3390 n = ARG2(scan); /* max to match */
3391 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3392 goto repeat;
3393 case STAR:
3394 ln = 0;
3395 n = REG_INFTY;
3396 scan = NEXTOPER(scan);
3397 paren = 0;
3398 goto repeat;
3399 case PLUS:
3400 ln = 1;
3401 n = REG_INFTY;
3402 scan = NEXTOPER(scan);
3403 paren = 0;
3404 repeat:
3405 /*
3406 * Lookahead to avoid useless match attempts
3407 * when we know what character comes next.
3408 */
3409
3410 /*
3411 * Used to only do .*x and .*?x, but now it allows
3412 * for )'s, ('s and (?{ ... })'s to be in the way
3413 * of the quantifier and the EXACT-like node. -- japhy
3414 */
3415
3416 if (HAS_TEXT(next) || JUMPABLE(next)) {
3417 U8 *s;
3418 regnode *text_node = next;
3419
3420 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3421
3422 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3423 else {
3424 if (PL_regkind[(U8)OP(text_node)] == REF) {
3425 I32 n, ln;
3426 n = ARG(text_node); /* which paren pair */
3427 ln = PL_regstartp[n];
3428 /* assume yes if we haven't seen CLOSEn */
3429 if (
3430 *PL_reglastparen < n ||
3431 ln == -1 ||
3432 ln == PL_regendp[n]
3433 ) {
3434 c1 = c2 = -1000;
3435 goto assume_ok_easy;
3436 }
3437 s = (U8*)PL_bostr + ln;
3438 }
3439 else { s = (U8*)STRING(text_node); }
3440
3441 if (!UTF) {
3442 c2 = c1 = *s;
3443 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3444 c2 = PL_fold[c1];
3445 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3446 c2 = PL_fold_locale[c1];
3447 }
3448 else { /* UTF */
3449 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3450 STRLEN ulen1, ulen2;
3451 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3452 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
3453
3454 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3455 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3456
3457 c1 = utf8_to_uvuni(tmpbuf1, 0);
3458 c2 = utf8_to_uvuni(tmpbuf2, 0);
3459 }
3460 else {
3461 c2 = c1 = utf8_to_uvchr(s, NULL);
3462 }
3463 }
3464 }
3465 }
3466 else
3467 c1 = c2 = -1000;
3468 assume_ok_easy:
3469 PL_reginput = locinput;
3470 if (minmod) {
3471 CHECKPOINT lastcp;
3472 minmod = 0;
3473 if (ln && regrepeat(scan, ln) < ln)
3474 sayNO;
3475 locinput = PL_reginput;
3476 REGCP_SET(lastcp);
3477 if (c1 != -1000) {
3478 char *e; /* Should not check after this */
3479 char *old = locinput;
3480
3481 if (n == REG_INFTY) {
3482 e = PL_regeol - 1;
3483 if (do_utf8)
3484 while (UTF8_IS_CONTINUATION(*(U8*)e))
3485 e--;
3486 }
3487 else if (do_utf8) {
3488 int m = n - ln;
3489 for (e = locinput;
3490 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3491 e += UTF8SKIP(e);
3492 }
3493 else {
3494 e = locinput + n - ln;
3495 if (e >= PL_regeol)
3496 e = PL_regeol - 1;
3497 }
3498 while (1) {
3499 int count;
3500 /* Find place 'next' could work */
3501 if (!do_utf8) {
3502 if (c1 == c2) {
3503 while (locinput <= e &&
3504 UCHARAT(locinput) != c1)
3505 locinput++;
3506 } else {
3507 while (locinput <= e
3508 && UCHARAT(locinput) != c1
3509 && UCHARAT(locinput) != c2)
3510 locinput++;
3511 }
3512 count = locinput - old;
3513 }
3514 else {
3515 STRLEN len;
3516 if (c1 == c2) {
3517 for (count = 0;
3518 locinput <= e &&
3519 utf8_to_uvchr((U8*)locinput, &len) != c1;
3520 count++)
3521 locinput += len;
3522
3523 } else {
3524 for (count = 0; locinput <= e; count++) {
3525 UV c = utf8_to_uvchr((U8*)locinput, &len);
3526 if (c == c1 || c == c2)
3527 break;
3528 locinput += len;
3529 }
3530 }
3531 }
3532 if (locinput > e)
3533 sayNO;
3534 /* PL_reginput == old now */
3535 if (locinput != old) {
3536 ln = 1; /* Did some */
3537 if (regrepeat(scan, count) < count)
3538 sayNO;
3539 }
3540 /* PL_reginput == locinput now */
3541 TRYPAREN(paren, ln, locinput);
3542 PL_reginput = locinput; /* Could be reset... */
3543 REGCP_UNWIND(lastcp);
3544 /* Couldn't or didn't -- move forward. */
3545 old = locinput;
3546 if (do_utf8)
3547 locinput += UTF8SKIP(locinput);
3548 else
3549 locinput++;
3550 }
3551 }
3552 else
3553 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3554 UV c;
3555 if (c1 != -1000) {
3556 if (do_utf8)
3557 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3558 else
3559 c = UCHARAT(PL_reginput);
3560 /* If it could work, try it. */
3561 if (c == c1 || c == c2)
3562 {
3563 TRYPAREN(paren, n, PL_reginput);
3564 REGCP_UNWIND(lastcp);
3565 }
3566 }
3567 /* If it could work, try it. */
3568 else if (c1 == -1000)
3569 {
3570 TRYPAREN(paren, n, PL_reginput);
3571 REGCP_UNWIND(lastcp);
3572 }
3573 /* Couldn't or didn't -- move forward. */
3574 PL_reginput = locinput;
3575 if (regrepeat(scan, 1)) {
3576 ln++;
3577 locinput = PL_reginput;
3578 }
3579 else
3580 sayNO;
3581 }
3582 }
3583 else {
3584 CHECKPOINT lastcp;
3585 n = regrepeat(scan, n);
3586 locinput = PL_reginput;
3587 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3588 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3589 ln = n; /* why back off? */
3590 /* ...because $ and \Z can match before *and* after
3591 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3592 We should back off by one in this case. */
3593 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3594 ln--;
3595 }
3596 REGCP_SET(lastcp);
3597 if (paren) {
3598 UV c = 0;
3599 while (n >= ln) {
3600 if (c1 != -1000) {
3601 if (do_utf8)
3602 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3603 else
3604 c = UCHARAT(PL_reginput);
3605 }
3606 /* If it could work, try it. */
3607 if (c1 == -1000 || c == c1 || c == c2)
3608 {
3609 TRYPAREN(paren, n, PL_reginput);
3610 REGCP_UNWIND(lastcp);
3611 }
3612 /* Couldn't or didn't -- back up. */
3613 n--;
3614 PL_reginput = locinput = HOPc(locinput, -1);
3615 }
3616 }
3617 else {
3618 UV c = 0;
3619 while (n >= ln) {
3620 if (c1 != -1000) {
3621 if (do_utf8)
3622 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3623 else
3624 c = UCHARAT(PL_reginput);
3625 }
3626 /* If it could work, try it. */
3627 if (c1 == -1000 || c == c1 || c == c2)
3628 {
3629 TRYPAREN(paren, n, PL_reginput);
3630 REGCP_UNWIND(lastcp);
3631 }
3632 /* Couldn't or didn't -- back up. */
3633 n--;
3634 PL_reginput = locinput = HOPc(locinput, -1);
3635 }
3636 }
3637 }
3638 sayNO;
3639 break;
3640 case END:
3641 if (PL_reg_call_cc) {
3642 re_cc_state *cur_call_cc = PL_reg_call_cc;
3643 CURCUR *cctmp = PL_regcc;
3644 regexp *re = PL_reg_re;
3645 CHECKPOINT cp, lastcp;
3646
3647 cp = regcppush(0); /* Save *all* the positions. */
3648 REGCP_SET(lastcp);
3649 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3650 the caller. */
3651 PL_reginput = locinput; /* Make position available to
3652 the callcc. */
3653 cache_re(PL_reg_call_cc->re);
3654 PL_regcc = PL_reg_call_cc->cc;
3655 PL_reg_call_cc = PL_reg_call_cc->prev;
3656 if (regmatch(cur_call_cc->node)) {
3657 PL_reg_call_cc = cur_call_cc;
3658 regcpblow(cp);
3659 sayYES;
3660 }
3661 REGCP_UNWIND(lastcp);
3662 regcppop();
3663 PL_reg_call_cc = cur_call_cc;
3664 PL_regcc = cctmp;
3665 PL_reg_re = re;
3666 cache_re(re);
3667
3668 DEBUG_r(
3669 PerlIO_printf(Perl_debug_log,
3670 "%*s continuation failed...\n",
3671 REPORT_CODE_OFF+PL_regindent*2, "")
3672 );
3673 sayNO_SILENT;
3674 }
3675 if (locinput < PL_regtill) {
3676 DEBUG_r(PerlIO_printf(Perl_debug_log,
3677 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3678 PL_colors[4],
3679 (long)(locinput - PL_reg_starttry),
3680 (long)(PL_regtill - PL_reg_starttry),
3681 PL_colors[5]));
3682 sayNO_FINAL; /* Cannot match: too short. */
3683 }
3684 PL_reginput = locinput; /* put where regtry can find it */
3685 sayYES_FINAL; /* Success! */
3686 case SUCCEED:
3687 PL_reginput = locinput; /* put where regtry can find it */
3688 sayYES_LOUD; /* Success! */
3689 case SUSPEND:
3690 n = 1;
3691 PL_reginput = locinput;
3692 goto do_ifmatch;
3693 case UNLESSM:
3694 n = 0;
3695 if (scan->flags) {
3696 s = HOPBACKc(locinput, scan->flags);
3697 if (!s)
3698 goto say_yes;
3699 PL_reginput = s;
3700 }
3701 else
3702 PL_reginput = locinput;
3703 goto do_ifmatch;
3704 case IFMATCH:
3705 n = 1;
3706 if (scan->flags) {
3707 s = HOPBACKc(locinput, scan->flags);
3708 if (!s)
3709 goto say_no;
3710 PL_reginput = s;
3711 }
3712 else
3713 PL_reginput = locinput;
3714
3715 do_ifmatch:
3716 inner = NEXTOPER(NEXTOPER(scan));
3717 if (regmatch(inner) != n) {
3718 say_no:
3719 if (logical) {
3720 logical = 0;
3721 sw = 0;
3722 goto do_longjump;
3723 }
3724 else
3725 sayNO;
3726 }
3727 say_yes:
3728 if (logical) {
3729 logical = 0;
3730 sw = 1;
3731 }
3732 if (OP(scan) == SUSPEND) {
3733 locinput = PL_reginput;
3734 nextchr = UCHARAT(locinput);
3735 }
3736 /* FALL THROUGH. */
3737 case LONGJMP:
3738 do_longjump:
3739 next = scan + ARG(scan);
3740 if (next == scan)
3741 next = NULL;
3742 break;
3743 default:
3744 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3745 PTR2UV(scan), OP(scan));
3746 Perl_croak(aTHX_ "regexp memory corruption");
3747 }
3748 reenter:
3749 scan = next;
3750 }
3751
3752 /*
3753 * We get here only if there's trouble -- normally "case END" is
3754 * the terminating point.
3755 */
3756 Perl_croak(aTHX_ "corrupted regexp pointers");
3757 /*NOTREACHED*/
3758 sayNO;
3759
3760yes_loud:
3761 DEBUG_r(
3762 PerlIO_printf(Perl_debug_log,
3763 "%*s %scould match...%s\n",
3764 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3765 );
3766 goto yes;
3767yes_final:
3768 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3769 PL_colors[4],PL_colors[5]));
3770yes:
3771#ifdef DEBUGGING
3772 PL_regindent--;
3773#endif
3774
3775#if 0 /* Breaks $^R */
3776 if (unwind)
3777 regcpblow(firstcp);
3778#endif
3779 return 1;
3780
3781no:
3782 DEBUG_r(
3783 PerlIO_printf(Perl_debug_log,
3784 "%*s %sfailed...%s\n",
3785 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3786 );
3787 goto do_no;
3788no_final:
3789do_no:
3790 if (unwind) {
3791 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3792
3793 switch (uw->type) {
3794 case RE_UNWIND_BRANCH:
3795 case RE_UNWIND_BRANCHJ:
3796 {
3797 re_unwind_branch_t *uwb = &(uw->branch);
3798 I32 lastparen = uwb->lastparen;
3799
3800 REGCP_UNWIND(uwb->lastcp);
3801 for (n = *PL_reglastparen; n > lastparen; n--)
3802 PL_regendp[n] = -1;
3803 *PL_reglastparen = n;
3804 scan = next = uwb->next;
3805 if ( !scan ||
3806 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3807 ? BRANCH : BRANCHJ) ) { /* Failure */
3808 unwind = uwb->prev;
3809#ifdef DEBUGGING
3810 PL_regindent--;
3811#endif
3812 goto do_no;
3813 }
3814 /* Have more choice yet. Reuse the same uwb. */
3815 /*SUPPRESS 560*/
3816 if ((n = (uwb->type == RE_UNWIND_BRANCH
3817 ? NEXT_OFF(next) : ARG(next))))
3818 next += n;
3819 else
3820 next = NULL; /* XXXX Needn't unwinding in this case... */
3821 uwb->next = next;
3822 next = NEXTOPER(scan);
3823 if (uwb->type == RE_UNWIND_BRANCHJ)
3824 next = NEXTOPER(next);
3825 locinput = uwb->locinput;
3826 nextchr = uwb->nextchr;
3827#ifdef DEBUGGING
3828 PL_regindent = uwb->regindent;
3829#endif
3830
3831 goto reenter;
3832 }
3833 /* NOT REACHED */
3834 default:
3835 Perl_croak(aTHX_ "regexp unwind memory corruption");
3836 }
3837 /* NOT REACHED */
3838 }
3839#ifdef DEBUGGING
3840 PL_regindent--;
3841#endif
3842 return 0;
3843}
3844
3845/*
3846 - regrepeat - repeatedly match something simple, report how many
3847 */
3848/*
3849 * [This routine now assumes that it will only match on things of length 1.
3850 * That was true before, but now we assume scan - reginput is the count,
3851 * rather than incrementing count on every character. [Er, except utf8.]]
3852 */
3853STATIC I32
3854S_regrepeat(pTHX_ regnode *p, I32 max)
3855{
3856 register char *scan;
3857 register I32 c;
3858 register char *loceol = PL_regeol;
3859 register I32 hardcount = 0;
3860 register bool do_utf8 = PL_reg_match_utf8;
3861
3862 scan = PL_reginput;
3863 if (max != REG_INFTY && max < loceol - scan)
3864 loceol = scan + max;
3865 switch (OP(p)) {
3866 case REG_ANY:
3867 if (do_utf8) {
3868 loceol = PL_regeol;
3869 while (scan < loceol && hardcount < max && *scan != '\n') {
3870 scan += UTF8SKIP(scan);
3871 hardcount++;
3872 }
3873 } else {
3874 while (scan < loceol && *scan != '\n')
3875 scan++;
3876 }
3877 break;
3878 case SANY:
3879 scan = loceol;
3880 break;
3881 case CANY:
3882 scan = loceol;
3883 break;
3884 case EXACT: /* length of string is 1 */
3885 c = (U8)*STRING(p);
3886 while (scan < loceol && UCHARAT(scan) == c)
3887 scan++;
3888 break;
3889 case EXACTF: /* length of string is 1 */
3890 c = (U8)*STRING(p);
3891 while (scan < loceol &&
3892 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3893 scan++;
3894 break;
3895 case EXACTFL: /* length of string is 1 */
3896 PL_reg_flags |= RF_tainted;
3897 c = (U8)*STRING(p);
3898 while (scan < loceol &&
3899 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3900 scan++;
3901 break;
3902 case ANYOF:
3903 if (do_utf8) {
3904 loceol = PL_regeol;
3905 while (hardcount < max && scan < loceol &&
3906 reginclass(p, (U8*)scan, do_utf8)) {
3907 scan += UTF8SKIP(scan);
3908 hardcount++;
3909 }
3910 } else {
3911 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3912 scan++;
3913 }
3914 break;
3915 case ALNUM:
3916 if (do_utf8) {
3917 loceol = PL_regeol;
3918 LOAD_UTF8_CHARCLASS(alnum,"a");
3919 while (hardcount < max && scan < loceol &&
3920 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3921 scan += UTF8SKIP(scan);
3922 hardcount++;
3923 }
3924 } else {
3925 while (scan < loceol && isALNUM(*scan))
3926 scan++;
3927 }
3928 break;
3929 case ALNUML:
3930 PL_reg_flags |= RF_tainted;
3931 if (do_utf8) {
3932 loceol = PL_regeol;
3933 while (hardcount < max && scan < loceol &&
3934 isALNUM_LC_utf8((U8*)scan)) {
3935 scan += UTF8SKIP(scan);
3936 hardcount++;
3937 }
3938 } else {
3939 while (scan < loceol && isALNUM_LC(*scan))
3940 scan++;
3941 }
3942 break;
3943 case NALNUM:
3944 if (do_utf8) {
3945 loceol = PL_regeol;
3946 LOAD_UTF8_CHARCLASS(alnum,"a");
3947 while (hardcount < max && scan < loceol &&
3948 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3949 scan += UTF8SKIP(scan);
3950 hardcount++;
3951 }
3952 } else {
3953 while (scan < loceol && !isALNUM(*scan))
3954 scan++;
3955 }
3956 break;
3957 case NALNUML:
3958 PL_reg_flags |= RF_tainted;
3959 if (do_utf8) {
3960 loceol = PL_regeol;
3961 while (hardcount < max && scan < loceol &&
3962 !isALNUM_LC_utf8((U8*)scan)) {
3963 scan += UTF8SKIP(scan);
3964 hardcount++;
3965 }
3966 } else {
3967 while (scan < loceol && !isALNUM_LC(*scan))
3968 scan++;
3969 }
3970 break;
3971 case SPACE:
3972 if (do_utf8) {
3973 loceol = PL_regeol;
3974 LOAD_UTF8_CHARCLASS(space," ");
3975 while (hardcount < max && scan < loceol &&
3976 (*scan == ' ' ||
3977 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3978 scan += UTF8SKIP(scan);
3979 hardcount++;
3980 }
3981 } else {
3982 while (scan < loceol && isSPACE(*scan))
3983 scan++;
3984 }
3985 break;
3986 case SPACEL:
3987 PL_reg_flags |= RF_tainted;
3988 if (do_utf8) {
3989 loceol = PL_regeol;
3990 while (hardcount < max && scan < loceol &&
3991 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3992 scan += UTF8SKIP(scan);
3993 hardcount++;
3994 }
3995 } else {
3996 while (scan < loceol && isSPACE_LC(*scan))
3997 scan++;
3998 }
3999 break;
4000 case NSPACE:
4001 if (do_utf8) {
4002 loceol = PL_regeol;
4003 LOAD_UTF8_CHARCLASS(space," ");
4004 while (hardcount < max && scan < loceol &&
4005 !(*scan == ' ' ||
4006 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4007 scan += UTF8SKIP(scan);
4008 hardcount++;
4009 }
4010 } else {
4011 while (scan < loceol && !isSPACE(*scan))
4012 scan++;
4013 break;
4014 }
4015 case NSPACEL:
4016 PL_reg_flags |= RF_tainted;
4017 if (do_utf8) {
4018 loceol = PL_regeol;
4019 while (hardcount < max && scan < loceol &&
4020 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4021 scan += UTF8SKIP(scan);
4022 hardcount++;
4023 }
4024 } else {
4025 while (scan < loceol && !isSPACE_LC(*scan))
4026 scan++;
4027 }
4028 break;
4029 case DIGIT:
4030 if (do_utf8) {
4031 loceol = PL_regeol;
4032 LOAD_UTF8_CHARCLASS(digit,"0");
4033 while (hardcount < max && scan < loceol &&
4034 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4035 scan += UTF8SKIP(scan);
4036 hardcount++;
4037 }
4038 } else {
4039 while (scan < loceol && isDIGIT(*scan))
4040 scan++;
4041 }
4042 break;
4043 case NDIGIT:
4044 if (do_utf8) {
4045 loceol = PL_regeol;
4046 LOAD_UTF8_CHARCLASS(digit,"0");
4047 while (hardcount < max && scan < loceol &&
4048 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4049 scan += UTF8SKIP(scan);
4050 hardcount++;
4051 }
4052 } else {
4053 while (scan < loceol && !isDIGIT(*scan))
4054 scan++;
4055 }
4056 break;
4057 default: /* Called on something of 0 width. */
4058 break; /* So match right here or not at all. */
4059 }
4060
4061 if (hardcount)
4062 c = hardcount;
4063 else
4064 c = scan - PL_reginput;
4065 PL_reginput = scan;
4066
4067 DEBUG_r(
4068 {
4069 SV *prop = sv_newmortal();
4070
4071 regprop(prop, p);
4072 PerlIO_printf(Perl_debug_log,
4073 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4074 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
4075 });
4076
4077 return(c);
4078}
4079
4080/*
4081 - regrepeat_hard - repeatedly match something, report total lenth and length
4082 *
4083 * The repeater is supposed to have constant length.
4084 */
4085
4086STATIC I32
4087S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4088{
4089 register char *scan = Nullch;
4090 register char *start;
4091 register char *loceol = PL_regeol;
4092 I32 l = 0;
4093 I32 count = 0, res = 1;
4094
4095 if (!max)
4096 return 0;
4097
4098 start = PL_reginput;
4099 if (PL_reg_match_utf8) {
4100 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4101 if (!count++) {
4102 l = 0;
4103 while (start < PL_reginput) {
4104 l++;
4105 start += UTF8SKIP(start);
4106 }
4107 *lp = l;
4108 if (l == 0)
4109 return max;
4110 }
4111 if (count == max)
4112 return count;
4113 }
4114 }
4115 else {
4116 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4117 if (!count++) {
4118 *lp = l = PL_reginput - start;
4119 if (max != REG_INFTY && l*max < loceol - scan)
4120 loceol = scan + l*max;
4121 if (l == 0)
4122 return max;
4123 }
4124 }
4125 }
4126 if (!res)
4127 PL_reginput = scan;
4128
4129 return count;
4130}
4131
4132/*
4133- regclass_swash - prepare the utf8 swash
4134*/
4135
4136SV *
4137Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4138{
4139 SV *sw = NULL;
4140 SV *si = NULL;
4141 SV *alt = NULL;
4142
4143 if (PL_regdata && PL_regdata->count) {
4144 U32 n = ARG(node);
4145
4146 if (PL_regdata->what[n] == 's') {
4147 SV *rv = (SV*)PL_regdata->data[n];
4148 AV *av = (AV*)SvRV((SV*)rv);
4149 SV **a, **b;
4150
4151 /* See the end of regcomp.c:S_reglass() for
4152 * documentation of these array elements. */
4153
4154 si = *av_fetch(av, 0, FALSE);
4155 a = av_fetch(av, 1, FALSE);
4156 b = av_fetch(av, 2, FALSE);
4157
4158 if (a)
4159 sw = *a;
4160 else if (si && doinit) {
4161 sw = swash_init("utf8", "", si, 1, 0);
4162 (void)av_store(av, 1, sw);
4163 }
4164 if (b)
4165 alt = *b;
4166 }
4167 }
4168
4169 if (listsvp)
4170 *listsvp = si;
4171 if (altsvp)
4172 *altsvp = alt;
4173
4174 return sw;
4175}
4176
4177/*
4178 - reginclass - determine if a character falls into a character class
4179 */
4180
4181STATIC bool
4182S_reginclasslen(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
4183{
4184 char flags = ANYOF_FLAGS(n);
4185 bool match = FALSE;
4186 UV c;
4187 STRLEN len = 0;
4188 STRLEN plen;
4189
4190 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
4191
4192 plen = lenp ? *lenp : UNISKIP(c);
4193 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4194 if (lenp)
4195 *lenp = 0;
4196 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4197 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4198 match = TRUE;
4199 }
4200 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4201 match = TRUE;
4202 if (!match) {
4203 AV *av;
4204 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4205
4206 if (sw) {
4207 if (swash_fetch(sw, p, do_utf8))
4208 match = TRUE;
4209 else if (flags & ANYOF_FOLD) {
4210 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4211 STRLEN tmplen;
4212
4213 if (!match && lenp && av) {
4214 I32 i;
4215
4216 for (i = 0; i <= av_len(av); i++) {
4217 SV* sv = *av_fetch(av, i, FALSE);
4218 STRLEN len;
4219 char *s = SvPV(sv, len);
4220
4221 if (len <= plen && memEQ(s, p, len)) {
4222 *lenp = len;
4223 match = TRUE;
4224 break;
4225 }
4226 }
4227 }
4228 if (!match) {
4229 to_utf8_fold(p, tmpbuf, &tmplen);
4230 if (swash_fetch(sw, tmpbuf, do_utf8))
4231 match = TRUE;
4232 }
4233 if (!match) {
4234 to_utf8_upper(p, tmpbuf, &tmplen);
4235 if (swash_fetch(sw, tmpbuf, do_utf8))
4236 match = TRUE;
4237 }
4238 }
4239 }
4240 }
4241 if (match && lenp && *lenp == 0)
4242 *lenp = UNISKIP(c);
4243 }
4244 if (!match && c < 256) {
4245 if (ANYOF_BITMAP_TEST(n, c))
4246 match = TRUE;
4247 else if (flags & ANYOF_FOLD) {
4248 I32 f;
4249
4250 if (flags & ANYOF_LOCALE) {
4251 PL_reg_flags |= RF_tainted;
4252 f = PL_fold_locale[c];
4253 }
4254 else
4255 f = PL_fold[c];
4256 if (f != c && ANYOF_BITMAP_TEST(n, f))
4257 match = TRUE;
4258 }
4259
4260 if (!match && (flags & ANYOF_CLASS)) {
4261 PL_reg_flags |= RF_tainted;
4262 if (
4263 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4264 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4265 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4266 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4267 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4268 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4269 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4270 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4271 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4272 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4273 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4274 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4275 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4276 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4277 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4278 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4279 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4280 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4281 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4282 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4283 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4284 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4285 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4286 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4287 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4288 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4289 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4290 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4291 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4292 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4293 ) /* How's that for a conditional? */
4294 {
4295 match = TRUE;
4296 }
4297 }
4298 }
4299
4300 return (flags & ANYOF_INVERT) ? !match : match;
4301}
4302
4303STATIC bool
4304S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
4305{
4306 return S_reginclasslen(aTHX_ n, p, 0, do_utf8);
4307}
4308
4309STATIC U8 *
4310S_reghop(pTHX_ U8 *s, I32 off)
4311{
4312 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4313}
4314
4315STATIC U8 *
4316S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4317{
4318 if (off >= 0) {
4319 while (off-- && s < lim) {
4320 /* XXX could check well-formedness here */
4321 s += UTF8SKIP(s);
4322 }
4323 }
4324 else {
4325 while (off++) {
4326 if (s > lim) {
4327 s--;
4328 if (UTF8_IS_CONTINUED(*s)) {
4329 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4330 s--;
4331 }
4332 /* XXX could check well-formedness here */
4333 }
4334 }
4335 }
4336 return s;
4337}
4338
4339STATIC U8 *
4340S_reghopmaybe(pTHX_ U8 *s, I32 off)
4341{
4342 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4343}
4344
4345STATIC U8 *
4346S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4347{
4348 if (off >= 0) {
4349 while (off-- && s < lim) {
4350 /* XXX could check well-formedness here */
4351 s += UTF8SKIP(s);
4352 }
4353 if (off >= 0)
4354 return 0;
4355 }
4356 else {
4357 while (off++) {
4358 if (s > lim) {
4359 s--;
4360 if (UTF8_IS_CONTINUED(*s)) {
4361 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4362 s--;
4363 }
4364 /* XXX could check well-formedness here */
4365 }
4366 else
4367 break;
4368 }
4369 if (off <= 0)
4370 return 0;
4371 }
4372 return s;
4373}
4374
4375static void
4376restore_pos(pTHX_ void *arg)
4377{
4378 if (PL_reg_eval_set) {
4379 if (PL_reg_oldsaved) {
4380 PL_reg_re->subbeg = PL_reg_oldsaved;
4381 PL_reg_re->sublen = PL_reg_oldsavedlen;
4382 RX_MATCH_COPIED_on(PL_reg_re);
4383 }
4384 PL_reg_magic->mg_len = PL_reg_oldpos;
4385 PL_reg_eval_set = 0;
4386 PL_curpm = PL_reg_oldcurpm;
4387 }
4388}