This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Configure followed by make minitest
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
34/* We *really* need to overwrite these symbols: */
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
38# define Perl_pregfree my_regfree
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
41# define Perl_regnext my_regnext
42# define Perl_save_re_context my_save_re_context
43# define Perl_reginitcolors my_reginitcolors
44
45# define PERL_NO_GET_CONTEXT
46#endif
47
48/*SUPPRESS 112*/
49/*
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
72 **** Copyright (c) 1991-2002, Larry Wall
73 ****
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
83#define PERL_IN_REGCOMP_C
84#include "perl.h"
85
86#ifndef PERL_IN_XSUB_RE
87# include "INTERN.h"
88#endif
89
90#define REG_COMP_C
91#include "regcomp.h"
92
93#ifdef op
94#undef op
95#endif /* op */
96
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
106#ifndef STATIC
107#define STATIC static
108#endif
109
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
128 I32 utf8;
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
138#define RExC_start (pRExC_state->start)
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143#define RExC_emit (pRExC_state->emit)
144#define RExC_emit_start (pRExC_state->emit_start)
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
153#define RExC_utf8 (pRExC_state->utf8)
154
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
158
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
162/*
163 * Flags to be passed up and down.
164 */
165#define WORST 0 /* Worst case. */
166#define HASWIDTH 0x1 /* Known to match non-null strings. */
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
170
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
190 I32 *last_closep;
191 struct regnode_charclass_class *start_class;
192} scan_data_t;
193
194/*
195 * Forward declarations for pregcomp()'s friends.
196 */
197
198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0};
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
224#define SCF_DO_SUBSTR 0x400
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228#define SCF_WHILEM_VISITED_POS 0x2000
229
230#define UTF RExC_utf8
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
233
234#define OOB_UNICODE 12345678
235#define OOB_NAMEDCLASS -1
236
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
251
252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
259#define FAIL(msg) \
260 STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
263 \
264 if (!SIZE_ONLY) \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
270 ellipses = "..."; \
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
274 } STMT_END
275
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
281#define FAIL2(pat,msg) \
282 STMT_START { \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
285 \
286 if (!SIZE_ONLY) \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
288 \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
292 ellipses = "..."; \
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
304 IV offset = RExC_parse - RExC_precomp; \
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
325 IV offset = RExC_parse - RExC_precomp; \
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
368 IV offset = RExC_parse - RExC_precomp; \
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
390 } STMT_END \
391
392#define vWARNdep(loc,m) \
393 STMT_START { \
394 IV offset = loc - RExC_precomp; \
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
406 (int)offset, RExC_precomp, RExC_precomp + offset); \
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
411 IV offset = loc - RExC_precomp; \
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
414 (int)offset, RExC_precomp, RExC_precomp + offset); \
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
419 IV offset = loc - RExC_precomp; \
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
422 (int)offset, RExC_precomp, RExC_precomp + offset); \
423 } STMT_END
424
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
428 IV offset = loc - RExC_precomp; \
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
434
435/* Allow for side effects in s */
436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
482static void clear_re(pTHX_ void *r);
483
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
488STATIC void
489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490{
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
493
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
499 data->flags
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
503 }
504 else {
505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
506 data->offset_float_max = (l
507 ? data->last_start_max
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
510 data->flags
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
521/* Can match anything (initialization) */
522STATIC void
523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524{
525 ANYOF_CLASS_ZERO(cl);
526 ANYOF_BITMAP_SETALL(cl);
527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* Can match anything (initialization) */
533STATIC int
534S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535{
536 int value;
537
538 for (value = 0; value <= ANYOF_MAX; value += 2)
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 return 1;
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 return 0;
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
544 return 0;
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551{
552 Zero(cl, 1, struct regnode_charclass_class);
553 cl->type = ANYOF;
554 cl_anything(pRExC_state, cl);
555}
556
557STATIC void
558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559{
560 Zero(cl, 1, struct regnode_charclass_class);
561 cl->type = ANYOF;
562 cl_anything(pRExC_state, cl);
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605{
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
625 cl_anything(pRExC_state, cl);
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
630 && (!(or_with->flags & ANYOF_FOLD)
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
644 cl_anything(pRExC_state, cl);
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
659}
660
661/*
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
664 */
665#if defined(__sparc64__) && defined(__GNUC__)
666# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667# undef SPARC64_GCC_WORKAROUND
668# define SPARC64_GCC_WORKAROUND 1
669# endif
670#endif
671
672/* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
674
675/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
677
678STATIC I32
679S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
683{
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
686 I32 delta = 0;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
688 int is_inf_internal = 0; /* The studied chunk is infinite */
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
695
696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
697 /* Merge several consecutive EXACTish nodes into one. */
698 regnode *n = regnext(scan);
699 U32 stringok = 1;
700#ifdef DEBUGGING
701 regnode *stop = scan;
702#endif
703
704 next = scan + NODE_SZ_STR(scan);
705 /* Skip NOTHING, merge EXACT*. */
706 while (n &&
707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
708 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(n)
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
712 stringok = 0;
713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
716#ifdef DEBUGGING
717 if (stringok)
718 stop = n;
719#endif
720 n = regnext(n);
721 }
722 else if (stringok) {
723 int oldl = STR_LEN(scan);
724 regnode *nnext = regnext(n);
725
726 if (oldl + STR_LEN(n) > U8_MAX)
727 break;
728 NEXT_OFF(scan) += NEXT_OFF(n);
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
731 /* Now we can overwrite *n : */
732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
733#ifdef DEBUGGING
734 stop = next - 1;
735#endif
736 n = nnext;
737 }
738 }
739
740 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
741/*
742 Two problematic code points in Unicode casefolding of EXACT nodes:
743
744 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
745 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
746
747 which casefold to
748
749 Unicode UTF-8
750
751 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
752 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753
754 This means that in case-insensitive matching (or "loose matching",
755 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
756 length of the above casefolded versions) can match a target string
757 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
758 This would rather mess up the minimum length computation.
759
760 What we'll do is to look for the tail four bytes, and then peek
761 at the preceding two bytes to see whether we need to decrease
762 the minimum length by four (six minus two).
763
764 Thanks to the design of UTF-8, there cannot be false matches:
765 A sequence of valid UTF-8 bytes cannot be a subsequence of
766 another valid sequence of UTF-8 bytes.
767
768*/
769 char *s0 = STRING(scan), *s, *t;
770 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
771 char *t0 = "\xcc\x88\xcc\x81";
772 char *t1 = t0 + 3;
773
774 for (s = s0 + 2;
775 s < s2 && (t = ninstr(s, s1, t0, t1));
776 s = t + 4) {
777 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
778 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
779 min -= 4;
780 }
781 }
782
783#ifdef DEBUGGING
784 /* Allow dumping */
785 n = scan + NODE_SZ_STR(scan);
786 while (n <= stop) {
787 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
788 OP(n) = OPTIMIZED;
789 NEXT_OFF(n) = 0;
790 }
791 n++;
792 }
793#endif
794 }
795 /* Follow the next-chain of the current node and optimize
796 away all the NOTHINGs from it. */
797 if (OP(scan) != CURLYX) {
798 int max = (reg_off_by_arg[OP(scan)]
799 ? I32_MAX
800 /* I32 may be smaller than U16 on CRAYs! */
801 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
802 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
803 int noff;
804 regnode *n = scan;
805
806 /* Skip NOTHING and LONGJMP. */
807 while ((n = regnext(n))
808 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
809 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
810 && off + noff < max)
811 off += noff;
812 if (reg_off_by_arg[OP(scan)])
813 ARG(scan) = off;
814 else
815 NEXT_OFF(scan) = off;
816 }
817 /* The principal pseudo-switch. Cannot be a switch, since we
818 look into several different things. */
819 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
820 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
821 next = regnext(scan);
822 code = OP(scan);
823
824 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
825 I32 max1 = 0, min1 = I32_MAX, num = 0;
826 struct regnode_charclass_class accum;
827
828 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
829 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
830 if (flags & SCF_DO_STCLASS)
831 cl_init_zero(pRExC_state, &accum);
832 while (OP(scan) == code) {
833 I32 deltanext, minnext, f = 0, fake;
834 struct regnode_charclass_class this_class;
835
836 num++;
837 data_fake.flags = 0;
838 if (data) {
839 data_fake.whilem_c = data->whilem_c;
840 data_fake.last_closep = data->last_closep;
841 }
842 else
843 data_fake.last_closep = &fake;
844 next = regnext(scan);
845 scan = NEXTOPER(scan);
846 if (code != BRANCH)
847 scan = NEXTOPER(scan);
848 if (flags & SCF_DO_STCLASS) {
849 cl_init(pRExC_state, &this_class);
850 data_fake.start_class = &this_class;
851 f = SCF_DO_STCLASS_AND;
852 }
853 if (flags & SCF_WHILEM_VISITED_POS)
854 f |= SCF_WHILEM_VISITED_POS;
855 /* we suppose the run is continuous, last=next...*/
856 minnext = study_chunk(pRExC_state, &scan, &deltanext,
857 next, &data_fake, f);
858 if (min1 > minnext)
859 min1 = minnext;
860 if (max1 < minnext + deltanext)
861 max1 = minnext + deltanext;
862 if (deltanext == I32_MAX)
863 is_inf = is_inf_internal = 1;
864 scan = next;
865 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 pars++;
867 if (data && (data_fake.flags & SF_HAS_EVAL))
868 data->flags |= SF_HAS_EVAL;
869 if (data)
870 data->whilem_c = data_fake.whilem_c;
871 if (flags & SCF_DO_STCLASS)
872 cl_or(pRExC_state, &accum, &this_class);
873 if (code == SUSPEND)
874 break;
875 }
876 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 min1 = 0;
878 if (flags & SCF_DO_SUBSTR) {
879 data->pos_min += min1;
880 data->pos_delta += max1 - min1;
881 if (max1 != min1 || is_inf)
882 data->longest = &(data->longest_float);
883 }
884 min += min1;
885 delta += max1 - min1;
886 if (flags & SCF_DO_STCLASS_OR) {
887 cl_or(pRExC_state, data->start_class, &accum);
888 if (min1) {
889 cl_and(data->start_class, &and_with);
890 flags &= ~SCF_DO_STCLASS;
891 }
892 }
893 else if (flags & SCF_DO_STCLASS_AND) {
894 if (min1) {
895 cl_and(data->start_class, &accum);
896 flags &= ~SCF_DO_STCLASS;
897 }
898 else {
899 /* Switch to OR mode: cache the old value of
900 * data->start_class */
901 StructCopy(data->start_class, &and_with,
902 struct regnode_charclass_class);
903 flags &= ~SCF_DO_STCLASS_AND;
904 StructCopy(&accum, data->start_class,
905 struct regnode_charclass_class);
906 flags |= SCF_DO_STCLASS_OR;
907 data->start_class->flags |= ANYOF_EOS;
908 }
909 }
910 }
911 else if (code == BRANCHJ) /* single branch is optimized. */
912 scan = NEXTOPER(NEXTOPER(scan));
913 else /* single branch is optimized. */
914 scan = NEXTOPER(scan);
915 continue;
916 }
917 else if (OP(scan) == EXACT) {
918 I32 l = STR_LEN(scan);
919 UV uc = *((U8*)STRING(scan));
920 if (UTF) {
921 U8 *s = (U8*)STRING(scan);
922 l = utf8_length(s, s + l);
923 uc = utf8_to_uvchr(s, NULL);
924 }
925 min += l;
926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
927 /* The code below prefers earlier match for fixed
928 offset, later match for variable offset. */
929 if (data->last_end == -1) { /* Update the start info. */
930 data->last_start_min = data->pos_min;
931 data->last_start_max = is_inf
932 ? I32_MAX : data->pos_min + data->pos_delta;
933 }
934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
935 data->last_end = data->pos_min + l;
936 data->pos_min += l; /* As in the first entry. */
937 data->flags &= ~SF_BEFORE_EOL;
938 }
939 if (flags & SCF_DO_STCLASS_AND) {
940 /* Check whether it is compatible with what we know already! */
941 int compat = 1;
942
943 if (uc >= 0x100 ||
944 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
945 && !ANYOF_BITMAP_TEST(data->start_class, uc)
946 && (!(data->start_class->flags & ANYOF_FOLD)
947 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
948 )
949 compat = 0;
950 ANYOF_CLASS_ZERO(data->start_class);
951 ANYOF_BITMAP_ZERO(data->start_class);
952 if (compat)
953 ANYOF_BITMAP_SET(data->start_class, uc);
954 data->start_class->flags &= ~ANYOF_EOS;
955 if (uc < 0x100)
956 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
957 }
958 else if (flags & SCF_DO_STCLASS_OR) {
959 /* false positive possible if the class is case-folded */
960 if (uc < 0x100)
961 ANYOF_BITMAP_SET(data->start_class, uc);
962 else
963 data->start_class->flags |= ANYOF_UNICODE_ALL;
964 data->start_class->flags &= ~ANYOF_EOS;
965 cl_and(data->start_class, &and_with);
966 }
967 flags &= ~SCF_DO_STCLASS;
968 }
969 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
970 I32 l = STR_LEN(scan);
971 UV uc = *((U8*)STRING(scan));
972
973 /* Search for fixed substrings supports EXACT only. */
974 if (flags & SCF_DO_SUBSTR)
975 scan_commit(pRExC_state, data);
976 if (UTF) {
977 U8 *s = (U8 *)STRING(scan);
978 l = utf8_length(s, s + l);
979 uc = utf8_to_uvchr(s, NULL);
980 }
981 min += l;
982 if (data && (flags & SCF_DO_SUBSTR))
983 data->pos_min += l;
984 if (flags & SCF_DO_STCLASS_AND) {
985 /* Check whether it is compatible with what we know already! */
986 int compat = 1;
987
988 if (uc >= 0x100 ||
989 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
990 && !ANYOF_BITMAP_TEST(data->start_class, uc)
991 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
992 compat = 0;
993 ANYOF_CLASS_ZERO(data->start_class);
994 ANYOF_BITMAP_ZERO(data->start_class);
995 if (compat) {
996 ANYOF_BITMAP_SET(data->start_class, uc);
997 data->start_class->flags &= ~ANYOF_EOS;
998 data->start_class->flags |= ANYOF_FOLD;
999 if (OP(scan) == EXACTFL)
1000 data->start_class->flags |= ANYOF_LOCALE;
1001 }
1002 }
1003 else if (flags & SCF_DO_STCLASS_OR) {
1004 if (data->start_class->flags & ANYOF_FOLD) {
1005 /* false positive possible if the class is case-folded.
1006 Assume that the locale settings are the same... */
1007 if (uc < 0x100)
1008 ANYOF_BITMAP_SET(data->start_class, uc);
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 }
1011 cl_and(data->start_class, &and_with);
1012 }
1013 flags &= ~SCF_DO_STCLASS;
1014 }
1015 else if (strchr((char*)PL_varies,OP(scan))) {
1016 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1017 I32 f = flags, pos_before = 0;
1018 regnode *oscan = scan;
1019 struct regnode_charclass_class this_class;
1020 struct regnode_charclass_class *oclass = NULL;
1021 I32 next_is_eval = 0;
1022
1023 switch (PL_regkind[(U8)OP(scan)]) {
1024 case WHILEM: /* End of (?:...)* . */
1025 scan = NEXTOPER(scan);
1026 goto finish;
1027 case PLUS:
1028 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1029 next = NEXTOPER(scan);
1030 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1031 mincount = 1;
1032 maxcount = REG_INFTY;
1033 next = regnext(scan);
1034 scan = NEXTOPER(scan);
1035 goto do_curly;
1036 }
1037 }
1038 if (flags & SCF_DO_SUBSTR)
1039 data->pos_min++;
1040 min++;
1041 /* Fall through. */
1042 case STAR:
1043 if (flags & SCF_DO_STCLASS) {
1044 mincount = 0;
1045 maxcount = REG_INFTY;
1046 next = regnext(scan);
1047 scan = NEXTOPER(scan);
1048 goto do_curly;
1049 }
1050 is_inf = is_inf_internal = 1;
1051 scan = regnext(scan);
1052 if (flags & SCF_DO_SUBSTR) {
1053 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1054 data->longest = &(data->longest_float);
1055 }
1056 goto optimize_curly_tail;
1057 case CURLY:
1058 mincount = ARG1(scan);
1059 maxcount = ARG2(scan);
1060 next = regnext(scan);
1061 if (OP(scan) == CURLYX) {
1062 I32 lp = (data ? *(data->last_closep) : 0);
1063
1064 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065 }
1066 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1067 next_is_eval = (OP(scan) == EVAL);
1068 do_curly:
1069 if (flags & SCF_DO_SUBSTR) {
1070 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1071 pos_before = data->pos_min;
1072 }
1073 if (data) {
1074 fl = data->flags;
1075 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076 if (is_inf)
1077 data->flags |= SF_IS_INF;
1078 }
1079 if (flags & SCF_DO_STCLASS) {
1080 cl_init(pRExC_state, &this_class);
1081 oclass = data->start_class;
1082 data->start_class = &this_class;
1083 f |= SCF_DO_STCLASS_AND;
1084 f &= ~SCF_DO_STCLASS_OR;
1085 }
1086 /* These are the cases when once a subexpression
1087 fails at a particular position, it cannot succeed
1088 even after backtracking at the enclosing scope.
1089
1090 XXXX what if minimal match and we are at the
1091 initial run of {n,m}? */
1092 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1093 f &= ~SCF_WHILEM_VISITED_POS;
1094
1095 /* This will finish on WHILEM, setting scan, or on NULL: */
1096 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097 mincount == 0
1098 ? (f & ~SCF_DO_SUBSTR) : f);
1099
1100 if (flags & SCF_DO_STCLASS)
1101 data->start_class = oclass;
1102 if (mincount == 0 || minnext == 0) {
1103 if (flags & SCF_DO_STCLASS_OR) {
1104 cl_or(pRExC_state, data->start_class, &this_class);
1105 }
1106 else if (flags & SCF_DO_STCLASS_AND) {
1107 /* Switch to OR mode: cache the old value of
1108 * data->start_class */
1109 StructCopy(data->start_class, &and_with,
1110 struct regnode_charclass_class);
1111 flags &= ~SCF_DO_STCLASS_AND;
1112 StructCopy(&this_class, data->start_class,
1113 struct regnode_charclass_class);
1114 flags |= SCF_DO_STCLASS_OR;
1115 data->start_class->flags |= ANYOF_EOS;
1116 }
1117 } else { /* Non-zero len */
1118 if (flags & SCF_DO_STCLASS_OR) {
1119 cl_or(pRExC_state, data->start_class, &this_class);
1120 cl_and(data->start_class, &and_with);
1121 }
1122 else if (flags & SCF_DO_STCLASS_AND)
1123 cl_and(data->start_class, &this_class);
1124 flags &= ~SCF_DO_STCLASS;
1125 }
1126 if (!scan) /* It was not CURLYX, but CURLY. */
1127 scan = next;
1128 if (ckWARN(WARN_REGEXP)
1129 /* ? quantifier ok, except for (?{ ... }) */
1130 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1131 && (minnext == 0) && (deltanext == 0)
1132 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1133 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1134 {
1135 vWARN(RExC_parse,
1136 "Quantifier unexpected on zero-length expression");
1137 }
1138
1139 min += minnext * mincount;
1140 is_inf_internal |= ((maxcount == REG_INFTY
1141 && (minnext + deltanext) > 0)
1142 || deltanext == I32_MAX);
1143 is_inf |= is_inf_internal;
1144 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145
1146 /* Try powerful optimization CURLYX => CURLYN. */
1147 if ( OP(oscan) == CURLYX && data
1148 && data->flags & SF_IN_PAR
1149 && !(data->flags & SF_HAS_EVAL)
1150 && !deltanext && minnext == 1 ) {
1151 /* Try to optimize to CURLYN. */
1152 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1153 regnode *nxt1 = nxt;
1154#ifdef DEBUGGING
1155 regnode *nxt2;
1156#endif
1157
1158 /* Skip open. */
1159 nxt = regnext(nxt);
1160 if (!strchr((char*)PL_simple,OP(nxt))
1161 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1162 && STR_LEN(nxt) == 1))
1163 goto nogo;
1164#ifdef DEBUGGING
1165 nxt2 = nxt;
1166#endif
1167 nxt = regnext(nxt);
1168 if (OP(nxt) != CLOSE)
1169 goto nogo;
1170 /* Now we know that nxt2 is the only contents: */
1171 oscan->flags = ARG(nxt);
1172 OP(oscan) = CURLYN;
1173 OP(nxt1) = NOTHING; /* was OPEN. */
1174#ifdef DEBUGGING
1175 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1176 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1177 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1178 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1179 OP(nxt + 1) = OPTIMIZED; /* was count. */
1180 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1181#endif
1182 }
1183 nogo:
1184
1185 /* Try optimization CURLYX => CURLYM. */
1186 if ( OP(oscan) == CURLYX && data
1187 && !(data->flags & SF_HAS_PAR)
1188 && !(data->flags & SF_HAS_EVAL)
1189 && !deltanext ) {
1190 /* XXXX How to optimize if data == 0? */
1191 /* Optimize to a simpler form. */
1192 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1193 regnode *nxt2;
1194
1195 OP(oscan) = CURLYM;
1196 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1197 && (OP(nxt2) != WHILEM))
1198 nxt = nxt2;
1199 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1200 /* Need to optimize away parenths. */
1201 if (data->flags & SF_IN_PAR) {
1202 /* Set the parenth number. */
1203 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204
1205 if (OP(nxt) != CLOSE)
1206 FAIL("Panic opt close");
1207 oscan->flags = ARG(nxt);
1208 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1209 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1210#ifdef DEBUGGING
1211 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1212 OP(nxt + 1) = OPTIMIZED; /* was count. */
1213 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1214 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1215#endif
1216#if 0
1217 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1218 regnode *nnxt = regnext(nxt1);
1219
1220 if (nnxt == nxt) {
1221 if (reg_off_by_arg[OP(nxt1)])
1222 ARG_SET(nxt1, nxt2 - nxt1);
1223 else if (nxt2 - nxt1 < U16_MAX)
1224 NEXT_OFF(nxt1) = nxt2 - nxt1;
1225 else
1226 OP(nxt) = NOTHING; /* Cannot beautify */
1227 }
1228 nxt1 = nnxt;
1229 }
1230#endif
1231 /* Optimize again: */
1232 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1233 NULL, 0);
1234 }
1235 else
1236 oscan->flags = 0;
1237 }
1238 else if ((OP(oscan) == CURLYX)
1239 && (flags & SCF_WHILEM_VISITED_POS)
1240 /* See the comment on a similar expression above.
1241 However, this time it not a subexpression
1242 we care about, but the expression itself. */
1243 && (maxcount == REG_INFTY)
1244 && data && ++data->whilem_c < 16) {
1245 /* This stays as CURLYX, we can put the count/of pair. */
1246 /* Find WHILEM (as in regexec.c) */
1247 regnode *nxt = oscan + NEXT_OFF(oscan);
1248
1249 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250 nxt += ARG(nxt);
1251 PREVOPER(nxt)->flags = data->whilem_c
1252 | (RExC_whilem_seen << 4); /* On WHILEM */
1253 }
1254 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1255 pars++;
1256 if (flags & SCF_DO_SUBSTR) {
1257 SV *last_str = Nullsv;
1258 int counted = mincount != 0;
1259
1260 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1261#if defined(SPARC64_GCC_WORKAROUND)
1262 I32 b = 0;
1263 STRLEN l = 0;
1264 char *s = NULL;
1265 I32 old = 0;
1266
1267 if (pos_before >= data->last_start_min)
1268 b = pos_before;
1269 else
1270 b = data->last_start_min;
1271
1272 l = 0;
1273 s = SvPV(data->last_found, l);
1274 old = b - data->last_start_min;
1275
1276#else
1277 I32 b = pos_before >= data->last_start_min
1278 ? pos_before : data->last_start_min;
1279 STRLEN l;
1280 char *s = SvPV(data->last_found, l);
1281 I32 old = b - data->last_start_min;
1282#endif
1283
1284 if (UTF)
1285 old = utf8_hop((U8*)s, old) - (U8*)s;
1286
1287 l -= old;
1288 /* Get the added string: */
1289 last_str = newSVpvn(s + old, l);
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1292 if (mincount > 1) {
1293 SvGROW(last_str, (mincount * l) + 1);
1294 repeatcpy(SvPVX(last_str) + l,
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
1298 SvCUR_set(data->last_found,
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
1301 data->last_end += l * (mincount - 1);
1302 }
1303 } else {
1304 /* start offset must point into the last copy */
1305 data->last_start_min += minnext * (mincount - 1);
1306 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1307 * (minnext + data->pos_delta);
1308 }
1309 }
1310 /* It is counted once already... */
1311 data->pos_min += minnext * (mincount - counted);
1312 data->pos_delta += - counted * deltanext +
1313 (minnext + deltanext) * maxcount - minnext * mincount;
1314 if (mincount != maxcount) {
1315 /* Cannot extend fixed substrings found inside
1316 the group. */
1317 scan_commit(pRExC_state,data);
1318 if (mincount && last_str) {
1319 sv_setsv(data->last_found, last_str);
1320 data->last_end = data->pos_min;
1321 data->last_start_min =
1322 data->pos_min - CHR_SVLEN(last_str);
1323 data->last_start_max = is_inf
1324 ? I32_MAX
1325 : data->pos_min + data->pos_delta
1326 - CHR_SVLEN(last_str);
1327 }
1328 data->longest = &(data->longest_float);
1329 }
1330 SvREFCNT_dec(last_str);
1331 }
1332 if (data && (fl & SF_HAS_EVAL))
1333 data->flags |= SF_HAS_EVAL;
1334 optimize_curly_tail:
1335 if (OP(oscan) != CURLYX) {
1336 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1337 && NEXT_OFF(next))
1338 NEXT_OFF(oscan) += NEXT_OFF(next);
1339 }
1340 continue;
1341 default: /* REF and CLUMP only? */
1342 if (flags & SCF_DO_SUBSTR) {
1343 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1344 data->longest = &(data->longest_float);
1345 }
1346 is_inf = is_inf_internal = 1;
1347 if (flags & SCF_DO_STCLASS_OR)
1348 cl_anything(pRExC_state, data->start_class);
1349 flags &= ~SCF_DO_STCLASS;
1350 break;
1351 }
1352 }
1353 else if (strchr((char*)PL_simple,OP(scan))) {
1354 int value = 0;
1355
1356 if (flags & SCF_DO_SUBSTR) {
1357 scan_commit(pRExC_state,data);
1358 data->pos_min++;
1359 }
1360 min++;
1361 if (flags & SCF_DO_STCLASS) {
1362 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363
1364 /* Some of the logic below assumes that switching
1365 locale on will only add false positives. */
1366 switch (PL_regkind[(U8)OP(scan)]) {
1367 case SANY:
1368 default:
1369 do_default:
1370 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1371 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1372 cl_anything(pRExC_state, data->start_class);
1373 break;
1374 case REG_ANY:
1375 if (OP(scan) == SANY)
1376 goto do_default;
1377 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1378 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1379 || (data->start_class->flags & ANYOF_CLASS));
1380 cl_anything(pRExC_state, data->start_class);
1381 }
1382 if (flags & SCF_DO_STCLASS_AND || !value)
1383 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1384 break;
1385 case ANYOF:
1386 if (flags & SCF_DO_STCLASS_AND)
1387 cl_and(data->start_class,
1388 (struct regnode_charclass_class*)scan);
1389 else
1390 cl_or(pRExC_state, data->start_class,
1391 (struct regnode_charclass_class*)scan);
1392 break;
1393 case ALNUM:
1394 if (flags & SCF_DO_STCLASS_AND) {
1395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1397 for (value = 0; value < 256; value++)
1398 if (!isALNUM(value))
1399 ANYOF_BITMAP_CLEAR(data->start_class, value);
1400 }
1401 }
1402 else {
1403 if (data->start_class->flags & ANYOF_LOCALE)
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405 else {
1406 for (value = 0; value < 256; value++)
1407 if (isALNUM(value))
1408 ANYOF_BITMAP_SET(data->start_class, value);
1409 }
1410 }
1411 break;
1412 case ALNUML:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (data->start_class->flags & ANYOF_LOCALE)
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416 }
1417 else {
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 data->start_class->flags |= ANYOF_LOCALE;
1420 }
1421 break;
1422 case NALNUM:
1423 if (flags & SCF_DO_STCLASS_AND) {
1424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1426 for (value = 0; value < 256; value++)
1427 if (isALNUM(value))
1428 ANYOF_BITMAP_CLEAR(data->start_class, value);
1429 }
1430 }
1431 else {
1432 if (data->start_class->flags & ANYOF_LOCALE)
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 else {
1435 for (value = 0; value < 256; value++)
1436 if (!isALNUM(value))
1437 ANYOF_BITMAP_SET(data->start_class, value);
1438 }
1439 }
1440 break;
1441 case NALNUML:
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (data->start_class->flags & ANYOF_LOCALE)
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445 }
1446 else {
1447 data->start_class->flags |= ANYOF_LOCALE;
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1449 }
1450 break;
1451 case SPACE:
1452 if (flags & SCF_DO_STCLASS_AND) {
1453 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1454 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1455 for (value = 0; value < 256; value++)
1456 if (!isSPACE(value))
1457 ANYOF_BITMAP_CLEAR(data->start_class, value);
1458 }
1459 }
1460 else {
1461 if (data->start_class->flags & ANYOF_LOCALE)
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 else {
1464 for (value = 0; value < 256; value++)
1465 if (isSPACE(value))
1466 ANYOF_BITMAP_SET(data->start_class, value);
1467 }
1468 }
1469 break;
1470 case SPACEL:
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (data->start_class->flags & ANYOF_LOCALE)
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474 }
1475 else {
1476 data->start_class->flags |= ANYOF_LOCALE;
1477 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1478 }
1479 break;
1480 case NSPACE:
1481 if (flags & SCF_DO_STCLASS_AND) {
1482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1484 for (value = 0; value < 256; value++)
1485 if (isSPACE(value))
1486 ANYOF_BITMAP_CLEAR(data->start_class, value);
1487 }
1488 }
1489 else {
1490 if (data->start_class->flags & ANYOF_LOCALE)
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492 else {
1493 for (value = 0; value < 256; value++)
1494 if (!isSPACE(value))
1495 ANYOF_BITMAP_SET(data->start_class, value);
1496 }
1497 }
1498 break;
1499 case NSPACEL:
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (data->start_class->flags & ANYOF_LOCALE) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1504 if (!isSPACE(value))
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 }
1507 }
1508 else {
1509 data->start_class->flags |= ANYOF_LOCALE;
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511 }
1512 break;
1513 case DIGIT:
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1516 for (value = 0; value < 256; value++)
1517 if (!isDIGIT(value))
1518 ANYOF_BITMAP_CLEAR(data->start_class, value);
1519 }
1520 else {
1521 if (data->start_class->flags & ANYOF_LOCALE)
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523 else {
1524 for (value = 0; value < 256; value++)
1525 if (isDIGIT(value))
1526 ANYOF_BITMAP_SET(data->start_class, value);
1527 }
1528 }
1529 break;
1530 case NDIGIT:
1531 if (flags & SCF_DO_STCLASS_AND) {
1532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1533 for (value = 0; value < 256; value++)
1534 if (isDIGIT(value))
1535 ANYOF_BITMAP_CLEAR(data->start_class, value);
1536 }
1537 else {
1538 if (data->start_class->flags & ANYOF_LOCALE)
1539 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540 else {
1541 for (value = 0; value < 256; value++)
1542 if (!isDIGIT(value))
1543 ANYOF_BITMAP_SET(data->start_class, value);
1544 }
1545 }
1546 break;
1547 }
1548 if (flags & SCF_DO_STCLASS_OR)
1549 cl_and(data->start_class, &and_with);
1550 flags &= ~SCF_DO_STCLASS;
1551 }
1552 }
1553 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1554 data->flags |= (OP(scan) == MEOL
1555 ? SF_BEFORE_MEOL
1556 : SF_BEFORE_SEOL);
1557 }
1558 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1559 /* Lookbehind, or need to calculate parens/evals/stclass: */
1560 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1561 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1562 /* Lookahead/lookbehind */
1563 I32 deltanext, minnext, fake = 0;
1564 regnode *nscan;
1565 struct regnode_charclass_class intrnl;
1566 int f = 0;
1567
1568 data_fake.flags = 0;
1569 if (data) {
1570 data_fake.whilem_c = data->whilem_c;
1571 data_fake.last_closep = data->last_closep;
1572 }
1573 else
1574 data_fake.last_closep = &fake;
1575 if ( flags & SCF_DO_STCLASS && !scan->flags
1576 && OP(scan) == IFMATCH ) { /* Lookahead */
1577 cl_init(pRExC_state, &intrnl);
1578 data_fake.start_class = &intrnl;
1579 f |= SCF_DO_STCLASS_AND;
1580 }
1581 if (flags & SCF_WHILEM_VISITED_POS)
1582 f |= SCF_WHILEM_VISITED_POS;
1583 next = regnext(scan);
1584 nscan = NEXTOPER(NEXTOPER(scan));
1585 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1586 if (scan->flags) {
1587 if (deltanext) {
1588 vFAIL("Variable length lookbehind not implemented");
1589 }
1590 else if (minnext > U8_MAX) {
1591 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1592 }
1593 scan->flags = minnext;
1594 }
1595 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596 pars++;
1597 if (data && (data_fake.flags & SF_HAS_EVAL))
1598 data->flags |= SF_HAS_EVAL;
1599 if (data)
1600 data->whilem_c = data_fake.whilem_c;
1601 if (f & SCF_DO_STCLASS_AND) {
1602 int was = (data->start_class->flags & ANYOF_EOS);
1603
1604 cl_and(data->start_class, &intrnl);
1605 if (was)
1606 data->start_class->flags |= ANYOF_EOS;
1607 }
1608 }
1609 else if (OP(scan) == OPEN) {
1610 pars++;
1611 }
1612 else if (OP(scan) == CLOSE) {
1613 if (ARG(scan) == is_par) {
1614 next = regnext(scan);
1615
1616 if ( next && (OP(next) != WHILEM) && next < last)
1617 is_par = 0; /* Disable optimization */
1618 }
1619 if (data)
1620 *(data->last_closep) = ARG(scan);
1621 }
1622 else if (OP(scan) == EVAL) {
1623 if (data)
1624 data->flags |= SF_HAS_EVAL;
1625 }
1626 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1627 if (flags & SCF_DO_SUBSTR) {
1628 scan_commit(pRExC_state,data);
1629 data->longest = &(data->longest_float);
1630 }
1631 is_inf = is_inf_internal = 1;
1632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1633 cl_anything(pRExC_state, data->start_class);
1634 flags &= ~SCF_DO_STCLASS;
1635 }
1636 /* Else: zero-length, ignore. */
1637 scan = regnext(scan);
1638 }
1639
1640 finish:
1641 *scanp = scan;
1642 *deltap = is_inf_internal ? I32_MAX : delta;
1643 if (flags & SCF_DO_SUBSTR && is_inf)
1644 data->pos_delta = I32_MAX - data->pos_min;
1645 if (is_par > U8_MAX)
1646 is_par = 0;
1647 if (is_par && pars==1 && data) {
1648 data->flags |= SF_IN_PAR;
1649 data->flags &= ~SF_HAS_PAR;
1650 }
1651 else if (pars && data) {
1652 data->flags |= SF_HAS_PAR;
1653 data->flags &= ~SF_IN_PAR;
1654 }
1655 if (flags & SCF_DO_STCLASS_OR)
1656 cl_and(data->start_class, &and_with);
1657 return min;
1658}
1659
1660STATIC I32
1661S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1662{
1663 if (RExC_rx->data) {
1664 Renewc(RExC_rx->data,
1665 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1666 char, struct reg_data);
1667 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1668 RExC_rx->data->count += n;
1669 }
1670 else {
1671 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1672 char, struct reg_data);
1673 New(1208, RExC_rx->data->what, n, U8);
1674 RExC_rx->data->count = n;
1675 }
1676 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1677 return RExC_rx->data->count - n;
1678}
1679
1680void
1681Perl_reginitcolors(pTHX)
1682{
1683 int i = 0;
1684 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1685
1686 if (s) {
1687 PL_colors[0] = s = savepv(s);
1688 while (++i < 6) {
1689 s = strchr(s, '\t');
1690 if (s) {
1691 *s = '\0';
1692 PL_colors[i] = ++s;
1693 }
1694 else
1695 PL_colors[i] = s = "";
1696 }
1697 } else {
1698 while (i < 6)
1699 PL_colors[i++] = "";
1700 }
1701 PL_colorset = 1;
1702}
1703
1704
1705/*
1706 - pregcomp - compile a regular expression into internal code
1707 *
1708 * We can't allocate space until we know how big the compiled form will be,
1709 * but we can't compile it (and thus know how big it is) until we've got a
1710 * place to put the code. So we cheat: we compile it twice, once with code
1711 * generation turned off and size counting turned on, and once "for real".
1712 * This also means that we don't allocate space until we are sure that the
1713 * thing really will compile successfully, and we never have to move the
1714 * code and thus invalidate pointers into it. (Note that it has to be in
1715 * one piece because free() must be able to free it all.) [NB: not true in perl]
1716 *
1717 * Beware that the optimization-preparation code in here knows about some
1718 * of the structure of the compiled regexp. [I'll say.]
1719 */
1720regexp *
1721Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1722{
1723 register regexp *r;
1724 regnode *scan;
1725 regnode *first;
1726 I32 flags;
1727 I32 minlen = 0;
1728 I32 sawplus = 0;
1729 I32 sawopen = 0;
1730 scan_data_t data;
1731 RExC_state_t RExC_state;
1732 RExC_state_t *pRExC_state = &RExC_state;
1733
1734 if (exp == NULL)
1735 FAIL("NULL regexp argument");
1736
1737 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1738
1739 RExC_precomp = exp;
1740 DEBUG_r({
1741 if (!PL_colorset) reginitcolors();
1742 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1743 PL_colors[4],PL_colors[5],PL_colors[0],
1744 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745 });
1746 RExC_flags16 = pm->op_pmflags;
1747 RExC_sawback = 0;
1748
1749 RExC_seen = 0;
1750 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1751 RExC_seen_evals = 0;
1752 RExC_extralen = 0;
1753
1754 /* First pass: determine size, legality. */
1755 RExC_parse = exp;
1756 RExC_start = exp;
1757 RExC_end = xend;
1758 RExC_naughty = 0;
1759 RExC_npar = 1;
1760 RExC_size = 0L;
1761 RExC_emit = &PL_regdummy;
1762 RExC_whilem_seen = 0;
1763#if 0 /* REGC() is (currently) a NOP at the first pass.
1764 * Clever compilers notice this and complain. --jhi */
1765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1766#endif
1767 if (reg(pRExC_state, 0, &flags) == NULL) {
1768 RExC_precomp = Nullch;
1769 return(NULL);
1770 }
1771 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1772
1773 /* Small enough for pointer-storage convention?
1774 If extralen==0, this means that we will not need long jumps. */
1775 if (RExC_size >= 0x10000L && RExC_extralen)
1776 RExC_size += RExC_extralen;
1777 else
1778 RExC_extralen = 0;
1779 if (RExC_whilem_seen > 15)
1780 RExC_whilem_seen = 15;
1781
1782 /* Allocate space and initialize. */
1783 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1784 char, regexp);
1785 if (r == NULL)
1786 FAIL("Regexp out of space");
1787
1788#ifdef DEBUGGING
1789 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1790 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1791#endif
1792 r->refcnt = 1;
1793 r->prelen = xend - exp;
1794 r->precomp = savepvn(RExC_precomp, r->prelen);
1795 r->subbeg = NULL;
1796 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1797 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1798
1799 r->substrs = 0; /* Useful during FAIL. */
1800 r->startp = 0; /* Useful during FAIL. */
1801 r->endp = 0; /* Useful during FAIL. */
1802
1803 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804 if (r->offsets) {
1805 r->offsets[0] = RExC_size;
1806 }
1807 DEBUG_r(PerlIO_printf(Perl_debug_log,
1808 "%s %"UVuf" bytes for offset annotations.\n",
1809 r->offsets ? "Got" : "Couldn't get",
1810 (UV)((2*RExC_size+1) * sizeof(U32))));
1811
1812 RExC_rx = r;
1813
1814 /* Second pass: emit code. */
1815 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
1816 RExC_parse = exp;
1817 RExC_end = xend;
1818 RExC_naughty = 0;
1819 RExC_npar = 1;
1820 RExC_emit_start = r->program;
1821 RExC_emit = r->program;
1822 /* Store the count of eval-groups for security checks: */
1823 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1824 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1825 r->data = 0;
1826 if (reg(pRExC_state, 0, &flags) == NULL)
1827 return(NULL);
1828
1829 /* Dig out information for optimizations. */
1830 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1831 pm->op_pmflags = RExC_flags16;
1832 if (UTF)
1833 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1834 r->regstclass = NULL;
1835 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1836 r->reganch |= ROPT_NAUGHTY;
1837 scan = r->program + 1; /* First BRANCH. */
1838
1839 /* XXXX To minimize changes to RE engine we always allocate
1840 3-units-long substrs field. */
1841 Newz(1004, r->substrs, 1, struct reg_substr_data);
1842
1843 StructCopy(&zero_scan_data, &data, scan_data_t);
1844 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1845 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1846 I32 fake;
1847 STRLEN longest_float_length, longest_fixed_length;
1848 struct regnode_charclass_class ch_class;
1849 int stclass_flag;
1850 I32 last_close = 0;
1851
1852 first = scan;
1853 /* Skip introductions and multiplicators >= 1. */
1854 while ((OP(first) == OPEN && (sawopen = 1)) ||
1855 /* An OR of *one* alternative - should not happen now. */
1856 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1857 (OP(first) == PLUS) ||
1858 (OP(first) == MINMOD) ||
1859 /* An {n,m} with n>0 */
1860 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1861 if (OP(first) == PLUS)
1862 sawplus = 1;
1863 else
1864 first += regarglen[(U8)OP(first)];
1865 first = NEXTOPER(first);
1866 }
1867
1868 /* Starting-point info. */
1869 again:
1870 if (PL_regkind[(U8)OP(first)] == EXACT) {
1871 if (OP(first) == EXACT)
1872 ; /* Empty, get anchored substr later. */
1873 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1874 r->regstclass = first;
1875 }
1876 else if (strchr((char*)PL_simple,OP(first)))
1877 r->regstclass = first;
1878 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1879 PL_regkind[(U8)OP(first)] == NBOUND)
1880 r->regstclass = first;
1881 else if (PL_regkind[(U8)OP(first)] == BOL) {
1882 r->reganch |= (OP(first) == MBOL
1883 ? ROPT_ANCH_MBOL
1884 : (OP(first) == SBOL
1885 ? ROPT_ANCH_SBOL
1886 : ROPT_ANCH_BOL));
1887 first = NEXTOPER(first);
1888 goto again;
1889 }
1890 else if (OP(first) == GPOS) {
1891 r->reganch |= ROPT_ANCH_GPOS;
1892 first = NEXTOPER(first);
1893 goto again;
1894 }
1895 else if (!sawopen && (OP(first) == STAR &&
1896 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1897 !(r->reganch & ROPT_ANCH) )
1898 {
1899 /* turn .* into ^.* with an implied $*=1 */
1900 int type = OP(NEXTOPER(first));
1901
1902 if (type == REG_ANY)
1903 type = ROPT_ANCH_MBOL;
1904 else
1905 type = ROPT_ANCH_SBOL;
1906
1907 r->reganch |= type | ROPT_IMPLICIT;
1908 first = NEXTOPER(first);
1909 goto again;
1910 }
1911 if (sawplus && (!sawopen || !RExC_sawback)
1912 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1913 /* x+ must match at the 1st pos of run of x's */
1914 r->reganch |= ROPT_SKIP;
1915
1916 /* Scan is after the zeroth branch, first is atomic matcher. */
1917 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1918 (IV)(first - scan + 1)));
1919 /*
1920 * If there's something expensive in the r.e., find the
1921 * longest literal string that must appear and make it the
1922 * regmust. Resolve ties in favor of later strings, since
1923 * the regstart check works with the beginning of the r.e.
1924 * and avoiding duplication strengthens checking. Not a
1925 * strong reason, but sufficient in the absence of others.
1926 * [Now we resolve ties in favor of the earlier string if
1927 * it happens that c_offset_min has been invalidated, since the
1928 * earlier string may buy us something the later one won't.]
1929 */
1930 minlen = 0;
1931
1932 data.longest_fixed = newSVpvn("",0);
1933 data.longest_float = newSVpvn("",0);
1934 data.last_found = newSVpvn("",0);
1935 data.longest = &(data.longest_fixed);
1936 first = scan;
1937 if (!r->regstclass) {
1938 cl_init(pRExC_state, &ch_class);
1939 data.start_class = &ch_class;
1940 stclass_flag = SCF_DO_STCLASS_AND;
1941 } else /* XXXX Check for BOUND? */
1942 stclass_flag = 0;
1943 data.last_closep = &last_close;
1944
1945 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1946 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1947 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1948 && data.last_start_min == 0 && data.last_end > 0
1949 && !RExC_seen_zerolen
1950 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1951 r->reganch |= ROPT_CHECK_ALL;
1952 scan_commit(pRExC_state, &data);
1953 SvREFCNT_dec(data.last_found);
1954
1955 longest_float_length = CHR_SVLEN(data.longest_float);
1956 if (longest_float_length
1957 || (data.flags & SF_FL_BEFORE_EOL
1958 && (!(data.flags & SF_FL_BEFORE_MEOL)
1959 || (RExC_flags16 & PMf_MULTILINE)))) {
1960 int t;
1961
1962 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1963 && data.offset_fixed == data.offset_float_min
1964 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1965 goto remove_float; /* As in (a)+. */
1966
1967 r->float_substr = data.longest_float;
1968 r->float_min_offset = data.offset_float_min;
1969 r->float_max_offset = data.offset_float_max;
1970 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1971 && (!(data.flags & SF_FL_BEFORE_MEOL)
1972 || (RExC_flags16 & PMf_MULTILINE)));
1973 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1974 }
1975 else {
1976 remove_float:
1977 r->float_substr = Nullsv;
1978 SvREFCNT_dec(data.longest_float);
1979 longest_float_length = 0;
1980 }
1981
1982 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1983 if (longest_fixed_length
1984 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1985 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1986 || (RExC_flags16 & PMf_MULTILINE)))) {
1987 int t;
1988
1989 r->anchored_substr = data.longest_fixed;
1990 r->anchored_offset = data.offset_fixed;
1991 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993 || (RExC_flags16 & PMf_MULTILINE)));
1994 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1995 }
1996 else {
1997 r->anchored_substr = Nullsv;
1998 SvREFCNT_dec(data.longest_fixed);
1999 longest_fixed_length = 0;
2000 }
2001 if (r->regstclass
2002 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2003 r->regstclass = NULL;
2004 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2005 && !(data.start_class->flags & ANYOF_EOS)
2006 && !cl_is_anything(data.start_class)) {
2007 I32 n = add_data(pRExC_state, 1, "f");
2008
2009 New(1006, RExC_rx->data->data[n], 1,
2010 struct regnode_charclass_class);
2011 StructCopy(data.start_class,
2012 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2013 struct regnode_charclass_class);
2014 r->regstclass = (regnode*)RExC_rx->data->data[n];
2015 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2016 PL_regdata = r->data; /* for regprop() */
2017 DEBUG_r({ SV *sv = sv_newmortal();
2018 regprop(sv, (regnode*)data.start_class);
2019 PerlIO_printf(Perl_debug_log,
2020 "synthetic stclass `%s'.\n",
2021 SvPVX(sv));});
2022 }
2023
2024 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2025 if (longest_fixed_length > longest_float_length) {
2026 r->check_substr = r->anchored_substr;
2027 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028 if (r->reganch & ROPT_ANCH_SINGLE)
2029 r->reganch |= ROPT_NOSCAN;
2030 }
2031 else {
2032 r->check_substr = r->float_substr;
2033 r->check_offset_min = data.offset_float_min;
2034 r->check_offset_max = data.offset_float_max;
2035 }
2036 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2037 This should be changed ASAP! */
2038 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
2039 r->reganch |= RE_USE_INTUIT;
2040 if (SvTAIL(r->check_substr))
2041 r->reganch |= RE_INTUIT_TAIL;
2042 }
2043 }
2044 else {
2045 /* Several toplevels. Best we can is to set minlen. */
2046 I32 fake;
2047 struct regnode_charclass_class ch_class;
2048 I32 last_close = 0;
2049
2050 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2051 scan = r->program + 1;
2052 cl_init(pRExC_state, &ch_class);
2053 data.start_class = &ch_class;
2054 data.last_closep = &last_close;
2055 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2056 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
2057 if (!(data.start_class->flags & ANYOF_EOS)
2058 && !cl_is_anything(data.start_class)) {
2059 I32 n = add_data(pRExC_state, 1, "f");
2060
2061 New(1006, RExC_rx->data->data[n], 1,
2062 struct regnode_charclass_class);
2063 StructCopy(data.start_class,
2064 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2065 struct regnode_charclass_class);
2066 r->regstclass = (regnode*)RExC_rx->data->data[n];
2067 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2068 DEBUG_r({ SV* sv = sv_newmortal();
2069 regprop(sv, (regnode*)data.start_class);
2070 PerlIO_printf(Perl_debug_log,
2071 "synthetic stclass `%s'.\n",
2072 SvPVX(sv));});
2073 }
2074 }
2075
2076 r->minlen = minlen;
2077 if (RExC_seen & REG_SEEN_GPOS)
2078 r->reganch |= ROPT_GPOS_SEEN;
2079 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2080 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2081 if (RExC_seen & REG_SEEN_EVAL)
2082 r->reganch |= ROPT_EVAL_SEEN;
2083 if (RExC_seen & REG_SEEN_CANY)
2084 r->reganch |= ROPT_CANY_SEEN;
2085 Newz(1002, r->startp, RExC_npar, I32);
2086 Newz(1002, r->endp, RExC_npar, I32);
2087 PL_regdata = r->data; /* for regprop() */
2088 DEBUG_r(regdump(r));
2089 return(r);
2090}
2091
2092/*
2093 - reg - regular expression, i.e. main body or parenthesized thing
2094 *
2095 * Caller must absorb opening parenthesis.
2096 *
2097 * Combining parenthesis handling with the base level of regular expression
2098 * is a trifle forced, but the need to tie the tails of the branches to what
2099 * follows makes it hard to avoid.
2100 */
2101STATIC regnode *
2102S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2103 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2104{
2105 register regnode *ret; /* Will be the head of the group. */
2106 register regnode *br;
2107 register regnode *lastbr;
2108 register regnode *ender = 0;
2109 register I32 parno = 0;
2110 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2111
2112 /* for (?g), (?gc), and (?o) warnings; warning
2113 about (?c) will warn about (?g) -- japhy */
2114
2115 I32 wastedflags = 0x00,
2116 wasted_o = 0x01,
2117 wasted_g = 0x02,
2118 wasted_gc = 0x02 | 0x04,
2119 wasted_c = 0x04;
2120
2121 char * parse_start = RExC_parse; /* MJD */
2122 char *oregcomp_parse = RExC_parse;
2123 char c;
2124
2125 *flagp = 0; /* Tentatively. */
2126
2127
2128 /* Make an OPEN node, if parenthesized. */
2129 if (paren) {
2130 if (*RExC_parse == '?') { /* (?...) */
2131 U16 posflags = 0, negflags = 0;
2132 U16 *flagsp = &posflags;
2133 int logical = 0;
2134 char *seqstart = RExC_parse;
2135
2136 RExC_parse++;
2137 paren = *RExC_parse++;
2138 ret = NULL; /* For look-ahead/behind. */
2139 switch (paren) {
2140 case '<': /* (?<...) */
2141 RExC_seen |= REG_SEEN_LOOKBEHIND;
2142 if (*RExC_parse == '!')
2143 paren = ',';
2144 if (*RExC_parse != '=' && *RExC_parse != '!')
2145 goto unknown;
2146 RExC_parse++;
2147 case '=': /* (?=...) */
2148 case '!': /* (?!...) */
2149 RExC_seen_zerolen++;
2150 case ':': /* (?:...) */
2151 case '>': /* (?>...) */
2152 break;
2153 case '$': /* (?$...) */
2154 case '@': /* (?@...) */
2155 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2156 break;
2157 case '#': /* (?#...) */
2158 while (*RExC_parse && *RExC_parse != ')')
2159 RExC_parse++;
2160 if (*RExC_parse != ')')
2161 FAIL("Sequence (?#... not terminated");
2162 nextchar(pRExC_state);
2163 *flagp = TRYAGAIN;
2164 return NULL;
2165 case 'p': /* (?p...) */
2166 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2167 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2168 /* FALL THROUGH*/
2169 case '?': /* (??...) */
2170 logical = 1;
2171 if (*RExC_parse != '{')
2172 goto unknown;
2173 paren = *RExC_parse++;
2174 /* FALL THROUGH */
2175 case '{': /* (?{...}) */
2176 {
2177 I32 count = 1, n = 0;
2178 char c;
2179 char *s = RExC_parse;
2180 SV *sv;
2181 OP_4tree *sop, *rop;
2182
2183 RExC_seen_zerolen++;
2184 RExC_seen |= REG_SEEN_EVAL;
2185 while (count && (c = *RExC_parse)) {
2186 if (c == '\\' && RExC_parse[1])
2187 RExC_parse++;
2188 else if (c == '{')
2189 count++;
2190 else if (c == '}')
2191 count--;
2192 RExC_parse++;
2193 }
2194 if (*RExC_parse != ')')
2195 {
2196 RExC_parse = s;
2197 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2198 }
2199 if (!SIZE_ONLY) {
2200 AV *av;
2201
2202 if (RExC_parse - 1 - s)
2203 sv = newSVpvn(s, RExC_parse - 1 - s);
2204 else
2205 sv = newSVpvn("", 0);
2206
2207 ENTER;
2208 Perl_save_re_context(aTHX);
2209 rop = sv_compile_2op(sv, &sop, "re", &av);
2210 sop->op_private |= OPpREFCOUNTED;
2211 /* re_dup will OpREFCNT_inc */
2212 OpREFCNT_set(sop, 1);
2213 LEAVE;
2214
2215 n = add_data(pRExC_state, 3, "nop");
2216 RExC_rx->data->data[n] = (void*)rop;
2217 RExC_rx->data->data[n+1] = (void*)sop;
2218 RExC_rx->data->data[n+2] = (void*)av;
2219 SvREFCNT_dec(sv);
2220 }
2221 else { /* First pass */
2222 if (PL_reginterp_cnt < ++RExC_seen_evals
2223 && PL_curcop != &PL_compiling)
2224 /* No compiled RE interpolated, has runtime
2225 components ===> unsafe. */
2226 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2227 if (PL_tainting && PL_tainted)
2228 FAIL("Eval-group in insecure regular expression");
2229 }
2230
2231 nextchar(pRExC_state);
2232 if (logical) {
2233 ret = reg_node(pRExC_state, LOGICAL);
2234 if (!SIZE_ONLY)
2235 ret->flags = 2;
2236 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2237 /* deal with the length of this later - MJD */
2238 return ret;
2239 }
2240 return reganode(pRExC_state, EVAL, n);
2241 }
2242 case '(': /* (?(?{...})...) and (?(?=...)...) */
2243 {
2244 if (RExC_parse[0] == '?') { /* (?(?...)) */
2245 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2246 || RExC_parse[1] == '<'
2247 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2248 I32 flag;
2249
2250 ret = reg_node(pRExC_state, LOGICAL);
2251 if (!SIZE_ONLY)
2252 ret->flags = 1;
2253 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2254 goto insert_if;
2255 }
2256 }
2257 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2258 /* (?(1)...) */
2259 parno = atoi(RExC_parse++);
2260
2261 while (isDIGIT(*RExC_parse))
2262 RExC_parse++;
2263 ret = reganode(pRExC_state, GROUPP, parno);
2264
2265 if ((c = *nextchar(pRExC_state)) != ')')
2266 vFAIL("Switch condition not recognized");
2267 insert_if:
2268 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2269 br = regbranch(pRExC_state, &flags, 1);
2270 if (br == NULL)
2271 br = reganode(pRExC_state, LONGJMP, 0);
2272 else
2273 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2274 c = *nextchar(pRExC_state);
2275 if (flags&HASWIDTH)
2276 *flagp |= HASWIDTH;
2277 if (c == '|') {
2278 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2279 regbranch(pRExC_state, &flags, 1);
2280 regtail(pRExC_state, ret, lastbr);
2281 if (flags&HASWIDTH)
2282 *flagp |= HASWIDTH;
2283 c = *nextchar(pRExC_state);
2284 }
2285 else
2286 lastbr = NULL;
2287 if (c != ')')
2288 vFAIL("Switch (?(condition)... contains too many branches");
2289 ender = reg_node(pRExC_state, TAIL);
2290 regtail(pRExC_state, br, ender);
2291 if (lastbr) {
2292 regtail(pRExC_state, lastbr, ender);
2293 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2294 }
2295 else
2296 regtail(pRExC_state, ret, ender);
2297 return ret;
2298 }
2299 else {
2300 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2301 }
2302 }
2303 case 0:
2304 RExC_parse--; /* for vFAIL to print correctly */
2305 vFAIL("Sequence (? incomplete");
2306 break;
2307 default:
2308 --RExC_parse;
2309 parse_flags: /* (?i) */
2310 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2311 /* (?g), (?gc) and (?o) are useless here
2312 and must be globally applied -- japhy */
2313
2314 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2315 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2316 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2317 if (! (wastedflags & wflagbit) ) {
2318 wastedflags |= wflagbit;
2319 vWARN5(
2320 RExC_parse + 1,
2321 "Useless (%s%c) - %suse /%c modifier",
2322 flagsp == &negflags ? "?-" : "?",
2323 *RExC_parse,
2324 flagsp == &negflags ? "don't " : "",
2325 *RExC_parse
2326 );
2327 }
2328 }
2329 }
2330 else if (*RExC_parse == 'c') {
2331 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2332 if (! (wastedflags & wasted_c) ) {
2333 wastedflags |= wasted_gc;
2334 vWARN3(
2335 RExC_parse + 1,
2336 "Useless (%sc) - %suse /gc modifier",
2337 flagsp == &negflags ? "?-" : "?",
2338 flagsp == &negflags ? "don't " : ""
2339 );
2340 }
2341 }
2342 }
2343 else { pmflag(flagsp, *RExC_parse); }
2344
2345 ++RExC_parse;
2346 }
2347 if (*RExC_parse == '-') {
2348 flagsp = &negflags;
2349 wastedflags = 0; /* reset so (?g-c) warns twice */
2350 ++RExC_parse;
2351 goto parse_flags;
2352 }
2353 RExC_flags16 |= posflags;
2354 RExC_flags16 &= ~negflags;
2355 if (*RExC_parse == ':') {
2356 RExC_parse++;
2357 paren = ':';
2358 break;
2359 }
2360 unknown:
2361 if (*RExC_parse != ')') {
2362 RExC_parse++;
2363 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2364 }
2365 nextchar(pRExC_state);
2366 *flagp = TRYAGAIN;
2367 return NULL;
2368 }
2369 }
2370 else { /* (...) */
2371 parno = RExC_npar;
2372 RExC_npar++;
2373 ret = reganode(pRExC_state, OPEN, parno);
2374 Set_Node_Length(ret, 1); /* MJD */
2375 Set_Node_Offset(ret, RExC_parse); /* MJD */
2376 open = 1;
2377 }
2378 }
2379 else /* ! paren */
2380 ret = NULL;
2381
2382 /* Pick up the branches, linking them together. */
2383 parse_start = RExC_parse; /* MJD */
2384 br = regbranch(pRExC_state, &flags, 1);
2385 /* branch_len = (paren != 0); */
2386
2387 if (br == NULL)
2388 return(NULL);
2389 if (*RExC_parse == '|') {
2390 if (!SIZE_ONLY && RExC_extralen) {
2391 reginsert(pRExC_state, BRANCHJ, br);
2392 }
2393 else { /* MJD */
2394 reginsert(pRExC_state, BRANCH, br);
2395 Set_Node_Length(br, paren != 0);
2396 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2397 }
2398 have_branch = 1;
2399 if (SIZE_ONLY)
2400 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2401 }
2402 else if (paren == ':') {
2403 *flagp |= flags&SIMPLE;
2404 }
2405 if (open) { /* Starts with OPEN. */
2406 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2407 }
2408 else if (paren != '?') /* Not Conditional */
2409 ret = br;
2410 *flagp |= flags & (SPSTART | HASWIDTH);
2411 lastbr = br;
2412 while (*RExC_parse == '|') {
2413 if (!SIZE_ONLY && RExC_extralen) {
2414 ender = reganode(pRExC_state, LONGJMP,0);
2415 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2416 }
2417 if (SIZE_ONLY)
2418 RExC_extralen += 2; /* Account for LONGJMP. */
2419 nextchar(pRExC_state);
2420 br = regbranch(pRExC_state, &flags, 0);
2421
2422 if (br == NULL)
2423 return(NULL);
2424 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2425 lastbr = br;
2426 if (flags&HASWIDTH)
2427 *flagp |= HASWIDTH;
2428 *flagp |= flags&SPSTART;
2429 }
2430
2431 if (have_branch || paren != ':') {
2432 /* Make a closing node, and hook it on the end. */
2433 switch (paren) {
2434 case ':':
2435 ender = reg_node(pRExC_state, TAIL);
2436 break;
2437 case 1:
2438 ender = reganode(pRExC_state, CLOSE, parno);
2439 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2440 Set_Node_Length(ender,1); /* MJD */
2441 break;
2442 case '<':
2443 case ',':
2444 case '=':
2445 case '!':
2446 *flagp &= ~HASWIDTH;
2447 /* FALL THROUGH */
2448 case '>':
2449 ender = reg_node(pRExC_state, SUCCEED);
2450 break;
2451 case 0:
2452 ender = reg_node(pRExC_state, END);
2453 break;
2454 }
2455 regtail(pRExC_state, lastbr, ender);
2456
2457 if (have_branch) {
2458 /* Hook the tails of the branches to the closing node. */
2459 for (br = ret; br != NULL; br = regnext(br)) {
2460 regoptail(pRExC_state, br, ender);
2461 }
2462 }
2463 }
2464
2465 {
2466 char *p;
2467 static char parens[] = "=!<,>";
2468
2469 if (paren && (p = strchr(parens, paren))) {
2470 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2471 int flag = (p - parens) > 1;
2472
2473 if (paren == '>')
2474 node = SUSPEND, flag = 0;
2475 reginsert(pRExC_state, node,ret);
2476 ret->flags = flag;
2477 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2478 }
2479 }
2480
2481 /* Check for proper termination. */
2482 if (paren) {
2483 RExC_flags16 = oregflags;
2484 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2485 RExC_parse = oregcomp_parse;
2486 vFAIL("Unmatched (");
2487 }
2488 }
2489 else if (!paren && RExC_parse < RExC_end) {
2490 if (*RExC_parse == ')') {
2491 RExC_parse++;
2492 vFAIL("Unmatched )");
2493 }
2494 else
2495 FAIL("Junk on end of regexp"); /* "Can't happen". */
2496 /* NOTREACHED */
2497 }
2498
2499 return(ret);
2500}
2501
2502/*
2503 - regbranch - one alternative of an | operator
2504 *
2505 * Implements the concatenation operator.
2506 */
2507STATIC regnode *
2508S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2509{
2510 register regnode *ret;
2511 register regnode *chain = NULL;
2512 register regnode *latest;
2513 I32 flags = 0, c = 0;
2514
2515 if (first)
2516 ret = NULL;
2517 else {
2518 if (!SIZE_ONLY && RExC_extralen)
2519 ret = reganode(pRExC_state, BRANCHJ,0);
2520 else {
2521 ret = reg_node(pRExC_state, BRANCH);
2522 Set_Node_Length(ret, 1);
2523 }
2524 }
2525
2526 if (!first && SIZE_ONLY)
2527 RExC_extralen += 1; /* BRANCHJ */
2528
2529 *flagp = WORST; /* Tentatively. */
2530
2531 RExC_parse--;
2532 nextchar(pRExC_state);
2533 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2534 flags &= ~TRYAGAIN;
2535 latest = regpiece(pRExC_state, &flags);
2536 if (latest == NULL) {
2537 if (flags & TRYAGAIN)
2538 continue;
2539 return(NULL);
2540 }
2541 else if (ret == NULL)
2542 ret = latest;
2543 *flagp |= flags&HASWIDTH;
2544 if (chain == NULL) /* First piece. */
2545 *flagp |= flags&SPSTART;
2546 else {
2547 RExC_naughty++;
2548 regtail(pRExC_state, chain, latest);
2549 }
2550 chain = latest;
2551 c++;
2552 }
2553 if (chain == NULL) { /* Loop ran zero times. */
2554 chain = reg_node(pRExC_state, NOTHING);
2555 if (ret == NULL)
2556 ret = chain;
2557 }
2558 if (c == 1) {
2559 *flagp |= flags&SIMPLE;
2560 }
2561
2562 return(ret);
2563}
2564
2565/*
2566 - regpiece - something followed by possible [*+?]
2567 *
2568 * Note that the branching code sequences used for ? and the general cases
2569 * of * and + are somewhat optimized: they use the same NOTHING node as
2570 * both the endmarker for their branch list and the body of the last branch.
2571 * It might seem that this node could be dispensed with entirely, but the
2572 * endmarker role is not redundant.
2573 */
2574STATIC regnode *
2575S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2576{
2577 register regnode *ret;
2578 register char op;
2579 register char *next;
2580 I32 flags;
2581 char *origparse = RExC_parse;
2582 char *maxpos;
2583 I32 min;
2584 I32 max = REG_INFTY;
2585 char *parse_start;
2586
2587 ret = regatom(pRExC_state, &flags);
2588 if (ret == NULL) {
2589 if (flags & TRYAGAIN)
2590 *flagp |= TRYAGAIN;
2591 return(NULL);
2592 }
2593
2594 op = *RExC_parse;
2595
2596 if (op == '{' && regcurly(RExC_parse)) {
2597 parse_start = RExC_parse; /* MJD */
2598 next = RExC_parse + 1;
2599 maxpos = Nullch;
2600 while (isDIGIT(*next) || *next == ',') {
2601 if (*next == ',') {
2602 if (maxpos)
2603 break;
2604 else
2605 maxpos = next;
2606 }
2607 next++;
2608 }
2609 if (*next == '}') { /* got one */
2610 if (!maxpos)
2611 maxpos = next;
2612 RExC_parse++;
2613 min = atoi(RExC_parse);
2614 if (*maxpos == ',')
2615 maxpos++;
2616 else
2617 maxpos = RExC_parse;
2618 max = atoi(maxpos);
2619 if (!max && *maxpos != '0')
2620 max = REG_INFTY; /* meaning "infinity" */
2621 else if (max >= REG_INFTY)
2622 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2623 RExC_parse = next;
2624 nextchar(pRExC_state);
2625
2626 do_curly:
2627 if ((flags&SIMPLE)) {
2628 RExC_naughty += 2 + RExC_naughty / 2;
2629 reginsert(pRExC_state, CURLY, ret);
2630 Set_Node_Offset(ret, parse_start+1); /* MJD */
2631 Set_Node_Cur_Length(ret);
2632 }
2633 else {
2634 regnode *w = reg_node(pRExC_state, WHILEM);
2635
2636 w->flags = 0;
2637 regtail(pRExC_state, ret, w);
2638 if (!SIZE_ONLY && RExC_extralen) {
2639 reginsert(pRExC_state, LONGJMP,ret);
2640 reginsert(pRExC_state, NOTHING,ret);
2641 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2642 }
2643 reginsert(pRExC_state, CURLYX,ret);
2644 /* MJD hk */
2645 Set_Node_Offset(ret, parse_start+1);
2646 Set_Node_Length(ret,
2647 op == '{' ? (RExC_parse - parse_start) : 1);
2648
2649 if (!SIZE_ONLY && RExC_extralen)
2650 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2651 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2652 if (SIZE_ONLY)
2653 RExC_whilem_seen++, RExC_extralen += 3;
2654 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2655 }
2656 ret->flags = 0;
2657
2658 if (min > 0)
2659 *flagp = WORST;
2660 if (max > 0)
2661 *flagp |= HASWIDTH;
2662 if (max && max < min)
2663 vFAIL("Can't do {n,m} with n > m");
2664 if (!SIZE_ONLY) {
2665 ARG1_SET(ret, min);
2666 ARG2_SET(ret, max);
2667 }
2668
2669 goto nest_check;
2670 }
2671 }
2672
2673 if (!ISMULT1(op)) {
2674 *flagp = flags;
2675 return(ret);
2676 }
2677
2678#if 0 /* Now runtime fix should be reliable. */
2679
2680 /* if this is reinstated, don't forget to put this back into perldiag:
2681
2682 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2683
2684 (F) The part of the regexp subject to either the * or + quantifier
2685 could match an empty string. The {#} shows in the regular
2686 expression about where the problem was discovered.
2687
2688 */
2689
2690 if (!(flags&HASWIDTH) && op != '?')
2691 vFAIL("Regexp *+ operand could be empty");
2692#endif
2693
2694 parse_start = RExC_parse;
2695 nextchar(pRExC_state);
2696
2697 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2698
2699 if (op == '*' && (flags&SIMPLE)) {
2700 reginsert(pRExC_state, STAR, ret);
2701 ret->flags = 0;
2702 RExC_naughty += 4;
2703 }
2704 else if (op == '*') {
2705 min = 0;
2706 goto do_curly;
2707 }
2708 else if (op == '+' && (flags&SIMPLE)) {
2709 reginsert(pRExC_state, PLUS, ret);
2710 ret->flags = 0;
2711 RExC_naughty += 3;
2712 }
2713 else if (op == '+') {
2714 min = 1;
2715 goto do_curly;
2716 }
2717 else if (op == '?') {
2718 min = 0; max = 1;
2719 goto do_curly;
2720 }
2721 nest_check:
2722 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2723 vWARN3(RExC_parse,
2724 "%.*s matches null string many times",
2725 RExC_parse - origparse,
2726 origparse);
2727 }
2728
2729 if (*RExC_parse == '?') {
2730 nextchar(pRExC_state);
2731 reginsert(pRExC_state, MINMOD, ret);
2732 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2733 }
2734 if (ISMULT2(RExC_parse)) {
2735 RExC_parse++;
2736 vFAIL("Nested quantifiers");
2737 }
2738
2739 return(ret);
2740}
2741
2742/*
2743 - regatom - the lowest level
2744 *
2745 * Optimization: gobbles an entire sequence of ordinary characters so that
2746 * it can turn them into a single node, which is smaller to store and
2747 * faster to run. Backslashed characters are exceptions, each becoming a
2748 * separate node; the code is simpler that way and it's not worth fixing.
2749 *
2750 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2751STATIC regnode *
2752S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2753{
2754 register regnode *ret = 0;
2755 I32 flags;
2756 char *parse_start = 0;
2757
2758 *flagp = WORST; /* Tentatively. */
2759
2760tryagain:
2761 switch (*RExC_parse) {
2762 case '^':
2763 RExC_seen_zerolen++;
2764 nextchar(pRExC_state);
2765 if (RExC_flags16 & PMf_MULTILINE)
2766 ret = reg_node(pRExC_state, MBOL);
2767 else if (RExC_flags16 & PMf_SINGLELINE)
2768 ret = reg_node(pRExC_state, SBOL);
2769 else
2770 ret = reg_node(pRExC_state, BOL);
2771 Set_Node_Length(ret, 1); /* MJD */
2772 break;
2773 case '$':
2774 nextchar(pRExC_state);
2775 if (*RExC_parse)
2776 RExC_seen_zerolen++;
2777 if (RExC_flags16 & PMf_MULTILINE)
2778 ret = reg_node(pRExC_state, MEOL);
2779 else if (RExC_flags16 & PMf_SINGLELINE)
2780 ret = reg_node(pRExC_state, SEOL);
2781 else
2782 ret = reg_node(pRExC_state, EOL);
2783 Set_Node_Length(ret, 1); /* MJD */
2784 break;
2785 case '.':
2786 nextchar(pRExC_state);
2787 if (RExC_flags16 & PMf_SINGLELINE)
2788 ret = reg_node(pRExC_state, SANY);
2789 else
2790 ret = reg_node(pRExC_state, REG_ANY);
2791 *flagp |= HASWIDTH|SIMPLE;
2792 RExC_naughty++;
2793 Set_Node_Length(ret, 1); /* MJD */
2794 break;
2795 case '[':
2796 {
2797 char *oregcomp_parse = ++RExC_parse;
2798 ret = regclass(pRExC_state);
2799 if (*RExC_parse != ']') {
2800 RExC_parse = oregcomp_parse;
2801 vFAIL("Unmatched [");
2802 }
2803 nextchar(pRExC_state);
2804 *flagp |= HASWIDTH|SIMPLE;
2805 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2806 break;
2807 }
2808 case '(':
2809 nextchar(pRExC_state);
2810 ret = reg(pRExC_state, 1, &flags);
2811 if (ret == NULL) {
2812 if (flags & TRYAGAIN) {
2813 if (RExC_parse == RExC_end) {
2814 /* Make parent create an empty node if needed. */
2815 *flagp |= TRYAGAIN;
2816 return(NULL);
2817 }
2818 goto tryagain;
2819 }
2820 return(NULL);
2821 }
2822 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2823 break;
2824 case '|':
2825 case ')':
2826 if (flags & TRYAGAIN) {
2827 *flagp |= TRYAGAIN;
2828 return NULL;
2829 }
2830 vFAIL("Internal urp");
2831 /* Supposed to be caught earlier. */
2832 break;
2833 case '{':
2834 if (!regcurly(RExC_parse)) {
2835 RExC_parse++;
2836 goto defchar;
2837 }
2838 /* FALL THROUGH */
2839 case '?':
2840 case '+':
2841 case '*':
2842 RExC_parse++;
2843 vFAIL("Quantifier follows nothing");
2844 break;
2845 case '\\':
2846 switch (*++RExC_parse) {
2847 case 'A':
2848 RExC_seen_zerolen++;
2849 ret = reg_node(pRExC_state, SBOL);
2850 *flagp |= SIMPLE;
2851 nextchar(pRExC_state);
2852 Set_Node_Length(ret, 2); /* MJD */
2853 break;
2854 case 'G':
2855 ret = reg_node(pRExC_state, GPOS);
2856 RExC_seen |= REG_SEEN_GPOS;
2857 *flagp |= SIMPLE;
2858 nextchar(pRExC_state);
2859 Set_Node_Length(ret, 2); /* MJD */
2860 break;
2861 case 'Z':
2862 ret = reg_node(pRExC_state, SEOL);
2863 *flagp |= SIMPLE;
2864 RExC_seen_zerolen++; /* Do not optimize RE away */
2865 nextchar(pRExC_state);
2866 break;
2867 case 'z':
2868 ret = reg_node(pRExC_state, EOS);
2869 *flagp |= SIMPLE;
2870 RExC_seen_zerolen++; /* Do not optimize RE away */
2871 nextchar(pRExC_state);
2872 Set_Node_Length(ret, 2); /* MJD */
2873 break;
2874 case 'C':
2875 ret = reg_node(pRExC_state, CANY);
2876 RExC_seen |= REG_SEEN_CANY;
2877 *flagp |= HASWIDTH|SIMPLE;
2878 nextchar(pRExC_state);
2879 Set_Node_Length(ret, 2); /* MJD */
2880 break;
2881 case 'X':
2882 ret = reg_node(pRExC_state, CLUMP);
2883 *flagp |= HASWIDTH;
2884 nextchar(pRExC_state);
2885 Set_Node_Length(ret, 2); /* MJD */
2886 break;
2887 case 'w':
2888 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
2889 *flagp |= HASWIDTH|SIMPLE;
2890 nextchar(pRExC_state);
2891 Set_Node_Length(ret, 2); /* MJD */
2892 break;
2893 case 'W':
2894 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
2895 *flagp |= HASWIDTH|SIMPLE;
2896 nextchar(pRExC_state);
2897 Set_Node_Length(ret, 2); /* MJD */
2898 break;
2899 case 'b':
2900 RExC_seen_zerolen++;
2901 RExC_seen |= REG_SEEN_LOOKBEHIND;
2902 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
2903 *flagp |= SIMPLE;
2904 nextchar(pRExC_state);
2905 Set_Node_Length(ret, 2); /* MJD */
2906 break;
2907 case 'B':
2908 RExC_seen_zerolen++;
2909 RExC_seen |= REG_SEEN_LOOKBEHIND;
2910 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
2911 *flagp |= SIMPLE;
2912 nextchar(pRExC_state);
2913 Set_Node_Length(ret, 2); /* MJD */
2914 break;
2915 case 's':
2916 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
2917 *flagp |= HASWIDTH|SIMPLE;
2918 nextchar(pRExC_state);
2919 Set_Node_Length(ret, 2); /* MJD */
2920 break;
2921 case 'S':
2922 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
2923 *flagp |= HASWIDTH|SIMPLE;
2924 nextchar(pRExC_state);
2925 Set_Node_Length(ret, 2); /* MJD */
2926 break;
2927 case 'd':
2928 ret = reg_node(pRExC_state, DIGIT);
2929 *flagp |= HASWIDTH|SIMPLE;
2930 nextchar(pRExC_state);
2931 Set_Node_Length(ret, 2); /* MJD */
2932 break;
2933 case 'D':
2934 ret = reg_node(pRExC_state, NDIGIT);
2935 *flagp |= HASWIDTH|SIMPLE;
2936 nextchar(pRExC_state);
2937 Set_Node_Length(ret, 2); /* MJD */
2938 break;
2939 case 'p':
2940 case 'P':
2941 {
2942 char* oldregxend = RExC_end;
2943 char* parse_start = RExC_parse;
2944
2945 if (RExC_parse[1] == '{') {
2946 /* a lovely hack--pretend we saw [\pX] instead */
2947 RExC_end = strchr(RExC_parse, '}');
2948 if (!RExC_end) {
2949 U8 c = (U8)*RExC_parse;
2950 RExC_parse += 2;
2951 RExC_end = oldregxend;
2952 vFAIL2("Missing right brace on \\%c{}", c);
2953 }
2954 RExC_end++;
2955 }
2956 else
2957 RExC_end = RExC_parse + 2;
2958 RExC_parse--;
2959
2960 ret = regclass(pRExC_state);
2961
2962 RExC_end = oldregxend;
2963 RExC_parse--;
2964 Set_Node_Cur_Length(ret); /* MJD */
2965 nextchar(pRExC_state);
2966 *flagp |= HASWIDTH|SIMPLE;
2967 }
2968 break;
2969 case 'n':
2970 case 'r':
2971 case 't':
2972 case 'f':
2973 case 'e':
2974 case 'a':
2975 case 'x':
2976 case 'c':
2977 case '0':
2978 goto defchar;
2979 case '1': case '2': case '3': case '4':
2980 case '5': case '6': case '7': case '8': case '9':
2981 {
2982 I32 num = atoi(RExC_parse);
2983
2984 if (num > 9 && num >= RExC_npar)
2985 goto defchar;
2986 else {
2987 char * parse_start = RExC_parse - 1; /* MJD */
2988 while (isDIGIT(*RExC_parse))
2989 RExC_parse++;
2990
2991 if (!SIZE_ONLY && num > RExC_rx->nparens)
2992 vFAIL("Reference to nonexistent group");
2993 RExC_sawback = 1;
2994 ret = reganode(pRExC_state, FOLD
2995 ? (LOC ? REFFL : REFF)
2996 : REF, num);
2997 *flagp |= HASWIDTH;
2998
2999 /* override incorrect value set in reganode MJD */
3000 Set_Node_Offset(ret, parse_start+1);
3001 Set_Node_Cur_Length(ret); /* MJD */
3002 RExC_parse--;
3003 nextchar(pRExC_state);
3004 }
3005 }
3006 break;
3007 case '\0':
3008 if (RExC_parse >= RExC_end)
3009 FAIL("Trailing \\");
3010 /* FALL THROUGH */
3011 default:
3012 /* Do not generate `unrecognized' warnings here, we fall
3013 back into the quick-grab loop below */
3014 goto defchar;
3015 }
3016 break;
3017
3018 case '#':
3019 if (RExC_flags16 & PMf_EXTENDED) {
3020 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3021 if (RExC_parse < RExC_end)
3022 goto tryagain;
3023 }
3024 /* FALL THROUGH */
3025
3026 default: {
3027 register STRLEN len;
3028 register UV ender;
3029 register char *p;
3030 char *oldp, *s;
3031 STRLEN numlen;
3032 STRLEN foldlen;
3033 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3034
3035 parse_start = RExC_parse - 1;
3036
3037 RExC_parse++;
3038
3039 defchar:
3040 ret = reg_node(pRExC_state, FOLD
3041 ? (LOC ? EXACTFL : EXACTF)
3042 : EXACT);
3043 s = STRING(ret);
3044 for (len = 0, p = RExC_parse - 1;
3045 len < 127 && p < RExC_end;
3046 len++)
3047 {
3048 oldp = p;
3049
3050 if (RExC_flags16 & PMf_EXTENDED)
3051 p = regwhite(p, RExC_end);
3052 switch (*p) {
3053 case '^':
3054 case '$':
3055 case '.':
3056 case '[':
3057 case '(':
3058 case ')':
3059 case '|':
3060 goto loopdone;
3061 case '\\':
3062 switch (*++p) {
3063 case 'A':
3064 case 'C':
3065 case 'X':
3066 case 'G':
3067 case 'Z':
3068 case 'z':
3069 case 'w':
3070 case 'W':
3071 case 'b':
3072 case 'B':
3073 case 's':
3074 case 'S':
3075 case 'd':
3076 case 'D':
3077 case 'p':
3078 case 'P':
3079 --p;
3080 goto loopdone;
3081 case 'n':
3082 ender = '\n';
3083 p++;
3084 break;
3085 case 'r':
3086 ender = '\r';
3087 p++;
3088 break;
3089 case 't':
3090 ender = '\t';
3091 p++;
3092 break;
3093 case 'f':
3094 ender = '\f';
3095 p++;
3096 break;
3097 case 'e':
3098 ender = ASCII_TO_NATIVE('\033');
3099 p++;
3100 break;
3101 case 'a':
3102 ender = ASCII_TO_NATIVE('\007');
3103 p++;
3104 break;
3105 case 'x':
3106 if (*++p == '{') {
3107 char* e = strchr(p, '}');
3108
3109 if (!e) {
3110 RExC_parse = p + 1;
3111 vFAIL("Missing right brace on \\x{}");
3112 }
3113 else {
3114 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3115 | PERL_SCAN_DISALLOW_PREFIX;
3116 numlen = e - p - 1;
3117 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3118 if (ender > 0xff)
3119 RExC_utf8 = 1;
3120 /* numlen is generous */
3121 if (numlen + len >= 127) {
3122 p--;
3123 goto loopdone;
3124 }
3125 p = e + 1;
3126 }
3127 }
3128 else {
3129 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3130 numlen = 2;
3131 ender = grok_hex(p, &numlen, &flags, NULL);
3132 p += numlen;
3133 }
3134 break;
3135 case 'c':
3136 p++;
3137 ender = UCHARAT(p++);
3138 ender = toCTRL(ender);
3139 break;
3140 case '0': case '1': case '2': case '3':case '4':
3141 case '5': case '6': case '7': case '8':case '9':
3142 if (*p == '0' ||
3143 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3144 I32 flags = 0;
3145 numlen = 3;
3146 ender = grok_oct(p, &numlen, &flags, NULL);
3147 p += numlen;
3148 }
3149 else {
3150 --p;
3151 goto loopdone;
3152 }
3153 break;
3154 case '\0':
3155 if (p >= RExC_end)
3156 FAIL("Trailing \\");
3157 /* FALL THROUGH */
3158 default:
3159 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3160 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3161 goto normal_default;
3162 }
3163 break;
3164 default:
3165 normal_default:
3166 if (UTF8_IS_START(*p) && UTF) {
3167 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3168 &numlen, 0);
3169 p += numlen;
3170 }
3171 else
3172 ender = *p++;
3173 break;
3174 }
3175 if (RExC_flags16 & PMf_EXTENDED)
3176 p = regwhite(p, RExC_end);
3177 if (UTF && FOLD) {
3178 /* Prime the casefolded buffer. */
3179 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3180 }
3181 if (ISMULT2(p)) { /* Back off on ?+*. */
3182 if (len)
3183 p = oldp;
3184 else if (UTF) {
3185 if (FOLD) {
3186 /* Emit all the Unicode characters. */
3187 for (foldbuf = tmpbuf;
3188 foldlen;
3189 foldlen -= numlen) {
3190 ender = utf8_to_uvchr(foldbuf, &numlen);
3191 reguni(pRExC_state, ender, s, &numlen);
3192 s += numlen;
3193 len += numlen;
3194 foldbuf += numlen;
3195 }
3196 }
3197 else {
3198 reguni(pRExC_state, ender, s, &numlen);
3199 s += numlen;
3200 len += numlen;
3201 }
3202 }
3203 else {
3204 len++;
3205 REGC(ender, s++);
3206 }
3207 break;
3208 }
3209 if (UTF) {
3210 if (FOLD) {
3211 /* Emit all the Unicode characters. */
3212 for (foldbuf = tmpbuf;
3213 foldlen;
3214 foldlen -= numlen) {
3215 ender = utf8_to_uvchr(foldbuf, &numlen);
3216 reguni(pRExC_state, ender, s, &numlen);
3217 s += numlen;
3218 len += numlen;
3219 foldbuf += numlen;
3220 }
3221 }
3222 else {
3223 reguni(pRExC_state, ender, s, &numlen);
3224 s += numlen;
3225 len += numlen;
3226 }
3227 len--;
3228 }
3229 else
3230 REGC(ender, s++);
3231 }
3232 loopdone:
3233 RExC_parse = p - 1;
3234 Set_Node_Cur_Length(ret); /* MJD */
3235 nextchar(pRExC_state);
3236 {
3237 /* len is STRLEN which is unsigned, need to copy to signed */
3238 IV iv = len;
3239 if (iv < 0)
3240 vFAIL("Internal disaster");
3241 }
3242 if (len > 0)
3243 *flagp |= HASWIDTH;
3244 if (len == 1)
3245 *flagp |= SIMPLE;
3246 if (!SIZE_ONLY)
3247 STR_LEN(ret) = len;
3248 if (SIZE_ONLY)
3249 RExC_size += STR_SZ(len);
3250 else
3251 RExC_emit += STR_SZ(len);
3252 }
3253 break;
3254 }
3255
3256 /* If the encoding pragma is in effect recode the text of
3257 * any EXACT-kind nodes. */
3258 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3259 STRLEN oldlen = STR_LEN(ret);
3260 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3261
3262 if (RExC_utf8)
3263 SvUTF8_on(sv);
3264 if (sv_utf8_downgrade(sv, TRUE)) {
3265 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3266 STRLEN newlen = SvCUR(sv);
3267
3268 if (!SIZE_ONLY) {
3269 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3270 (int)oldlen, STRING(ret),
3271 (int)newlen, s));
3272 Copy(s, STRING(ret), newlen, char);
3273 STR_LEN(ret) += newlen - oldlen;
3274 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3275 } else
3276 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3277 }
3278 }
3279
3280 return(ret);
3281}
3282
3283STATIC char *
3284S_regwhite(pTHX_ char *p, char *e)
3285{
3286 while (p < e) {
3287 if (isSPACE(*p))
3288 ++p;
3289 else if (*p == '#') {
3290 do {
3291 p++;
3292 } while (p < e && *p != '\n');
3293 }
3294 else
3295 break;
3296 }
3297 return p;
3298}
3299
3300/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3301 Character classes ([:foo:]) can also be negated ([:^foo:]).
3302 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3303 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3304 but trigger failures because they are currently unimplemented. */
3305
3306#define POSIXCC_DONE(c) ((c) == ':')
3307#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3308#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3309
3310STATIC I32
3311S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3312{
3313 char *posixcc = 0;
3314 I32 namedclass = OOB_NAMEDCLASS;
3315
3316 if (value == '[' && RExC_parse + 1 < RExC_end &&
3317 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3318 POSIXCC(UCHARAT(RExC_parse))) {
3319 char c = UCHARAT(RExC_parse);
3320 char* s = RExC_parse++;
3321
3322 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3323 RExC_parse++;
3324 if (RExC_parse == RExC_end)
3325 /* Grandfather lone [:, [=, [. */
3326 RExC_parse = s;
3327 else {
3328 char* t = RExC_parse++; /* skip over the c */
3329
3330 if (UCHARAT(RExC_parse) == ']') {
3331 RExC_parse++; /* skip over the ending ] */
3332 posixcc = s + 1;
3333 if (*s == ':') {
3334 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3335 I32 skip = 5; /* the most common skip */
3336
3337 switch (*posixcc) {
3338 case 'a':
3339 if (strnEQ(posixcc, "alnum", 5))
3340 namedclass =
3341 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3342 else if (strnEQ(posixcc, "alpha", 5))
3343 namedclass =
3344 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3345 else if (strnEQ(posixcc, "ascii", 5))
3346 namedclass =
3347 complement ? ANYOF_NASCII : ANYOF_ASCII;
3348 break;
3349 case 'b':
3350 if (strnEQ(posixcc, "blank", 5))
3351 namedclass =
3352 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3353 break;
3354 case 'c':
3355 if (strnEQ(posixcc, "cntrl", 5))
3356 namedclass =
3357 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3358 break;
3359 case 'd':
3360 if (strnEQ(posixcc, "digit", 5))
3361 namedclass =
3362 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3363 break;
3364 case 'g':
3365 if (strnEQ(posixcc, "graph", 5))
3366 namedclass =
3367 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3368 break;
3369 case 'l':
3370 if (strnEQ(posixcc, "lower", 5))
3371 namedclass =
3372 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3373 break;
3374 case 'p':
3375 if (strnEQ(posixcc, "print", 5))
3376 namedclass =
3377 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3378 else if (strnEQ(posixcc, "punct", 5))
3379 namedclass =
3380 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3381 break;
3382 case 's':
3383 if (strnEQ(posixcc, "space", 5))
3384 namedclass =
3385 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3386 break;
3387 case 'u':
3388 if (strnEQ(posixcc, "upper", 5))
3389 namedclass =
3390 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3391 break;
3392 case 'w': /* this is not POSIX, this is the Perl \w */
3393 if (strnEQ(posixcc, "word", 4)) {
3394 namedclass =
3395 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3396 skip = 4;
3397 }
3398 break;
3399 case 'x':
3400 if (strnEQ(posixcc, "xdigit", 6)) {
3401 namedclass =
3402 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3403 skip = 6;
3404 }
3405 break;
3406 }
3407 if (namedclass == OOB_NAMEDCLASS ||
3408 posixcc[skip] != ':' ||
3409 posixcc[skip+1] != ']')
3410 {
3411 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3412 t - s - 1, s + 1);
3413 }
3414 } else if (!SIZE_ONLY) {
3415 /* [[=foo=]] and [[.foo.]] are still future. */
3416
3417 /* adjust RExC_parse so the warning shows after
3418 the class closes */
3419 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3420 RExC_parse++;
3421 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3422 }
3423 } else {
3424 /* Maternal grandfather:
3425 * "[:" ending in ":" but not in ":]" */
3426 RExC_parse = s;
3427 }
3428 }
3429 }
3430
3431 return namedclass;
3432}
3433
3434STATIC void
3435S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3436{
3437 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
3438 POSIXCC(UCHARAT(RExC_parse))) {
3439 char *s = RExC_parse;
3440 char c = *s++;
3441
3442 while(*s && isALNUM(*s))
3443 s++;
3444 if (*s && c == *s && s[1] == ']') {
3445 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3446
3447 /* [[=foo=]] and [[.foo.]] are still future. */
3448 if (POSIXCC_NOTYET(c)) {
3449 /* adjust RExC_parse so the error shows after
3450 the class closes */
3451 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3452 ;
3453 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3454 }
3455 }
3456 }
3457}
3458
3459STATIC regnode *
3460S_regclass(pTHX_ RExC_state_t *pRExC_state)
3461{
3462 register UV value;
3463 register UV nextvalue;
3464 register IV prevvalue = OOB_UNICODE;
3465 register IV range = 0;
3466 register regnode *ret;
3467 STRLEN numlen;
3468 IV namedclass;
3469 char *rangebegin = 0;
3470 bool need_class = 0;
3471 SV *listsv = Nullsv;
3472 register char *e;
3473 UV n;
3474 bool optimize_invert = TRUE;
3475 AV* unicode_alternate = 0;
3476
3477 ret = reganode(pRExC_state, ANYOF, 0);
3478
3479 if (!SIZE_ONLY)
3480 ANYOF_FLAGS(ret) = 0;
3481
3482 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3483 RExC_naughty++;
3484 RExC_parse++;
3485 if (!SIZE_ONLY)
3486 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3487 }
3488
3489 if (SIZE_ONLY)
3490 RExC_size += ANYOF_SKIP;
3491 else {
3492 RExC_emit += ANYOF_SKIP;
3493 if (FOLD)
3494 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3495 if (LOC)
3496 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3497 ANYOF_BITMAP_ZERO(ret);
3498 listsv = newSVpvn("# comment\n", 10);
3499 }
3500
3501 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3502
3503 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
3504 checkposixcc(pRExC_state);
3505
3506 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3507 if (UCHARAT(RExC_parse) == ']')
3508 goto charclassloop;
3509
3510 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3511
3512 charclassloop:
3513
3514 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3515
3516 if (!range)
3517 rangebegin = RExC_parse;
3518 if (UTF) {
3519 value = utf8n_to_uvchr((U8*)RExC_parse,
3520 RExC_end - RExC_parse,
3521 &numlen, 0);
3522 RExC_parse += numlen;
3523 }
3524 else
3525 value = UCHARAT(RExC_parse++);
3526 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3527 if (value == '[' && POSIXCC(nextvalue))
3528 namedclass = regpposixcc(pRExC_state, value);
3529 else if (value == '\\') {
3530 if (UTF) {
3531 value = utf8n_to_uvchr((U8*)RExC_parse,
3532 RExC_end - RExC_parse,
3533 &numlen, 0);
3534 RExC_parse += numlen;
3535 }
3536 else
3537 value = UCHARAT(RExC_parse++);
3538 /* Some compilers cannot handle switching on 64-bit integer
3539 * values, therefore value cannot be an UV. Yes, this will
3540 * be a problem later if we want switch on Unicode.
3541 * A similar issue a little bit later when switching on
3542 * namedclass. --jhi */
3543 switch ((I32)value) {
3544 case 'w': namedclass = ANYOF_ALNUM; break;
3545 case 'W': namedclass = ANYOF_NALNUM; break;
3546 case 's': namedclass = ANYOF_SPACE; break;
3547 case 'S': namedclass = ANYOF_NSPACE; break;
3548 case 'd': namedclass = ANYOF_DIGIT; break;
3549 case 'D': namedclass = ANYOF_NDIGIT; break;
3550 case 'p':
3551 case 'P':
3552 if (*RExC_parse == '{') {
3553 U8 c = (U8)value;
3554 e = strchr(RExC_parse++, '}');
3555 if (!e)
3556 vFAIL2("Missing right brace on \\%c{}", c);
3557 while (isSPACE(UCHARAT(RExC_parse)))
3558 RExC_parse++;
3559 if (e == RExC_parse)
3560 vFAIL2("Empty \\%c{}", c);
3561 n = e - RExC_parse;
3562 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3563 n--;
3564 }
3565 else {
3566 e = RExC_parse;
3567 n = 1;
3568 }
3569 if (!SIZE_ONLY) {
3570 if (UCHARAT(RExC_parse) == '^') {
3571 RExC_parse++;
3572 n--;
3573 value = value == 'p' ? 'P' : 'p'; /* toggle */
3574 while (isSPACE(UCHARAT(RExC_parse))) {
3575 RExC_parse++;
3576 n--;
3577 }
3578 }
3579 if (value == 'p')
3580 Perl_sv_catpvf(aTHX_ listsv,
3581 "+utf8::%.*s\n", (int)n, RExC_parse);
3582 else
3583 Perl_sv_catpvf(aTHX_ listsv,
3584 "!utf8::%.*s\n", (int)n, RExC_parse);
3585 }
3586 RExC_parse = e + 1;
3587 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3588 continue;
3589 case 'n': value = '\n'; break;
3590 case 'r': value = '\r'; break;
3591 case 't': value = '\t'; break;
3592 case 'f': value = '\f'; break;
3593 case 'b': value = '\b'; break;
3594 case 'e': value = ASCII_TO_NATIVE('\033');break;
3595 case 'a': value = ASCII_TO_NATIVE('\007');break;
3596 case 'x':
3597 if (*RExC_parse == '{') {
3598 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3599 | PERL_SCAN_DISALLOW_PREFIX;
3600 e = strchr(RExC_parse++, '}');
3601 if (!e)
3602 vFAIL("Missing right brace on \\x{}");
3603
3604 numlen = e - RExC_parse;
3605 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3606 RExC_parse = e + 1;
3607 }
3608 else {
3609 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3610 numlen = 2;
3611 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3612 RExC_parse += numlen;
3613 }
3614 break;
3615 case 'c':
3616 value = UCHARAT(RExC_parse++);
3617 value = toCTRL(value);
3618 break;
3619 case '0': case '1': case '2': case '3': case '4':
3620 case '5': case '6': case '7': case '8': case '9':
3621 {
3622 I32 flags = 0;
3623 numlen = 3;
3624 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3625 RExC_parse += numlen;
3626 break;
3627 }
3628 default:
3629 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3630 vWARN2(RExC_parse,
3631 "Unrecognized escape \\%c in character class passed through",
3632 (int)value);
3633 break;
3634 }
3635 } /* end of \blah */
3636
3637 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3638
3639 if (!SIZE_ONLY && !need_class)
3640 ANYOF_CLASS_ZERO(ret);
3641
3642 need_class = 1;
3643
3644 /* a bad range like a-\d, a-[:digit:] ? */
3645 if (range) {
3646 if (!SIZE_ONLY) {
3647 if (ckWARN(WARN_REGEXP))
3648 vWARN4(RExC_parse,
3649 "False [] range \"%*.*s\"",
3650 RExC_parse - rangebegin,
3651 RExC_parse - rangebegin,
3652 rangebegin);
3653 if (prevvalue < 256) {
3654 ANYOF_BITMAP_SET(ret, prevvalue);
3655 ANYOF_BITMAP_SET(ret, '-');
3656 }
3657 else {
3658 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3659 Perl_sv_catpvf(aTHX_ listsv,
3660 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3661 }
3662 }
3663
3664 range = 0; /* this was not a true range */
3665 }
3666
3667 if (!SIZE_ONLY) {
3668 if (namedclass > OOB_NAMEDCLASS)
3669 optimize_invert = FALSE;
3670 /* Possible truncation here but in some 64-bit environments
3671 * the compiler gets heartburn about switch on 64-bit values.
3672 * A similar issue a little earlier when switching on value.
3673 * --jhi */
3674 switch ((I32)namedclass) {
3675 case ANYOF_ALNUM:
3676 if (LOC)
3677 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3678 else {
3679 for (value = 0; value < 256; value++)
3680 if (isALNUM(value))
3681 ANYOF_BITMAP_SET(ret, value);
3682 }
3683 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3684 break;
3685 case ANYOF_NALNUM:
3686 if (LOC)
3687 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3688 else {
3689 for (value = 0; value < 256; value++)
3690 if (!isALNUM(value))
3691 ANYOF_BITMAP_SET(ret, value);
3692 }
3693 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3694 break;
3695 case ANYOF_ALNUMC:
3696 if (LOC)
3697 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3698 else {
3699 for (value = 0; value < 256; value++)
3700 if (isALNUMC(value))
3701 ANYOF_BITMAP_SET(ret, value);
3702 }
3703 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3704 break;
3705 case ANYOF_NALNUMC:
3706 if (LOC)
3707 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3708 else {
3709 for (value = 0; value < 256; value++)
3710 if (!isALNUMC(value))
3711 ANYOF_BITMAP_SET(ret, value);
3712 }
3713 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3714 break;
3715 case ANYOF_ALPHA:
3716 if (LOC)
3717 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3718 else {
3719 for (value = 0; value < 256; value++)
3720 if (isALPHA(value))
3721 ANYOF_BITMAP_SET(ret, value);
3722 }
3723 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3724 break;
3725 case ANYOF_NALPHA:
3726 if (LOC)
3727 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3728 else {
3729 for (value = 0; value < 256; value++)
3730 if (!isALPHA(value))
3731 ANYOF_BITMAP_SET(ret, value);
3732 }
3733 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3734 break;
3735 case ANYOF_ASCII:
3736 if (LOC)
3737 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3738 else {
3739#ifndef EBCDIC
3740 for (value = 0; value < 128; value++)
3741 ANYOF_BITMAP_SET(ret, value);
3742#else /* EBCDIC */
3743 for (value = 0; value < 256; value++) {
3744 if (isASCII(value))
3745 ANYOF_BITMAP_SET(ret, value);
3746 }
3747#endif /* EBCDIC */
3748 }
3749 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3750 break;
3751 case ANYOF_NASCII:
3752 if (LOC)
3753 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3754 else {
3755#ifndef EBCDIC
3756 for (value = 128; value < 256; value++)
3757 ANYOF_BITMAP_SET(ret, value);
3758#else /* EBCDIC */
3759 for (value = 0; value < 256; value++) {
3760 if (!isASCII(value))
3761 ANYOF_BITMAP_SET(ret, value);
3762 }
3763#endif /* EBCDIC */
3764 }
3765 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3766 break;
3767 case ANYOF_BLANK:
3768 if (LOC)
3769 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3770 else {
3771 for (value = 0; value < 256; value++)
3772 if (isBLANK(value))
3773 ANYOF_BITMAP_SET(ret, value);
3774 }
3775 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3776 break;
3777 case ANYOF_NBLANK:
3778 if (LOC)
3779 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3780 else {
3781 for (value = 0; value < 256; value++)
3782 if (!isBLANK(value))
3783 ANYOF_BITMAP_SET(ret, value);
3784 }
3785 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3786 break;
3787 case ANYOF_CNTRL:
3788 if (LOC)
3789 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3790 else {
3791 for (value = 0; value < 256; value++)
3792 if (isCNTRL(value))
3793 ANYOF_BITMAP_SET(ret, value);
3794 }
3795 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3796 break;
3797 case ANYOF_NCNTRL:
3798 if (LOC)
3799 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3800 else {
3801 for (value = 0; value < 256; value++)
3802 if (!isCNTRL(value))
3803 ANYOF_BITMAP_SET(ret, value);
3804 }
3805 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3806 break;
3807 case ANYOF_DIGIT:
3808 if (LOC)
3809 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3810 else {
3811 /* consecutive digits assumed */
3812 for (value = '0'; value <= '9'; value++)
3813 ANYOF_BITMAP_SET(ret, value);
3814 }
3815 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3816 break;
3817 case ANYOF_NDIGIT:
3818 if (LOC)
3819 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3820 else {
3821 /* consecutive digits assumed */
3822 for (value = 0; value < '0'; value++)
3823 ANYOF_BITMAP_SET(ret, value);
3824 for (value = '9' + 1; value < 256; value++)
3825 ANYOF_BITMAP_SET(ret, value);
3826 }
3827 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3828 break;
3829 case ANYOF_GRAPH:
3830 if (LOC)
3831 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3832 else {
3833 for (value = 0; value < 256; value++)
3834 if (isGRAPH(value))
3835 ANYOF_BITMAP_SET(ret, value);
3836 }
3837 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3838 break;
3839 case ANYOF_NGRAPH:
3840 if (LOC)
3841 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3842 else {
3843 for (value = 0; value < 256; value++)
3844 if (!isGRAPH(value))
3845 ANYOF_BITMAP_SET(ret, value);
3846 }
3847 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3848 break;
3849 case ANYOF_LOWER:
3850 if (LOC)
3851 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3852 else {
3853 for (value = 0; value < 256; value++)
3854 if (isLOWER(value))
3855 ANYOF_BITMAP_SET(ret, value);
3856 }
3857 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3858 break;
3859 case ANYOF_NLOWER:
3860 if (LOC)
3861 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3862 else {
3863 for (value = 0; value < 256; value++)
3864 if (!isLOWER(value))
3865 ANYOF_BITMAP_SET(ret, value);
3866 }
3867 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3868 break;
3869 case ANYOF_PRINT:
3870 if (LOC)
3871 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3872 else {
3873 for (value = 0; value < 256; value++)
3874 if (isPRINT(value))
3875 ANYOF_BITMAP_SET(ret, value);
3876 }
3877 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3878 break;
3879 case ANYOF_NPRINT:
3880 if (LOC)
3881 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3882 else {
3883 for (value = 0; value < 256; value++)
3884 if (!isPRINT(value))
3885 ANYOF_BITMAP_SET(ret, value);
3886 }
3887 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3888 break;
3889 case ANYOF_PSXSPC:
3890 if (LOC)
3891 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3892 else {
3893 for (value = 0; value < 256; value++)
3894 if (isPSXSPC(value))
3895 ANYOF_BITMAP_SET(ret, value);
3896 }
3897 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3898 break;
3899 case ANYOF_NPSXSPC:
3900 if (LOC)
3901 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3902 else {
3903 for (value = 0; value < 256; value++)
3904 if (!isPSXSPC(value))
3905 ANYOF_BITMAP_SET(ret, value);
3906 }
3907 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3908 break;
3909 case ANYOF_PUNCT:
3910 if (LOC)
3911 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3912 else {
3913 for (value = 0; value < 256; value++)
3914 if (isPUNCT(value))
3915 ANYOF_BITMAP_SET(ret, value);
3916 }
3917 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3918 break;
3919 case ANYOF_NPUNCT:
3920 if (LOC)
3921 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3922 else {
3923 for (value = 0; value < 256; value++)
3924 if (!isPUNCT(value))
3925 ANYOF_BITMAP_SET(ret, value);
3926 }
3927 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3928 break;
3929 case ANYOF_SPACE:
3930 if (LOC)
3931 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3932 else {
3933 for (value = 0; value < 256; value++)
3934 if (isSPACE(value))
3935 ANYOF_BITMAP_SET(ret, value);
3936 }
3937 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3938 break;
3939 case ANYOF_NSPACE:
3940 if (LOC)
3941 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3942 else {
3943 for (value = 0; value < 256; value++)
3944 if (!isSPACE(value))
3945 ANYOF_BITMAP_SET(ret, value);
3946 }
3947 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3948 break;
3949 case ANYOF_UPPER:
3950 if (LOC)
3951 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3952 else {
3953 for (value = 0; value < 256; value++)
3954 if (isUPPER(value))
3955 ANYOF_BITMAP_SET(ret, value);
3956 }
3957 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3958 break;
3959 case ANYOF_NUPPER:
3960 if (LOC)
3961 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3962 else {
3963 for (value = 0; value < 256; value++)
3964 if (!isUPPER(value))
3965 ANYOF_BITMAP_SET(ret, value);
3966 }
3967 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
3968 break;
3969 case ANYOF_XDIGIT:
3970 if (LOC)
3971 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3972 else {
3973 for (value = 0; value < 256; value++)
3974 if (isXDIGIT(value))
3975 ANYOF_BITMAP_SET(ret, value);
3976 }
3977 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
3978 break;
3979 case ANYOF_NXDIGIT:
3980 if (LOC)
3981 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
3982 else {
3983 for (value = 0; value < 256; value++)
3984 if (!isXDIGIT(value))
3985 ANYOF_BITMAP_SET(ret, value);
3986 }
3987 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
3988 break;
3989 default:
3990 vFAIL("Invalid [::] class");
3991 break;
3992 }
3993 if (LOC)
3994 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
3995 continue;
3996 }
3997 } /* end of namedclass \blah */
3998
3999 if (range) {
4000 if (prevvalue > value) /* b-a */ {
4001 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4002 RExC_parse - rangebegin,
4003 RExC_parse - rangebegin,
4004 rangebegin);
4005 range = 0; /* not a valid range */
4006 }
4007 }
4008 else {
4009 prevvalue = value; /* save the beginning of the range */
4010 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4011 RExC_parse[1] != ']') {
4012 RExC_parse++;
4013
4014 /* a bad range like \w-, [:word:]- ? */
4015 if (namedclass > OOB_NAMEDCLASS) {
4016 if (ckWARN(WARN_REGEXP))
4017 vWARN4(RExC_parse,
4018 "False [] range \"%*.*s\"",
4019 RExC_parse - rangebegin,
4020 RExC_parse - rangebegin,
4021 rangebegin);
4022 if (!SIZE_ONLY)
4023 ANYOF_BITMAP_SET(ret, '-');
4024 } else
4025 range = 1; /* yeah, it's a range! */
4026 continue; /* but do it the next time */
4027 }
4028 }
4029
4030 /* now is the next time */
4031 if (!SIZE_ONLY) {
4032 IV i;
4033
4034 if (prevvalue < 256) {
4035 IV ceilvalue = value < 256 ? value : 255;
4036
4037#ifdef EBCDIC
4038 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4039 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4040 {
4041 if (isLOWER(prevvalue)) {
4042 for (i = prevvalue; i <= ceilvalue; i++)
4043 if (isLOWER(i))
4044 ANYOF_BITMAP_SET(ret, i);
4045 } else {
4046 for (i = prevvalue; i <= ceilvalue; i++)
4047 if (isUPPER(i))
4048 ANYOF_BITMAP_SET(ret, i);
4049 }
4050 }
4051 else
4052#endif
4053 for (i = prevvalue; i <= ceilvalue; i++)
4054 ANYOF_BITMAP_SET(ret, i);
4055 }
4056 if (value > 255 || UTF) {
4057 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4058 UV natvalue = NATIVE_TO_UNI(value);
4059
4060 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4061 if (prevnatvalue < natvalue) { /* what about > ? */
4062 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4063 prevnatvalue, natvalue);
4064 }
4065 else if (prevnatvalue == natvalue) {
4066 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4067 if (FOLD) {
4068 U8 tmpbuf [UTF8_MAXLEN+1];
4069 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4070 STRLEN foldlen;
4071 UV f;
4072
4073 uvchr_to_utf8(tmpbuf, natvalue);
4074 to_utf8_fold(tmpbuf, foldbuf, &foldlen);
4075 f = UNI_TO_NATIVE(utf8_to_uvchr(foldbuf, 0));
4076
4077 /* If folding and foldable and a single
4078 * character, insert also the folded version
4079 * to the charclass. */
4080 if (f != value) {
4081 if (foldlen == UNISKIP(f))
4082 Perl_sv_catpvf(aTHX_ listsv,
4083 "%04"UVxf"\n", f);
4084 else {
4085 /* Any multicharacter foldings
4086 * require the following transform:
4087 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4088 * where E folds into "pq" and F folds
4089 * into "rst", all other characters
4090 * fold to single characters. We save
4091 * away these multicharacter foldings,
4092 * to be later saved as part of the
4093 * additional "s" data. */
4094 SV *sv;
4095
4096 if (!unicode_alternate)
4097 unicode_alternate = newAV();
4098 sv = newSVpvn((char*)foldbuf, foldlen);
4099 SvUTF8_on(sv);
4100 av_push(unicode_alternate, sv);
4101 }
4102 }
4103
4104 /* If folding and the value is one of the Greek
4105 * sigmas insert a few more sigmas to make the
4106 * folding rules of the sigmas to work right.
4107 * Note that not all the possible combinations
4108 * are handled here: some of them are handled
4109 * by the standard folding rules, and some of
4110 * them (literal or EXACTF cases) are handled
4111 * during runtime in regexec.c:S_find_byclass(). */
4112 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4113 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4114 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4115 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4116 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4117 }
4118 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4119 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4120 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4121 }
4122 }
4123 }
4124 }
4125
4126 range = 0; /* this range (if it was one) is done now */
4127 }
4128
4129 if (need_class) {
4130 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4131 if (SIZE_ONLY)
4132 RExC_size += ANYOF_CLASS_ADD_SKIP;
4133 else
4134 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4135 }
4136
4137 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4138 if (!SIZE_ONLY &&
4139 /* If the only flag is folding (plus possibly inversion). */
4140 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4141 ) {
4142 for (value = 0; value < 256; ++value) {
4143 if (ANYOF_BITMAP_TEST(ret, value)) {
4144 IV fold = PL_fold[value];
4145
4146 if (fold != value)
4147 ANYOF_BITMAP_SET(ret, fold);
4148 }
4149 }
4150 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4151 }
4152
4153 /* optimize inverted simple patterns (e.g. [^a-z]) */
4154 if (!SIZE_ONLY && optimize_invert &&
4155 /* If the only flag is inversion. */
4156 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4157 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4158 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4159 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4160 }
4161
4162 if (!SIZE_ONLY) {
4163 AV *av = newAV();
4164 SV *rv;
4165
4166 /* The 0th element stores the character class description
4167 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4168 * to initialize the appropriate swash (which gets stored in
4169 * the 1st element), and also useful for dumping the regnode.
4170 * The 2nd element stores the multicharacter foldings,
4171 * used later (regexec.c:s_reginclasslen()). */
4172 av_store(av, 0, listsv);
4173 av_store(av, 1, NULL);
4174 av_store(av, 2, (SV*)unicode_alternate);
4175 rv = newRV_noinc((SV*)av);
4176 n = add_data(pRExC_state, 1, "s");
4177 RExC_rx->data->data[n] = (void*)rv;
4178 ARG_SET(ret, n);
4179 }
4180
4181 return ret;
4182}
4183
4184STATIC char*
4185S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4186{
4187 char* retval = RExC_parse++;
4188
4189 for (;;) {
4190 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4191 RExC_parse[2] == '#') {
4192 while (*RExC_parse && *RExC_parse != ')')
4193 RExC_parse++;
4194 RExC_parse++;
4195 continue;
4196 }
4197 if (RExC_flags16 & PMf_EXTENDED) {
4198 if (isSPACE(*RExC_parse)) {
4199 RExC_parse++;
4200 continue;
4201 }
4202 else if (*RExC_parse == '#') {
4203 while (*RExC_parse && *RExC_parse != '\n')
4204 RExC_parse++;
4205 RExC_parse++;
4206 continue;
4207 }
4208 }
4209 return retval;
4210 }
4211}
4212
4213/*
4214- reg_node - emit a node
4215*/
4216STATIC regnode * /* Location. */
4217S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4218{
4219 register regnode *ret;
4220 register regnode *ptr;
4221
4222 ret = RExC_emit;
4223 if (SIZE_ONLY) {
4224 SIZE_ALIGN(RExC_size);
4225 RExC_size += 1;
4226 return(ret);
4227 }
4228
4229 NODE_ALIGN_FILL(ret);
4230 ptr = ret;
4231 FILL_ADVANCE_NODE(ptr, op);
4232 if (RExC_offsets) { /* MJD */
4233 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4234 "reg_node", __LINE__,
4235 reg_name[op],
4236 RExC_emit - RExC_emit_start > RExC_offsets[0]
4237 ? "Overwriting end of array!\n" : "OK",
4238 RExC_emit - RExC_emit_start,
4239 RExC_parse - RExC_start,
4240 RExC_offsets[0]));
4241 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4242 }
4243
4244 RExC_emit = ptr;
4245
4246 return(ret);
4247}
4248
4249/*
4250- reganode - emit a node with an argument
4251*/
4252STATIC regnode * /* Location. */
4253S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4254{
4255 register regnode *ret;
4256 register regnode *ptr;
4257
4258 ret = RExC_emit;
4259 if (SIZE_ONLY) {
4260 SIZE_ALIGN(RExC_size);
4261 RExC_size += 2;
4262 return(ret);
4263 }
4264
4265 NODE_ALIGN_FILL(ret);
4266 ptr = ret;
4267 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4268 if (RExC_offsets) { /* MJD */
4269 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4270 "reganode",
4271 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4272 "Overwriting end of array!\n" : "OK",
4273 RExC_emit - RExC_emit_start,
4274 RExC_parse - RExC_start,
4275 RExC_offsets[0]));
4276 Set_Cur_Node_Offset;
4277 }
4278
4279 RExC_emit = ptr;
4280
4281 return(ret);
4282}
4283
4284/*
4285- reguni - emit (if appropriate) a Unicode character
4286*/
4287STATIC void
4288S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4289{
4290 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4291}
4292
4293/*
4294- reginsert - insert an operator in front of already-emitted operand
4295*
4296* Means relocating the operand.
4297*/
4298STATIC void
4299S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4300{
4301 register regnode *src;
4302 register regnode *dst;
4303 register regnode *place;
4304 register int offset = regarglen[(U8)op];
4305
4306/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4307
4308 if (SIZE_ONLY) {
4309 RExC_size += NODE_STEP_REGNODE + offset;
4310 return;
4311 }
4312
4313 src = RExC_emit;
4314 RExC_emit += NODE_STEP_REGNODE + offset;
4315 dst = RExC_emit;
4316 while (src > opnd) {
4317 StructCopy(--src, --dst, regnode);
4318 if (RExC_offsets) { /* MJD 20010112 */
4319 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4320 "reg_insert",
4321 dst - RExC_emit_start > RExC_offsets[0]
4322 ? "Overwriting end of array!\n" : "OK",
4323 src - RExC_emit_start,
4324 dst - RExC_emit_start,
4325 RExC_offsets[0]));
4326 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4327 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4328 }
4329 }
4330
4331
4332 place = opnd; /* Op node, where operand used to be. */
4333 if (RExC_offsets) { /* MJD */
4334 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4335 "reginsert",
4336 place - RExC_emit_start > RExC_offsets[0]
4337 ? "Overwriting end of array!\n" : "OK",
4338 place - RExC_emit_start,
4339 RExC_parse - RExC_start,
4340 RExC_offsets[0]));
4341 Set_Node_Offset(place, RExC_parse);
4342 }
4343 src = NEXTOPER(place);
4344 FILL_ADVANCE_NODE(place, op);
4345 Zero(src, offset, regnode);
4346}
4347
4348/*
4349- regtail - set the next-pointer at the end of a node chain of p to val.
4350*/
4351STATIC void
4352S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4353{
4354 register regnode *scan;
4355 register regnode *temp;
4356
4357 if (SIZE_ONLY)
4358 return;
4359
4360 /* Find last node. */
4361 scan = p;
4362 for (;;) {
4363 temp = regnext(scan);
4364 if (temp == NULL)
4365 break;
4366 scan = temp;
4367 }
4368
4369 if (reg_off_by_arg[OP(scan)]) {
4370 ARG_SET(scan, val - scan);
4371 }
4372 else {
4373 NEXT_OFF(scan) = val - scan;
4374 }
4375}
4376
4377/*
4378- regoptail - regtail on operand of first argument; nop if operandless
4379*/
4380STATIC void
4381S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4382{
4383 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4384 if (p == NULL || SIZE_ONLY)
4385 return;
4386 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4387 regtail(pRExC_state, NEXTOPER(p), val);
4388 }
4389 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4390 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4391 }
4392 else
4393 return;
4394}
4395
4396/*
4397 - regcurly - a little FSA that accepts {\d+,?\d*}
4398 */
4399STATIC I32
4400S_regcurly(pTHX_ register char *s)
4401{
4402 if (*s++ != '{')
4403 return FALSE;
4404 if (!isDIGIT(*s))
4405 return FALSE;
4406 while (isDIGIT(*s))
4407 s++;
4408 if (*s == ',')
4409 s++;
4410 while (isDIGIT(*s))
4411 s++;
4412 if (*s != '}')
4413 return FALSE;
4414 return TRUE;
4415}
4416
4417
4418#ifdef DEBUGGING
4419
4420STATIC regnode *
4421S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4422{
4423 register U8 op = EXACT; /* Arbitrary non-END op. */
4424 register regnode *next;
4425
4426 while (op != END && (!last || node < last)) {
4427 /* While that wasn't END last time... */
4428
4429 NODE_ALIGN(node);
4430 op = OP(node);
4431 if (op == CLOSE)
4432 l--;
4433 next = regnext(node);
4434 /* Where, what. */
4435 if (OP(node) == OPTIMIZED)
4436 goto after_print;
4437 regprop(sv, node);
4438 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4439 (int)(2*l + 1), "", SvPVX(sv));
4440 if (next == NULL) /* Next ptr. */
4441 PerlIO_printf(Perl_debug_log, "(0)");
4442 else
4443 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4444 (void)PerlIO_putc(Perl_debug_log, '\n');
4445 after_print:
4446 if (PL_regkind[(U8)op] == BRANCHJ) {
4447 register regnode *nnode = (OP(next) == LONGJMP
4448 ? regnext(next)
4449 : next);
4450 if (last && nnode > last)
4451 nnode = last;
4452 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4453 }
4454 else if (PL_regkind[(U8)op] == BRANCH) {
4455 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4456 }
4457 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4458 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4459 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4460 }
4461 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4462 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4463 next, sv, l + 1);
4464 }
4465 else if ( op == PLUS || op == STAR) {
4466 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4467 }
4468 else if (op == ANYOF) {
4469 /* arglen 1 + class block */
4470 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4471 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4472 node = NEXTOPER(node);
4473 }
4474 else if (PL_regkind[(U8)op] == EXACT) {
4475 /* Literal string, where present. */
4476 node += NODE_SZ_STR(node) - 1;
4477 node = NEXTOPER(node);
4478 }
4479 else {
4480 node = NEXTOPER(node);
4481 node += regarglen[(U8)op];
4482 }
4483 if (op == CURLYX || op == OPEN)
4484 l++;
4485 else if (op == WHILEM)
4486 l--;
4487 }
4488 return node;
4489}
4490
4491#endif /* DEBUGGING */
4492
4493/*
4494 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4495 */
4496void
4497Perl_regdump(pTHX_ regexp *r)
4498{
4499#ifdef DEBUGGING
4500 SV *sv = sv_newmortal();
4501
4502 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4503
4504 /* Header fields of interest. */
4505 if (r->anchored_substr)
4506 PerlIO_printf(Perl_debug_log,
4507 "anchored `%s%.*s%s'%s at %"IVdf" ",
4508 PL_colors[0],
4509 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4510 SvPVX(r->anchored_substr),
4511 PL_colors[1],
4512 SvTAIL(r->anchored_substr) ? "$" : "",
4513 (IV)r->anchored_offset);
4514 if (r->float_substr)
4515 PerlIO_printf(Perl_debug_log,
4516 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4517 PL_colors[0],
4518 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4519 SvPVX(r->float_substr),
4520 PL_colors[1],
4521 SvTAIL(r->float_substr) ? "$" : "",
4522 (IV)r->float_min_offset, (UV)r->float_max_offset);
4523 if (r->check_substr)
4524 PerlIO_printf(Perl_debug_log,
4525 r->check_substr == r->float_substr
4526 ? "(checking floating" : "(checking anchored");
4527 if (r->reganch & ROPT_NOSCAN)
4528 PerlIO_printf(Perl_debug_log, " noscan");
4529 if (r->reganch & ROPT_CHECK_ALL)
4530 PerlIO_printf(Perl_debug_log, " isall");
4531 if (r->check_substr)
4532 PerlIO_printf(Perl_debug_log, ") ");
4533
4534 if (r->regstclass) {
4535 regprop(sv, r->regstclass);
4536 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4537 }
4538 if (r->reganch & ROPT_ANCH) {
4539 PerlIO_printf(Perl_debug_log, "anchored");
4540 if (r->reganch & ROPT_ANCH_BOL)
4541 PerlIO_printf(Perl_debug_log, "(BOL)");
4542 if (r->reganch & ROPT_ANCH_MBOL)
4543 PerlIO_printf(Perl_debug_log, "(MBOL)");
4544 if (r->reganch & ROPT_ANCH_SBOL)
4545 PerlIO_printf(Perl_debug_log, "(SBOL)");
4546 if (r->reganch & ROPT_ANCH_GPOS)
4547 PerlIO_printf(Perl_debug_log, "(GPOS)");
4548 PerlIO_putc(Perl_debug_log, ' ');
4549 }
4550 if (r->reganch & ROPT_GPOS_SEEN)
4551 PerlIO_printf(Perl_debug_log, "GPOS ");
4552 if (r->reganch & ROPT_SKIP)
4553 PerlIO_printf(Perl_debug_log, "plus ");
4554 if (r->reganch & ROPT_IMPLICIT)
4555 PerlIO_printf(Perl_debug_log, "implicit ");
4556 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4557 if (r->reganch & ROPT_EVAL_SEEN)
4558 PerlIO_printf(Perl_debug_log, "with eval ");
4559 PerlIO_printf(Perl_debug_log, "\n");
4560 if (r->offsets) {
4561 U32 i;
4562 U32 len = r->offsets[0];
4563 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4564 for (i = 1; i <= len; i++)
4565 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4566 (UV)r->offsets[i*2-1],
4567 (UV)r->offsets[i*2]);
4568 PerlIO_printf(Perl_debug_log, "\n");
4569 }
4570#endif /* DEBUGGING */
4571}
4572
4573#ifdef DEBUGGING
4574
4575STATIC void
4576S_put_byte(pTHX_ SV *sv, int c)
4577{
4578 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4579 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4580 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4581 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4582 else
4583 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4584}
4585
4586#endif /* DEBUGGING */
4587
4588/*
4589- regprop - printable representation of opcode
4590*/
4591void
4592Perl_regprop(pTHX_ SV *sv, regnode *o)
4593{
4594#ifdef DEBUGGING
4595 register int k;
4596
4597 sv_setpvn(sv, "", 0);
4598 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4599 /* It would be nice to FAIL() here, but this may be called from
4600 regexec.c, and it would be hard to supply pRExC_state. */
4601 Perl_croak(aTHX_ "Corrupted regexp opcode");
4602 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4603
4604 k = PL_regkind[(U8)OP(o)];
4605
4606 if (k == EXACT) {
4607 SV *dsv = sv_2mortal(newSVpvn("", 0));
4608 /* Using is_utf8_string() is a crude hack but it may
4609 * be the best for now since we have no flag "this EXACTish
4610 * node was UTF-8" --jhi */
4611 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4612 char *s = do_utf8 ?
4613 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4614 UNI_DISPLAY_REGEX) :
4615 STRING(o);
4616 int len = do_utf8 ?
4617 strlen(s) :
4618 STR_LEN(o);
4619 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4620 PL_colors[0],
4621 len, s,
4622 PL_colors[1]);
4623 }
4624 else if (k == CURLY) {
4625 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4626 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4627 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4628 }
4629 else if (k == WHILEM && o->flags) /* Ordinal/of */
4630 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4631 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4632 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4633 else if (k == LOGICAL)
4634 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4635 else if (k == ANYOF) {
4636 int i, rangestart = -1;
4637 U8 flags = ANYOF_FLAGS(o);
4638 const char * const anyofs[] = { /* Should be syncronized with
4639 * ANYOF_ #xdefines in regcomp.h */
4640 "\\w",
4641 "\\W",
4642 "\\s",
4643 "\\S",
4644 "\\d",
4645 "\\D",
4646 "[:alnum:]",
4647 "[:^alnum:]",
4648 "[:alpha:]",
4649 "[:^alpha:]",
4650 "[:ascii:]",
4651 "[:^ascii:]",
4652 "[:ctrl:]",
4653 "[:^ctrl:]",
4654 "[:graph:]",
4655 "[:^graph:]",
4656 "[:lower:]",
4657 "[:^lower:]",
4658 "[:print:]",
4659 "[:^print:]",
4660 "[:punct:]",
4661 "[:^punct:]",
4662 "[:upper:]",
4663 "[:^upper:]",
4664 "[:xdigit:]",
4665 "[:^xdigit:]",
4666 "[:space:]",
4667 "[:^space:]",
4668 "[:blank:]",
4669 "[:^blank:]"
4670 };
4671
4672 if (flags & ANYOF_LOCALE)
4673 sv_catpv(sv, "{loc}");
4674 if (flags & ANYOF_FOLD)
4675 sv_catpv(sv, "{i}");
4676 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4677 if (flags & ANYOF_INVERT)
4678 sv_catpv(sv, "^");
4679 for (i = 0; i <= 256; i++) {
4680 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4681 if (rangestart == -1)
4682 rangestart = i;
4683 } else if (rangestart != -1) {
4684 if (i <= rangestart + 3)
4685 for (; rangestart < i; rangestart++)
4686 put_byte(sv, rangestart);
4687 else {
4688 put_byte(sv, rangestart);
4689 sv_catpv(sv, "-");
4690 put_byte(sv, i - 1);
4691 }
4692 rangestart = -1;
4693 }
4694 }
4695
4696 if (o->flags & ANYOF_CLASS)
4697 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4698 if (ANYOF_CLASS_TEST(o,i))
4699 sv_catpv(sv, anyofs[i]);
4700
4701 if (flags & ANYOF_UNICODE)
4702 sv_catpv(sv, "{unicode}");
4703 else if (flags & ANYOF_UNICODE_ALL)
4704 sv_catpv(sv, "{unicode_all}");
4705
4706 {
4707 SV *lv;
4708 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4709
4710 if (lv) {
4711 if (sw) {
4712 UV i;
4713 U8 s[UTF8_MAXLEN+1];
4714
4715 for (i = 0; i <= 256; i++) { /* just the first 256 */
4716 U8 *e = uvchr_to_utf8(s, i);
4717
4718 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4719 if (rangestart == -1)
4720 rangestart = i;
4721 } else if (rangestart != -1) {
4722 U8 *p;
4723
4724 if (i <= rangestart + 3)
4725 for (; rangestart < i; rangestart++) {
4726 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4727 put_byte(sv, *p);
4728 }
4729 else {
4730 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4731 put_byte(sv, *p);
4732 sv_catpv(sv, "-");
4733 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4734 put_byte(sv, *p);
4735 }
4736 rangestart = -1;
4737 }
4738 }
4739
4740 sv_catpv(sv, "..."); /* et cetera */
4741 }
4742
4743 {
4744 char *s = savepv(SvPVX(lv));
4745 char *origs = s;
4746
4747 while(*s && *s != '\n') s++;
4748
4749 if (*s == '\n') {
4750 char *t = ++s;
4751
4752 while (*s) {
4753 if (*s == '\n')
4754 *s = ' ';
4755 s++;
4756 }
4757 if (s[-1] == ' ')
4758 s[-1] = 0;
4759
4760 sv_catpv(sv, t);
4761 }
4762
4763 Safefree(origs);
4764 }
4765 }
4766 }
4767
4768 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4769 }
4770 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4771 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4772#endif /* DEBUGGING */
4773}
4774
4775SV *
4776Perl_re_intuit_string(pTHX_ regexp *prog)
4777{ /* Assume that RE_INTUIT is set */
4778 DEBUG_r(
4779 { STRLEN n_a;
4780 char *s = SvPV(prog->check_substr,n_a);
4781
4782 if (!PL_colorset) reginitcolors();
4783 PerlIO_printf(Perl_debug_log,
4784 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4785 PL_colors[4],PL_colors[5],PL_colors[0],
4786 s,
4787 PL_colors[1],
4788 (strlen(s) > 60 ? "..." : ""));
4789 } );
4790
4791 return prog->check_substr;
4792}
4793
4794void
4795Perl_pregfree(pTHX_ struct regexp *r)
4796{
4797#ifdef DEBUGGING
4798 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4799#endif
4800
4801 if (!r || (--r->refcnt > 0))
4802 return;
4803 DEBUG_r({
4804 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4805 UNI_DISPLAY_REGEX);
4806 int len = SvCUR(dsv);
4807 if (!PL_colorset)
4808 reginitcolors();
4809 PerlIO_printf(Perl_debug_log,
4810 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4811 PL_colors[4],PL_colors[5],PL_colors[0],
4812 len, len, s,
4813 PL_colors[1],
4814 len > 60 ? "..." : "");
4815 });
4816
4817 if (r->precomp)
4818 Safefree(r->precomp);
4819 if (r->offsets) /* 20010421 MJD */
4820 Safefree(r->offsets);
4821 if (RX_MATCH_COPIED(r))
4822 Safefree(r->subbeg);
4823 if (r->substrs) {
4824 if (r->anchored_substr)
4825 SvREFCNT_dec(r->anchored_substr);
4826 if (r->float_substr)
4827 SvREFCNT_dec(r->float_substr);
4828 Safefree(r->substrs);
4829 }
4830 if (r->data) {
4831 int n = r->data->count;
4832 AV* new_comppad = NULL;
4833 AV* old_comppad;
4834 SV** old_curpad;
4835
4836 while (--n >= 0) {
4837 /* If you add a ->what type here, update the comment in regcomp.h */
4838 switch (r->data->what[n]) {
4839 case 's':
4840 SvREFCNT_dec((SV*)r->data->data[n]);
4841 break;
4842 case 'f':
4843 Safefree(r->data->data[n]);
4844 break;
4845 case 'p':
4846 new_comppad = (AV*)r->data->data[n];
4847 break;
4848 case 'o':
4849 if (new_comppad == NULL)
4850 Perl_croak(aTHX_ "panic: pregfree comppad");
4851 old_comppad = PL_comppad;
4852 old_curpad = PL_curpad;
4853 /* Watch out for global destruction's random ordering. */
4854 if (SvTYPE(new_comppad) == SVt_PVAV) {
4855 PL_comppad = new_comppad;
4856 PL_curpad = AvARRAY(new_comppad);
4857 }
4858 else
4859 PL_curpad = NULL;
4860
4861 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4862 op_free((OP_4tree*)r->data->data[n]);
4863 }
4864
4865 PL_comppad = old_comppad;
4866 PL_curpad = old_curpad;
4867 SvREFCNT_dec((SV*)new_comppad);
4868 new_comppad = NULL;
4869 break;
4870 case 'n':
4871 break;
4872 default:
4873 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4874 }
4875 }
4876 Safefree(r->data->what);
4877 Safefree(r->data);
4878 }
4879 Safefree(r->startp);
4880 Safefree(r->endp);
4881 Safefree(r);
4882}
4883
4884/*
4885 - regnext - dig the "next" pointer out of a node
4886 *
4887 * [Note, when REGALIGN is defined there are two places in regmatch()
4888 * that bypass this code for speed.]
4889 */
4890regnode *
4891Perl_regnext(pTHX_ register regnode *p)
4892{
4893 register I32 offset;
4894
4895 if (p == &PL_regdummy)
4896 return(NULL);
4897
4898 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4899 if (offset == 0)
4900 return(NULL);
4901
4902 return(p+offset);
4903}
4904
4905STATIC void
4906S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4907{
4908 va_list args;
4909 STRLEN l1 = strlen(pat1);
4910 STRLEN l2 = strlen(pat2);
4911 char buf[512];
4912 SV *msv;
4913 char *message;
4914
4915 if (l1 > 510)
4916 l1 = 510;
4917 if (l1 + l2 > 510)
4918 l2 = 510 - l1;
4919 Copy(pat1, buf, l1 , char);
4920 Copy(pat2, buf + l1, l2 , char);
4921 buf[l1 + l2] = '\n';
4922 buf[l1 + l2 + 1] = '\0';
4923#ifdef I_STDARG
4924 /* ANSI variant takes additional second argument */
4925 va_start(args, pat2);
4926#else
4927 va_start(args);
4928#endif
4929 msv = vmess(buf, &args);
4930 va_end(args);
4931 message = SvPV(msv,l1);
4932 if (l1 > 512)
4933 l1 = 512;
4934 Copy(message, buf, l1 , char);
4935 buf[l1] = '\0'; /* Overwrite \n */
4936 Perl_croak(aTHX_ "%s", buf);
4937}
4938
4939/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4940
4941void
4942Perl_save_re_context(pTHX)
4943{
4944#if 0
4945 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4946 SAVEI32(RExC_npar); /* () count. */
4947 SAVEI32(RExC_size); /* Code size. */
4948 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4949 SAVEVPTR(RExC_rx); /* from regcomp.c */
4950 SAVEI32(RExC_seen); /* from regcomp.c */
4951 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4952 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4953 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4954 SAVEPPTR(RExC_end); /* End of input for compile */
4955 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4956#endif
4957
4958 SAVEI32(PL_reg_flags); /* from regexec.c */
4959 SAVEPPTR(PL_bostr);
4960 SAVEPPTR(PL_reginput); /* String-input pointer. */
4961 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4962 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
4963 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4964 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4965 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
4966 SAVEPPTR(PL_regtill); /* How far we are required to go. */
4967 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
4968 PL_reg_start_tmp = 0;
4969 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4970 PL_reg_start_tmpl = 0;
4971 SAVEVPTR(PL_regdata);
4972 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4973 SAVEI32(PL_regnarrate); /* from regexec.c */
4974 SAVEVPTR(PL_regprogram); /* from regexec.c */
4975 SAVEINT(PL_regindent); /* from regexec.c */
4976 SAVEVPTR(PL_regcc); /* from regexec.c */
4977 SAVEVPTR(PL_curcop);
4978 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4979 SAVEVPTR(PL_reg_re); /* from regexec.c */
4980 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4981 SAVESPTR(PL_reg_sv); /* from regexec.c */
4982 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
4983 SAVEVPTR(PL_reg_magic); /* from regexec.c */
4984 SAVEI32(PL_reg_oldpos); /* from regexec.c */
4985 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4986 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
4987 SAVEI32(PL_regnpar); /* () count. */
4988 SAVEI32(PL_regsize); /* from regexec.c */
4989#ifdef DEBUGGING
4990 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
4991#endif
4992}
4993
4994static void
4995clear_re(pTHX_ void *r)
4996{
4997 ReREFCNT_dec((regexp *)r);
4998}
4999