This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Passing read only values (such as string constants) to select should
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16 */
17
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
44/* We *really* need to overwrite these symbols: */
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
48# define Perl_pregfree my_regfree
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
51# define Perl_regnext my_regnext
52# define Perl_save_re_context my_save_re_context
53# define Perl_reginitcolors my_reginitcolors
54
55# define PERL_NO_GET_CONTEXT
56#endif
57
58/*SUPPRESS 112*/
59/*
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 ****
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
94#define PERL_IN_REGCOMP_C
95#include "perl.h"
96
97#ifndef PERL_IN_XSUB_RE
98# include "INTERN.h"
99#endif
100
101#define REG_COMP_C
102#include "regcomp.h"
103
104#ifdef op
105#undef op
106#endif /* op */
107
108#ifdef MSDOS
109# if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111# pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
115#endif /* MSDOS */
116
117#ifndef STATIC
118#define STATIC static
119#endif
120
121typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
139 I32 utf8;
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
146#define RExC_flags (pRExC_state->flags)
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
149#define RExC_start (pRExC_state->start)
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154#define RExC_emit (pRExC_state->emit)
155#define RExC_emit_start (pRExC_state->emit_start)
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
164#define RExC_utf8 (pRExC_state->utf8)
165
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
169
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
173/*
174 * Flags to be passed up and down.
175 */
176#define WORST 0 /* Worst case. */
177#define HASWIDTH 0x1 /* Known to match non-null strings. */
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
181
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
201 I32 *last_closep;
202 struct regnode_charclass_class *start_class;
203} scan_data_t;
204
205/*
206 * Forward declarations for pregcomp()'s friends.
207 */
208
209static const scan_data_t zero_scan_data =
210 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
235#define SCF_DO_SUBSTR 0x400
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239#define SCF_WHILEM_VISITED_POS 0x2000
240
241#define UTF (RExC_utf8 != 0)
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
244
245#define OOB_UNICODE 12345678
246#define OOB_NAMEDCLASS -1
247
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
251
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262
263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
270#define FAIL(msg) STMT_START { \
271 const char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
284
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
290#define FAIL2(pat,msg) STMT_START { \
291 const char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
309#define Simple_vFAIL(m) STMT_START { \
310 const IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 const IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 const IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 const IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
369
370#define vWARN(loc,m) STMT_START { \
371 const IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
375
376#define vWARNdep(loc,m) STMT_START { \
377 const IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
382
383
384#define vWARN2(loc, m, a1) STMT_START { \
385 const IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
388} STMT_END
389
390#define vWARN3(loc, m, a1, a2) STMT_START { \
391 const IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
394} STMT_END
395
396#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 const IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400} STMT_END
401
402#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 const IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
407
408
409/* Allow for side effects in s */
410#define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412} STMT_END
413
414/* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
418 */
419
420#define MJD_OFFSET_DEBUG(x)
421/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
422
423
424#define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 if (! SIZE_ONLY) { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
428 if((node) < 0) { \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 } else { \
431 RExC_offsets[2*(node)-1] = (byte); \
432 } \
433 } \
434} STMT_END
435
436#define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440#define Set_Node_Length_To_R(node,len) STMT_START { \
441 if (! SIZE_ONLY) { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
444 if((node) < 0) { \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 } else { \
447 RExC_offsets[2*(node)] = (len); \
448 } \
449 } \
450} STMT_END
451
452#define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455#define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
457
458/* Get offsets and lengths */
459#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
462static void clear_re(pTHX_ void *r);
463
464/* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
467
468STATIC void
469S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470{
471 const STRLEN l = CHR_SVLEN(data->last_found);
472 const STRLEN old_l = CHR_SVLEN(*data->longest);
473
474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475 SvSetMagicSV(*data->longest, data->last_found);
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
479 data->flags
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 else
482 data->flags &= ~SF_FIX_BEFORE_EOL;
483 }
484 else {
485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
486 data->offset_float_max = (l
487 ? data->last_start_max
488 : data->pos_min + data->pos_delta);
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
491 if (data->flags & SF_BEFORE_EOL)
492 data->flags
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 else
495 data->flags &= ~SF_FL_BEFORE_EOL;
496 }
497 }
498 SvCUR_set(data->last_found, 0);
499 {
500 SV * const sv = data->last_found;
501 MAGIC * const mg =
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
504 mg->mg_len = 0;
505 }
506 data->last_end = -1;
507 data->flags &= ~SF_BEFORE_EOL;
508}
509
510/* Can match anything (initialization) */
511STATIC void
512S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513{
514 ANYOF_CLASS_ZERO(cl);
515 ANYOF_BITMAP_SETALL(cl);
516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 if (LOC)
518 cl->flags |= ANYOF_LOCALE;
519}
520
521/* Can match anything (initialization) */
522STATIC int
523S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
524{
525 int value;
526
527 for (value = 0; value <= ANYOF_MAX; value += 2)
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 return 1;
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 return 0;
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
533 return 0;
534 return 1;
535}
536
537/* Can match anything (initialization) */
538STATIC void
539S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540{
541 Zero(cl, 1, struct regnode_charclass_class);
542 cl->type = ANYOF;
543 cl_anything(pRExC_state, cl);
544}
545
546STATIC void
547S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548{
549 Zero(cl, 1, struct regnode_charclass_class);
550 cl->type = ANYOF;
551 cl_anything(pRExC_state, cl);
552 if (LOC)
553 cl->flags |= ANYOF_LOCALE;
554}
555
556/* 'And' a given class with another one. Can create false positives */
557/* We assume that cl is not inverted */
558STATIC void
559S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 const struct regnode_charclass_class *and_with)
561{
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
567 int i;
568
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
572 else
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
578
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580 !(and_with->flags & ANYOF_INVERT)) {
581 cl->flags &= ~ANYOF_UNICODE_ALL;
582 cl->flags |= ANYOF_UNICODE;
583 ARG_SET(cl, ARG(and_with));
584 }
585 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586 !(and_with->flags & ANYOF_INVERT))
587 cl->flags &= ~ANYOF_UNICODE_ALL;
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589 !(and_with->flags & ANYOF_INVERT))
590 cl->flags &= ~ANYOF_UNICODE;
591}
592
593/* 'OR' a given class with another one. Can create false positives */
594/* We assume that cl is not inverted */
595STATIC void
596S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
597{
598 if (or_with->flags & ANYOF_INVERT) {
599 /* We do not use
600 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601 * <= (B1 | !B2) | (CL1 | !CL2)
602 * which is wasteful if CL2 is small, but we ignore CL2:
603 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604 * XXXX Can we handle case-fold? Unclear:
605 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 */
608 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609 && !(or_with->flags & ANYOF_FOLD)
610 && !(cl->flags & ANYOF_FOLD) ) {
611 int i;
612
613 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614 cl->bitmap[i] |= ~or_with->bitmap[i];
615 } /* XXXX: logic is complicated otherwise */
616 else {
617 cl_anything(pRExC_state, cl);
618 }
619 } else {
620 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
622 && (!(or_with->flags & ANYOF_FOLD)
623 || (cl->flags & ANYOF_FOLD)) ) {
624 int i;
625
626 /* OR char bitmap and class bitmap separately */
627 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628 cl->bitmap[i] |= or_with->bitmap[i];
629 if (or_with->flags & ANYOF_CLASS) {
630 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631 cl->classflags[i] |= or_with->classflags[i];
632 cl->flags |= ANYOF_CLASS;
633 }
634 }
635 else { /* XXXX: logic is complicated, leave it along for a moment. */
636 cl_anything(pRExC_state, cl);
637 }
638 }
639 if (or_with->flags & ANYOF_EOS)
640 cl->flags |= ANYOF_EOS;
641
642 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643 ARG(cl) != ARG(or_with)) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
646 }
647 if (or_with->flags & ANYOF_UNICODE_ALL) {
648 cl->flags |= ANYOF_UNICODE_ALL;
649 cl->flags &= ~ANYOF_UNICODE;
650 }
651}
652
653/*
654
655 make_trie(startbranch,first,last,tail,flags)
656 startbranch: the first branch in the whole branch sequence
657 first : start branch of sequence of branch-exact nodes.
658 May be the same as startbranch
659 last : Thing following the last branch.
660 May be the same as tail.
661 tail : item following the branch sequence
662 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
663
664Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
665
666A trie is an N'ary tree where the branches are determined by digital
667decomposition of the key. IE, at the root node you look up the 1st character and
668follow that branch repeat until you find the end of the branches. Nodes can be
669marked as "accepting" meaning they represent a complete word. Eg:
670
671 /he|she|his|hers/
672
673would convert into the following structure. Numbers represent states, letters
674following numbers represent valid transitions on the letter from that state, if
675the number is in square brackets it represents an accepting state, otherwise it
676will be in parenthesis.
677
678 +-h->+-e->[3]-+-r->(8)-+-s->[9]
679 | |
680 | (2)
681 | |
682 (1) +-i->(6)-+-s->[7]
683 |
684 +-s->(3)-+-h->(4)-+-e->[5]
685
686 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
687
688This shows that when matching against the string 'hers' we will begin at state 1
689read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
690then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
691is also accepting. Thus we know that we can match both 'he' and 'hers' with a
692single traverse. We store a mapping from accepting to state to which word was
693matched, and then when we have multiple possibilities we try to complete the
694rest of the regex in the order in which they occured in the alternation.
695
696The only prior NFA like behaviour that would be changed by the TRIE support is
697the silent ignoring of duplicate alternations which are of the form:
698
699 / (DUPE|DUPE) X? (?{ ... }) Y /x
700
701Thus EVAL blocks follwing a trie may be called a different number of times with
702and without the optimisation. With the optimisations dupes will be silently
703ignored. This inconsistant behaviour of EVAL type nodes is well established as
704the following demonstrates:
705
706 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
707
708which prints out 'word' three times, but
709
710 'words'=~/(word|word|word)(?{ print $1 })S/
711
712which doesnt print it out at all. This is due to other optimisations kicking in.
713
714Example of what happens on a structural level:
715
716The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
717
718 1: CURLYM[1] {1,32767}(18)
719 5: BRANCH(8)
720 6: EXACT <ac>(16)
721 8: BRANCH(11)
722 9: EXACT <ad>(16)
723 11: BRANCH(14)
724 12: EXACT <ab>(16)
725 16: SUCCEED(0)
726 17: NOTHING(18)
727 18: END(0)
728
729This would be optimizable with startbranch=5, first=5, last=16, tail=16
730and should turn into:
731
732 1: CURLYM[1] {1,32767}(18)
733 5: TRIE(16)
734 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
735 <ac>
736 <ad>
737 <ab>
738 16: SUCCEED(0)
739 17: NOTHING(18)
740 18: END(0)
741
742Cases where tail != last would be like /(?foo|bar)baz/:
743
744 1: BRANCH(4)
745 2: EXACT <foo>(8)
746 4: BRANCH(7)
747 5: EXACT <bar>(8)
748 7: TAIL(8)
749 8: EXACT <baz>(10)
750 10: END(0)
751
752which would be optimizable with startbranch=1, first=1, last=7, tail=8
753and would end up looking like:
754
755 1: TRIE(8)
756 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
757 <foo>
758 <bar>
759 7: TAIL(8)
760 8: EXACT <baz>(10)
761 10: END(0)
762
763*/
764
765#define TRIE_DEBUG_CHAR \
766 DEBUG_TRIE_COMPILE_r({ \
767 SV *tmp; \
768 if ( UTF ) { \
769 tmp = newSVpv( "", 0 ); \
770 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
771 } else { \
772 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
773 } \
774 av_push( trie->revcharmap, tmp ); \
775 })
776
777#define TRIE_READ_CHAR STMT_START { \
778 if ( UTF ) { \
779 if ( folder ) { \
780 if ( foldlen > 0 ) { \
781 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
782 foldlen -= len; \
783 scan += len; \
784 len = 0; \
785 } else { \
786 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
787 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
788 foldlen -= UNISKIP( uvc ); \
789 scan = foldbuf + UNISKIP( uvc ); \
790 } \
791 } else { \
792 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
793 } \
794 } else { \
795 uvc = (U32)*uc; \
796 len = 1; \
797 } \
798} STMT_END
799
800
801#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
802#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
803#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
804#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805
806#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
807 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
808 TRIE_LIST_LEN( state ) *= 2; \
809 Renew( trie->states[ state ].trans.list, \
810 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
811 } \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
813 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
814 TRIE_LIST_CUR( state )++; \
815} STMT_END
816
817#define TRIE_LIST_NEW(state) STMT_START { \
818 Newz( 1023, trie->states[ state ].trans.list, \
819 4, reg_trie_trans_le ); \
820 TRIE_LIST_CUR( state ) = 1; \
821 TRIE_LIST_LEN( state ) = 4; \
822} STMT_END
823
824STATIC I32
825S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
826{
827 dVAR;
828 /* first pass, loop through and scan words */
829 reg_trie_data *trie;
830 regnode *cur;
831 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
832 STRLEN len = 0;
833 UV uvc = 0;
834 U16 curword = 0;
835 U32 next_alloc = 0;
836 /* we just use folder as a flag in utf8 */
837 const U8 * const folder = ( flags == EXACTF
838 ? PL_fold
839 : ( flags == EXACTFL
840 ? PL_fold_locale
841 : NULL
842 )
843 );
844
845 const U32 data_slot = add_data( pRExC_state, 1, "t" );
846 SV *re_trie_maxbuff;
847
848 GET_RE_DEBUG_FLAGS_DECL;
849
850 Newz( 848200, trie, 1, reg_trie_data );
851 trie->refcount = 1;
852 RExC_rx->data->data[ data_slot ] = (void*)trie;
853 Newz( 848201, trie->charmap, 256, U16 );
854 DEBUG_r({
855 trie->words = newAV();
856 trie->revcharmap = newAV();
857 });
858
859
860 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
861 if (!SvIOK(re_trie_maxbuff)) {
862 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
863 }
864
865 /* -- First loop and Setup --
866
867 We first traverse the branches and scan each word to determine if it
868 contains widechars, and how many unique chars there are, this is
869 important as we have to build a table with at least as many columns as we
870 have unique chars.
871
872 We use an array of integers to represent the character codes 0..255
873 (trie->charmap) and we use a an HV* to store unicode characters. We use the
874 native representation of the character value as the key and IV's for the
875 coded index.
876
877 *TODO* If we keep track of how many times each character is used we can
878 remap the columns so that the table compression later on is more
879 efficient in terms of memory by ensuring most common value is in the
880 middle and the least common are on the outside. IMO this would be better
881 than a most to least common mapping as theres a decent chance the most
882 common letter will share a node with the least common, meaning the node
883 will not be compressable. With a middle is most common approach the worst
884 case is when we have the least common nodes twice.
885
886 */
887
888
889 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
890 regnode *noper = NEXTOPER( cur );
891 const U8 *uc = (U8*)STRING( noper );
892 const U8 * const e = uc + STR_LEN( noper );
893 STRLEN foldlen = 0;
894 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
895 const U8 *scan = (U8*)NULL;
896
897 for ( ; uc < e ; uc += len ) {
898 trie->charcount++;
899 TRIE_READ_CHAR;
900 if ( uvc < 256 ) {
901 if ( !trie->charmap[ uvc ] ) {
902 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
903 if ( folder )
904 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
905 TRIE_DEBUG_CHAR;
906 }
907 } else {
908 SV** svpp;
909 if ( !trie->widecharmap )
910 trie->widecharmap = newHV();
911
912 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
913
914 if ( !svpp )
915 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
916
917 if ( !SvTRUE( *svpp ) ) {
918 sv_setiv( *svpp, ++trie->uniquecharcount );
919 TRIE_DEBUG_CHAR;
920 }
921 }
922 }
923 trie->wordcount++;
924 } /* end first pass */
925 DEBUG_TRIE_COMPILE_r(
926 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
927 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
928 (int)trie->charcount, trie->uniquecharcount )
929 );
930
931
932 /*
933 We now know what we are dealing with in terms of unique chars and
934 string sizes so we can calculate how much memory a naive
935 representation using a flat table will take. If it's over a reasonable
936 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
937 conservative but potentially much slower representation using an array
938 of lists.
939
940 At the end we convert both representations into the same compressed
941 form that will be used in regexec.c for matching with. The latter
942 is a form that cannot be used to construct with but has memory
943 properties similar to the list form and access properties similar
944 to the table form making it both suitable for fast searches and
945 small enough that its feasable to store for the duration of a program.
946
947 See the comment in the code where the compressed table is produced
948 inplace from the flat tabe representation for an explanation of how
949 the compression works.
950
951 */
952
953
954 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
955 /*
956 Second Pass -- Array Of Lists Representation
957
958 Each state will be represented by a list of charid:state records
959 (reg_trie_trans_le) the first such element holds the CUR and LEN
960 points of the allocated array. (See defines above).
961
962 We build the initial structure using the lists, and then convert
963 it into the compressed table form which allows faster lookups
964 (but cant be modified once converted).
965
966
967 */
968
969
970 STRLEN transcount = 1;
971
972 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
973 TRIE_LIST_NEW(1);
974 next_alloc = 2;
975
976 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
977
978 regnode *noper = NEXTOPER( cur );
979 U8 *uc = (U8*)STRING( noper );
980 const U8 * const e = uc + STR_LEN( noper );
981 U32 state = 1; /* required init */
982 U16 charid = 0; /* sanity init */
983 U8 *scan = (U8*)NULL; /* sanity init */
984 STRLEN foldlen = 0; /* required init */
985 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
986
987
988 for ( ; uc < e ; uc += len ) {
989
990 TRIE_READ_CHAR;
991
992 if ( uvc < 256 ) {
993 charid = trie->charmap[ uvc ];
994 } else {
995 SV** svpp=(SV**)NULL;
996 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
997 if ( !svpp ) {
998 charid = 0;
999 } else {
1000 charid=(U16)SvIV( *svpp );
1001 }
1002 }
1003 if ( charid ) {
1004
1005 U16 check;
1006 U32 newstate = 0;
1007
1008 charid--;
1009 if ( !trie->states[ state ].trans.list ) {
1010 TRIE_LIST_NEW( state );
1011 }
1012 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1013 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1014 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1015 break;
1016 }
1017 }
1018 if ( ! newstate ) {
1019 newstate = next_alloc++;
1020 TRIE_LIST_PUSH( state, charid, newstate );
1021 transcount++;
1022 }
1023 state = newstate;
1024 } else {
1025 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1026 }
1027 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1028 }
1029
1030 if ( !trie->states[ state ].wordnum ) {
1031 /* we havent inserted this word into the structure yet. */
1032 trie->states[ state ].wordnum = ++curword;
1033
1034 DEBUG_r({
1035 /* store the word for dumping */
1036 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1037 if ( UTF ) SvUTF8_on( tmp );
1038 av_push( trie->words, tmp );
1039 });
1040
1041 } else {
1042 /* Its a dupe. So ignore it. */
1043 }
1044
1045 } /* end second pass */
1046
1047 trie->laststate = next_alloc;
1048 Renew( trie->states, next_alloc, reg_trie_state );
1049
1050 DEBUG_TRIE_COMPILE_MORE_r({
1051 U32 state;
1052
1053 /* print out the table precompression. */
1054
1055 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1056 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1057
1058 for( state=1 ; state < next_alloc ; state ++ ) {
1059 U16 charid;
1060
1061 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1062 if ( ! trie->states[ state ].wordnum ) {
1063 PerlIO_printf( Perl_debug_log, "%5s| ","");
1064 } else {
1065 PerlIO_printf( Perl_debug_log, "W%04x| ",
1066 trie->states[ state ].wordnum
1067 );
1068 }
1069 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1070 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1071 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1072 SvPV_nolen( *tmp ),
1073 TRIE_LIST_ITEM(state,charid).forid,
1074 (UV)TRIE_LIST_ITEM(state,charid).newstate
1075 );
1076 }
1077
1078 }
1079 PerlIO_printf( Perl_debug_log, "\n\n" );
1080 });
1081
1082 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1083 {
1084 U32 state;
1085 U32 tp = 0;
1086 U32 zp = 0;
1087
1088
1089 for( state=1 ; state < next_alloc ; state ++ ) {
1090 U32 base=0;
1091
1092 /*
1093 DEBUG_TRIE_COMPILE_MORE_r(
1094 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1095 );
1096 */
1097
1098 if (trie->states[state].trans.list) {
1099 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1100 U16 maxid=minid;
1101 U16 idx;
1102
1103 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1104 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1105 minid=TRIE_LIST_ITEM( state, idx).forid;
1106 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1107 maxid=TRIE_LIST_ITEM( state, idx).forid;
1108 }
1109 }
1110 if ( transcount < tp + maxid - minid + 1) {
1111 transcount *= 2;
1112 Renew( trie->trans, transcount, reg_trie_trans );
1113 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1114 }
1115 base = trie->uniquecharcount + tp - minid;
1116 if ( maxid == minid ) {
1117 U32 set = 0;
1118 for ( ; zp < tp ; zp++ ) {
1119 if ( ! trie->trans[ zp ].next ) {
1120 base = trie->uniquecharcount + zp - minid;
1121 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1122 trie->trans[ zp ].check = state;
1123 set = 1;
1124 break;
1125 }
1126 }
1127 if ( !set ) {
1128 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1129 trie->trans[ tp ].check = state;
1130 tp++;
1131 zp = tp;
1132 }
1133 } else {
1134 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1135 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1136 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1137 trie->trans[ tid ].check = state;
1138 }
1139 tp += ( maxid - minid + 1 );
1140 }
1141 Safefree(trie->states[ state ].trans.list);
1142 }
1143 /*
1144 DEBUG_TRIE_COMPILE_MORE_r(
1145 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1146 );
1147 */
1148 trie->states[ state ].trans.base=base;
1149 }
1150 trie->lasttrans = tp + 1;
1151 }
1152 } else {
1153 /*
1154 Second Pass -- Flat Table Representation.
1155
1156 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1157 We know that we will need Charcount+1 trans at most to store the data
1158 (one row per char at worst case) So we preallocate both structures
1159 assuming worst case.
1160
1161 We then construct the trie using only the .next slots of the entry
1162 structs.
1163
1164 We use the .check field of the first entry of the node temporarily to
1165 make compression both faster and easier by keeping track of how many non
1166 zero fields are in the node.
1167
1168 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1169 transition.
1170
1171 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1172 number representing the first entry of the node, and state as a
1173 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1174 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1175 are 2 entrys per node. eg:
1176
1177 A B A B
1178 1. 2 4 1. 3 7
1179 2. 0 3 3. 0 5
1180 3. 0 0 5. 0 0
1181 4. 0 0 7. 0 0
1182
1183 The table is internally in the right hand, idx form. However as we also
1184 have to deal with the states array which is indexed by nodenum we have to
1185 use TRIE_NODENUM() to convert.
1186
1187 */
1188
1189 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1190 reg_trie_trans );
1191 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1192 next_alloc = trie->uniquecharcount + 1;
1193
1194 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1195
1196 regnode *noper = NEXTOPER( cur );
1197 const U8 *uc = (U8*)STRING( noper );
1198 const U8 * const e = uc + STR_LEN( noper );
1199
1200 U32 state = 1; /* required init */
1201
1202 U16 charid = 0; /* sanity init */
1203 U32 accept_state = 0; /* sanity init */
1204 U8 *scan = (U8*)NULL; /* sanity init */
1205
1206 STRLEN foldlen = 0; /* required init */
1207 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1208
1209
1210 for ( ; uc < e ; uc += len ) {
1211
1212 TRIE_READ_CHAR;
1213
1214 if ( uvc < 256 ) {
1215 charid = trie->charmap[ uvc ];
1216 } else {
1217 SV** svpp=(SV**)NULL;
1218 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1219 if ( !svpp ) {
1220 charid = 0;
1221 } else {
1222 charid=(U16)SvIV( *svpp );
1223 }
1224 }
1225 if ( charid ) {
1226 charid--;
1227 if ( !trie->trans[ state + charid ].next ) {
1228 trie->trans[ state + charid ].next = next_alloc;
1229 trie->trans[ state ].check++;
1230 next_alloc += trie->uniquecharcount;
1231 }
1232 state = trie->trans[ state + charid ].next;
1233 } else {
1234 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1235 }
1236 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1237 }
1238
1239 accept_state = TRIE_NODENUM( state );
1240 if ( !trie->states[ accept_state ].wordnum ) {
1241 /* we havent inserted this word into the structure yet. */
1242 trie->states[ accept_state ].wordnum = ++curword;
1243
1244 DEBUG_r({
1245 /* store the word for dumping */
1246 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1247 if ( UTF ) SvUTF8_on( tmp );
1248 av_push( trie->words, tmp );
1249 });
1250
1251 } else {
1252 /* Its a dupe. So ignore it. */
1253 }
1254
1255 } /* end second pass */
1256
1257 DEBUG_TRIE_COMPILE_MORE_r({
1258 /*
1259 print out the table precompression so that we can do a visual check
1260 that they are identical.
1261 */
1262 U32 state;
1263 U16 charid;
1264 PerlIO_printf( Perl_debug_log, "\nChar : " );
1265
1266 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1267 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1268 if ( tmp ) {
1269 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1270 }
1271 }
1272
1273 PerlIO_printf( Perl_debug_log, "\nState+-" );
1274
1275 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1276 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1277 }
1278
1279 PerlIO_printf( Perl_debug_log, "\n" );
1280
1281 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1282
1283 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1284
1285 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1286 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1287 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1288 }
1289 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1290 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1291 } else {
1292 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1293 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1294 }
1295 }
1296 PerlIO_printf( Perl_debug_log, "\n\n" );
1297 });
1298 {
1299 /*
1300 * Inplace compress the table.*
1301
1302 For sparse data sets the table constructed by the trie algorithm will
1303 be mostly 0/FAIL transitions or to put it another way mostly empty.
1304 (Note that leaf nodes will not contain any transitions.)
1305
1306 This algorithm compresses the tables by eliminating most such
1307 transitions, at the cost of a modest bit of extra work during lookup:
1308
1309 - Each states[] entry contains a .base field which indicates the
1310 index in the state[] array wheres its transition data is stored.
1311
1312 - If .base is 0 there are no valid transitions from that node.
1313
1314 - If .base is nonzero then charid is added to it to find an entry in
1315 the trans array.
1316
1317 -If trans[states[state].base+charid].check!=state then the
1318 transition is taken to be a 0/Fail transition. Thus if there are fail
1319 transitions at the front of the node then the .base offset will point
1320 somewhere inside the previous nodes data (or maybe even into a node
1321 even earlier), but the .check field determines if the transition is
1322 valid.
1323
1324 The following process inplace converts the table to the compressed
1325 table: We first do not compress the root node 1,and mark its all its
1326 .check pointers as 1 and set its .base pointer as 1 as well. This
1327 allows to do a DFA construction from the compressed table later, and
1328 ensures that any .base pointers we calculate later are greater than
1329 0.
1330
1331 - We set 'pos' to indicate the first entry of the second node.
1332
1333 - We then iterate over the columns of the node, finding the first and
1334 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1335 and set the .check pointers accordingly, and advance pos
1336 appropriately and repreat for the next node. Note that when we copy
1337 the next pointers we have to convert them from the original
1338 NODEIDX form to NODENUM form as the former is not valid post
1339 compression.
1340
1341 - If a node has no transitions used we mark its base as 0 and do not
1342 advance the pos pointer.
1343
1344 - If a node only has one transition we use a second pointer into the
1345 structure to fill in allocated fail transitions from other states.
1346 This pointer is independent of the main pointer and scans forward
1347 looking for null transitions that are allocated to a state. When it
1348 finds one it writes the single transition into the "hole". If the
1349 pointer doesnt find one the single transition is appeneded as normal.
1350
1351 - Once compressed we can Renew/realloc the structures to release the
1352 excess space.
1353
1354 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1355 specifically Fig 3.47 and the associated pseudocode.
1356
1357 demq
1358 */
1359 const U32 laststate = TRIE_NODENUM( next_alloc );
1360 U32 state, charid;
1361 U32 pos = 0, zp=0;
1362 trie->laststate = laststate;
1363
1364 for ( state = 1 ; state < laststate ; state++ ) {
1365 U8 flag = 0;
1366 const U32 stateidx = TRIE_NODEIDX( state );
1367 const U32 o_used = trie->trans[ stateidx ].check;
1368 U32 used = trie->trans[ stateidx ].check;
1369 trie->trans[ stateidx ].check = 0;
1370
1371 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1372 if ( flag || trie->trans[ stateidx + charid ].next ) {
1373 if ( trie->trans[ stateidx + charid ].next ) {
1374 if (o_used == 1) {
1375 for ( ; zp < pos ; zp++ ) {
1376 if ( ! trie->trans[ zp ].next ) {
1377 break;
1378 }
1379 }
1380 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1381 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1382 trie->trans[ zp ].check = state;
1383 if ( ++zp > pos ) pos = zp;
1384 break;
1385 }
1386 used--;
1387 }
1388 if ( !flag ) {
1389 flag = 1;
1390 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1391 }
1392 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1393 trie->trans[ pos ].check = state;
1394 pos++;
1395 }
1396 }
1397 }
1398 trie->lasttrans = pos + 1;
1399 Renew( trie->states, laststate + 1, reg_trie_state);
1400 DEBUG_TRIE_COMPILE_MORE_r(
1401 PerlIO_printf( Perl_debug_log,
1402 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1403 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1404 (IV)next_alloc,
1405 (IV)pos,
1406 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1407 );
1408
1409 } /* end table compress */
1410 }
1411 /* resize the trans array to remove unused space */
1412 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1413
1414 DEBUG_TRIE_COMPILE_r({
1415 U32 state;
1416 /*
1417 Now we print it out again, in a slightly different form as there is additional
1418 info we want to be able to see when its compressed. They are close enough for
1419 visual comparison though.
1420 */
1421 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1422
1423 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1424 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1425 if ( tmp ) {
1426 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1427 }
1428 }
1429 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1430
1431 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1432 PerlIO_printf( Perl_debug_log, "-----");
1433 PerlIO_printf( Perl_debug_log, "\n");
1434
1435 for( state = 1 ; state < trie->laststate ; state++ ) {
1436 const U32 base = trie->states[ state ].trans.base;
1437
1438 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1439
1440 if ( trie->states[ state ].wordnum ) {
1441 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1442 } else {
1443 PerlIO_printf( Perl_debug_log, "%6s", "" );
1444 }
1445
1446 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1447
1448 if ( base ) {
1449 U32 ofs = 0;
1450
1451 while( ( base + ofs < trie->uniquecharcount ) ||
1452 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1453 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1454 ofs++;
1455
1456 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1457
1458 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1459 if ( ( base + ofs >= trie->uniquecharcount ) &&
1460 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1461 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1462 {
1463 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1464 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1465 } else {
1466 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1467 }
1468 }
1469
1470 PerlIO_printf( Perl_debug_log, "]");
1471
1472 }
1473 PerlIO_printf( Perl_debug_log, "\n" );
1474 }
1475 });
1476
1477 {
1478 /* now finally we "stitch in" the new TRIE node
1479 This means we convert either the first branch or the first Exact,
1480 depending on whether the thing following (in 'last') is a branch
1481 or not and whther first is the startbranch (ie is it a sub part of
1482 the alternation or is it the whole thing.)
1483 Assuming its a sub part we conver the EXACT otherwise we convert
1484 the whole branch sequence, including the first.
1485 */
1486 regnode *convert;
1487
1488
1489
1490
1491 if ( first == startbranch && OP( last ) != BRANCH ) {
1492 convert = first;
1493 } else {
1494 convert = NEXTOPER( first );
1495 NEXT_OFF( first ) = (U16)(last - first);
1496 }
1497
1498 OP( convert ) = TRIE + (U8)( flags - EXACT );
1499 NEXT_OFF( convert ) = (U16)(tail - convert);
1500 ARG_SET( convert, data_slot );
1501
1502 /* tells us if we need to handle accept buffers specially */
1503 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1504
1505
1506 /* needed for dumping*/
1507 DEBUG_r({
1508 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1509 /* We now need to mark all of the space originally used by the
1510 branches as optimized away. This keeps the dumpuntil from
1511 throwing a wobbly as it doesnt use regnext() to traverse the
1512 opcodes.
1513 */
1514 while( optimize < last ) {
1515 OP( optimize ) = OPTIMIZED;
1516 optimize++;
1517 }
1518 });
1519 } /* end node insert */
1520 return 1;
1521}
1522
1523
1524
1525/*
1526 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1527 * These need to be revisited when a newer toolchain becomes available.
1528 */
1529#if defined(__sparc64__) && defined(__GNUC__)
1530# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1531# undef SPARC64_GCC_WORKAROUND
1532# define SPARC64_GCC_WORKAROUND 1
1533# endif
1534#endif
1535
1536/* REx optimizer. Converts nodes into quickier variants "in place".
1537 Finds fixed substrings. */
1538
1539/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1540 to the position after last scanned or to NULL. */
1541
1542
1543STATIC I32
1544S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1545 /* scanp: Start here (read-write). */
1546 /* deltap: Write maxlen-minlen here. */
1547 /* last: Stop before this one. */
1548{
1549 I32 min = 0, pars = 0, code;
1550 regnode *scan = *scanp, *next;
1551 I32 delta = 0;
1552 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1553 int is_inf_internal = 0; /* The studied chunk is infinite */
1554 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1555 scan_data_t data_fake;
1556 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1557 SV *re_trie_maxbuff = NULL;
1558
1559 GET_RE_DEBUG_FLAGS_DECL;
1560
1561 while (scan && OP(scan) != END && scan < last) {
1562 /* Peephole optimizer: */
1563 DEBUG_OPTIMISE_r({
1564 SV *mysv=sv_newmortal();
1565 regprop( mysv, scan);
1566 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1567 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
1568 });
1569
1570 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1571 /* Merge several consecutive EXACTish nodes into one. */
1572 regnode *n = regnext(scan);
1573 U32 stringok = 1;
1574#ifdef DEBUGGING
1575 regnode *stop = scan;
1576#endif
1577
1578 next = scan + NODE_SZ_STR(scan);
1579 /* Skip NOTHING, merge EXACT*. */
1580 while (n &&
1581 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1582 (stringok && (OP(n) == OP(scan))))
1583 && NEXT_OFF(n)
1584 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1585 if (OP(n) == TAIL || n > next)
1586 stringok = 0;
1587 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1588 NEXT_OFF(scan) += NEXT_OFF(n);
1589 next = n + NODE_STEP_REGNODE;
1590#ifdef DEBUGGING
1591 if (stringok)
1592 stop = n;
1593#endif
1594 n = regnext(n);
1595 }
1596 else if (stringok) {
1597 const int oldl = STR_LEN(scan);
1598 regnode *nnext = regnext(n);
1599
1600 if (oldl + STR_LEN(n) > U8_MAX)
1601 break;
1602 NEXT_OFF(scan) += NEXT_OFF(n);
1603 STR_LEN(scan) += STR_LEN(n);
1604 next = n + NODE_SZ_STR(n);
1605 /* Now we can overwrite *n : */
1606 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1607#ifdef DEBUGGING
1608 stop = next - 1;
1609#endif
1610 n = nnext;
1611 }
1612 }
1613
1614 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1615/*
1616 Two problematic code points in Unicode casefolding of EXACT nodes:
1617
1618 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1619 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1620
1621 which casefold to
1622
1623 Unicode UTF-8
1624
1625 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1626 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1627
1628 This means that in case-insensitive matching (or "loose matching",
1629 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1630 length of the above casefolded versions) can match a target string
1631 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1632 This would rather mess up the minimum length computation.
1633
1634 What we'll do is to look for the tail four bytes, and then peek
1635 at the preceding two bytes to see whether we need to decrease
1636 the minimum length by four (six minus two).
1637
1638 Thanks to the design of UTF-8, there cannot be false matches:
1639 A sequence of valid UTF-8 bytes cannot be a subsequence of
1640 another valid sequence of UTF-8 bytes.
1641
1642*/
1643 char *s0 = STRING(scan), *s, *t;
1644 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1645 const char * const t0 = "\xcc\x88\xcc\x81";
1646 const char * const t1 = t0 + 3;
1647
1648 for (s = s0 + 2;
1649 s < s2 && (t = ninstr(s, s1, t0, t1));
1650 s = t + 4) {
1651 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1652 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1653 min -= 4;
1654 }
1655 }
1656
1657#ifdef DEBUGGING
1658 /* Allow dumping */
1659 n = scan + NODE_SZ_STR(scan);
1660 while (n <= stop) {
1661 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1662 OP(n) = OPTIMIZED;
1663 NEXT_OFF(n) = 0;
1664 }
1665 n++;
1666 }
1667#endif
1668 }
1669
1670
1671
1672 /* Follow the next-chain of the current node and optimize
1673 away all the NOTHINGs from it. */
1674 if (OP(scan) != CURLYX) {
1675 const int max = (reg_off_by_arg[OP(scan)]
1676 ? I32_MAX
1677 /* I32 may be smaller than U16 on CRAYs! */
1678 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1679 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1680 int noff;
1681 regnode *n = scan;
1682
1683 /* Skip NOTHING and LONGJMP. */
1684 while ((n = regnext(n))
1685 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1686 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1687 && off + noff < max)
1688 off += noff;
1689 if (reg_off_by_arg[OP(scan)])
1690 ARG(scan) = off;
1691 else
1692 NEXT_OFF(scan) = off;
1693 }
1694
1695 /* The principal pseudo-switch. Cannot be a switch, since we
1696 look into several different things. */
1697 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1698 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1699 next = regnext(scan);
1700 code = OP(scan);
1701 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1702
1703 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1704 I32 max1 = 0, min1 = I32_MAX, num = 0;
1705 struct regnode_charclass_class accum;
1706 regnode *startbranch=scan;
1707
1708 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1709 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1710 if (flags & SCF_DO_STCLASS)
1711 cl_init_zero(pRExC_state, &accum);
1712
1713 while (OP(scan) == code) {
1714 I32 deltanext, minnext, f = 0, fake;
1715 struct regnode_charclass_class this_class;
1716
1717 num++;
1718 data_fake.flags = 0;
1719 if (data) {
1720 data_fake.whilem_c = data->whilem_c;
1721 data_fake.last_closep = data->last_closep;
1722 }
1723 else
1724 data_fake.last_closep = &fake;
1725 next = regnext(scan);
1726 scan = NEXTOPER(scan);
1727 if (code != BRANCH)
1728 scan = NEXTOPER(scan);
1729 if (flags & SCF_DO_STCLASS) {
1730 cl_init(pRExC_state, &this_class);
1731 data_fake.start_class = &this_class;
1732 f = SCF_DO_STCLASS_AND;
1733 }
1734 if (flags & SCF_WHILEM_VISITED_POS)
1735 f |= SCF_WHILEM_VISITED_POS;
1736
1737 /* we suppose the run is continuous, last=next...*/
1738 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1739 next, &data_fake, f,depth+1);
1740 if (min1 > minnext)
1741 min1 = minnext;
1742 if (max1 < minnext + deltanext)
1743 max1 = minnext + deltanext;
1744 if (deltanext == I32_MAX)
1745 is_inf = is_inf_internal = 1;
1746 scan = next;
1747 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1748 pars++;
1749 if (data && (data_fake.flags & SF_HAS_EVAL))
1750 data->flags |= SF_HAS_EVAL;
1751 if (data)
1752 data->whilem_c = data_fake.whilem_c;
1753 if (flags & SCF_DO_STCLASS)
1754 cl_or(pRExC_state, &accum, &this_class);
1755 if (code == SUSPEND)
1756 break;
1757 }
1758 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1759 min1 = 0;
1760 if (flags & SCF_DO_SUBSTR) {
1761 data->pos_min += min1;
1762 data->pos_delta += max1 - min1;
1763 if (max1 != min1 || is_inf)
1764 data->longest = &(data->longest_float);
1765 }
1766 min += min1;
1767 delta += max1 - min1;
1768 if (flags & SCF_DO_STCLASS_OR) {
1769 cl_or(pRExC_state, data->start_class, &accum);
1770 if (min1) {
1771 cl_and(data->start_class, &and_with);
1772 flags &= ~SCF_DO_STCLASS;
1773 }
1774 }
1775 else if (flags & SCF_DO_STCLASS_AND) {
1776 if (min1) {
1777 cl_and(data->start_class, &accum);
1778 flags &= ~SCF_DO_STCLASS;
1779 }
1780 else {
1781 /* Switch to OR mode: cache the old value of
1782 * data->start_class */
1783 StructCopy(data->start_class, &and_with,
1784 struct regnode_charclass_class);
1785 flags &= ~SCF_DO_STCLASS_AND;
1786 StructCopy(&accum, data->start_class,
1787 struct regnode_charclass_class);
1788 flags |= SCF_DO_STCLASS_OR;
1789 data->start_class->flags |= ANYOF_EOS;
1790 }
1791 }
1792
1793 /* demq.
1794
1795 Assuming this was/is a branch we are dealing with: 'scan' now
1796 points at the item that follows the branch sequence, whatever
1797 it is. We now start at the beginning of the sequence and look
1798 for subsequences of
1799
1800 BRANCH->EXACT=>X
1801 BRANCH->EXACT=>X
1802
1803 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1804
1805 If we can find such a subseqence we need to turn the first
1806 element into a trie and then add the subsequent branch exact
1807 strings to the trie.
1808
1809 We have two cases
1810
1811 1. patterns where the whole set of branch can be converted to a trie,
1812
1813 2. patterns where only a subset of the alternations can be
1814 converted to a trie.
1815
1816 In case 1 we can replace the whole set with a single regop
1817 for the trie. In case 2 we need to keep the start and end
1818 branchs so
1819
1820 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1821 becomes BRANCH TRIE; BRANCH X;
1822
1823 Hypthetically when we know the regex isnt anchored we can
1824 turn a case 1 into a DFA and let it rip... Every time it finds a match
1825 it would just call its tail, no WHILEM/CURLY needed.
1826
1827 */
1828 if (DO_TRIE) {
1829 if (!re_trie_maxbuff) {
1830 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1831 if (!SvIOK(re_trie_maxbuff))
1832 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1833 }
1834 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1835 regnode *cur;
1836 regnode *first = (regnode *)NULL;
1837 regnode *last = (regnode *)NULL;
1838 regnode *tail = scan;
1839 U8 optype = 0;
1840 U32 count=0;
1841
1842#ifdef DEBUGGING
1843 SV *mysv = sv_newmortal(); /* for dumping */
1844#endif
1845 /* var tail is used because there may be a TAIL
1846 regop in the way. Ie, the exacts will point to the
1847 thing following the TAIL, but the last branch will
1848 point at the TAIL. So we advance tail. If we
1849 have nested (?:) we may have to move through several
1850 tails.
1851 */
1852
1853 while ( OP( tail ) == TAIL ) {
1854 /* this is the TAIL generated by (?:) */
1855 tail = regnext( tail );
1856 }
1857
1858 DEBUG_OPTIMISE_r({
1859 regprop( mysv, tail );
1860 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1861 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1862 (RExC_seen_evals) ? "[EVAL]" : ""
1863 );
1864 });
1865 /*
1866
1867 step through the branches, cur represents each
1868 branch, noper is the first thing to be matched
1869 as part of that branch and noper_next is the
1870 regnext() of that node. if noper is an EXACT
1871 and noper_next is the same as scan (our current
1872 position in the regex) then the EXACT branch is
1873 a possible optimization target. Once we have
1874 two or more consequetive such branches we can
1875 create a trie of the EXACT's contents and stich
1876 it in place. If the sequence represents all of
1877 the branches we eliminate the whole thing and
1878 replace it with a single TRIE. If it is a
1879 subsequence then we need to stitch it in. This
1880 means the first branch has to remain, and needs
1881 to be repointed at the item on the branch chain
1882 following the last branch optimized. This could
1883 be either a BRANCH, in which case the
1884 subsequence is internal, or it could be the
1885 item following the branch sequence in which
1886 case the subsequence is at the end.
1887
1888 */
1889
1890 /* dont use tail as the end marker for this traverse */
1891 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1892 regnode *noper = NEXTOPER( cur );
1893 regnode *noper_next = regnext( noper );
1894
1895 DEBUG_OPTIMISE_r({
1896 regprop( mysv, cur);
1897 PerlIO_printf( Perl_debug_log, "%*s%s",
1898 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
1899
1900 regprop( mysv, noper);
1901 PerlIO_printf( Perl_debug_log, " -> %s",
1902 SvPV_nolen(mysv));
1903
1904 if ( noper_next ) {
1905 regprop( mysv, noper_next );
1906 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1907 SvPV_nolen(mysv));
1908 }
1909 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1910 first, last, cur );
1911 });
1912 if ( ( first ? OP( noper ) == optype
1913 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1914 && noper_next == tail && count<U16_MAX)
1915 {
1916 count++;
1917 if ( !first ) {
1918 first = cur;
1919 optype = OP( noper );
1920 } else {
1921 DEBUG_OPTIMISE_r(
1922 if (!last ) {
1923 regprop( mysv, first);
1924 PerlIO_printf( Perl_debug_log, "%*s%s",
1925 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1926 regprop( mysv, NEXTOPER(first) );
1927 PerlIO_printf( Perl_debug_log, " -> %s\n",
1928 SvPV_nolen( mysv ) );
1929 }
1930 );
1931 last = cur;
1932 DEBUG_OPTIMISE_r({
1933 regprop( mysv, cur);
1934 PerlIO_printf( Perl_debug_log, "%*s%s",
1935 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1936 regprop( mysv, noper );
1937 PerlIO_printf( Perl_debug_log, " -> %s\n",
1938 SvPV_nolen( mysv ) );
1939 });
1940 }
1941 } else {
1942 if ( last ) {
1943 DEBUG_OPTIMISE_r(
1944 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1945 (int)depth * 2 + 2, "E:", "**END**" );
1946 );
1947 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1948 }
1949 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1950 && noper_next == tail )
1951 {
1952 count = 1;
1953 first = cur;
1954 optype = OP( noper );
1955 } else {
1956 count = 0;
1957 first = NULL;
1958 optype = 0;
1959 }
1960 last = NULL;
1961 }
1962 }
1963 DEBUG_OPTIMISE_r({
1964 regprop( mysv, cur);
1965 PerlIO_printf( Perl_debug_log,
1966 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1967 " ", SvPV_nolen( mysv ), first, last, cur);
1968
1969 });
1970 if ( last ) {
1971 DEBUG_OPTIMISE_r(
1972 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1973 (int)depth * 2 + 2, "E:", "==END==" );
1974 );
1975 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1976 }
1977 }
1978 }
1979 }
1980 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1981 scan = NEXTOPER(NEXTOPER(scan));
1982 } else /* single branch is optimized. */
1983 scan = NEXTOPER(scan);
1984 continue;
1985 }
1986 else if (OP(scan) == EXACT) {
1987 I32 l = STR_LEN(scan);
1988 UV uc = *((U8*)STRING(scan));
1989 if (UTF) {
1990 const U8 * const s = (U8*)STRING(scan);
1991 l = utf8_length(s, s + l);
1992 uc = utf8_to_uvchr(s, NULL);
1993 }
1994 min += l;
1995 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1996 /* The code below prefers earlier match for fixed
1997 offset, later match for variable offset. */
1998 if (data->last_end == -1) { /* Update the start info. */
1999 data->last_start_min = data->pos_min;
2000 data->last_start_max = is_inf
2001 ? I32_MAX : data->pos_min + data->pos_delta;
2002 }
2003 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2004 {
2005 SV * sv = data->last_found;
2006 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2007 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2008 if (mg && mg->mg_len >= 0)
2009 mg->mg_len += utf8_length((U8*)STRING(scan),
2010 (U8*)STRING(scan)+STR_LEN(scan));
2011 }
2012 if (UTF)
2013 SvUTF8_on(data->last_found);
2014 data->last_end = data->pos_min + l;
2015 data->pos_min += l; /* As in the first entry. */
2016 data->flags &= ~SF_BEFORE_EOL;
2017 }
2018 if (flags & SCF_DO_STCLASS_AND) {
2019 /* Check whether it is compatible with what we know already! */
2020 int compat = 1;
2021
2022 if (uc >= 0x100 ||
2023 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2024 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2025 && (!(data->start_class->flags & ANYOF_FOLD)
2026 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2027 )
2028 compat = 0;
2029 ANYOF_CLASS_ZERO(data->start_class);
2030 ANYOF_BITMAP_ZERO(data->start_class);
2031 if (compat)
2032 ANYOF_BITMAP_SET(data->start_class, uc);
2033 data->start_class->flags &= ~ANYOF_EOS;
2034 if (uc < 0x100)
2035 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2036 }
2037 else if (flags & SCF_DO_STCLASS_OR) {
2038 /* false positive possible if the class is case-folded */
2039 if (uc < 0x100)
2040 ANYOF_BITMAP_SET(data->start_class, uc);
2041 else
2042 data->start_class->flags |= ANYOF_UNICODE_ALL;
2043 data->start_class->flags &= ~ANYOF_EOS;
2044 cl_and(data->start_class, &and_with);
2045 }
2046 flags &= ~SCF_DO_STCLASS;
2047 }
2048 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2049 I32 l = STR_LEN(scan);
2050 UV uc = *((U8*)STRING(scan));
2051
2052 /* Search for fixed substrings supports EXACT only. */
2053 if (flags & SCF_DO_SUBSTR)
2054 scan_commit(pRExC_state, data);
2055 if (UTF) {
2056 U8 *s = (U8 *)STRING(scan);
2057 l = utf8_length(s, s + l);
2058 uc = utf8_to_uvchr(s, NULL);
2059 }
2060 min += l;
2061 if (data && (flags & SCF_DO_SUBSTR))
2062 data->pos_min += l;
2063 if (flags & SCF_DO_STCLASS_AND) {
2064 /* Check whether it is compatible with what we know already! */
2065 int compat = 1;
2066
2067 if (uc >= 0x100 ||
2068 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2069 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2070 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2071 compat = 0;
2072 ANYOF_CLASS_ZERO(data->start_class);
2073 ANYOF_BITMAP_ZERO(data->start_class);
2074 if (compat) {
2075 ANYOF_BITMAP_SET(data->start_class, uc);
2076 data->start_class->flags &= ~ANYOF_EOS;
2077 data->start_class->flags |= ANYOF_FOLD;
2078 if (OP(scan) == EXACTFL)
2079 data->start_class->flags |= ANYOF_LOCALE;
2080 }
2081 }
2082 else if (flags & SCF_DO_STCLASS_OR) {
2083 if (data->start_class->flags & ANYOF_FOLD) {
2084 /* false positive possible if the class is case-folded.
2085 Assume that the locale settings are the same... */
2086 if (uc < 0x100)
2087 ANYOF_BITMAP_SET(data->start_class, uc);
2088 data->start_class->flags &= ~ANYOF_EOS;
2089 }
2090 cl_and(data->start_class, &and_with);
2091 }
2092 flags &= ~SCF_DO_STCLASS;
2093 }
2094 else if (strchr((const char*)PL_varies,OP(scan))) {
2095 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2096 I32 f = flags, pos_before = 0;
2097 regnode *oscan = scan;
2098 struct regnode_charclass_class this_class;
2099 struct regnode_charclass_class *oclass = NULL;
2100 I32 next_is_eval = 0;
2101
2102 switch (PL_regkind[(U8)OP(scan)]) {
2103 case WHILEM: /* End of (?:...)* . */
2104 scan = NEXTOPER(scan);
2105 goto finish;
2106 case PLUS:
2107 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2108 next = NEXTOPER(scan);
2109 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2110 mincount = 1;
2111 maxcount = REG_INFTY;
2112 next = regnext(scan);
2113 scan = NEXTOPER(scan);
2114 goto do_curly;
2115 }
2116 }
2117 if (flags & SCF_DO_SUBSTR)
2118 data->pos_min++;
2119 min++;
2120 /* Fall through. */
2121 case STAR:
2122 if (flags & SCF_DO_STCLASS) {
2123 mincount = 0;
2124 maxcount = REG_INFTY;
2125 next = regnext(scan);
2126 scan = NEXTOPER(scan);
2127 goto do_curly;
2128 }
2129 is_inf = is_inf_internal = 1;
2130 scan = regnext(scan);
2131 if (flags & SCF_DO_SUBSTR) {
2132 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2133 data->longest = &(data->longest_float);
2134 }
2135 goto optimize_curly_tail;
2136 case CURLY:
2137 mincount = ARG1(scan);
2138 maxcount = ARG2(scan);
2139 next = regnext(scan);
2140 if (OP(scan) == CURLYX) {
2141 I32 lp = (data ? *(data->last_closep) : 0);
2142 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2143 }
2144 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2145 next_is_eval = (OP(scan) == EVAL);
2146 do_curly:
2147 if (flags & SCF_DO_SUBSTR) {
2148 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2149 pos_before = data->pos_min;
2150 }
2151 if (data) {
2152 fl = data->flags;
2153 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2154 if (is_inf)
2155 data->flags |= SF_IS_INF;
2156 }
2157 if (flags & SCF_DO_STCLASS) {
2158 cl_init(pRExC_state, &this_class);
2159 oclass = data->start_class;
2160 data->start_class = &this_class;
2161 f |= SCF_DO_STCLASS_AND;
2162 f &= ~SCF_DO_STCLASS_OR;
2163 }
2164 /* These are the cases when once a subexpression
2165 fails at a particular position, it cannot succeed
2166 even after backtracking at the enclosing scope.
2167
2168 XXXX what if minimal match and we are at the
2169 initial run of {n,m}? */
2170 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2171 f &= ~SCF_WHILEM_VISITED_POS;
2172
2173 /* This will finish on WHILEM, setting scan, or on NULL: */
2174 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2175 (mincount == 0
2176 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2177
2178 if (flags & SCF_DO_STCLASS)
2179 data->start_class = oclass;
2180 if (mincount == 0 || minnext == 0) {
2181 if (flags & SCF_DO_STCLASS_OR) {
2182 cl_or(pRExC_state, data->start_class, &this_class);
2183 }
2184 else if (flags & SCF_DO_STCLASS_AND) {
2185 /* Switch to OR mode: cache the old value of
2186 * data->start_class */
2187 StructCopy(data->start_class, &and_with,
2188 struct regnode_charclass_class);
2189 flags &= ~SCF_DO_STCLASS_AND;
2190 StructCopy(&this_class, data->start_class,
2191 struct regnode_charclass_class);
2192 flags |= SCF_DO_STCLASS_OR;
2193 data->start_class->flags |= ANYOF_EOS;
2194 }
2195 } else { /* Non-zero len */
2196 if (flags & SCF_DO_STCLASS_OR) {
2197 cl_or(pRExC_state, data->start_class, &this_class);
2198 cl_and(data->start_class, &and_with);
2199 }
2200 else if (flags & SCF_DO_STCLASS_AND)
2201 cl_and(data->start_class, &this_class);
2202 flags &= ~SCF_DO_STCLASS;
2203 }
2204 if (!scan) /* It was not CURLYX, but CURLY. */
2205 scan = next;
2206 if (ckWARN(WARN_REGEXP)
2207 /* ? quantifier ok, except for (?{ ... }) */
2208 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2209 && (minnext == 0) && (deltanext == 0)
2210 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2211 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2212 {
2213 vWARN(RExC_parse,
2214 "Quantifier unexpected on zero-length expression");
2215 }
2216
2217 min += minnext * mincount;
2218 is_inf_internal |= ((maxcount == REG_INFTY
2219 && (minnext + deltanext) > 0)
2220 || deltanext == I32_MAX);
2221 is_inf |= is_inf_internal;
2222 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2223
2224 /* Try powerful optimization CURLYX => CURLYN. */
2225 if ( OP(oscan) == CURLYX && data
2226 && data->flags & SF_IN_PAR
2227 && !(data->flags & SF_HAS_EVAL)
2228 && !deltanext && minnext == 1 ) {
2229 /* Try to optimize to CURLYN. */
2230 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2231 regnode *nxt1 = nxt;
2232#ifdef DEBUGGING
2233 regnode *nxt2;
2234#endif
2235
2236 /* Skip open. */
2237 nxt = regnext(nxt);
2238 if (!strchr((const char*)PL_simple,OP(nxt))
2239 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2240 && STR_LEN(nxt) == 1))
2241 goto nogo;
2242#ifdef DEBUGGING
2243 nxt2 = nxt;
2244#endif
2245 nxt = regnext(nxt);
2246 if (OP(nxt) != CLOSE)
2247 goto nogo;
2248 /* Now we know that nxt2 is the only contents: */
2249 oscan->flags = (U8)ARG(nxt);
2250 OP(oscan) = CURLYN;
2251 OP(nxt1) = NOTHING; /* was OPEN. */
2252#ifdef DEBUGGING
2253 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2254 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2255 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2256 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2257 OP(nxt + 1) = OPTIMIZED; /* was count. */
2258 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2259#endif
2260 }
2261 nogo:
2262
2263 /* Try optimization CURLYX => CURLYM. */
2264 if ( OP(oscan) == CURLYX && data
2265 && !(data->flags & SF_HAS_PAR)
2266 && !(data->flags & SF_HAS_EVAL)
2267 && !deltanext /* atom is fixed width */
2268 && minnext != 0 /* CURLYM can't handle zero width */
2269 ) {
2270 /* XXXX How to optimize if data == 0? */
2271 /* Optimize to a simpler form. */
2272 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2273 regnode *nxt2;
2274
2275 OP(oscan) = CURLYM;
2276 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2277 && (OP(nxt2) != WHILEM))
2278 nxt = nxt2;
2279 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2280 /* Need to optimize away parenths. */
2281 if (data->flags & SF_IN_PAR) {
2282 /* Set the parenth number. */
2283 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2284
2285 if (OP(nxt) != CLOSE)
2286 FAIL("Panic opt close");
2287 oscan->flags = (U8)ARG(nxt);
2288 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2289 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2290#ifdef DEBUGGING
2291 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2292 OP(nxt + 1) = OPTIMIZED; /* was count. */
2293 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2294 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2295#endif
2296#if 0
2297 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2298 regnode *nnxt = regnext(nxt1);
2299
2300 if (nnxt == nxt) {
2301 if (reg_off_by_arg[OP(nxt1)])
2302 ARG_SET(nxt1, nxt2 - nxt1);
2303 else if (nxt2 - nxt1 < U16_MAX)
2304 NEXT_OFF(nxt1) = nxt2 - nxt1;
2305 else
2306 OP(nxt) = NOTHING; /* Cannot beautify */
2307 }
2308 nxt1 = nnxt;
2309 }
2310#endif
2311 /* Optimize again: */
2312 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2313 NULL, 0,depth+1);
2314 }
2315 else
2316 oscan->flags = 0;
2317 }
2318 else if ((OP(oscan) == CURLYX)
2319 && (flags & SCF_WHILEM_VISITED_POS)
2320 /* See the comment on a similar expression above.
2321 However, this time it not a subexpression
2322 we care about, but the expression itself. */
2323 && (maxcount == REG_INFTY)
2324 && data && ++data->whilem_c < 16) {
2325 /* This stays as CURLYX, we can put the count/of pair. */
2326 /* Find WHILEM (as in regexec.c) */
2327 regnode *nxt = oscan + NEXT_OFF(oscan);
2328
2329 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2330 nxt += ARG(nxt);
2331 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2332 | (RExC_whilem_seen << 4)); /* On WHILEM */
2333 }
2334 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2335 pars++;
2336 if (flags & SCF_DO_SUBSTR) {
2337 SV *last_str = Nullsv;
2338 int counted = mincount != 0;
2339
2340 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2341#if defined(SPARC64_GCC_WORKAROUND)
2342 I32 b = 0;
2343 STRLEN l = 0;
2344 char *s = NULL;
2345 I32 old = 0;
2346
2347 if (pos_before >= data->last_start_min)
2348 b = pos_before;
2349 else
2350 b = data->last_start_min;
2351
2352 l = 0;
2353 s = SvPV(data->last_found, l);
2354 old = b - data->last_start_min;
2355
2356#else
2357 I32 b = pos_before >= data->last_start_min
2358 ? pos_before : data->last_start_min;
2359 STRLEN l;
2360 char *s = SvPV(data->last_found, l);
2361 I32 old = b - data->last_start_min;
2362#endif
2363
2364 if (UTF)
2365 old = utf8_hop((U8*)s, old) - (U8*)s;
2366
2367 l -= old;
2368 /* Get the added string: */
2369 last_str = newSVpvn(s + old, l);
2370 if (UTF)
2371 SvUTF8_on(last_str);
2372 if (deltanext == 0 && pos_before == b) {
2373 /* What was added is a constant string */
2374 if (mincount > 1) {
2375 SvGROW(last_str, (mincount * l) + 1);
2376 repeatcpy(SvPVX(last_str) + l,
2377 SvPVX_const(last_str), l, mincount - 1);
2378 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2379 /* Add additional parts. */
2380 SvCUR_set(data->last_found,
2381 SvCUR(data->last_found) - l);
2382 sv_catsv(data->last_found, last_str);
2383 {
2384 SV * sv = data->last_found;
2385 MAGIC *mg =
2386 SvUTF8(sv) && SvMAGICAL(sv) ?
2387 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2388 if (mg && mg->mg_len >= 0)
2389 mg->mg_len += CHR_SVLEN(last_str);
2390 }
2391 data->last_end += l * (mincount - 1);
2392 }
2393 } else {
2394 /* start offset must point into the last copy */
2395 data->last_start_min += minnext * (mincount - 1);
2396 data->last_start_max += is_inf ? I32_MAX
2397 : (maxcount - 1) * (minnext + data->pos_delta);
2398 }
2399 }
2400 /* It is counted once already... */
2401 data->pos_min += minnext * (mincount - counted);
2402 data->pos_delta += - counted * deltanext +
2403 (minnext + deltanext) * maxcount - minnext * mincount;
2404 if (mincount != maxcount) {
2405 /* Cannot extend fixed substrings found inside
2406 the group. */
2407 scan_commit(pRExC_state,data);
2408 if (mincount && last_str) {
2409 sv_setsv(data->last_found, last_str);
2410 data->last_end = data->pos_min;
2411 data->last_start_min =
2412 data->pos_min - CHR_SVLEN(last_str);
2413 data->last_start_max = is_inf
2414 ? I32_MAX
2415 : data->pos_min + data->pos_delta
2416 - CHR_SVLEN(last_str);
2417 }
2418 data->longest = &(data->longest_float);
2419 }
2420 SvREFCNT_dec(last_str);
2421 }
2422 if (data && (fl & SF_HAS_EVAL))
2423 data->flags |= SF_HAS_EVAL;
2424 optimize_curly_tail:
2425 if (OP(oscan) != CURLYX) {
2426 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2427 && NEXT_OFF(next))
2428 NEXT_OFF(oscan) += NEXT_OFF(next);
2429 }
2430 continue;
2431 default: /* REF and CLUMP only? */
2432 if (flags & SCF_DO_SUBSTR) {
2433 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2434 data->longest = &(data->longest_float);
2435 }
2436 is_inf = is_inf_internal = 1;
2437 if (flags & SCF_DO_STCLASS_OR)
2438 cl_anything(pRExC_state, data->start_class);
2439 flags &= ~SCF_DO_STCLASS;
2440 break;
2441 }
2442 }
2443 else if (strchr((const char*)PL_simple,OP(scan))) {
2444 int value = 0;
2445
2446 if (flags & SCF_DO_SUBSTR) {
2447 scan_commit(pRExC_state,data);
2448 data->pos_min++;
2449 }
2450 min++;
2451 if (flags & SCF_DO_STCLASS) {
2452 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2453
2454 /* Some of the logic below assumes that switching
2455 locale on will only add false positives. */
2456 switch (PL_regkind[(U8)OP(scan)]) {
2457 case SANY:
2458 default:
2459 do_default:
2460 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2461 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2462 cl_anything(pRExC_state, data->start_class);
2463 break;
2464 case REG_ANY:
2465 if (OP(scan) == SANY)
2466 goto do_default;
2467 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2468 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2469 || (data->start_class->flags & ANYOF_CLASS));
2470 cl_anything(pRExC_state, data->start_class);
2471 }
2472 if (flags & SCF_DO_STCLASS_AND || !value)
2473 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2474 break;
2475 case ANYOF:
2476 if (flags & SCF_DO_STCLASS_AND)
2477 cl_and(data->start_class,
2478 (struct regnode_charclass_class*)scan);
2479 else
2480 cl_or(pRExC_state, data->start_class,
2481 (struct regnode_charclass_class*)scan);
2482 break;
2483 case ALNUM:
2484 if (flags & SCF_DO_STCLASS_AND) {
2485 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2487 for (value = 0; value < 256; value++)
2488 if (!isALNUM(value))
2489 ANYOF_BITMAP_CLEAR(data->start_class, value);
2490 }
2491 }
2492 else {
2493 if (data->start_class->flags & ANYOF_LOCALE)
2494 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2495 else {
2496 for (value = 0; value < 256; value++)
2497 if (isALNUM(value))
2498 ANYOF_BITMAP_SET(data->start_class, value);
2499 }
2500 }
2501 break;
2502 case ALNUML:
2503 if (flags & SCF_DO_STCLASS_AND) {
2504 if (data->start_class->flags & ANYOF_LOCALE)
2505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2506 }
2507 else {
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2509 data->start_class->flags |= ANYOF_LOCALE;
2510 }
2511 break;
2512 case NALNUM:
2513 if (flags & SCF_DO_STCLASS_AND) {
2514 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2516 for (value = 0; value < 256; value++)
2517 if (isALNUM(value))
2518 ANYOF_BITMAP_CLEAR(data->start_class, value);
2519 }
2520 }
2521 else {
2522 if (data->start_class->flags & ANYOF_LOCALE)
2523 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2524 else {
2525 for (value = 0; value < 256; value++)
2526 if (!isALNUM(value))
2527 ANYOF_BITMAP_SET(data->start_class, value);
2528 }
2529 }
2530 break;
2531 case NALNUML:
2532 if (flags & SCF_DO_STCLASS_AND) {
2533 if (data->start_class->flags & ANYOF_LOCALE)
2534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2535 }
2536 else {
2537 data->start_class->flags |= ANYOF_LOCALE;
2538 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2539 }
2540 break;
2541 case SPACE:
2542 if (flags & SCF_DO_STCLASS_AND) {
2543 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2545 for (value = 0; value < 256; value++)
2546 if (!isSPACE(value))
2547 ANYOF_BITMAP_CLEAR(data->start_class, value);
2548 }
2549 }
2550 else {
2551 if (data->start_class->flags & ANYOF_LOCALE)
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2553 else {
2554 for (value = 0; value < 256; value++)
2555 if (isSPACE(value))
2556 ANYOF_BITMAP_SET(data->start_class, value);
2557 }
2558 }
2559 break;
2560 case SPACEL:
2561 if (flags & SCF_DO_STCLASS_AND) {
2562 if (data->start_class->flags & ANYOF_LOCALE)
2563 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2564 }
2565 else {
2566 data->start_class->flags |= ANYOF_LOCALE;
2567 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2568 }
2569 break;
2570 case NSPACE:
2571 if (flags & SCF_DO_STCLASS_AND) {
2572 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2574 for (value = 0; value < 256; value++)
2575 if (isSPACE(value))
2576 ANYOF_BITMAP_CLEAR(data->start_class, value);
2577 }
2578 }
2579 else {
2580 if (data->start_class->flags & ANYOF_LOCALE)
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2582 else {
2583 for (value = 0; value < 256; value++)
2584 if (!isSPACE(value))
2585 ANYOF_BITMAP_SET(data->start_class, value);
2586 }
2587 }
2588 break;
2589 case NSPACEL:
2590 if (flags & SCF_DO_STCLASS_AND) {
2591 if (data->start_class->flags & ANYOF_LOCALE) {
2592 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2593 for (value = 0; value < 256; value++)
2594 if (!isSPACE(value))
2595 ANYOF_BITMAP_CLEAR(data->start_class, value);
2596 }
2597 }
2598 else {
2599 data->start_class->flags |= ANYOF_LOCALE;
2600 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2601 }
2602 break;
2603 case DIGIT:
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2606 for (value = 0; value < 256; value++)
2607 if (!isDIGIT(value))
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2609 }
2610 else {
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2613 else {
2614 for (value = 0; value < 256; value++)
2615 if (isDIGIT(value))
2616 ANYOF_BITMAP_SET(data->start_class, value);
2617 }
2618 }
2619 break;
2620 case NDIGIT:
2621 if (flags & SCF_DO_STCLASS_AND) {
2622 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2623 for (value = 0; value < 256; value++)
2624 if (isDIGIT(value))
2625 ANYOF_BITMAP_CLEAR(data->start_class, value);
2626 }
2627 else {
2628 if (data->start_class->flags & ANYOF_LOCALE)
2629 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2630 else {
2631 for (value = 0; value < 256; value++)
2632 if (!isDIGIT(value))
2633 ANYOF_BITMAP_SET(data->start_class, value);
2634 }
2635 }
2636 break;
2637 }
2638 if (flags & SCF_DO_STCLASS_OR)
2639 cl_and(data->start_class, &and_with);
2640 flags &= ~SCF_DO_STCLASS;
2641 }
2642 }
2643 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2644 data->flags |= (OP(scan) == MEOL
2645 ? SF_BEFORE_MEOL
2646 : SF_BEFORE_SEOL);
2647 }
2648 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2649 /* Lookbehind, or need to calculate parens/evals/stclass: */
2650 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2651 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2652 /* Lookahead/lookbehind */
2653 I32 deltanext, minnext, fake = 0;
2654 regnode *nscan;
2655 struct regnode_charclass_class intrnl;
2656 int f = 0;
2657
2658 data_fake.flags = 0;
2659 if (data) {
2660 data_fake.whilem_c = data->whilem_c;
2661 data_fake.last_closep = data->last_closep;
2662 }
2663 else
2664 data_fake.last_closep = &fake;
2665 if ( flags & SCF_DO_STCLASS && !scan->flags
2666 && OP(scan) == IFMATCH ) { /* Lookahead */
2667 cl_init(pRExC_state, &intrnl);
2668 data_fake.start_class = &intrnl;
2669 f |= SCF_DO_STCLASS_AND;
2670 }
2671 if (flags & SCF_WHILEM_VISITED_POS)
2672 f |= SCF_WHILEM_VISITED_POS;
2673 next = regnext(scan);
2674 nscan = NEXTOPER(NEXTOPER(scan));
2675 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2676 if (scan->flags) {
2677 if (deltanext) {
2678 vFAIL("Variable length lookbehind not implemented");
2679 }
2680 else if (minnext > U8_MAX) {
2681 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2682 }
2683 scan->flags = (U8)minnext;
2684 }
2685 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2686 pars++;
2687 if (data && (data_fake.flags & SF_HAS_EVAL))
2688 data->flags |= SF_HAS_EVAL;
2689 if (data)
2690 data->whilem_c = data_fake.whilem_c;
2691 if (f & SCF_DO_STCLASS_AND) {
2692 const int was = (data->start_class->flags & ANYOF_EOS);
2693
2694 cl_and(data->start_class, &intrnl);
2695 if (was)
2696 data->start_class->flags |= ANYOF_EOS;
2697 }
2698 }
2699 else if (OP(scan) == OPEN) {
2700 pars++;
2701 }
2702 else if (OP(scan) == CLOSE) {
2703 if ((I32)ARG(scan) == is_par) {
2704 next = regnext(scan);
2705
2706 if ( next && (OP(next) != WHILEM) && next < last)
2707 is_par = 0; /* Disable optimization */
2708 }
2709 if (data)
2710 *(data->last_closep) = ARG(scan);
2711 }
2712 else if (OP(scan) == EVAL) {
2713 if (data)
2714 data->flags |= SF_HAS_EVAL;
2715 }
2716 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2717 if (flags & SCF_DO_SUBSTR) {
2718 scan_commit(pRExC_state,data);
2719 data->longest = &(data->longest_float);
2720 }
2721 is_inf = is_inf_internal = 1;
2722 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2723 cl_anything(pRExC_state, data->start_class);
2724 flags &= ~SCF_DO_STCLASS;
2725 }
2726 /* Else: zero-length, ignore. */
2727 scan = regnext(scan);
2728 }
2729
2730 finish:
2731 *scanp = scan;
2732 *deltap = is_inf_internal ? I32_MAX : delta;
2733 if (flags & SCF_DO_SUBSTR && is_inf)
2734 data->pos_delta = I32_MAX - data->pos_min;
2735 if (is_par > U8_MAX)
2736 is_par = 0;
2737 if (is_par && pars==1 && data) {
2738 data->flags |= SF_IN_PAR;
2739 data->flags &= ~SF_HAS_PAR;
2740 }
2741 else if (pars && data) {
2742 data->flags |= SF_HAS_PAR;
2743 data->flags &= ~SF_IN_PAR;
2744 }
2745 if (flags & SCF_DO_STCLASS_OR)
2746 cl_and(data->start_class, &and_with);
2747 return min;
2748}
2749
2750STATIC I32
2751S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2752{
2753 if (RExC_rx->data) {
2754 Renewc(RExC_rx->data,
2755 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2756 char, struct reg_data);
2757 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2758 RExC_rx->data->count += n;
2759 }
2760 else {
2761 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2762 char, struct reg_data);
2763 New(1208, RExC_rx->data->what, n, U8);
2764 RExC_rx->data->count = n;
2765 }
2766 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2767 return RExC_rx->data->count - n;
2768}
2769
2770void
2771Perl_reginitcolors(pTHX)
2772{
2773 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2774 if (s) {
2775 char *t = savepv(s);
2776 int i = 0;
2777 PL_colors[0] = t;
2778 while (++i < 6) {
2779 t = strchr(t, '\t');
2780 if (t) {
2781 *t = '\0';
2782 PL_colors[i] = ++t;
2783 }
2784 else
2785 PL_colors[i] = t = (char *)"";
2786 }
2787 } else {
2788 int i = 0;
2789 while (i < 6)
2790 PL_colors[i++] = (char *)"";
2791 }
2792 PL_colorset = 1;
2793}
2794
2795
2796/*
2797 - pregcomp - compile a regular expression into internal code
2798 *
2799 * We can't allocate space until we know how big the compiled form will be,
2800 * but we can't compile it (and thus know how big it is) until we've got a
2801 * place to put the code. So we cheat: we compile it twice, once with code
2802 * generation turned off and size counting turned on, and once "for real".
2803 * This also means that we don't allocate space until we are sure that the
2804 * thing really will compile successfully, and we never have to move the
2805 * code and thus invalidate pointers into it. (Note that it has to be in
2806 * one piece because free() must be able to free it all.) [NB: not true in perl]
2807 *
2808 * Beware that the optimization-preparation code in here knows about some
2809 * of the structure of the compiled regexp. [I'll say.]
2810 */
2811regexp *
2812Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2813{
2814 register regexp *r;
2815 regnode *scan;
2816 regnode *first;
2817 I32 flags;
2818 I32 minlen = 0;
2819 I32 sawplus = 0;
2820 I32 sawopen = 0;
2821 scan_data_t data;
2822 RExC_state_t RExC_state;
2823 RExC_state_t *pRExC_state = &RExC_state;
2824
2825 GET_RE_DEBUG_FLAGS_DECL;
2826
2827 if (exp == NULL)
2828 FAIL("NULL regexp argument");
2829
2830 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2831
2832 RExC_precomp = exp;
2833 DEBUG_r(if (!PL_colorset) reginitcolors());
2834 DEBUG_COMPILE_r({
2835 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2836 PL_colors[4],PL_colors[5],PL_colors[0],
2837 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2838 });
2839 RExC_flags = pm->op_pmflags;
2840 RExC_sawback = 0;
2841
2842 RExC_seen = 0;
2843 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2844 RExC_seen_evals = 0;
2845 RExC_extralen = 0;
2846
2847 /* First pass: determine size, legality. */
2848 RExC_parse = exp;
2849 RExC_start = exp;
2850 RExC_end = xend;
2851 RExC_naughty = 0;
2852 RExC_npar = 1;
2853 RExC_size = 0L;
2854 RExC_emit = &PL_regdummy;
2855 RExC_whilem_seen = 0;
2856#if 0 /* REGC() is (currently) a NOP at the first pass.
2857 * Clever compilers notice this and complain. --jhi */
2858 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2859#endif
2860 if (reg(pRExC_state, 0, &flags) == NULL) {
2861 RExC_precomp = Nullch;
2862 return(NULL);
2863 }
2864 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2865
2866 /* Small enough for pointer-storage convention?
2867 If extralen==0, this means that we will not need long jumps. */
2868 if (RExC_size >= 0x10000L && RExC_extralen)
2869 RExC_size += RExC_extralen;
2870 else
2871 RExC_extralen = 0;
2872 if (RExC_whilem_seen > 15)
2873 RExC_whilem_seen = 15;
2874
2875 /* Allocate space and initialize. */
2876 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2877 char, regexp);
2878 if (r == NULL)
2879 FAIL("Regexp out of space");
2880
2881#ifdef DEBUGGING
2882 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2883 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2884#endif
2885 r->refcnt = 1;
2886 r->prelen = xend - exp;
2887 r->precomp = savepvn(RExC_precomp, r->prelen);
2888 r->subbeg = NULL;
2889#ifdef PERL_OLD_COPY_ON_WRITE
2890 r->saved_copy = Nullsv;
2891#endif
2892 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2893 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2894
2895 r->substrs = 0; /* Useful during FAIL. */
2896 r->startp = 0; /* Useful during FAIL. */
2897 r->endp = 0; /* Useful during FAIL. */
2898
2899 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2900 if (r->offsets) {
2901 r->offsets[0] = RExC_size;
2902 }
2903 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2904 "%s %"UVuf" bytes for offset annotations.\n",
2905 r->offsets ? "Got" : "Couldn't get",
2906 (UV)((2*RExC_size+1) * sizeof(U32))));
2907
2908 RExC_rx = r;
2909
2910 /* Second pass: emit code. */
2911 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2912 RExC_parse = exp;
2913 RExC_end = xend;
2914 RExC_naughty = 0;
2915 RExC_npar = 1;
2916 RExC_emit_start = r->program;
2917 RExC_emit = r->program;
2918 /* Store the count of eval-groups for security checks: */
2919 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2920 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2921 r->data = 0;
2922 if (reg(pRExC_state, 0, &flags) == NULL)
2923 return(NULL);
2924
2925
2926 /* Dig out information for optimizations. */
2927 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2928 pm->op_pmflags = RExC_flags;
2929 if (UTF)
2930 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2931 r->regstclass = NULL;
2932 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2933 r->reganch |= ROPT_NAUGHTY;
2934 scan = r->program + 1; /* First BRANCH. */
2935
2936 /* XXXX To minimize changes to RE engine we always allocate
2937 3-units-long substrs field. */
2938 Newz(1004, r->substrs, 1, struct reg_substr_data);
2939
2940 StructCopy(&zero_scan_data, &data, scan_data_t);
2941 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2942 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2943 I32 fake;
2944 STRLEN longest_float_length, longest_fixed_length;
2945 struct regnode_charclass_class ch_class;
2946 int stclass_flag;
2947 I32 last_close = 0;
2948
2949 first = scan;
2950 /* Skip introductions and multiplicators >= 1. */
2951 while ((OP(first) == OPEN && (sawopen = 1)) ||
2952 /* An OR of *one* alternative - should not happen now. */
2953 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2954 (OP(first) == PLUS) ||
2955 (OP(first) == MINMOD) ||
2956 /* An {n,m} with n>0 */
2957 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2958 if (OP(first) == PLUS)
2959 sawplus = 1;
2960 else
2961 first += regarglen[(U8)OP(first)];
2962 first = NEXTOPER(first);
2963 }
2964
2965 /* Starting-point info. */
2966 again:
2967 if (PL_regkind[(U8)OP(first)] == EXACT) {
2968 if (OP(first) == EXACT)
2969 ; /* Empty, get anchored substr later. */
2970 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2971 r->regstclass = first;
2972 }
2973 else if (strchr((const char*)PL_simple,OP(first)))
2974 r->regstclass = first;
2975 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2976 PL_regkind[(U8)OP(first)] == NBOUND)
2977 r->regstclass = first;
2978 else if (PL_regkind[(U8)OP(first)] == BOL) {
2979 r->reganch |= (OP(first) == MBOL
2980 ? ROPT_ANCH_MBOL
2981 : (OP(first) == SBOL
2982 ? ROPT_ANCH_SBOL
2983 : ROPT_ANCH_BOL));
2984 first = NEXTOPER(first);
2985 goto again;
2986 }
2987 else if (OP(first) == GPOS) {
2988 r->reganch |= ROPT_ANCH_GPOS;
2989 first = NEXTOPER(first);
2990 goto again;
2991 }
2992 else if (!sawopen && (OP(first) == STAR &&
2993 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2994 !(r->reganch & ROPT_ANCH) )
2995 {
2996 /* turn .* into ^.* with an implied $*=1 */
2997 const int type =
2998 (OP(NEXTOPER(first)) == REG_ANY)
2999 ? ROPT_ANCH_MBOL
3000 : ROPT_ANCH_SBOL;
3001 r->reganch |= type | ROPT_IMPLICIT;
3002 first = NEXTOPER(first);
3003 goto again;
3004 }
3005 if (sawplus && (!sawopen || !RExC_sawback)
3006 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3007 /* x+ must match at the 1st pos of run of x's */
3008 r->reganch |= ROPT_SKIP;
3009
3010 /* Scan is after the zeroth branch, first is atomic matcher. */
3011 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3012 (IV)(first - scan + 1)));
3013 /*
3014 * If there's something expensive in the r.e., find the
3015 * longest literal string that must appear and make it the
3016 * regmust. Resolve ties in favor of later strings, since
3017 * the regstart check works with the beginning of the r.e.
3018 * and avoiding duplication strengthens checking. Not a
3019 * strong reason, but sufficient in the absence of others.
3020 * [Now we resolve ties in favor of the earlier string if
3021 * it happens that c_offset_min has been invalidated, since the
3022 * earlier string may buy us something the later one won't.]
3023 */
3024 minlen = 0;
3025
3026 data.longest_fixed = newSVpvn("",0);
3027 data.longest_float = newSVpvn("",0);
3028 data.last_found = newSVpvn("",0);
3029 data.longest = &(data.longest_fixed);
3030 first = scan;
3031 if (!r->regstclass) {
3032 cl_init(pRExC_state, &ch_class);
3033 data.start_class = &ch_class;
3034 stclass_flag = SCF_DO_STCLASS_AND;
3035 } else /* XXXX Check for BOUND? */
3036 stclass_flag = 0;
3037 data.last_closep = &last_close;
3038
3039 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3040 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3041 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3042 && data.last_start_min == 0 && data.last_end > 0
3043 && !RExC_seen_zerolen
3044 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3045 r->reganch |= ROPT_CHECK_ALL;
3046 scan_commit(pRExC_state, &data);
3047 SvREFCNT_dec(data.last_found);
3048
3049 longest_float_length = CHR_SVLEN(data.longest_float);
3050 if (longest_float_length
3051 || (data.flags & SF_FL_BEFORE_EOL
3052 && (!(data.flags & SF_FL_BEFORE_MEOL)
3053 || (RExC_flags & PMf_MULTILINE)))) {
3054 int t;
3055
3056 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3057 && data.offset_fixed == data.offset_float_min
3058 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3059 goto remove_float; /* As in (a)+. */
3060
3061 if (SvUTF8(data.longest_float)) {
3062 r->float_utf8 = data.longest_float;
3063 r->float_substr = Nullsv;
3064 } else {
3065 r->float_substr = data.longest_float;
3066 r->float_utf8 = Nullsv;
3067 }
3068 r->float_min_offset = data.offset_float_min;
3069 r->float_max_offset = data.offset_float_max;
3070 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3071 && (!(data.flags & SF_FL_BEFORE_MEOL)
3072 || (RExC_flags & PMf_MULTILINE)));
3073 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3074 }
3075 else {
3076 remove_float:
3077 r->float_substr = r->float_utf8 = Nullsv;
3078 SvREFCNT_dec(data.longest_float);
3079 longest_float_length = 0;
3080 }
3081
3082 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3083 if (longest_fixed_length
3084 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3086 || (RExC_flags & PMf_MULTILINE)))) {
3087 int t;
3088
3089 if (SvUTF8(data.longest_fixed)) {
3090 r->anchored_utf8 = data.longest_fixed;
3091 r->anchored_substr = Nullsv;
3092 } else {
3093 r->anchored_substr = data.longest_fixed;
3094 r->anchored_utf8 = Nullsv;
3095 }
3096 r->anchored_offset = data.offset_fixed;
3097 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3099 || (RExC_flags & PMf_MULTILINE)));
3100 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3101 }
3102 else {
3103 r->anchored_substr = r->anchored_utf8 = Nullsv;
3104 SvREFCNT_dec(data.longest_fixed);
3105 longest_fixed_length = 0;
3106 }
3107 if (r->regstclass
3108 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3109 r->regstclass = NULL;
3110 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3111 && stclass_flag
3112 && !(data.start_class->flags & ANYOF_EOS)
3113 && !cl_is_anything(data.start_class))
3114 {
3115 const I32 n = add_data(pRExC_state, 1, "f");
3116
3117 New(1006, RExC_rx->data->data[n], 1,
3118 struct regnode_charclass_class);
3119 StructCopy(data.start_class,
3120 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3121 struct regnode_charclass_class);
3122 r->regstclass = (regnode*)RExC_rx->data->data[n];
3123 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3124 PL_regdata = r->data; /* for regprop() */
3125 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3126 regprop(sv, (regnode*)data.start_class);
3127 PerlIO_printf(Perl_debug_log,
3128 "synthetic stclass \"%s\".\n",
3129 SvPVX_const(sv));});
3130 }
3131
3132 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3133 if (longest_fixed_length > longest_float_length) {
3134 r->check_substr = r->anchored_substr;
3135 r->check_utf8 = r->anchored_utf8;
3136 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3137 if (r->reganch & ROPT_ANCH_SINGLE)
3138 r->reganch |= ROPT_NOSCAN;
3139 }
3140 else {
3141 r->check_substr = r->float_substr;
3142 r->check_utf8 = r->float_utf8;
3143 r->check_offset_min = data.offset_float_min;
3144 r->check_offset_max = data.offset_float_max;
3145 }
3146 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3147 This should be changed ASAP! */
3148 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3149 r->reganch |= RE_USE_INTUIT;
3150 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3151 r->reganch |= RE_INTUIT_TAIL;
3152 }
3153 }
3154 else {
3155 /* Several toplevels. Best we can is to set minlen. */
3156 I32 fake;
3157 struct regnode_charclass_class ch_class;
3158 I32 last_close = 0;
3159
3160 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3161 scan = r->program + 1;
3162 cl_init(pRExC_state, &ch_class);
3163 data.start_class = &ch_class;
3164 data.last_closep = &last_close;
3165 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3166 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3167 = r->float_substr = r->float_utf8 = Nullsv;
3168 if (!(data.start_class->flags & ANYOF_EOS)
3169 && !cl_is_anything(data.start_class))
3170 {
3171 const I32 n = add_data(pRExC_state, 1, "f");
3172
3173 New(1006, RExC_rx->data->data[n], 1,
3174 struct regnode_charclass_class);
3175 StructCopy(data.start_class,
3176 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3177 struct regnode_charclass_class);
3178 r->regstclass = (regnode*)RExC_rx->data->data[n];
3179 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3180 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3181 regprop(sv, (regnode*)data.start_class);
3182 PerlIO_printf(Perl_debug_log,
3183 "synthetic stclass \"%s\".\n",
3184 SvPVX_const(sv));});
3185 }
3186 }
3187
3188 r->minlen = minlen;
3189 if (RExC_seen & REG_SEEN_GPOS)
3190 r->reganch |= ROPT_GPOS_SEEN;
3191 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3192 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3193 if (RExC_seen & REG_SEEN_EVAL)
3194 r->reganch |= ROPT_EVAL_SEEN;
3195 if (RExC_seen & REG_SEEN_CANY)
3196 r->reganch |= ROPT_CANY_SEEN;
3197 Newz(1002, r->startp, RExC_npar, I32);
3198 Newz(1002, r->endp, RExC_npar, I32);
3199 PL_regdata = r->data; /* for regprop() */
3200 DEBUG_COMPILE_r(regdump(r));
3201 return(r);
3202}
3203
3204/*
3205 - reg - regular expression, i.e. main body or parenthesized thing
3206 *
3207 * Caller must absorb opening parenthesis.
3208 *
3209 * Combining parenthesis handling with the base level of regular expression
3210 * is a trifle forced, but the need to tie the tails of the branches to what
3211 * follows makes it hard to avoid.
3212 */
3213STATIC regnode *
3214S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3215 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3216{
3217 dVAR;
3218 register regnode *ret; /* Will be the head of the group. */
3219 register regnode *br;
3220 register regnode *lastbr;
3221 register regnode *ender = 0;
3222 register I32 parno = 0;
3223 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3224
3225 /* for (?g), (?gc), and (?o) warnings; warning
3226 about (?c) will warn about (?g) -- japhy */
3227
3228 I32 wastedflags = 0x00,
3229 wasted_o = 0x01,
3230 wasted_g = 0x02,
3231 wasted_gc = 0x02 | 0x04,
3232 wasted_c = 0x04;
3233
3234 char * parse_start = RExC_parse; /* MJD */
3235 char * const oregcomp_parse = RExC_parse;
3236 char c;
3237
3238 *flagp = 0; /* Tentatively. */
3239
3240
3241 /* Make an OPEN node, if parenthesized. */
3242 if (paren) {
3243 if (*RExC_parse == '?') { /* (?...) */
3244 U32 posflags = 0, negflags = 0;
3245 U32 *flagsp = &posflags;
3246 int logical = 0;
3247 const char * const seqstart = RExC_parse;
3248
3249 RExC_parse++;
3250 paren = *RExC_parse++;
3251 ret = NULL; /* For look-ahead/behind. */
3252 switch (paren) {
3253 case '<': /* (?<...) */
3254 RExC_seen |= REG_SEEN_LOOKBEHIND;
3255 if (*RExC_parse == '!')
3256 paren = ',';
3257 if (*RExC_parse != '=' && *RExC_parse != '!')
3258 goto unknown;
3259 RExC_parse++;
3260 case '=': /* (?=...) */
3261 case '!': /* (?!...) */
3262 RExC_seen_zerolen++;
3263 case ':': /* (?:...) */
3264 case '>': /* (?>...) */
3265 break;
3266 case '$': /* (?$...) */
3267 case '@': /* (?@...) */
3268 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3269 break;
3270 case '#': /* (?#...) */
3271 while (*RExC_parse && *RExC_parse != ')')
3272 RExC_parse++;
3273 if (*RExC_parse != ')')
3274 FAIL("Sequence (?#... not terminated");
3275 nextchar(pRExC_state);
3276 *flagp = TRYAGAIN;
3277 return NULL;
3278 case 'p': /* (?p...) */
3279 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3280 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3281 /* FALL THROUGH*/
3282 case '?': /* (??...) */
3283 logical = 1;
3284 if (*RExC_parse != '{')
3285 goto unknown;
3286 paren = *RExC_parse++;
3287 /* FALL THROUGH */
3288 case '{': /* (?{...}) */
3289 {
3290 I32 count = 1, n = 0;
3291 char c;
3292 char *s = RExC_parse;
3293 SV *sv;
3294 OP_4tree *sop, *rop;
3295
3296 RExC_seen_zerolen++;
3297 RExC_seen |= REG_SEEN_EVAL;
3298 while (count && (c = *RExC_parse)) {
3299 if (c == '\\' && RExC_parse[1])
3300 RExC_parse++;
3301 else if (c == '{')
3302 count++;
3303 else if (c == '}')
3304 count--;
3305 RExC_parse++;
3306 }
3307 if (*RExC_parse != ')')
3308 {
3309 RExC_parse = s;
3310 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3311 }
3312 if (!SIZE_ONLY) {
3313 PAD *pad;
3314
3315 if (RExC_parse - 1 - s)
3316 sv = newSVpvn(s, RExC_parse - 1 - s);
3317 else
3318 sv = newSVpvn("", 0);
3319
3320 ENTER;
3321 Perl_save_re_context(aTHX);
3322 rop = sv_compile_2op(sv, &sop, "re", &pad);
3323 sop->op_private |= OPpREFCOUNTED;
3324 /* re_dup will OpREFCNT_inc */
3325 OpREFCNT_set(sop, 1);
3326 LEAVE;
3327
3328 n = add_data(pRExC_state, 3, "nop");
3329 RExC_rx->data->data[n] = (void*)rop;
3330 RExC_rx->data->data[n+1] = (void*)sop;
3331 RExC_rx->data->data[n+2] = (void*)pad;
3332 SvREFCNT_dec(sv);
3333 }
3334 else { /* First pass */
3335 if (PL_reginterp_cnt < ++RExC_seen_evals
3336 && IN_PERL_RUNTIME)
3337 /* No compiled RE interpolated, has runtime
3338 components ===> unsafe. */
3339 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3340 if (PL_tainting && PL_tainted)
3341 FAIL("Eval-group in insecure regular expression");
3342 if (IN_PERL_COMPILETIME)
3343 PL_cv_has_eval = 1;
3344 }
3345
3346 nextchar(pRExC_state);
3347 if (logical) {
3348 ret = reg_node(pRExC_state, LOGICAL);
3349 if (!SIZE_ONLY)
3350 ret->flags = 2;
3351 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3352 /* deal with the length of this later - MJD */
3353 return ret;
3354 }
3355 ret = reganode(pRExC_state, EVAL, n);
3356 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3357 Set_Node_Offset(ret, parse_start);
3358 return ret;
3359 }
3360 case '(': /* (?(?{...})...) and (?(?=...)...) */
3361 {
3362 if (RExC_parse[0] == '?') { /* (?(?...)) */
3363 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3364 || RExC_parse[1] == '<'
3365 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3366 I32 flag;
3367
3368 ret = reg_node(pRExC_state, LOGICAL);
3369 if (!SIZE_ONLY)
3370 ret->flags = 1;
3371 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3372 goto insert_if;
3373 }
3374 }
3375 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3376 /* (?(1)...) */
3377 parno = atoi(RExC_parse++);
3378
3379 while (isDIGIT(*RExC_parse))
3380 RExC_parse++;
3381 ret = reganode(pRExC_state, GROUPP, parno);
3382
3383 if ((c = *nextchar(pRExC_state)) != ')')
3384 vFAIL("Switch condition not recognized");
3385 insert_if:
3386 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3387 br = regbranch(pRExC_state, &flags, 1);
3388 if (br == NULL)
3389 br = reganode(pRExC_state, LONGJMP, 0);
3390 else
3391 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3392 c = *nextchar(pRExC_state);
3393 if (flags&HASWIDTH)
3394 *flagp |= HASWIDTH;
3395 if (c == '|') {
3396 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3397 regbranch(pRExC_state, &flags, 1);
3398 regtail(pRExC_state, ret, lastbr);
3399 if (flags&HASWIDTH)
3400 *flagp |= HASWIDTH;
3401 c = *nextchar(pRExC_state);
3402 }
3403 else
3404 lastbr = NULL;
3405 if (c != ')')
3406 vFAIL("Switch (?(condition)... contains too many branches");
3407 ender = reg_node(pRExC_state, TAIL);
3408 regtail(pRExC_state, br, ender);
3409 if (lastbr) {
3410 regtail(pRExC_state, lastbr, ender);
3411 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3412 }
3413 else
3414 regtail(pRExC_state, ret, ender);
3415 return ret;
3416 }
3417 else {
3418 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3419 }
3420 }
3421 case 0:
3422 RExC_parse--; /* for vFAIL to print correctly */
3423 vFAIL("Sequence (? incomplete");
3424 break;
3425 default:
3426 --RExC_parse;
3427 parse_flags: /* (?i) */
3428 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3429 /* (?g), (?gc) and (?o) are useless here
3430 and must be globally applied -- japhy */
3431
3432 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3434 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3435 if (! (wastedflags & wflagbit) ) {
3436 wastedflags |= wflagbit;
3437 vWARN5(
3438 RExC_parse + 1,
3439 "Useless (%s%c) - %suse /%c modifier",
3440 flagsp == &negflags ? "?-" : "?",
3441 *RExC_parse,
3442 flagsp == &negflags ? "don't " : "",
3443 *RExC_parse
3444 );
3445 }
3446 }
3447 }
3448 else if (*RExC_parse == 'c') {
3449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3450 if (! (wastedflags & wasted_c) ) {
3451 wastedflags |= wasted_gc;
3452 vWARN3(
3453 RExC_parse + 1,
3454 "Useless (%sc) - %suse /gc modifier",
3455 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : ""
3457 );
3458 }
3459 }
3460 }
3461 else { pmflag(flagsp, *RExC_parse); }
3462
3463 ++RExC_parse;
3464 }
3465 if (*RExC_parse == '-') {
3466 flagsp = &negflags;
3467 wastedflags = 0; /* reset so (?g-c) warns twice */
3468 ++RExC_parse;
3469 goto parse_flags;
3470 }
3471 RExC_flags |= posflags;
3472 RExC_flags &= ~negflags;
3473 if (*RExC_parse == ':') {
3474 RExC_parse++;
3475 paren = ':';
3476 break;
3477 }
3478 unknown:
3479 if (*RExC_parse != ')') {
3480 RExC_parse++;
3481 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3482 }
3483 nextchar(pRExC_state);
3484 *flagp = TRYAGAIN;
3485 return NULL;
3486 }
3487 }
3488 else { /* (...) */
3489 parno = RExC_npar;
3490 RExC_npar++;
3491 ret = reganode(pRExC_state, OPEN, parno);
3492 Set_Node_Length(ret, 1); /* MJD */
3493 Set_Node_Offset(ret, RExC_parse); /* MJD */
3494 open = 1;
3495 }
3496 }
3497 else /* ! paren */
3498 ret = NULL;
3499
3500 /* Pick up the branches, linking them together. */
3501 parse_start = RExC_parse; /* MJD */
3502 br = regbranch(pRExC_state, &flags, 1);
3503 /* branch_len = (paren != 0); */
3504
3505 if (br == NULL)
3506 return(NULL);
3507 if (*RExC_parse == '|') {
3508 if (!SIZE_ONLY && RExC_extralen) {
3509 reginsert(pRExC_state, BRANCHJ, br);
3510 }
3511 else { /* MJD */
3512 reginsert(pRExC_state, BRANCH, br);
3513 Set_Node_Length(br, paren != 0);
3514 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3515 }
3516 have_branch = 1;
3517 if (SIZE_ONLY)
3518 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3519 }
3520 else if (paren == ':') {
3521 *flagp |= flags&SIMPLE;
3522 }
3523 if (open) { /* Starts with OPEN. */
3524 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3525 }
3526 else if (paren != '?') /* Not Conditional */
3527 ret = br;
3528 *flagp |= flags & (SPSTART | HASWIDTH);
3529 lastbr = br;
3530 while (*RExC_parse == '|') {
3531 if (!SIZE_ONLY && RExC_extralen) {
3532 ender = reganode(pRExC_state, LONGJMP,0);
3533 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3534 }
3535 if (SIZE_ONLY)
3536 RExC_extralen += 2; /* Account for LONGJMP. */
3537 nextchar(pRExC_state);
3538 br = regbranch(pRExC_state, &flags, 0);
3539
3540 if (br == NULL)
3541 return(NULL);
3542 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3543 lastbr = br;
3544 if (flags&HASWIDTH)
3545 *flagp |= HASWIDTH;
3546 *flagp |= flags&SPSTART;
3547 }
3548
3549 if (have_branch || paren != ':') {
3550 /* Make a closing node, and hook it on the end. */
3551 switch (paren) {
3552 case ':':
3553 ender = reg_node(pRExC_state, TAIL);
3554 break;
3555 case 1:
3556 ender = reganode(pRExC_state, CLOSE, parno);
3557 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3558 Set_Node_Length(ender,1); /* MJD */
3559 break;
3560 case '<':
3561 case ',':
3562 case '=':
3563 case '!':
3564 *flagp &= ~HASWIDTH;
3565 /* FALL THROUGH */
3566 case '>':
3567 ender = reg_node(pRExC_state, SUCCEED);
3568 break;
3569 case 0:
3570 ender = reg_node(pRExC_state, END);
3571 break;
3572 }
3573 regtail(pRExC_state, lastbr, ender);
3574
3575 if (have_branch) {
3576 /* Hook the tails of the branches to the closing node. */
3577 for (br = ret; br != NULL; br = regnext(br)) {
3578 regoptail(pRExC_state, br, ender);
3579 }
3580 }
3581 }
3582
3583 {
3584 const char *p;
3585 static const char parens[] = "=!<,>";
3586
3587 if (paren && (p = strchr(parens, paren))) {
3588 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3589 int flag = (p - parens) > 1;
3590
3591 if (paren == '>')
3592 node = SUSPEND, flag = 0;
3593 reginsert(pRExC_state, node,ret);
3594 Set_Node_Cur_Length(ret);
3595 Set_Node_Offset(ret, parse_start + 1);
3596 ret->flags = flag;
3597 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3598 }
3599 }
3600
3601 /* Check for proper termination. */
3602 if (paren) {
3603 RExC_flags = oregflags;
3604 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3605 RExC_parse = oregcomp_parse;
3606 vFAIL("Unmatched (");
3607 }
3608 }
3609 else if (!paren && RExC_parse < RExC_end) {
3610 if (*RExC_parse == ')') {
3611 RExC_parse++;
3612 vFAIL("Unmatched )");
3613 }
3614 else
3615 FAIL("Junk on end of regexp"); /* "Can't happen". */
3616 /* NOTREACHED */
3617 }
3618
3619 return(ret);
3620}
3621
3622/*
3623 - regbranch - one alternative of an | operator
3624 *
3625 * Implements the concatenation operator.
3626 */
3627STATIC regnode *
3628S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3629{
3630 register regnode *ret;
3631 register regnode *chain = NULL;
3632 register regnode *latest;
3633 I32 flags = 0, c = 0;
3634
3635 if (first)
3636 ret = NULL;
3637 else {
3638 if (!SIZE_ONLY && RExC_extralen)
3639 ret = reganode(pRExC_state, BRANCHJ,0);
3640 else {
3641 ret = reg_node(pRExC_state, BRANCH);
3642 Set_Node_Length(ret, 1);
3643 }
3644 }
3645
3646 if (!first && SIZE_ONLY)
3647 RExC_extralen += 1; /* BRANCHJ */
3648
3649 *flagp = WORST; /* Tentatively. */
3650
3651 RExC_parse--;
3652 nextchar(pRExC_state);
3653 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3654 flags &= ~TRYAGAIN;
3655 latest = regpiece(pRExC_state, &flags);
3656 if (latest == NULL) {
3657 if (flags & TRYAGAIN)
3658 continue;
3659 return(NULL);
3660 }
3661 else if (ret == NULL)
3662 ret = latest;
3663 *flagp |= flags&HASWIDTH;
3664 if (chain == NULL) /* First piece. */
3665 *flagp |= flags&SPSTART;
3666 else {
3667 RExC_naughty++;
3668 regtail(pRExC_state, chain, latest);
3669 }
3670 chain = latest;
3671 c++;
3672 }
3673 if (chain == NULL) { /* Loop ran zero times. */
3674 chain = reg_node(pRExC_state, NOTHING);
3675 if (ret == NULL)
3676 ret = chain;
3677 }
3678 if (c == 1) {
3679 *flagp |= flags&SIMPLE;
3680 }
3681
3682 return(ret);
3683}
3684
3685/*
3686 - regpiece - something followed by possible [*+?]
3687 *
3688 * Note that the branching code sequences used for ? and the general cases
3689 * of * and + are somewhat optimized: they use the same NOTHING node as
3690 * both the endmarker for their branch list and the body of the last branch.
3691 * It might seem that this node could be dispensed with entirely, but the
3692 * endmarker role is not redundant.
3693 */
3694STATIC regnode *
3695S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3696{
3697 register regnode *ret;
3698 register char op;
3699 register char *next;
3700 I32 flags;
3701 const char * const origparse = RExC_parse;
3702 char *maxpos;
3703 I32 min;
3704 I32 max = REG_INFTY;
3705 char *parse_start;
3706
3707 ret = regatom(pRExC_state, &flags);
3708 if (ret == NULL) {
3709 if (flags & TRYAGAIN)
3710 *flagp |= TRYAGAIN;
3711 return(NULL);
3712 }
3713
3714 op = *RExC_parse;
3715
3716 if (op == '{' && regcurly(RExC_parse)) {
3717 parse_start = RExC_parse; /* MJD */
3718 next = RExC_parse + 1;
3719 maxpos = Nullch;
3720 while (isDIGIT(*next) || *next == ',') {
3721 if (*next == ',') {
3722 if (maxpos)
3723 break;
3724 else
3725 maxpos = next;
3726 }
3727 next++;
3728 }
3729 if (*next == '}') { /* got one */
3730 if (!maxpos)
3731 maxpos = next;
3732 RExC_parse++;
3733 min = atoi(RExC_parse);
3734 if (*maxpos == ',')
3735 maxpos++;
3736 else
3737 maxpos = RExC_parse;
3738 max = atoi(maxpos);
3739 if (!max && *maxpos != '0')
3740 max = REG_INFTY; /* meaning "infinity" */
3741 else if (max >= REG_INFTY)
3742 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3743 RExC_parse = next;
3744 nextchar(pRExC_state);
3745
3746 do_curly:
3747 if ((flags&SIMPLE)) {
3748 RExC_naughty += 2 + RExC_naughty / 2;
3749 reginsert(pRExC_state, CURLY, ret);
3750 Set_Node_Offset(ret, parse_start+1); /* MJD */
3751 Set_Node_Cur_Length(ret);
3752 }
3753 else {
3754 regnode *w = reg_node(pRExC_state, WHILEM);
3755
3756 w->flags = 0;
3757 regtail(pRExC_state, ret, w);
3758 if (!SIZE_ONLY && RExC_extralen) {
3759 reginsert(pRExC_state, LONGJMP,ret);
3760 reginsert(pRExC_state, NOTHING,ret);
3761 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3762 }
3763 reginsert(pRExC_state, CURLYX,ret);
3764 /* MJD hk */
3765 Set_Node_Offset(ret, parse_start+1);
3766 Set_Node_Length(ret,
3767 op == '{' ? (RExC_parse - parse_start) : 1);
3768
3769 if (!SIZE_ONLY && RExC_extralen)
3770 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3771 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3772 if (SIZE_ONLY)
3773 RExC_whilem_seen++, RExC_extralen += 3;
3774 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3775 }
3776 ret->flags = 0;
3777
3778 if (min > 0)
3779 *flagp = WORST;
3780 if (max > 0)
3781 *flagp |= HASWIDTH;
3782 if (max && max < min)
3783 vFAIL("Can't do {n,m} with n > m");
3784 if (!SIZE_ONLY) {
3785 ARG1_SET(ret, (U16)min);
3786 ARG2_SET(ret, (U16)max);
3787 }
3788
3789 goto nest_check;
3790 }
3791 }
3792
3793 if (!ISMULT1(op)) {
3794 *flagp = flags;
3795 return(ret);
3796 }
3797
3798#if 0 /* Now runtime fix should be reliable. */
3799
3800 /* if this is reinstated, don't forget to put this back into perldiag:
3801
3802 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3803
3804 (F) The part of the regexp subject to either the * or + quantifier
3805 could match an empty string. The {#} shows in the regular
3806 expression about where the problem was discovered.
3807
3808 */
3809
3810 if (!(flags&HASWIDTH) && op != '?')
3811 vFAIL("Regexp *+ operand could be empty");
3812#endif
3813
3814 parse_start = RExC_parse;
3815 nextchar(pRExC_state);
3816
3817 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3818
3819 if (op == '*' && (flags&SIMPLE)) {
3820 reginsert(pRExC_state, STAR, ret);
3821 ret->flags = 0;
3822 RExC_naughty += 4;
3823 }
3824 else if (op == '*') {
3825 min = 0;
3826 goto do_curly;
3827 }
3828 else if (op == '+' && (flags&SIMPLE)) {
3829 reginsert(pRExC_state, PLUS, ret);
3830 ret->flags = 0;
3831 RExC_naughty += 3;
3832 }
3833 else if (op == '+') {
3834 min = 1;
3835 goto do_curly;
3836 }
3837 else if (op == '?') {
3838 min = 0; max = 1;
3839 goto do_curly;
3840 }
3841 nest_check:
3842 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3843 vWARN3(RExC_parse,
3844 "%.*s matches null string many times",
3845 RExC_parse - origparse,
3846 origparse);
3847 }
3848
3849 if (*RExC_parse == '?') {
3850 nextchar(pRExC_state);
3851 reginsert(pRExC_state, MINMOD, ret);
3852 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3853 }
3854 if (ISMULT2(RExC_parse)) {
3855 RExC_parse++;
3856 vFAIL("Nested quantifiers");
3857 }
3858
3859 return(ret);
3860}
3861
3862/*
3863 - regatom - the lowest level
3864 *
3865 * Optimization: gobbles an entire sequence of ordinary characters so that
3866 * it can turn them into a single node, which is smaller to store and
3867 * faster to run. Backslashed characters are exceptions, each becoming a
3868 * separate node; the code is simpler that way and it's not worth fixing.
3869 *
3870 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3871STATIC regnode *
3872S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3873{
3874 register regnode *ret = 0;
3875 I32 flags;
3876 char *parse_start = RExC_parse;
3877
3878 *flagp = WORST; /* Tentatively. */
3879
3880tryagain:
3881 switch (*RExC_parse) {
3882 case '^':
3883 RExC_seen_zerolen++;
3884 nextchar(pRExC_state);
3885 if (RExC_flags & PMf_MULTILINE)
3886 ret = reg_node(pRExC_state, MBOL);
3887 else if (RExC_flags & PMf_SINGLELINE)
3888 ret = reg_node(pRExC_state, SBOL);
3889 else
3890 ret = reg_node(pRExC_state, BOL);
3891 Set_Node_Length(ret, 1); /* MJD */
3892 break;
3893 case '$':
3894 nextchar(pRExC_state);
3895 if (*RExC_parse)
3896 RExC_seen_zerolen++;
3897 if (RExC_flags & PMf_MULTILINE)
3898 ret = reg_node(pRExC_state, MEOL);
3899 else if (RExC_flags & PMf_SINGLELINE)
3900 ret = reg_node(pRExC_state, SEOL);
3901 else
3902 ret = reg_node(pRExC_state, EOL);
3903 Set_Node_Length(ret, 1); /* MJD */
3904 break;
3905 case '.':
3906 nextchar(pRExC_state);
3907 if (RExC_flags & PMf_SINGLELINE)
3908 ret = reg_node(pRExC_state, SANY);
3909 else
3910 ret = reg_node(pRExC_state, REG_ANY);
3911 *flagp |= HASWIDTH|SIMPLE;
3912 RExC_naughty++;
3913 Set_Node_Length(ret, 1); /* MJD */
3914 break;
3915 case '[':
3916 {
3917 char *oregcomp_parse = ++RExC_parse;
3918 ret = regclass(pRExC_state);
3919 if (*RExC_parse != ']') {
3920 RExC_parse = oregcomp_parse;
3921 vFAIL("Unmatched [");
3922 }
3923 nextchar(pRExC_state);
3924 *flagp |= HASWIDTH|SIMPLE;
3925 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3926 break;
3927 }
3928 case '(':
3929 nextchar(pRExC_state);
3930 ret = reg(pRExC_state, 1, &flags);
3931 if (ret == NULL) {
3932 if (flags & TRYAGAIN) {
3933 if (RExC_parse == RExC_end) {
3934 /* Make parent create an empty node if needed. */
3935 *flagp |= TRYAGAIN;
3936 return(NULL);
3937 }
3938 goto tryagain;
3939 }
3940 return(NULL);
3941 }
3942 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3943 break;
3944 case '|':
3945 case ')':
3946 if (flags & TRYAGAIN) {
3947 *flagp |= TRYAGAIN;
3948 return NULL;
3949 }
3950 vFAIL("Internal urp");
3951 /* Supposed to be caught earlier. */
3952 break;
3953 case '{':
3954 if (!regcurly(RExC_parse)) {
3955 RExC_parse++;
3956 goto defchar;
3957 }
3958 /* FALL THROUGH */
3959 case '?':
3960 case '+':
3961 case '*':
3962 RExC_parse++;
3963 vFAIL("Quantifier follows nothing");
3964 break;
3965 case '\\':
3966 switch (*++RExC_parse) {
3967 case 'A':
3968 RExC_seen_zerolen++;
3969 ret = reg_node(pRExC_state, SBOL);
3970 *flagp |= SIMPLE;
3971 nextchar(pRExC_state);
3972 Set_Node_Length(ret, 2); /* MJD */
3973 break;
3974 case 'G':
3975 ret = reg_node(pRExC_state, GPOS);
3976 RExC_seen |= REG_SEEN_GPOS;
3977 *flagp |= SIMPLE;
3978 nextchar(pRExC_state);
3979 Set_Node_Length(ret, 2); /* MJD */
3980 break;
3981 case 'Z':
3982 ret = reg_node(pRExC_state, SEOL);
3983 *flagp |= SIMPLE;
3984 RExC_seen_zerolen++; /* Do not optimize RE away */
3985 nextchar(pRExC_state);
3986 break;
3987 case 'z':
3988 ret = reg_node(pRExC_state, EOS);
3989 *flagp |= SIMPLE;
3990 RExC_seen_zerolen++; /* Do not optimize RE away */
3991 nextchar(pRExC_state);
3992 Set_Node_Length(ret, 2); /* MJD */
3993 break;
3994 case 'C':
3995 ret = reg_node(pRExC_state, CANY);
3996 RExC_seen |= REG_SEEN_CANY;
3997 *flagp |= HASWIDTH|SIMPLE;
3998 nextchar(pRExC_state);
3999 Set_Node_Length(ret, 2); /* MJD */
4000 break;
4001 case 'X':
4002 ret = reg_node(pRExC_state, CLUMP);
4003 *flagp |= HASWIDTH;
4004 nextchar(pRExC_state);
4005 Set_Node_Length(ret, 2); /* MJD */
4006 break;
4007 case 'w':
4008 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4009 *flagp |= HASWIDTH|SIMPLE;
4010 nextchar(pRExC_state);
4011 Set_Node_Length(ret, 2); /* MJD */
4012 break;
4013 case 'W':
4014 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4015 *flagp |= HASWIDTH|SIMPLE;
4016 nextchar(pRExC_state);
4017 Set_Node_Length(ret, 2); /* MJD */
4018 break;
4019 case 'b':
4020 RExC_seen_zerolen++;
4021 RExC_seen |= REG_SEEN_LOOKBEHIND;
4022 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4023 *flagp |= SIMPLE;
4024 nextchar(pRExC_state);
4025 Set_Node_Length(ret, 2); /* MJD */
4026 break;
4027 case 'B':
4028 RExC_seen_zerolen++;
4029 RExC_seen |= REG_SEEN_LOOKBEHIND;
4030 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4031 *flagp |= SIMPLE;
4032 nextchar(pRExC_state);
4033 Set_Node_Length(ret, 2); /* MJD */
4034 break;
4035 case 's':
4036 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4037 *flagp |= HASWIDTH|SIMPLE;
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4040 break;
4041 case 'S':
4042 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4043 *flagp |= HASWIDTH|SIMPLE;
4044 nextchar(pRExC_state);
4045 Set_Node_Length(ret, 2); /* MJD */
4046 break;
4047 case 'd':
4048 ret = reg_node(pRExC_state, DIGIT);
4049 *flagp |= HASWIDTH|SIMPLE;
4050 nextchar(pRExC_state);
4051 Set_Node_Length(ret, 2); /* MJD */
4052 break;
4053 case 'D':
4054 ret = reg_node(pRExC_state, NDIGIT);
4055 *flagp |= HASWIDTH|SIMPLE;
4056 nextchar(pRExC_state);
4057 Set_Node_Length(ret, 2); /* MJD */
4058 break;
4059 case 'p':
4060 case 'P':
4061 {
4062 char* oldregxend = RExC_end;
4063 char* parse_start = RExC_parse - 2;
4064
4065 if (RExC_parse[1] == '{') {
4066 /* a lovely hack--pretend we saw [\pX] instead */
4067 RExC_end = strchr(RExC_parse, '}');
4068 if (!RExC_end) {
4069 U8 c = (U8)*RExC_parse;
4070 RExC_parse += 2;
4071 RExC_end = oldregxend;
4072 vFAIL2("Missing right brace on \\%c{}", c);
4073 }
4074 RExC_end++;
4075 }
4076 else {
4077 RExC_end = RExC_parse + 2;
4078 if (RExC_end > oldregxend)
4079 RExC_end = oldregxend;
4080 }
4081 RExC_parse--;
4082
4083 ret = regclass(pRExC_state);
4084
4085 RExC_end = oldregxend;
4086 RExC_parse--;
4087
4088 Set_Node_Offset(ret, parse_start + 2);
4089 Set_Node_Cur_Length(ret);
4090 nextchar(pRExC_state);
4091 *flagp |= HASWIDTH|SIMPLE;
4092 }
4093 break;
4094 case 'n':
4095 case 'r':
4096 case 't':
4097 case 'f':
4098 case 'e':
4099 case 'a':
4100 case 'x':
4101 case 'c':
4102 case '0':
4103 goto defchar;
4104 case '1': case '2': case '3': case '4':
4105 case '5': case '6': case '7': case '8': case '9':
4106 {
4107 const I32 num = atoi(RExC_parse);
4108
4109 if (num > 9 && num >= RExC_npar)
4110 goto defchar;
4111 else {
4112 char * parse_start = RExC_parse - 1; /* MJD */
4113 while (isDIGIT(*RExC_parse))
4114 RExC_parse++;
4115
4116 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4117 vFAIL("Reference to nonexistent group");
4118 RExC_sawback = 1;
4119 ret = reganode(pRExC_state,
4120 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4121 num);
4122 *flagp |= HASWIDTH;
4123
4124 /* override incorrect value set in reganode MJD */
4125 Set_Node_Offset(ret, parse_start+1);
4126 Set_Node_Cur_Length(ret); /* MJD */
4127 RExC_parse--;
4128 nextchar(pRExC_state);
4129 }
4130 }
4131 break;
4132 case '\0':
4133 if (RExC_parse >= RExC_end)
4134 FAIL("Trailing \\");
4135 /* FALL THROUGH */
4136 default:
4137 /* Do not generate "unrecognized" warnings here, we fall
4138 back into the quick-grab loop below */
4139 parse_start--;
4140 goto defchar;
4141 }
4142 break;
4143
4144 case '#':
4145 if (RExC_flags & PMf_EXTENDED) {
4146 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4147 if (RExC_parse < RExC_end)
4148 goto tryagain;
4149 }
4150 /* FALL THROUGH */
4151
4152 default: {
4153 register STRLEN len;
4154 register UV ender;
4155 register char *p;
4156 char *oldp, *s;
4157 STRLEN foldlen;
4158 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4159
4160 parse_start = RExC_parse - 1;
4161
4162 RExC_parse++;
4163
4164 defchar:
4165 ender = 0;
4166 ret = reg_node(pRExC_state,
4167 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4168 s = STRING(ret);
4169 for (len = 0, p = RExC_parse - 1;
4170 len < 127 && p < RExC_end;
4171 len++)
4172 {
4173 oldp = p;
4174
4175 if (RExC_flags & PMf_EXTENDED)
4176 p = regwhite(p, RExC_end);
4177 switch (*p) {
4178 case '^':
4179 case '$':
4180 case '.':
4181 case '[':
4182 case '(':
4183 case ')':
4184 case '|':
4185 goto loopdone;
4186 case '\\':
4187 switch (*++p) {
4188 case 'A':
4189 case 'C':
4190 case 'X':
4191 case 'G':
4192 case 'Z':
4193 case 'z':
4194 case 'w':
4195 case 'W':
4196 case 'b':
4197 case 'B':
4198 case 's':
4199 case 'S':
4200 case 'd':
4201 case 'D':
4202 case 'p':
4203 case 'P':
4204 --p;
4205 goto loopdone;
4206 case 'n':
4207 ender = '\n';
4208 p++;
4209 break;
4210 case 'r':
4211 ender = '\r';
4212 p++;
4213 break;
4214 case 't':
4215 ender = '\t';
4216 p++;
4217 break;
4218 case 'f':
4219 ender = '\f';
4220 p++;
4221 break;
4222 case 'e':
4223 ender = ASCII_TO_NATIVE('\033');
4224 p++;
4225 break;
4226 case 'a':
4227 ender = ASCII_TO_NATIVE('\007');
4228 p++;
4229 break;
4230 case 'x':
4231 if (*++p == '{') {
4232 char* const e = strchr(p, '}');
4233
4234 if (!e) {
4235 RExC_parse = p + 1;
4236 vFAIL("Missing right brace on \\x{}");
4237 }
4238 else {
4239 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4240 | PERL_SCAN_DISALLOW_PREFIX;
4241 STRLEN numlen = e - p - 1;
4242 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4243 if (ender > 0xff)
4244 RExC_utf8 = 1;
4245 p = e + 1;
4246 }
4247 }
4248 else {
4249 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4250 STRLEN numlen = 2;
4251 ender = grok_hex(p, &numlen, &flags, NULL);
4252 p += numlen;
4253 }
4254 break;
4255 case 'c':
4256 p++;
4257 ender = UCHARAT(p++);
4258 ender = toCTRL(ender);
4259 break;
4260 case '0': case '1': case '2': case '3':case '4':
4261 case '5': case '6': case '7': case '8':case '9':
4262 if (*p == '0' ||
4263 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4264 I32 flags = 0;
4265 STRLEN numlen = 3;
4266 ender = grok_oct(p, &numlen, &flags, NULL);
4267 p += numlen;
4268 }
4269 else {
4270 --p;
4271 goto loopdone;
4272 }
4273 break;
4274 case '\0':
4275 if (p >= RExC_end)
4276 FAIL("Trailing \\");
4277 /* FALL THROUGH */
4278 default:
4279 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4280 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4281 goto normal_default;
4282 }
4283 break;
4284 default:
4285 normal_default:
4286 if (UTF8_IS_START(*p) && UTF) {
4287 STRLEN numlen;
4288 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4289 &numlen, 0);
4290 p += numlen;
4291 }
4292 else
4293 ender = *p++;
4294 break;
4295 }
4296 if (RExC_flags & PMf_EXTENDED)
4297 p = regwhite(p, RExC_end);
4298 if (UTF && FOLD) {
4299 /* Prime the casefolded buffer. */
4300 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4301 }
4302 if (ISMULT2(p)) { /* Back off on ?+*. */
4303 if (len)
4304 p = oldp;
4305 else if (UTF) {
4306 STRLEN unilen;
4307
4308 if (FOLD) {
4309 /* Emit all the Unicode characters. */
4310 STRLEN numlen;
4311 for (foldbuf = tmpbuf;
4312 foldlen;
4313 foldlen -= numlen) {
4314 ender = utf8_to_uvchr(foldbuf, &numlen);
4315 if (numlen > 0) {
4316 reguni(pRExC_state, ender, s, &unilen);
4317 s += unilen;
4318 len += unilen;
4319 /* In EBCDIC the numlen
4320 * and unilen can differ. */
4321 foldbuf += numlen;
4322 if (numlen >= foldlen)
4323 break;
4324 }
4325 else
4326 break; /* "Can't happen." */
4327 }
4328 }
4329 else {
4330 reguni(pRExC_state, ender, s, &unilen);
4331 if (unilen > 0) {
4332 s += unilen;
4333 len += unilen;
4334 }
4335 }
4336 }
4337 else {
4338 len++;
4339 REGC((char)ender, s++);
4340 }
4341 break;
4342 }
4343 if (UTF) {
4344 STRLEN unilen;
4345
4346 if (FOLD) {
4347 /* Emit all the Unicode characters. */
4348 STRLEN numlen;
4349 for (foldbuf = tmpbuf;
4350 foldlen;
4351 foldlen -= numlen) {
4352 ender = utf8_to_uvchr(foldbuf, &numlen);
4353 if (numlen > 0) {
4354 reguni(pRExC_state, ender, s, &unilen);
4355 len += unilen;
4356 s += unilen;
4357 /* In EBCDIC the numlen
4358 * and unilen can differ. */
4359 foldbuf += numlen;
4360 if (numlen >= foldlen)
4361 break;
4362 }
4363 else
4364 break;
4365 }
4366 }
4367 else {
4368 reguni(pRExC_state, ender, s, &unilen);
4369 if (unilen > 0) {
4370 s += unilen;
4371 len += unilen;
4372 }
4373 }
4374 len--;
4375 }
4376 else
4377 REGC((char)ender, s++);
4378 }
4379 loopdone:
4380 RExC_parse = p - 1;
4381 Set_Node_Cur_Length(ret); /* MJD */
4382 nextchar(pRExC_state);
4383 {
4384 /* len is STRLEN which is unsigned, need to copy to signed */
4385 IV iv = len;
4386 if (iv < 0)
4387 vFAIL("Internal disaster");
4388 }
4389 if (len > 0)
4390 *flagp |= HASWIDTH;
4391 if (len == 1 && UNI_IS_INVARIANT(ender))
4392 *flagp |= SIMPLE;
4393 if (!SIZE_ONLY)
4394 STR_LEN(ret) = len;
4395 if (SIZE_ONLY)
4396 RExC_size += STR_SZ(len);
4397 else
4398 RExC_emit += STR_SZ(len);
4399 }
4400 break;
4401 }
4402
4403 /* If the encoding pragma is in effect recode the text of
4404 * any EXACT-kind nodes. */
4405 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4406 STRLEN oldlen = STR_LEN(ret);
4407 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4408
4409 if (RExC_utf8)
4410 SvUTF8_on(sv);
4411 if (sv_utf8_downgrade(sv, TRUE)) {
4412 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4413 const STRLEN newlen = SvCUR(sv);
4414
4415 if (SvUTF8(sv))
4416 RExC_utf8 = 1;
4417 if (!SIZE_ONLY) {
4418 GET_RE_DEBUG_FLAGS_DECL;
4419 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4420 (int)oldlen, STRING(ret),
4421 (int)newlen, s));
4422 Copy(s, STRING(ret), newlen, char);
4423 STR_LEN(ret) += newlen - oldlen;
4424 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4425 } else
4426 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4427 }
4428 }
4429
4430 return(ret);
4431}
4432
4433STATIC char *
4434S_regwhite(pTHX_ char *p, const char *e)
4435{
4436 while (p < e) {
4437 if (isSPACE(*p))
4438 ++p;
4439 else if (*p == '#') {
4440 do {
4441 p++;
4442 } while (p < e && *p != '\n');
4443 }
4444 else
4445 break;
4446 }
4447 return p;
4448}
4449
4450/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4451 Character classes ([:foo:]) can also be negated ([:^foo:]).
4452 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4453 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4454 but trigger failures because they are currently unimplemented. */
4455
4456#define POSIXCC_DONE(c) ((c) == ':')
4457#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4458#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4459
4460STATIC I32
4461S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4462{
4463 I32 namedclass = OOB_NAMEDCLASS;
4464
4465 if (value == '[' && RExC_parse + 1 < RExC_end &&
4466 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4467 POSIXCC(UCHARAT(RExC_parse))) {
4468 const char c = UCHARAT(RExC_parse);
4469 char* s = RExC_parse++;
4470
4471 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4472 RExC_parse++;
4473 if (RExC_parse == RExC_end)
4474 /* Grandfather lone [:, [=, [. */
4475 RExC_parse = s;
4476 else {
4477 const char* t = RExC_parse++; /* skip over the c */
4478 const char *posixcc;
4479
4480 assert(*t == c);
4481
4482 if (UCHARAT(RExC_parse) == ']') {
4483 RExC_parse++; /* skip over the ending ] */
4484 posixcc = s + 1;
4485 if (*s == ':') {
4486 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4487 const I32 skip = t - posixcc;
4488
4489 /* Initially switch on the length of the name. */
4490 switch (skip) {
4491 case 4:
4492 if (memEQ(posixcc, "word", 4)) {
4493 /* this is not POSIX, this is the Perl \w */;
4494 namedclass
4495 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4496 }
4497 break;
4498 case 5:
4499 /* Names all of length 5. */
4500 /* alnum alpha ascii blank cntrl digit graph lower
4501 print punct space upper */
4502 /* Offset 4 gives the best switch position. */
4503 switch (posixcc[4]) {
4504 case 'a':
4505 if (memEQ(posixcc, "alph", 4)) {
4506 /* a */
4507 namedclass
4508 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4509 }
4510 break;
4511 case 'e':
4512 if (memEQ(posixcc, "spac", 4)) {
4513 /* e */
4514 namedclass
4515 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4516 }
4517 break;
4518 case 'h':
4519 if (memEQ(posixcc, "grap", 4)) {
4520 /* h */
4521 namedclass
4522 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4523 }
4524 break;
4525 case 'i':
4526 if (memEQ(posixcc, "asci", 4)) {
4527 /* i */
4528 namedclass
4529 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4530 }
4531 break;
4532 case 'k':
4533 if (memEQ(posixcc, "blan", 4)) {
4534 /* k */
4535 namedclass
4536 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4537 }
4538 break;
4539 case 'l':
4540 if (memEQ(posixcc, "cntr", 4)) {
4541 /* l */
4542 namedclass
4543 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4544 }
4545 break;
4546 case 'm':
4547 if (memEQ(posixcc, "alnu", 4)) {
4548 /* m */
4549 namedclass
4550 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4551 }
4552 break;
4553 case 'r':
4554 if (memEQ(posixcc, "lowe", 4)) {
4555 /* r */
4556 namedclass
4557 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4558 }
4559 if (memEQ(posixcc, "uppe", 4)) {
4560 /* r */
4561 namedclass
4562 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4563 }
4564 break;
4565 case 't':
4566 if (memEQ(posixcc, "digi", 4)) {
4567 /* t */
4568 namedclass
4569 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4570 }
4571 if (memEQ(posixcc, "prin", 4)) {
4572 /* t */
4573 namedclass
4574 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4575 }
4576 if (memEQ(posixcc, "punc", 4)) {
4577 /* t */
4578 namedclass
4579 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4580 }
4581 break;
4582 }
4583 break;
4584 case 6:
4585 if (memEQ(posixcc, "xdigit", 6)) {
4586 namedclass
4587 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4588 }
4589 break;
4590 }
4591
4592 if (namedclass == OOB_NAMEDCLASS)
4593 {
4594 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4595 t - s - 1, s + 1);
4596 }
4597 assert (posixcc[skip] == ':');
4598 assert (posixcc[skip+1] == ']');
4599 } else if (!SIZE_ONLY) {
4600 /* [[=foo=]] and [[.foo.]] are still future. */
4601
4602 /* adjust RExC_parse so the warning shows after
4603 the class closes */
4604 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4605 RExC_parse++;
4606 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4607 }
4608 } else {
4609 /* Maternal grandfather:
4610 * "[:" ending in ":" but not in ":]" */
4611 RExC_parse = s;
4612 }
4613 }
4614 }
4615
4616 return namedclass;
4617}
4618
4619STATIC void
4620S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4621{
4622 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4623 const char *s = RExC_parse;
4624 const char c = *s++;
4625
4626 while(*s && isALNUM(*s))
4627 s++;
4628 if (*s && c == *s && s[1] == ']') {
4629 if (ckWARN(WARN_REGEXP))
4630 vWARN3(s+2,
4631 "POSIX syntax [%c %c] belongs inside character classes",
4632 c, c);
4633
4634 /* [[=foo=]] and [[.foo.]] are still future. */
4635 if (POSIXCC_NOTYET(c)) {
4636 /* adjust RExC_parse so the error shows after
4637 the class closes */
4638 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4639 ;
4640 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4641 }
4642 }
4643 }
4644}
4645
4646STATIC regnode *
4647S_regclass(pTHX_ RExC_state_t *pRExC_state)
4648{
4649 register UV value;
4650 register UV nextvalue;
4651 register IV prevvalue = OOB_UNICODE;
4652 register IV range = 0;
4653 register regnode *ret;
4654 STRLEN numlen;
4655 IV namedclass;
4656 char *rangebegin = 0;
4657 bool need_class = 0;
4658 SV *listsv = Nullsv;
4659 register char *e;
4660 UV n;
4661 bool optimize_invert = TRUE;
4662 AV* unicode_alternate = 0;
4663#ifdef EBCDIC
4664 UV literal_endpoint = 0;
4665#endif
4666
4667 ret = reganode(pRExC_state, ANYOF, 0);
4668
4669 if (!SIZE_ONLY)
4670 ANYOF_FLAGS(ret) = 0;
4671
4672 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4673 RExC_naughty++;
4674 RExC_parse++;
4675 if (!SIZE_ONLY)
4676 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4677 }
4678
4679 if (SIZE_ONLY)
4680 RExC_size += ANYOF_SKIP;
4681 else {
4682 RExC_emit += ANYOF_SKIP;
4683 if (FOLD)
4684 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4685 if (LOC)
4686 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4687 ANYOF_BITMAP_ZERO(ret);
4688 listsv = newSVpvn("# comment\n", 10);
4689 }
4690
4691 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4692
4693 if (!SIZE_ONLY && POSIXCC(nextvalue))
4694 checkposixcc(pRExC_state);
4695
4696 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4697 if (UCHARAT(RExC_parse) == ']')
4698 goto charclassloop;
4699
4700 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4701
4702 charclassloop:
4703
4704 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4705
4706 if (!range)
4707 rangebegin = RExC_parse;
4708 if (UTF) {
4709 value = utf8n_to_uvchr((U8*)RExC_parse,
4710 RExC_end - RExC_parse,
4711 &numlen, 0);
4712 RExC_parse += numlen;
4713 }
4714 else
4715 value = UCHARAT(RExC_parse++);
4716 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4717 if (value == '[' && POSIXCC(nextvalue))
4718 namedclass = regpposixcc(pRExC_state, value);
4719 else if (value == '\\') {
4720 if (UTF) {
4721 value = utf8n_to_uvchr((U8*)RExC_parse,
4722 RExC_end - RExC_parse,
4723 &numlen, 0);
4724 RExC_parse += numlen;
4725 }
4726 else
4727 value = UCHARAT(RExC_parse++);
4728 /* Some compilers cannot handle switching on 64-bit integer
4729 * values, therefore value cannot be an UV. Yes, this will
4730 * be a problem later if we want switch on Unicode.
4731 * A similar issue a little bit later when switching on
4732 * namedclass. --jhi */
4733 switch ((I32)value) {
4734 case 'w': namedclass = ANYOF_ALNUM; break;
4735 case 'W': namedclass = ANYOF_NALNUM; break;
4736 case 's': namedclass = ANYOF_SPACE; break;
4737 case 'S': namedclass = ANYOF_NSPACE; break;
4738 case 'd': namedclass = ANYOF_DIGIT; break;
4739 case 'D': namedclass = ANYOF_NDIGIT; break;
4740 case 'p':
4741 case 'P':
4742 if (RExC_parse >= RExC_end)
4743 vFAIL2("Empty \\%c{}", (U8)value);
4744 if (*RExC_parse == '{') {
4745 const U8 c = (U8)value;
4746 e = strchr(RExC_parse++, '}');
4747 if (!e)
4748 vFAIL2("Missing right brace on \\%c{}", c);
4749 while (isSPACE(UCHARAT(RExC_parse)))
4750 RExC_parse++;
4751 if (e == RExC_parse)
4752 vFAIL2("Empty \\%c{}", c);
4753 n = e - RExC_parse;
4754 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4755 n--;
4756 }
4757 else {
4758 e = RExC_parse;
4759 n = 1;
4760 }
4761 if (!SIZE_ONLY) {
4762 if (UCHARAT(RExC_parse) == '^') {
4763 RExC_parse++;
4764 n--;
4765 value = value == 'p' ? 'P' : 'p'; /* toggle */
4766 while (isSPACE(UCHARAT(RExC_parse))) {
4767 RExC_parse++;
4768 n--;
4769 }
4770 }
4771 if (value == 'p')
4772 Perl_sv_catpvf(aTHX_ listsv,
4773 "+utf8::%.*s\n", (int)n, RExC_parse);
4774 else
4775 Perl_sv_catpvf(aTHX_ listsv,
4776 "!utf8::%.*s\n", (int)n, RExC_parse);
4777 }
4778 RExC_parse = e + 1;
4779 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4780 namedclass = ANYOF_MAX; /* no official name, but it's named */
4781 break;
4782 case 'n': value = '\n'; break;
4783 case 'r': value = '\r'; break;
4784 case 't': value = '\t'; break;
4785 case 'f': value = '\f'; break;
4786 case 'b': value = '\b'; break;
4787 case 'e': value = ASCII_TO_NATIVE('\033');break;
4788 case 'a': value = ASCII_TO_NATIVE('\007');break;
4789 case 'x':
4790 if (*RExC_parse == '{') {
4791 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4792 | PERL_SCAN_DISALLOW_PREFIX;
4793 e = strchr(RExC_parse++, '}');
4794 if (!e)
4795 vFAIL("Missing right brace on \\x{}");
4796
4797 numlen = e - RExC_parse;
4798 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4799 RExC_parse = e + 1;
4800 }
4801 else {
4802 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4803 numlen = 2;
4804 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4805 RExC_parse += numlen;
4806 }
4807 break;
4808 case 'c':
4809 value = UCHARAT(RExC_parse++);
4810 value = toCTRL(value);
4811 break;
4812 case '0': case '1': case '2': case '3': case '4':
4813 case '5': case '6': case '7': case '8': case '9':
4814 {
4815 I32 flags = 0;
4816 numlen = 3;
4817 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4818 RExC_parse += numlen;
4819 break;
4820 }
4821 default:
4822 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
4823 vWARN2(RExC_parse,
4824 "Unrecognized escape \\%c in character class passed through",
4825 (int)value);
4826 break;
4827 }
4828 } /* end of \blah */
4829#ifdef EBCDIC
4830 else
4831 literal_endpoint++;
4832#endif
4833
4834 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4835
4836 if (!SIZE_ONLY && !need_class)
4837 ANYOF_CLASS_ZERO(ret);
4838
4839 need_class = 1;
4840
4841 /* a bad range like a-\d, a-[:digit:] ? */
4842 if (range) {
4843 if (!SIZE_ONLY) {
4844 if (ckWARN(WARN_REGEXP))
4845 vWARN4(RExC_parse,
4846 "False [] range \"%*.*s\"",
4847 RExC_parse - rangebegin,
4848 RExC_parse - rangebegin,
4849 rangebegin);
4850 if (prevvalue < 256) {
4851 ANYOF_BITMAP_SET(ret, prevvalue);
4852 ANYOF_BITMAP_SET(ret, '-');
4853 }
4854 else {
4855 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4856 Perl_sv_catpvf(aTHX_ listsv,
4857 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4858 }
4859 }
4860
4861 range = 0; /* this was not a true range */
4862 }
4863
4864 if (!SIZE_ONLY) {
4865 const char *what = NULL;
4866 char yesno = 0;
4867
4868 if (namedclass > OOB_NAMEDCLASS)
4869 optimize_invert = FALSE;
4870 /* Possible truncation here but in some 64-bit environments
4871 * the compiler gets heartburn about switch on 64-bit values.
4872 * A similar issue a little earlier when switching on value.
4873 * --jhi */
4874 switch ((I32)namedclass) {
4875 case ANYOF_ALNUM:
4876 if (LOC)
4877 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4878 else {
4879 for (value = 0; value < 256; value++)
4880 if (isALNUM(value))
4881 ANYOF_BITMAP_SET(ret, value);
4882 }
4883 yesno = '+';
4884 what = "Word";
4885 break;
4886 case ANYOF_NALNUM:
4887 if (LOC)
4888 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4889 else {
4890 for (value = 0; value < 256; value++)
4891 if (!isALNUM(value))
4892 ANYOF_BITMAP_SET(ret, value);
4893 }
4894 yesno = '!';
4895 what = "Word";
4896 break;
4897 case ANYOF_ALNUMC:
4898 if (LOC)
4899 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4900 else {
4901 for (value = 0; value < 256; value++)
4902 if (isALNUMC(value))
4903 ANYOF_BITMAP_SET(ret, value);
4904 }
4905 yesno = '+';
4906 what = "Alnum";
4907 break;
4908 case ANYOF_NALNUMC:
4909 if (LOC)
4910 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4911 else {
4912 for (value = 0; value < 256; value++)
4913 if (!isALNUMC(value))
4914 ANYOF_BITMAP_SET(ret, value);
4915 }
4916 yesno = '!';
4917 what = "Alnum";
4918 break;
4919 case ANYOF_ALPHA:
4920 if (LOC)
4921 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4922 else {
4923 for (value = 0; value < 256; value++)
4924 if (isALPHA(value))
4925 ANYOF_BITMAP_SET(ret, value);
4926 }
4927 yesno = '+';
4928 what = "Alpha";
4929 break;
4930 case ANYOF_NALPHA:
4931 if (LOC)
4932 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4933 else {
4934 for (value = 0; value < 256; value++)
4935 if (!isALPHA(value))
4936 ANYOF_BITMAP_SET(ret, value);
4937 }
4938 yesno = '!';
4939 what = "Alpha";
4940 break;
4941 case ANYOF_ASCII:
4942 if (LOC)
4943 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4944 else {
4945#ifndef EBCDIC
4946 for (value = 0; value < 128; value++)
4947 ANYOF_BITMAP_SET(ret, value);
4948#else /* EBCDIC */
4949 for (value = 0; value < 256; value++) {
4950 if (isASCII(value))
4951 ANYOF_BITMAP_SET(ret, value);
4952 }
4953#endif /* EBCDIC */
4954 }
4955 yesno = '+';
4956 what = "ASCII";
4957 break;
4958 case ANYOF_NASCII:
4959 if (LOC)
4960 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4961 else {
4962#ifndef EBCDIC
4963 for (value = 128; value < 256; value++)
4964 ANYOF_BITMAP_SET(ret, value);
4965#else /* EBCDIC */
4966 for (value = 0; value < 256; value++) {
4967 if (!isASCII(value))
4968 ANYOF_BITMAP_SET(ret, value);
4969 }
4970#endif /* EBCDIC */
4971 }
4972 yesno = '!';
4973 what = "ASCII";
4974 break;
4975 case ANYOF_BLANK:
4976 if (LOC)
4977 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4978 else {
4979 for (value = 0; value < 256; value++)
4980 if (isBLANK(value))
4981 ANYOF_BITMAP_SET(ret, value);
4982 }
4983 yesno = '+';
4984 what = "Blank";
4985 break;
4986 case ANYOF_NBLANK:
4987 if (LOC)
4988 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4989 else {
4990 for (value = 0; value < 256; value++)
4991 if (!isBLANK(value))
4992 ANYOF_BITMAP_SET(ret, value);
4993 }
4994 yesno = '!';
4995 what = "Blank";
4996 break;
4997 case ANYOF_CNTRL:
4998 if (LOC)
4999 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5000 else {
5001 for (value = 0; value < 256; value++)
5002 if (isCNTRL(value))
5003 ANYOF_BITMAP_SET(ret, value);
5004 }
5005 yesno = '+';
5006 what = "Cntrl";
5007 break;
5008 case ANYOF_NCNTRL:
5009 if (LOC)
5010 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5011 else {
5012 for (value = 0; value < 256; value++)
5013 if (!isCNTRL(value))
5014 ANYOF_BITMAP_SET(ret, value);
5015 }
5016 yesno = '!';
5017 what = "Cntrl";
5018 break;
5019 case ANYOF_DIGIT:
5020 if (LOC)
5021 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5022 else {
5023 /* consecutive digits assumed */
5024 for (value = '0'; value <= '9'; value++)
5025 ANYOF_BITMAP_SET(ret, value);
5026 }
5027 yesno = '+';
5028 what = "Digit";
5029 break;
5030 case ANYOF_NDIGIT:
5031 if (LOC)
5032 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5033 else {
5034 /* consecutive digits assumed */
5035 for (value = 0; value < '0'; value++)
5036 ANYOF_BITMAP_SET(ret, value);
5037 for (value = '9' + 1; value < 256; value++)
5038 ANYOF_BITMAP_SET(ret, value);
5039 }
5040 yesno = '!';
5041 what = "Digit";
5042 break;
5043 case ANYOF_GRAPH:
5044 if (LOC)
5045 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5046 else {
5047 for (value = 0; value < 256; value++)
5048 if (isGRAPH(value))
5049 ANYOF_BITMAP_SET(ret, value);
5050 }
5051 yesno = '+';
5052 what = "Graph";
5053 break;
5054 case ANYOF_NGRAPH:
5055 if (LOC)
5056 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5057 else {
5058 for (value = 0; value < 256; value++)
5059 if (!isGRAPH(value))
5060 ANYOF_BITMAP_SET(ret, value);
5061 }
5062 yesno = '!';
5063 what = "Graph";
5064 break;
5065 case ANYOF_LOWER:
5066 if (LOC)
5067 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5068 else {
5069 for (value = 0; value < 256; value++)
5070 if (isLOWER(value))
5071 ANYOF_BITMAP_SET(ret, value);
5072 }
5073 yesno = '+';
5074 what = "Lower";
5075 break;
5076 case ANYOF_NLOWER:
5077 if (LOC)
5078 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5079 else {
5080 for (value = 0; value < 256; value++)
5081 if (!isLOWER(value))
5082 ANYOF_BITMAP_SET(ret, value);
5083 }
5084 yesno = '!';
5085 what = "Lower";
5086 break;
5087 case ANYOF_PRINT:
5088 if (LOC)
5089 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5090 else {
5091 for (value = 0; value < 256; value++)
5092 if (isPRINT(value))
5093 ANYOF_BITMAP_SET(ret, value);
5094 }
5095 yesno = '+';
5096 what = "Print";
5097 break;
5098 case ANYOF_NPRINT:
5099 if (LOC)
5100 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5101 else {
5102 for (value = 0; value < 256; value++)
5103 if (!isPRINT(value))
5104 ANYOF_BITMAP_SET(ret, value);
5105 }
5106 yesno = '!';
5107 what = "Print";
5108 break;
5109 case ANYOF_PSXSPC:
5110 if (LOC)
5111 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5112 else {
5113 for (value = 0; value < 256; value++)
5114 if (isPSXSPC(value))
5115 ANYOF_BITMAP_SET(ret, value);
5116 }
5117 yesno = '+';
5118 what = "Space";
5119 break;
5120 case ANYOF_NPSXSPC:
5121 if (LOC)
5122 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5123 else {
5124 for (value = 0; value < 256; value++)
5125 if (!isPSXSPC(value))
5126 ANYOF_BITMAP_SET(ret, value);
5127 }
5128 yesno = '!';
5129 what = "Space";
5130 break;
5131 case ANYOF_PUNCT:
5132 if (LOC)
5133 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5134 else {
5135 for (value = 0; value < 256; value++)
5136 if (isPUNCT(value))
5137 ANYOF_BITMAP_SET(ret, value);
5138 }
5139 yesno = '+';
5140 what = "Punct";
5141 break;
5142 case ANYOF_NPUNCT:
5143 if (LOC)
5144 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5145 else {
5146 for (value = 0; value < 256; value++)
5147 if (!isPUNCT(value))
5148 ANYOF_BITMAP_SET(ret, value);
5149 }
5150 yesno = '!';
5151 what = "Punct";
5152 break;
5153 case ANYOF_SPACE:
5154 if (LOC)
5155 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5156 else {
5157 for (value = 0; value < 256; value++)
5158 if (isSPACE(value))
5159 ANYOF_BITMAP_SET(ret, value);
5160 }
5161 yesno = '+';
5162 what = "SpacePerl";
5163 break;
5164 case ANYOF_NSPACE:
5165 if (LOC)
5166 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5167 else {
5168 for (value = 0; value < 256; value++)
5169 if (!isSPACE(value))
5170 ANYOF_BITMAP_SET(ret, value);
5171 }
5172 yesno = '!';
5173 what = "SpacePerl";
5174 break;
5175 case ANYOF_UPPER:
5176 if (LOC)
5177 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5178 else {
5179 for (value = 0; value < 256; value++)
5180 if (isUPPER(value))
5181 ANYOF_BITMAP_SET(ret, value);
5182 }
5183 yesno = '+';
5184 what = "Upper";
5185 break;
5186 case ANYOF_NUPPER:
5187 if (LOC)
5188 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5189 else {
5190 for (value = 0; value < 256; value++)
5191 if (!isUPPER(value))
5192 ANYOF_BITMAP_SET(ret, value);
5193 }
5194 yesno = '!';
5195 what = "Upper";
5196 break;
5197 case ANYOF_XDIGIT:
5198 if (LOC)
5199 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5200 else {
5201 for (value = 0; value < 256; value++)
5202 if (isXDIGIT(value))
5203 ANYOF_BITMAP_SET(ret, value);
5204 }
5205 yesno = '+';
5206 what = "XDigit";
5207 break;
5208 case ANYOF_NXDIGIT:
5209 if (LOC)
5210 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5211 else {
5212 for (value = 0; value < 256; value++)
5213 if (!isXDIGIT(value))
5214 ANYOF_BITMAP_SET(ret, value);
5215 }
5216 yesno = '!';
5217 what = "XDigit";
5218 break;
5219 case ANYOF_MAX:
5220 /* this is to handle \p and \P */
5221 break;
5222 default:
5223 vFAIL("Invalid [::] class");
5224 break;
5225 }
5226 if (what) {
5227 /* Strings such as "+utf8::isWord\n" */
5228 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5229 }
5230 if (LOC)
5231 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5232 continue;
5233 }
5234 } /* end of namedclass \blah */
5235
5236 if (range) {
5237 if (prevvalue > (IV)value) /* b-a */ {
5238 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5239 RExC_parse - rangebegin,
5240 RExC_parse - rangebegin,
5241 rangebegin);
5242 range = 0; /* not a valid range */
5243 }
5244 }
5245 else {
5246 prevvalue = value; /* save the beginning of the range */
5247 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5248 RExC_parse[1] != ']') {
5249 RExC_parse++;
5250
5251 /* a bad range like \w-, [:word:]- ? */
5252 if (namedclass > OOB_NAMEDCLASS) {
5253 if (ckWARN(WARN_REGEXP))
5254 vWARN4(RExC_parse,
5255 "False [] range \"%*.*s\"",
5256 RExC_parse - rangebegin,
5257 RExC_parse - rangebegin,
5258 rangebegin);
5259 if (!SIZE_ONLY)
5260 ANYOF_BITMAP_SET(ret, '-');
5261 } else
5262 range = 1; /* yeah, it's a range! */
5263 continue; /* but do it the next time */
5264 }
5265 }
5266
5267 /* now is the next time */
5268 if (!SIZE_ONLY) {
5269 IV i;
5270
5271 if (prevvalue < 256) {
5272 const IV ceilvalue = value < 256 ? value : 255;
5273
5274#ifdef EBCDIC
5275 /* In EBCDIC [\x89-\x91] should include
5276 * the \x8e but [i-j] should not. */
5277 if (literal_endpoint == 2 &&
5278 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5279 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5280 {
5281 if (isLOWER(prevvalue)) {
5282 for (i = prevvalue; i <= ceilvalue; i++)
5283 if (isLOWER(i))
5284 ANYOF_BITMAP_SET(ret, i);
5285 } else {
5286 for (i = prevvalue; i <= ceilvalue; i++)
5287 if (isUPPER(i))
5288 ANYOF_BITMAP_SET(ret, i);
5289 }
5290 }
5291 else
5292#endif
5293 for (i = prevvalue; i <= ceilvalue; i++)
5294 ANYOF_BITMAP_SET(ret, i);
5295 }
5296 if (value > 255 || UTF) {
5297 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5298 const UV natvalue = NATIVE_TO_UNI(value);
5299
5300 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5301 if (prevnatvalue < natvalue) { /* what about > ? */
5302 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5303 prevnatvalue, natvalue);
5304 }
5305 else if (prevnatvalue == natvalue) {
5306 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5307 if (FOLD) {
5308 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5309 STRLEN foldlen;
5310 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5311
5312 /* If folding and foldable and a single
5313 * character, insert also the folded version
5314 * to the charclass. */
5315 if (f != value) {
5316 if (foldlen == (STRLEN)UNISKIP(f))
5317 Perl_sv_catpvf(aTHX_ listsv,
5318 "%04"UVxf"\n", f);
5319 else {
5320 /* Any multicharacter foldings
5321 * require the following transform:
5322 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5323 * where E folds into "pq" and F folds
5324 * into "rst", all other characters
5325 * fold to single characters. We save
5326 * away these multicharacter foldings,
5327 * to be later saved as part of the
5328 * additional "s" data. */
5329 SV *sv;
5330
5331 if (!unicode_alternate)
5332 unicode_alternate = newAV();
5333 sv = newSVpvn((char*)foldbuf, foldlen);
5334 SvUTF8_on(sv);
5335 av_push(unicode_alternate, sv);
5336 }
5337 }
5338
5339 /* If folding and the value is one of the Greek
5340 * sigmas insert a few more sigmas to make the
5341 * folding rules of the sigmas to work right.
5342 * Note that not all the possible combinations
5343 * are handled here: some of them are handled
5344 * by the standard folding rules, and some of
5345 * them (literal or EXACTF cases) are handled
5346 * during runtime in regexec.c:S_find_byclass(). */
5347 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5349 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5350 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5351 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5352 }
5353 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5355 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5356 }
5357 }
5358 }
5359#ifdef EBCDIC
5360 literal_endpoint = 0;
5361#endif
5362 }
5363
5364 range = 0; /* this range (if it was one) is done now */
5365 }
5366
5367 if (need_class) {
5368 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5369 if (SIZE_ONLY)
5370 RExC_size += ANYOF_CLASS_ADD_SKIP;
5371 else
5372 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5373 }
5374
5375 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5376 if (!SIZE_ONLY &&
5377 /* If the only flag is folding (plus possibly inversion). */
5378 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5379 ) {
5380 for (value = 0; value < 256; ++value) {
5381 if (ANYOF_BITMAP_TEST(ret, value)) {
5382 UV fold = PL_fold[value];
5383
5384 if (fold != value)
5385 ANYOF_BITMAP_SET(ret, fold);
5386 }
5387 }
5388 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5389 }
5390
5391 /* optimize inverted simple patterns (e.g. [^a-z]) */
5392 if (!SIZE_ONLY && optimize_invert &&
5393 /* If the only flag is inversion. */
5394 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5395 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5396 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5397 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5398 }
5399
5400 if (!SIZE_ONLY) {
5401 AV *av = newAV();
5402 SV *rv;
5403
5404 /* The 0th element stores the character class description
5405 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5406 * to initialize the appropriate swash (which gets stored in
5407 * the 1st element), and also useful for dumping the regnode.
5408 * The 2nd element stores the multicharacter foldings,
5409 * used later (regexec.c:S_reginclass()). */
5410 av_store(av, 0, listsv);
5411 av_store(av, 1, NULL);
5412 av_store(av, 2, (SV*)unicode_alternate);
5413 rv = newRV_noinc((SV*)av);
5414 n = add_data(pRExC_state, 1, "s");
5415 RExC_rx->data->data[n] = (void*)rv;
5416 ARG_SET(ret, n);
5417 }
5418
5419 return ret;
5420}
5421
5422STATIC char*
5423S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5424{
5425 char* retval = RExC_parse++;
5426
5427 for (;;) {
5428 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5429 RExC_parse[2] == '#') {
5430 while (*RExC_parse != ')') {
5431 if (RExC_parse == RExC_end)
5432 FAIL("Sequence (?#... not terminated");
5433 RExC_parse++;
5434 }
5435 RExC_parse++;
5436 continue;
5437 }
5438 if (RExC_flags & PMf_EXTENDED) {
5439 if (isSPACE(*RExC_parse)) {
5440 RExC_parse++;
5441 continue;
5442 }
5443 else if (*RExC_parse == '#') {
5444 while (RExC_parse < RExC_end)
5445 if (*RExC_parse++ == '\n') break;
5446 continue;
5447 }
5448 }
5449 return retval;
5450 }
5451}
5452
5453/*
5454- reg_node - emit a node
5455*/
5456STATIC regnode * /* Location. */
5457S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5458{
5459 register regnode *ptr;
5460 regnode * const ret = RExC_emit;
5461
5462 if (SIZE_ONLY) {
5463 SIZE_ALIGN(RExC_size);
5464 RExC_size += 1;
5465 return(ret);
5466 }
5467
5468 NODE_ALIGN_FILL(ret);
5469 ptr = ret;
5470 FILL_ADVANCE_NODE(ptr, op);
5471 if (RExC_offsets) { /* MJD */
5472 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5473 "reg_node", __LINE__,
5474 reg_name[op],
5475 RExC_emit - RExC_emit_start > RExC_offsets[0]
5476 ? "Overwriting end of array!\n" : "OK",
5477 RExC_emit - RExC_emit_start,
5478 RExC_parse - RExC_start,
5479 RExC_offsets[0]));
5480 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5481 }
5482
5483 RExC_emit = ptr;
5484
5485 return(ret);
5486}
5487
5488/*
5489- reganode - emit a node with an argument
5490*/
5491STATIC regnode * /* Location. */
5492S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5493{
5494 register regnode *ptr;
5495 regnode * const ret = RExC_emit;
5496
5497 if (SIZE_ONLY) {
5498 SIZE_ALIGN(RExC_size);
5499 RExC_size += 2;
5500 return(ret);
5501 }
5502
5503 NODE_ALIGN_FILL(ret);
5504 ptr = ret;
5505 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5506 if (RExC_offsets) { /* MJD */
5507 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5508 "reganode",
5509 __LINE__,
5510 reg_name[op],
5511 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5512 "Overwriting end of array!\n" : "OK",
5513 RExC_emit - RExC_emit_start,
5514 RExC_parse - RExC_start,
5515 RExC_offsets[0]));
5516 Set_Cur_Node_Offset;
5517 }
5518
5519 RExC_emit = ptr;
5520
5521 return(ret);
5522}
5523
5524/*
5525- reguni - emit (if appropriate) a Unicode character
5526*/
5527STATIC void
5528S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5529{
5530 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5531}
5532
5533/*
5534- reginsert - insert an operator in front of already-emitted operand
5535*
5536* Means relocating the operand.
5537*/
5538STATIC void
5539S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5540{
5541 register regnode *src;
5542 register regnode *dst;
5543 register regnode *place;
5544 const int offset = regarglen[(U8)op];
5545
5546/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5547
5548 if (SIZE_ONLY) {
5549 RExC_size += NODE_STEP_REGNODE + offset;
5550 return;
5551 }
5552
5553 src = RExC_emit;
5554 RExC_emit += NODE_STEP_REGNODE + offset;
5555 dst = RExC_emit;
5556 while (src > opnd) {
5557 StructCopy(--src, --dst, regnode);
5558 if (RExC_offsets) { /* MJD 20010112 */
5559 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5560 "reg_insert",
5561 __LINE__,
5562 reg_name[op],
5563 dst - RExC_emit_start > RExC_offsets[0]
5564 ? "Overwriting end of array!\n" : "OK",
5565 src - RExC_emit_start,
5566 dst - RExC_emit_start,
5567 RExC_offsets[0]));
5568 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5569 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5570 }
5571 }
5572
5573
5574 place = opnd; /* Op node, where operand used to be. */
5575 if (RExC_offsets) { /* MJD */
5576 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5577 "reginsert",
5578 __LINE__,
5579 reg_name[op],
5580 place - RExC_emit_start > RExC_offsets[0]
5581 ? "Overwriting end of array!\n" : "OK",
5582 place - RExC_emit_start,
5583 RExC_parse - RExC_start,
5584 RExC_offsets[0]));
5585 Set_Node_Offset(place, RExC_parse);
5586 Set_Node_Length(place, 1);
5587 }
5588 src = NEXTOPER(place);
5589 FILL_ADVANCE_NODE(place, op);
5590 Zero(src, offset, regnode);
5591}
5592
5593/*
5594- regtail - set the next-pointer at the end of a node chain of p to val.
5595*/
5596STATIC void
5597S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5598{
5599 register regnode *scan;
5600
5601 if (SIZE_ONLY)
5602 return;
5603
5604 /* Find last node. */
5605 scan = p;
5606 for (;;) {
5607 regnode * const temp = regnext(scan);
5608 if (temp == NULL)
5609 break;
5610 scan = temp;
5611 }
5612
5613 if (reg_off_by_arg[OP(scan)]) {
5614 ARG_SET(scan, val - scan);
5615 }
5616 else {
5617 NEXT_OFF(scan) = val - scan;
5618 }
5619}
5620
5621/*
5622- regoptail - regtail on operand of first argument; nop if operandless
5623*/
5624STATIC void
5625S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5626{
5627 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5628 if (p == NULL || SIZE_ONLY)
5629 return;
5630 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5631 regtail(pRExC_state, NEXTOPER(p), val);
5632 }
5633 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5634 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5635 }
5636 else
5637 return;
5638}
5639
5640/*
5641 - regcurly - a little FSA that accepts {\d+,?\d*}
5642 */
5643STATIC I32
5644S_regcurly(pTHX_ register const char *s)
5645{
5646 if (*s++ != '{')
5647 return FALSE;
5648 if (!isDIGIT(*s))
5649 return FALSE;
5650 while (isDIGIT(*s))
5651 s++;
5652 if (*s == ',')
5653 s++;
5654 while (isDIGIT(*s))
5655 s++;
5656 if (*s != '}')
5657 return FALSE;
5658 return TRUE;
5659}
5660
5661
5662/*
5663 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5664 */
5665void
5666Perl_regdump(pTHX_ regexp *r)
5667{
5668#ifdef DEBUGGING
5669 SV *sv = sv_newmortal();
5670
5671 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5672
5673 /* Header fields of interest. */
5674 if (r->anchored_substr)
5675 PerlIO_printf(Perl_debug_log,
5676 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5677 PL_colors[0],
5678 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5679 SvPVX_const(r->anchored_substr),
5680 PL_colors[1],
5681 SvTAIL(r->anchored_substr) ? "$" : "",
5682 (IV)r->anchored_offset);
5683 else if (r->anchored_utf8)
5684 PerlIO_printf(Perl_debug_log,
5685 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5686 PL_colors[0],
5687 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5688 SvPVX_const(r->anchored_utf8),
5689 PL_colors[1],
5690 SvTAIL(r->anchored_utf8) ? "$" : "",
5691 (IV)r->anchored_offset);
5692 if (r->float_substr)
5693 PerlIO_printf(Perl_debug_log,
5694 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5695 PL_colors[0],
5696 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5697 SvPVX_const(r->float_substr),
5698 PL_colors[1],
5699 SvTAIL(r->float_substr) ? "$" : "",
5700 (IV)r->float_min_offset, (UV)r->float_max_offset);
5701 else if (r->float_utf8)
5702 PerlIO_printf(Perl_debug_log,
5703 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5704 PL_colors[0],
5705 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5706 SvPVX_const(r->float_utf8),
5707 PL_colors[1],
5708 SvTAIL(r->float_utf8) ? "$" : "",
5709 (IV)r->float_min_offset, (UV)r->float_max_offset);
5710 if (r->check_substr || r->check_utf8)
5711 PerlIO_printf(Perl_debug_log,
5712 r->check_substr == r->float_substr
5713 && r->check_utf8 == r->float_utf8
5714 ? "(checking floating" : "(checking anchored");
5715 if (r->reganch & ROPT_NOSCAN)
5716 PerlIO_printf(Perl_debug_log, " noscan");
5717 if (r->reganch & ROPT_CHECK_ALL)
5718 PerlIO_printf(Perl_debug_log, " isall");
5719 if (r->check_substr || r->check_utf8)
5720 PerlIO_printf(Perl_debug_log, ") ");
5721
5722 if (r->regstclass) {
5723 regprop(sv, r->regstclass);
5724 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5725 }
5726 if (r->reganch & ROPT_ANCH) {
5727 PerlIO_printf(Perl_debug_log, "anchored");
5728 if (r->reganch & ROPT_ANCH_BOL)
5729 PerlIO_printf(Perl_debug_log, "(BOL)");
5730 if (r->reganch & ROPT_ANCH_MBOL)
5731 PerlIO_printf(Perl_debug_log, "(MBOL)");
5732 if (r->reganch & ROPT_ANCH_SBOL)
5733 PerlIO_printf(Perl_debug_log, "(SBOL)");
5734 if (r->reganch & ROPT_ANCH_GPOS)
5735 PerlIO_printf(Perl_debug_log, "(GPOS)");
5736 PerlIO_putc(Perl_debug_log, ' ');
5737 }
5738 if (r->reganch & ROPT_GPOS_SEEN)
5739 PerlIO_printf(Perl_debug_log, "GPOS ");
5740 if (r->reganch & ROPT_SKIP)
5741 PerlIO_printf(Perl_debug_log, "plus ");
5742 if (r->reganch & ROPT_IMPLICIT)
5743 PerlIO_printf(Perl_debug_log, "implicit ");
5744 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5745 if (r->reganch & ROPT_EVAL_SEEN)
5746 PerlIO_printf(Perl_debug_log, "with eval ");
5747 PerlIO_printf(Perl_debug_log, "\n");
5748 if (r->offsets) {
5749 const U32 len = r->offsets[0];
5750 GET_RE_DEBUG_FLAGS_DECL;
5751 DEBUG_OFFSETS_r({
5752 U32 i;
5753 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5754 for (i = 1; i <= len; i++)
5755 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5756 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5757 PerlIO_printf(Perl_debug_log, "\n");
5758 });
5759 }
5760#endif /* DEBUGGING */
5761}
5762
5763/*
5764- regprop - printable representation of opcode
5765*/
5766void
5767Perl_regprop(pTHX_ SV *sv, const regnode *o)
5768{
5769#ifdef DEBUGGING
5770 register int k;
5771
5772 sv_setpvn(sv, "", 0);
5773 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5774 /* It would be nice to FAIL() here, but this may be called from
5775 regexec.c, and it would be hard to supply pRExC_state. */
5776 Perl_croak(aTHX_ "Corrupted regexp opcode");
5777 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5778
5779 k = PL_regkind[(U8)OP(o)];
5780
5781 if (k == EXACT) {
5782 SV *dsv = sv_2mortal(newSVpvn("", 0));
5783 /* Using is_utf8_string() is a crude hack but it may
5784 * be the best for now since we have no flag "this EXACTish
5785 * node was UTF-8" --jhi */
5786 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5787 const char *s = do_utf8 ?
5788 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5789 UNI_DISPLAY_REGEX) :
5790 STRING(o);
5791 const int len = do_utf8 ?
5792 strlen(s) :
5793 STR_LEN(o);
5794 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5795 PL_colors[0],
5796 len, s,
5797 PL_colors[1]);
5798 } else if (k == TRIE) {/*
5799 this isn't always safe, as Pl_regdata may not be for this regex yet
5800 (depending on where its called from) so its being moved to dumpuntil
5801 I32 n = ARG(o);
5802 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5803 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5804 trie->wordcount,
5805 trie->charcount,
5806 trie->uniquecharcount,
5807 trie->laststate);
5808 */
5809 } else if (k == CURLY) {
5810 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5811 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5812 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5813 }
5814 else if (k == WHILEM && o->flags) /* Ordinal/of */
5815 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5816 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5817 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5818 else if (k == LOGICAL)
5819 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5820 else if (k == ANYOF) {
5821 int i, rangestart = -1;
5822 U8 flags = ANYOF_FLAGS(o);
5823 const char * const anyofs[] = { /* Should be synchronized with
5824 * ANYOF_ #xdefines in regcomp.h */
5825 "\\w",
5826 "\\W",
5827 "\\s",
5828 "\\S",
5829 "\\d",
5830 "\\D",
5831 "[:alnum:]",
5832 "[:^alnum:]",
5833 "[:alpha:]",
5834 "[:^alpha:]",
5835 "[:ascii:]",
5836 "[:^ascii:]",
5837 "[:ctrl:]",
5838 "[:^ctrl:]",
5839 "[:graph:]",
5840 "[:^graph:]",
5841 "[:lower:]",
5842 "[:^lower:]",
5843 "[:print:]",
5844 "[:^print:]",
5845 "[:punct:]",
5846 "[:^punct:]",
5847 "[:upper:]",
5848 "[:^upper:]",
5849 "[:xdigit:]",
5850 "[:^xdigit:]",
5851 "[:space:]",
5852 "[:^space:]",
5853 "[:blank:]",
5854 "[:^blank:]"
5855 };
5856
5857 if (flags & ANYOF_LOCALE)
5858 sv_catpv(sv, "{loc}");
5859 if (flags & ANYOF_FOLD)
5860 sv_catpv(sv, "{i}");
5861 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5862 if (flags & ANYOF_INVERT)
5863 sv_catpv(sv, "^");
5864 for (i = 0; i <= 256; i++) {
5865 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5866 if (rangestart == -1)
5867 rangestart = i;
5868 } else if (rangestart != -1) {
5869 if (i <= rangestart + 3)
5870 for (; rangestart < i; rangestart++)
5871 put_byte(sv, rangestart);
5872 else {
5873 put_byte(sv, rangestart);
5874 sv_catpv(sv, "-");
5875 put_byte(sv, i - 1);
5876 }
5877 rangestart = -1;
5878 }
5879 }
5880
5881 if (o->flags & ANYOF_CLASS)
5882 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5883 if (ANYOF_CLASS_TEST(o,i))
5884 sv_catpv(sv, anyofs[i]);
5885
5886 if (flags & ANYOF_UNICODE)
5887 sv_catpv(sv, "{unicode}");
5888 else if (flags & ANYOF_UNICODE_ALL)
5889 sv_catpv(sv, "{unicode_all}");
5890
5891 {
5892 SV *lv;
5893 SV *sw = regclass_swash(o, FALSE, &lv, 0);
5894
5895 if (lv) {
5896 if (sw) {
5897 U8 s[UTF8_MAXBYTES_CASE+1];
5898
5899 for (i = 0; i <= 256; i++) { /* just the first 256 */
5900 uvchr_to_utf8(s, i);
5901
5902 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5903 if (rangestart == -1)
5904 rangestart = i;
5905 } else if (rangestart != -1) {
5906 U8 *p;
5907
5908 if (i <= rangestart + 3)
5909 for (; rangestart < i; rangestart++) {
5910 U8 *e;
5911 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
5912 put_byte(sv, *p);
5913 }
5914 else {
5915 U8 *e;
5916 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
5917 put_byte(sv, *p);
5918 sv_catpv(sv, "-");
5919 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
5920 put_byte(sv, *p);
5921 }
5922 rangestart = -1;
5923 }
5924 }
5925
5926 sv_catpv(sv, "..."); /* et cetera */
5927 }
5928
5929 {
5930 char *s = savesvpv(lv);
5931 char *origs = s;
5932
5933 while(*s && *s != '\n') s++;
5934
5935 if (*s == '\n') {
5936 const char *t = ++s;
5937
5938 while (*s) {
5939 if (*s == '\n')
5940 *s = ' ';
5941 s++;
5942 }
5943 if (s[-1] == ' ')
5944 s[-1] = 0;
5945
5946 sv_catpv(sv, t);
5947 }
5948
5949 Safefree(origs);
5950 }
5951 }
5952 }
5953
5954 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5955 }
5956 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5957 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5958#endif /* DEBUGGING */
5959}
5960
5961SV *
5962Perl_re_intuit_string(pTHX_ regexp *prog)
5963{ /* Assume that RE_INTUIT is set */
5964 GET_RE_DEBUG_FLAGS_DECL;
5965 DEBUG_COMPILE_r(
5966 { STRLEN n_a;
5967 const char *s = SvPV(prog->check_substr
5968 ? prog->check_substr : prog->check_utf8, n_a);
5969
5970 if (!PL_colorset) reginitcolors();
5971 PerlIO_printf(Perl_debug_log,
5972 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5973 PL_colors[4],
5974 prog->check_substr ? "" : "utf8 ",
5975 PL_colors[5],PL_colors[0],
5976 s,
5977 PL_colors[1],
5978 (strlen(s) > 60 ? "..." : ""));
5979 } );
5980
5981 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5982}
5983
5984void
5985Perl_pregfree(pTHX_ struct regexp *r)
5986{
5987 dVAR;
5988#ifdef DEBUGGING
5989 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
5990 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
5991#endif
5992
5993
5994 if (!r || (--r->refcnt > 0))
5995 return;
5996 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
5997 const char *s = (r->reganch & ROPT_UTF8)
5998 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
5999 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6000 const int len = SvCUR(dsv);
6001 if (!PL_colorset)
6002 reginitcolors();
6003 PerlIO_printf(Perl_debug_log,
6004 "%sFreeing REx:%s %s%*.*s%s%s\n",
6005 PL_colors[4],PL_colors[5],PL_colors[0],
6006 len, len, s,
6007 PL_colors[1],
6008 len > 60 ? "..." : "");
6009 });
6010
6011 if (r->precomp)
6012 Safefree(r->precomp);
6013 if (r->offsets) /* 20010421 MJD */
6014 Safefree(r->offsets);
6015 RX_MATCH_COPY_FREE(r);
6016#ifdef PERL_OLD_COPY_ON_WRITE
6017 if (r->saved_copy)
6018 SvREFCNT_dec(r->saved_copy);
6019#endif
6020 if (r->substrs) {
6021 if (r->anchored_substr)
6022 SvREFCNT_dec(r->anchored_substr);
6023 if (r->anchored_utf8)
6024 SvREFCNT_dec(r->anchored_utf8);
6025 if (r->float_substr)
6026 SvREFCNT_dec(r->float_substr);
6027 if (r->float_utf8)
6028 SvREFCNT_dec(r->float_utf8);
6029 Safefree(r->substrs);
6030 }
6031 if (r->data) {
6032 int n = r->data->count;
6033 PAD* new_comppad = NULL;
6034 PAD* old_comppad;
6035 PADOFFSET refcnt;
6036
6037 while (--n >= 0) {
6038 /* If you add a ->what type here, update the comment in regcomp.h */
6039 switch (r->data->what[n]) {
6040 case 's':
6041 SvREFCNT_dec((SV*)r->data->data[n]);
6042 break;
6043 case 'f':
6044 Safefree(r->data->data[n]);
6045 break;
6046 case 'p':
6047 new_comppad = (AV*)r->data->data[n];
6048 break;
6049 case 'o':
6050 if (new_comppad == NULL)
6051 Perl_croak(aTHX_ "panic: pregfree comppad");
6052 PAD_SAVE_LOCAL(old_comppad,
6053 /* Watch out for global destruction's random ordering. */
6054 (SvTYPE(new_comppad) == SVt_PVAV) ?
6055 new_comppad : Null(PAD *)
6056 );
6057 OP_REFCNT_LOCK;
6058 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6059 OP_REFCNT_UNLOCK;
6060 if (!refcnt)
6061 op_free((OP_4tree*)r->data->data[n]);
6062
6063 PAD_RESTORE_LOCAL(old_comppad);
6064 SvREFCNT_dec((SV*)new_comppad);
6065 new_comppad = NULL;
6066 break;
6067 case 'n':
6068 break;
6069 case 't':
6070 {
6071 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6072 U32 refcount;
6073 OP_REFCNT_LOCK;
6074 refcount = trie->refcount--;
6075 OP_REFCNT_UNLOCK;
6076 if ( !refcount ) {
6077 if (trie->charmap)
6078 Safefree(trie->charmap);
6079 if (trie->widecharmap)
6080 SvREFCNT_dec((SV*)trie->widecharmap);
6081 if (trie->states)
6082 Safefree(trie->states);
6083 if (trie->trans)
6084 Safefree(trie->trans);
6085#ifdef DEBUGGING
6086 if (trie->words)
6087 SvREFCNT_dec((SV*)trie->words);
6088 if (trie->revcharmap)
6089 SvREFCNT_dec((SV*)trie->revcharmap);
6090#endif
6091 Safefree(r->data->data[n]); /* do this last!!!! */
6092 }
6093 break;
6094 }
6095 default:
6096 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6097 }
6098 }
6099 Safefree(r->data->what);
6100 Safefree(r->data);
6101 }
6102 Safefree(r->startp);
6103 Safefree(r->endp);
6104 Safefree(r);
6105}
6106
6107/*
6108 - regnext - dig the "next" pointer out of a node
6109 */
6110regnode *
6111Perl_regnext(pTHX_ register regnode *p)
6112{
6113 register I32 offset;
6114
6115 if (p == &PL_regdummy)
6116 return(NULL);
6117
6118 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6119 if (offset == 0)
6120 return(NULL);
6121
6122 return(p+offset);
6123}
6124
6125STATIC void
6126S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6127{
6128 va_list args;
6129 STRLEN l1 = strlen(pat1);
6130 STRLEN l2 = strlen(pat2);
6131 char buf[512];
6132 SV *msv;
6133 const char *message;
6134
6135 if (l1 > 510)
6136 l1 = 510;
6137 if (l1 + l2 > 510)
6138 l2 = 510 - l1;
6139 Copy(pat1, buf, l1 , char);
6140 Copy(pat2, buf + l1, l2 , char);
6141 buf[l1 + l2] = '\n';
6142 buf[l1 + l2 + 1] = '\0';
6143#ifdef I_STDARG
6144 /* ANSI variant takes additional second argument */
6145 va_start(args, pat2);
6146#else
6147 va_start(args);
6148#endif
6149 msv = vmess(buf, &args);
6150 va_end(args);
6151 message = SvPV(msv,l1);
6152 if (l1 > 512)
6153 l1 = 512;
6154 Copy(message, buf, l1 , char);
6155 buf[l1-1] = '\0'; /* Overwrite \n */
6156 Perl_croak(aTHX_ "%s", buf);
6157}
6158
6159/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6160
6161void
6162Perl_save_re_context(pTHX)
6163{
6164 SAVEI32(PL_reg_flags); /* from regexec.c */
6165 SAVEPPTR(PL_bostr);
6166 SAVEPPTR(PL_reginput); /* String-input pointer. */
6167 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6168 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6169 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6170 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6171 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6172 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6173 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6174 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6175 PL_reg_start_tmp = 0;
6176 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6177 PL_reg_start_tmpl = 0;
6178 SAVEVPTR(PL_regdata);
6179 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6180 SAVEI32(PL_regnarrate); /* from regexec.c */
6181 SAVEVPTR(PL_regprogram); /* from regexec.c */
6182 SAVEINT(PL_regindent); /* from regexec.c */
6183 SAVEVPTR(PL_regcc); /* from regexec.c */
6184 SAVEVPTR(PL_curcop);
6185 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6186 SAVEVPTR(PL_reg_re); /* from regexec.c */
6187 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6188 SAVESPTR(PL_reg_sv); /* from regexec.c */
6189 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6190 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6191 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6192 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6193 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6194 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6195 PL_reg_oldsaved = Nullch;
6196 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6197 PL_reg_oldsavedlen = 0;
6198#ifdef PERL_OLD_COPY_ON_WRITE
6199 SAVESPTR(PL_nrs);
6200 PL_nrs = Nullsv;
6201#endif
6202 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6203 PL_reg_maxiter = 0;
6204 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6205 PL_reg_leftiter = 0;
6206 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6207 PL_reg_poscache = Nullch;
6208 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6209 PL_reg_poscache_size = 0;
6210 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6211 SAVEI32(PL_regnpar); /* () count. */
6212 SAVEI32(PL_regsize); /* from regexec.c */
6213
6214 {
6215 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6216 REGEXP *rx;
6217
6218 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6219 U32 i;
6220 for (i = 1; i <= rx->nparens; i++) {
6221 GV *mgv;
6222 char digits[TYPE_CHARS(long)];
6223 sprintf(digits, "%lu", (long)i);
6224 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6225 save_scalar(mgv);
6226 }
6227 }
6228 }
6229
6230#ifdef DEBUGGING
6231 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6232#endif
6233}
6234
6235static void
6236clear_re(pTHX_ void *r)
6237{
6238 ReREFCNT_dec((regexp *)r);
6239}
6240
6241#ifdef DEBUGGING
6242
6243STATIC void
6244S_put_byte(pTHX_ SV *sv, int c)
6245{
6246 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6247 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6248 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6249 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6250 else
6251 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6252}
6253
6254
6255STATIC regnode *
6256S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6257{
6258 register U8 op = EXACT; /* Arbitrary non-END op. */
6259 register regnode *next;
6260
6261 while (op != END && (!last || node < last)) {
6262 /* While that wasn't END last time... */
6263
6264 NODE_ALIGN(node);
6265 op = OP(node);
6266 if (op == CLOSE)
6267 l--;
6268 next = regnext(node);
6269 /* Where, what. */
6270 if (OP(node) == OPTIMIZED)
6271 goto after_print;
6272 regprop(sv, node);
6273 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6274 (int)(2*l + 1), "", SvPVX_const(sv));
6275 if (next == NULL) /* Next ptr. */
6276 PerlIO_printf(Perl_debug_log, "(0)");
6277 else
6278 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6279 (void)PerlIO_putc(Perl_debug_log, '\n');
6280 after_print:
6281 if (PL_regkind[(U8)op] == BRANCHJ) {
6282 register regnode *nnode = (OP(next) == LONGJMP
6283 ? regnext(next)
6284 : next);
6285 if (last && nnode > last)
6286 nnode = last;
6287 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6288 }
6289 else if (PL_regkind[(U8)op] == BRANCH) {
6290 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6291 }
6292 else if ( PL_regkind[(U8)op] == TRIE ) {
6293 const I32 n = ARG(node);
6294 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6295 const I32 arry_len = av_len(trie->words)+1;
6296 I32 word_idx;
6297 PerlIO_printf(Perl_debug_log,
6298 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6299 (int)(2*(l+3)),
6300 "",
6301 trie->wordcount,
6302 (int)trie->charcount,
6303 trie->uniquecharcount,
6304 (IV)trie->laststate-1,
6305 node->flags ? " EVAL mode" : "");
6306
6307 for (word_idx=0; word_idx < arry_len; word_idx++) {
6308 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6309 if (elem_ptr) {
6310 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6311 (int)(2*(l+4)), "",
6312 PL_colors[0],
6313 SvPV_nolen(*elem_ptr),
6314 PL_colors[1]
6315 );
6316 /*
6317 if (next == NULL)
6318 PerlIO_printf(Perl_debug_log, "(0)\n");
6319 else
6320 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6321 */
6322 }
6323
6324 }
6325
6326 node = NEXTOPER(node);
6327 node += regarglen[(U8)op];
6328
6329 }
6330 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6331 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6332 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6333 }
6334 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6335 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6336 next, sv, l + 1);
6337 }
6338 else if ( op == PLUS || op == STAR) {
6339 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6340 }
6341 else if (op == ANYOF) {
6342 /* arglen 1 + class block */
6343 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6344 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6345 node = NEXTOPER(node);
6346 }
6347 else if (PL_regkind[(U8)op] == EXACT) {
6348 /* Literal string, where present. */
6349 node += NODE_SZ_STR(node) - 1;
6350 node = NEXTOPER(node);
6351 }
6352 else {
6353 node = NEXTOPER(node);
6354 node += regarglen[(U8)op];
6355 }
6356 if (op == CURLYX || op == OPEN)
6357 l++;
6358 else if (op == WHILEM)
6359 l--;
6360 }
6361 return node;
6362}
6363
6364#endif /* DEBUGGING */
6365
6366/*
6367 * Local variables:
6368 * c-indentation-style: bsd
6369 * c-basic-offset: 4
6370 * indent-tabs-mode: t
6371 * End:
6372 *
6373 * ex: set ts=8 sts=4 sw=4 noet:
6374 */