This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add comment to top of reentr.c and fix typos in other files
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
11 */
12
13/* NOTE: this is derived from Henry Spencer's regexp code, and should not
14 * confused with the original package (see point 3 below). Thanks, Henry!
15 */
16
17/* Additional note: this code is very heavily munged from Henry's version
18 * in places. In some spots I've traded clarity for efficiency, so don't
19 * blame Henry for some of the lack of readability.
20 */
21
22/* The names of the functions have been changed from regcomp and
23 * regexec to pregcomp and pregexec in order to avoid conflicts
24 * with the POSIX routines of the same names.
25*/
26
27#ifdef PERL_EXT_RE_BUILD
28/* need to replace pregcomp et al, so enable that */
29# ifndef PERL_IN_XSUB_RE
30# define PERL_IN_XSUB_RE
31# endif
32/* need access to debugger hooks */
33# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
34# define DEBUGGING
35# endif
36#endif
37
38#ifdef PERL_IN_XSUB_RE
39/* We *really* need to overwrite these symbols: */
40# define Perl_pregcomp my_regcomp
41# define Perl_regdump my_regdump
42# define Perl_regprop my_regprop
43# define Perl_pregfree my_regfree
44# define Perl_re_intuit_string my_re_intuit_string
45/* *These* symbols are masked to allow static link. */
46# define Perl_regnext my_regnext
47# define Perl_save_re_context my_save_re_context
48# define Perl_reginitcolors my_reginitcolors
49
50# define PERL_NO_GET_CONTEXT
51#endif
52
53/*SUPPRESS 112*/
54/*
55 * pregcomp and pregexec -- regsub and regerror are not used in perl
56 *
57 * Copyright (c) 1986 by University of Toronto.
58 * Written by Henry Spencer. Not derived from licensed software.
59 *
60 * Permission is granted to anyone to use this software for any
61 * purpose on any computer system, and to redistribute it freely,
62 * subject to the following restrictions:
63 *
64 * 1. The author is not responsible for the consequences of use of
65 * this software, no matter how awful, even if they arise
66 * from defects in it.
67 *
68 * 2. The origin of this software must not be misrepresented, either
69 * by explicit claim or by omission.
70 *
71 * 3. Altered versions must be plainly marked as such, and must not
72 * be misrepresented as being the original software.
73 *
74 *
75 **** Alterations to Henry's code are...
76 ****
77 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
78 **** 2000, 2001, 2002, 2003, by Larry Wall and others
79 ****
80 **** You may distribute under the terms of either the GNU General Public
81 **** License or the Artistic License, as specified in the README file.
82
83 *
84 * Beware that some of this code is subtly aware of the way operator
85 * precedence is structured in regular expressions. Serious changes in
86 * regular-expression syntax might require a total rethink.
87 */
88#include "EXTERN.h"
89#define PERL_IN_REGCOMP_C
90#include "perl.h"
91
92#ifndef PERL_IN_XSUB_RE
93# include "INTERN.h"
94#endif
95
96#define REG_COMP_C
97#include "regcomp.h"
98
99#ifdef op
100#undef op
101#endif /* op */
102
103#ifdef MSDOS
104# if defined(BUGGY_MSC6)
105 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
106# pragma optimize("a",off)
107 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
108# pragma optimize("w",on )
109# endif /* BUGGY_MSC6 */
110#endif /* MSDOS */
111
112#ifndef STATIC
113#define STATIC static
114#endif
115
116typedef struct RExC_state_t {
117 U32 flags; /* are we folding, multilining? */
118 char *precomp; /* uncompiled string. */
119 regexp *rx;
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
126 I32 naughty; /* How bad is this pattern? */
127 I32 sawback; /* Did we see \1, ...? */
128 U32 seen;
129 I32 size; /* Code size. */
130 I32 npar; /* () count. */
131 I32 extralen;
132 I32 seen_zerolen;
133 I32 seen_evals;
134 I32 utf8;
135#if ADD_TO_REGEXEC
136 char *starttry; /* -Dr: where regtry was called. */
137#define RExC_starttry (pRExC_state->starttry)
138#endif
139} RExC_state_t;
140
141#define RExC_flags (pRExC_state->flags)
142#define RExC_precomp (pRExC_state->precomp)
143#define RExC_rx (pRExC_state->rx)
144#define RExC_start (pRExC_state->start)
145#define RExC_end (pRExC_state->end)
146#define RExC_parse (pRExC_state->parse)
147#define RExC_whilem_seen (pRExC_state->whilem_seen)
148#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
149#define RExC_emit (pRExC_state->emit)
150#define RExC_emit_start (pRExC_state->emit_start)
151#define RExC_naughty (pRExC_state->naughty)
152#define RExC_sawback (pRExC_state->sawback)
153#define RExC_seen (pRExC_state->seen)
154#define RExC_size (pRExC_state->size)
155#define RExC_npar (pRExC_state->npar)
156#define RExC_extralen (pRExC_state->extralen)
157#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
158#define RExC_seen_evals (pRExC_state->seen_evals)
159#define RExC_utf8 (pRExC_state->utf8)
160
161#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
162#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
163 ((*s) == '{' && regcurly(s)))
164
165#ifdef SPSTART
166#undef SPSTART /* dratted cpp namespace... */
167#endif
168/*
169 * Flags to be passed up and down.
170 */
171#define WORST 0 /* Worst case. */
172#define HASWIDTH 0x1 /* Known to match non-null strings. */
173#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
174#define SPSTART 0x4 /* Starts with * or +. */
175#define TRYAGAIN 0x8 /* Weeded out a declaration. */
176
177/* Length of a variant. */
178
179typedef struct scan_data_t {
180 I32 len_min;
181 I32 len_delta;
182 I32 pos_min;
183 I32 pos_delta;
184 SV *last_found;
185 I32 last_end; /* min value, <0 unless valid. */
186 I32 last_start_min;
187 I32 last_start_max;
188 SV **longest; /* Either &l_fixed, or &l_float. */
189 SV *longest_fixed;
190 I32 offset_fixed;
191 SV *longest_float;
192 I32 offset_float_min;
193 I32 offset_float_max;
194 I32 flags;
195 I32 whilem_c;
196 I32 *last_closep;
197 struct regnode_charclass_class *start_class;
198} scan_data_t;
199
200/*
201 * Forward declarations for pregcomp()'s friends.
202 */
203
204static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0};
206
207#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
208#define SF_BEFORE_SEOL 0x1
209#define SF_BEFORE_MEOL 0x2
210#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
212
213#ifdef NO_UNARY_PLUS
214# define SF_FIX_SHIFT_EOL (0+2)
215# define SF_FL_SHIFT_EOL (0+4)
216#else
217# define SF_FIX_SHIFT_EOL (+2)
218# define SF_FL_SHIFT_EOL (+4)
219#endif
220
221#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223
224#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
226#define SF_IS_INF 0x40
227#define SF_HAS_PAR 0x80
228#define SF_IN_PAR 0x100
229#define SF_HAS_EVAL 0x200
230#define SCF_DO_SUBSTR 0x400
231#define SCF_DO_STCLASS_AND 0x0800
232#define SCF_DO_STCLASS_OR 0x1000
233#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
234#define SCF_WHILEM_VISITED_POS 0x2000
235
236#define UTF (RExC_utf8 != 0)
237#define LOC ((RExC_flags & PMf_LOCALE) != 0)
238#define FOLD ((RExC_flags & PMf_FOLD) != 0)
239
240#define OOB_UNICODE 12345678
241#define OOB_NAMEDCLASS -1
242
243#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
244#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
245
246
247/* length of regex to show in messages that don't mark a position within */
248#define RegexLengthToShowInErrorMessages 127
249
250/*
251 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
252 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
253 * op/pragma/warn/regcomp.
254 */
255#define MARKER1 "<-- HERE" /* marker as it appears in the description */
256#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
257
258#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
259
260/*
261 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
262 * arg. Show regex, up to a maximum length. If it's too long, chop and add
263 * "...".
264 */
265#define FAIL(msg) STMT_START { \
266 char *ellipses = ""; \
267 IV len = RExC_end - RExC_precomp; \
268 \
269 if (!SIZE_ONLY) \
270 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
274 ellipses = "..."; \
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
277 msg, (int)len, RExC_precomp, ellipses); \
278} STMT_END
279
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
285#define FAIL2(pat,msg) STMT_START { \
286 char *ellipses = ""; \
287 IV len = RExC_end - RExC_precomp; \
288 \
289 if (!SIZE_ONLY) \
290 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
291 if (len > RegexLengthToShowInErrorMessages) { \
292 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
293 len = RegexLengthToShowInErrorMessages - 10; \
294 ellipses = "..."; \
295 } \
296 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
297 msg, (int)len, RExC_precomp, ellipses); \
298} STMT_END
299
300
301/*
302 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
303 */
304#define Simple_vFAIL(m) STMT_START { \
305 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
308} STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) STMT_START { \
314 if (!SIZE_ONLY) \
315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
316 Simple_vFAIL(m); \
317} STMT_END
318
319/*
320 * Like Simple_vFAIL(), but accepts two arguments.
321 */
322#define Simple_vFAIL2(m,a1) STMT_START { \
323 IV offset = RExC_parse - RExC_precomp; \
324 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
325 (int)offset, RExC_precomp, RExC_precomp + offset); \
326} STMT_END
327
328/*
329 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
330 */
331#define vFAIL2(m,a1) STMT_START { \
332 if (!SIZE_ONLY) \
333 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
334 Simple_vFAIL2(m, a1); \
335} STMT_END
336
337
338/*
339 * Like Simple_vFAIL(), but accepts three arguments.
340 */
341#define Simple_vFAIL3(m, a1, a2) STMT_START { \
342 IV offset = RExC_parse - RExC_precomp; \
343 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
344 (int)offset, RExC_precomp, RExC_precomp + offset); \
345} STMT_END
346
347/*
348 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
349 */
350#define vFAIL3(m,a1,a2) STMT_START { \
351 if (!SIZE_ONLY) \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 Simple_vFAIL3(m, a1, a2); \
354} STMT_END
355
356/*
357 * Like Simple_vFAIL(), but accepts four arguments.
358 */
359#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
360 IV offset = RExC_parse - RExC_precomp; \
361 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
362 (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365/*
366 * Like Simple_vFAIL(), but accepts five arguments.
367 */
368#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
369 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
372} STMT_END
373
374
375#define vWARN(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
378 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
379} STMT_END
380
381#define vWARNdep(loc,m) STMT_START { \
382 IV offset = loc - RExC_precomp; \
383 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
384 "%s" REPORT_LOCATION, \
385 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386} STMT_END
387
388
389#define vWARN2(loc, m, a1) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN3(loc, m, a1, a2) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
406
407#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
408 IV offset = loc - RExC_precomp; \
409 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
410 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
411} STMT_END
412
413
414/* Allow for side effects in s */
415#define REGC(c,s) STMT_START { \
416 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
417} STMT_END
418
419/* Macros for recording node offsets. 20001227 mjd@plover.com
420 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
421 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
422 * Element 0 holds the number n.
423 */
424
425#define MJD_OFFSET_DEBUG(x)
426/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
427
428
429#define Set_Node_Offset_To_R(node,byte) STMT_START { \
430 if (! SIZE_ONLY) { \
431 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
432 __LINE__, (node), (byte))); \
433 if((node) < 0) { \
434 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
435 } else { \
436 RExC_offsets[2*(node)-1] = (byte); \
437 } \
438 } \
439} STMT_END
440
441#define Set_Node_Offset(node,byte) \
442 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
443#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
444
445#define Set_Node_Length_To_R(node,len) STMT_START { \
446 if (! SIZE_ONLY) { \
447 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
448 __LINE__, (node), (len))); \
449 if((node) < 0) { \
450 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
451 } else { \
452 RExC_offsets[2*(node)] = (len); \
453 } \
454 } \
455} STMT_END
456
457#define Set_Node_Length(node,len) \
458 Set_Node_Length_To_R((node)-RExC_emit_start, len)
459#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
460#define Set_Node_Cur_Length(node) \
461 Set_Node_Length(node, RExC_parse - parse_start)
462
463/* Get offsets and lengths */
464#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
465#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
466
467static void clear_re(pTHX_ void *r);
468
469/* Mark that we cannot extend a found fixed substring at this point.
470 Updata the longest found anchored substring and the longest found
471 floating substrings if needed. */
472
473STATIC void
474S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
475{
476 STRLEN l = CHR_SVLEN(data->last_found);
477 STRLEN old_l = CHR_SVLEN(*data->longest);
478
479 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
480 SvSetMagicSV(*data->longest, data->last_found);
481 if (*data->longest == data->longest_fixed) {
482 data->offset_fixed = l ? data->last_start_min : data->pos_min;
483 if (data->flags & SF_BEFORE_EOL)
484 data->flags
485 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
486 else
487 data->flags &= ~SF_FIX_BEFORE_EOL;
488 }
489 else {
490 data->offset_float_min = l ? data->last_start_min : data->pos_min;
491 data->offset_float_max = (l
492 ? data->last_start_max
493 : data->pos_min + data->pos_delta);
494 if ((U32)data->offset_float_max > (U32)I32_MAX)
495 data->offset_float_max = I32_MAX;
496 if (data->flags & SF_BEFORE_EOL)
497 data->flags
498 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
499 else
500 data->flags &= ~SF_FL_BEFORE_EOL;
501 }
502 }
503 SvCUR_set(data->last_found, 0);
504 {
505 SV * sv = data->last_found;
506 MAGIC *mg =
507 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
508 if (mg && mg->mg_len > 0)
509 mg->mg_len = 0;
510 }
511 data->last_end = -1;
512 data->flags &= ~SF_BEFORE_EOL;
513}
514
515/* Can match anything (initialization) */
516STATIC void
517S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
518{
519 ANYOF_CLASS_ZERO(cl);
520 ANYOF_BITMAP_SETALL(cl);
521 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
522 if (LOC)
523 cl->flags |= ANYOF_LOCALE;
524}
525
526/* Can match anything (initialization) */
527STATIC int
528S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
529{
530 int value;
531
532 for (value = 0; value <= ANYOF_MAX; value += 2)
533 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
534 return 1;
535 if (!(cl->flags & ANYOF_UNICODE_ALL))
536 return 0;
537 if (!ANYOF_BITMAP_TESTALLSET(cl))
538 return 0;
539 return 1;
540}
541
542/* Can match anything (initialization) */
543STATIC void
544S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
545{
546 Zero(cl, 1, struct regnode_charclass_class);
547 cl->type = ANYOF;
548 cl_anything(pRExC_state, cl);
549}
550
551STATIC void
552S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
553{
554 Zero(cl, 1, struct regnode_charclass_class);
555 cl->type = ANYOF;
556 cl_anything(pRExC_state, cl);
557 if (LOC)
558 cl->flags |= ANYOF_LOCALE;
559}
560
561/* 'And' a given class with another one. Can create false positives */
562/* We assume that cl is not inverted */
563STATIC void
564S_cl_and(pTHX_ struct regnode_charclass_class *cl,
565 struct regnode_charclass_class *and_with)
566{
567 if (!(and_with->flags & ANYOF_CLASS)
568 && !(cl->flags & ANYOF_CLASS)
569 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
570 && !(and_with->flags & ANYOF_FOLD)
571 && !(cl->flags & ANYOF_FOLD)) {
572 int i;
573
574 if (and_with->flags & ANYOF_INVERT)
575 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
576 cl->bitmap[i] &= ~and_with->bitmap[i];
577 else
578 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
579 cl->bitmap[i] &= and_with->bitmap[i];
580 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
581 if (!(and_with->flags & ANYOF_EOS))
582 cl->flags &= ~ANYOF_EOS;
583
584 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
585 !(and_with->flags & ANYOF_INVERT)) {
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 cl->flags |= ANYOF_UNICODE;
588 ARG_SET(cl, ARG(and_with));
589 }
590 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
591 !(and_with->flags & ANYOF_INVERT))
592 cl->flags &= ~ANYOF_UNICODE_ALL;
593 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
594 !(and_with->flags & ANYOF_INVERT))
595 cl->flags &= ~ANYOF_UNICODE;
596}
597
598/* 'OR' a given class with another one. Can create false positives */
599/* We assume that cl is not inverted */
600STATIC void
601S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
602{
603 if (or_with->flags & ANYOF_INVERT) {
604 /* We do not use
605 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
606 * <= (B1 | !B2) | (CL1 | !CL2)
607 * which is wasteful if CL2 is small, but we ignore CL2:
608 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
609 * XXXX Can we handle case-fold? Unclear:
610 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
611 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
612 */
613 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
614 && !(or_with->flags & ANYOF_FOLD)
615 && !(cl->flags & ANYOF_FOLD) ) {
616 int i;
617
618 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
619 cl->bitmap[i] |= ~or_with->bitmap[i];
620 } /* XXXX: logic is complicated otherwise */
621 else {
622 cl_anything(pRExC_state, cl);
623 }
624 } else {
625 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
626 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
627 && (!(or_with->flags & ANYOF_FOLD)
628 || (cl->flags & ANYOF_FOLD)) ) {
629 int i;
630
631 /* OR char bitmap and class bitmap separately */
632 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
633 cl->bitmap[i] |= or_with->bitmap[i];
634 if (or_with->flags & ANYOF_CLASS) {
635 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
636 cl->classflags[i] |= or_with->classflags[i];
637 cl->flags |= ANYOF_CLASS;
638 }
639 }
640 else { /* XXXX: logic is complicated, leave it along for a moment. */
641 cl_anything(pRExC_state, cl);
642 }
643 }
644 if (or_with->flags & ANYOF_EOS)
645 cl->flags |= ANYOF_EOS;
646
647 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
648 ARG(cl) != ARG(or_with)) {
649 cl->flags |= ANYOF_UNICODE_ALL;
650 cl->flags &= ~ANYOF_UNICODE;
651 }
652 if (or_with->flags & ANYOF_UNICODE_ALL) {
653 cl->flags |= ANYOF_UNICODE_ALL;
654 cl->flags &= ~ANYOF_UNICODE;
655 }
656}
657
658/*
659 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
660 * These need to be revisited when a newer toolchain becomes available.
661 */
662#if defined(__sparc64__) && defined(__GNUC__)
663# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
664# undef SPARC64_GCC_WORKAROUND
665# define SPARC64_GCC_WORKAROUND 1
666# endif
667#endif
668
669/* REx optimizer. Converts nodes into quickier variants "in place".
670 Finds fixed substrings. */
671
672/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673 to the position after last scanned or to NULL. */
674
675STATIC I32
676S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
677 /* scanp: Start here (read-write). */
678 /* deltap: Write maxlen-minlen here. */
679 /* last: Stop before this one. */
680{
681 I32 min = 0, pars = 0, code;
682 regnode *scan = *scanp, *next;
683 I32 delta = 0;
684 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
685 int is_inf_internal = 0; /* The studied chunk is infinite */
686 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687 scan_data_t data_fake;
688 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
689
690 while (scan && OP(scan) != END && scan < last) {
691 /* Peephole optimizer: */
692
693 if (PL_regkind[(U8)OP(scan)] == EXACT) {
694 /* Merge several consecutive EXACTish nodes into one. */
695 regnode *n = regnext(scan);
696 U32 stringok = 1;
697#ifdef DEBUGGING
698 regnode *stop = scan;
699#endif
700
701 next = scan + NODE_SZ_STR(scan);
702 /* Skip NOTHING, merge EXACT*. */
703 while (n &&
704 ( PL_regkind[(U8)OP(n)] == NOTHING ||
705 (stringok && (OP(n) == OP(scan))))
706 && NEXT_OFF(n)
707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708 if (OP(n) == TAIL || n > next)
709 stringok = 0;
710 if (PL_regkind[(U8)OP(n)] == NOTHING) {
711 NEXT_OFF(scan) += NEXT_OFF(n);
712 next = n + NODE_STEP_REGNODE;
713#ifdef DEBUGGING
714 if (stringok)
715 stop = n;
716#endif
717 n = regnext(n);
718 }
719 else if (stringok) {
720 int oldl = STR_LEN(scan);
721 regnode *nnext = regnext(n);
722
723 if (oldl + STR_LEN(n) > U8_MAX)
724 break;
725 NEXT_OFF(scan) += NEXT_OFF(n);
726 STR_LEN(scan) += STR_LEN(n);
727 next = n + NODE_SZ_STR(n);
728 /* Now we can overwrite *n : */
729 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
730#ifdef DEBUGGING
731 stop = next - 1;
732#endif
733 n = nnext;
734 }
735 }
736
737 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
738/*
739 Two problematic code points in Unicode casefolding of EXACT nodes:
740
741 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
742 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
743
744 which casefold to
745
746 Unicode UTF-8
747
748 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
749 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
750
751 This means that in case-insensitive matching (or "loose matching",
752 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
753 length of the above casefolded versions) can match a target string
754 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
755 This would rather mess up the minimum length computation.
756
757 What we'll do is to look for the tail four bytes, and then peek
758 at the preceding two bytes to see whether we need to decrease
759 the minimum length by four (six minus two).
760
761 Thanks to the design of UTF-8, there cannot be false matches:
762 A sequence of valid UTF-8 bytes cannot be a subsequence of
763 another valid sequence of UTF-8 bytes.
764
765*/
766 char *s0 = STRING(scan), *s, *t;
767 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
768 char *t0 = "\xcc\x88\xcc\x81";
769 char *t1 = t0 + 3;
770
771 for (s = s0 + 2;
772 s < s2 && (t = ninstr(s, s1, t0, t1));
773 s = t + 4) {
774 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
775 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
776 min -= 4;
777 }
778 }
779
780#ifdef DEBUGGING
781 /* Allow dumping */
782 n = scan + NODE_SZ_STR(scan);
783 while (n <= stop) {
784 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
785 OP(n) = OPTIMIZED;
786 NEXT_OFF(n) = 0;
787 }
788 n++;
789 }
790#endif
791 }
792 /* Follow the next-chain of the current node and optimize
793 away all the NOTHINGs from it. */
794 if (OP(scan) != CURLYX) {
795 int max = (reg_off_by_arg[OP(scan)]
796 ? I32_MAX
797 /* I32 may be smaller than U16 on CRAYs! */
798 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
799 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
800 int noff;
801 regnode *n = scan;
802
803 /* Skip NOTHING and LONGJMP. */
804 while ((n = regnext(n))
805 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
806 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
807 && off + noff < max)
808 off += noff;
809 if (reg_off_by_arg[OP(scan)])
810 ARG(scan) = off;
811 else
812 NEXT_OFF(scan) = off;
813 }
814 /* The principal pseudo-switch. Cannot be a switch, since we
815 look into several different things. */
816 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
817 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
818 next = regnext(scan);
819 code = OP(scan);
820
821 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
822 I32 max1 = 0, min1 = I32_MAX, num = 0;
823 struct regnode_charclass_class accum;
824
825 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
826 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
827 if (flags & SCF_DO_STCLASS)
828 cl_init_zero(pRExC_state, &accum);
829 while (OP(scan) == code) {
830 I32 deltanext, minnext, f = 0, fake;
831 struct regnode_charclass_class this_class;
832
833 num++;
834 data_fake.flags = 0;
835 if (data) {
836 data_fake.whilem_c = data->whilem_c;
837 data_fake.last_closep = data->last_closep;
838 }
839 else
840 data_fake.last_closep = &fake;
841 next = regnext(scan);
842 scan = NEXTOPER(scan);
843 if (code != BRANCH)
844 scan = NEXTOPER(scan);
845 if (flags & SCF_DO_STCLASS) {
846 cl_init(pRExC_state, &this_class);
847 data_fake.start_class = &this_class;
848 f = SCF_DO_STCLASS_AND;
849 }
850 if (flags & SCF_WHILEM_VISITED_POS)
851 f |= SCF_WHILEM_VISITED_POS;
852 /* we suppose the run is continuous, last=next...*/
853 minnext = study_chunk(pRExC_state, &scan, &deltanext,
854 next, &data_fake, f);
855 if (min1 > minnext)
856 min1 = minnext;
857 if (max1 < minnext + deltanext)
858 max1 = minnext + deltanext;
859 if (deltanext == I32_MAX)
860 is_inf = is_inf_internal = 1;
861 scan = next;
862 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
863 pars++;
864 if (data && (data_fake.flags & SF_HAS_EVAL))
865 data->flags |= SF_HAS_EVAL;
866 if (data)
867 data->whilem_c = data_fake.whilem_c;
868 if (flags & SCF_DO_STCLASS)
869 cl_or(pRExC_state, &accum, &this_class);
870 if (code == SUSPEND)
871 break;
872 }
873 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
874 min1 = 0;
875 if (flags & SCF_DO_SUBSTR) {
876 data->pos_min += min1;
877 data->pos_delta += max1 - min1;
878 if (max1 != min1 || is_inf)
879 data->longest = &(data->longest_float);
880 }
881 min += min1;
882 delta += max1 - min1;
883 if (flags & SCF_DO_STCLASS_OR) {
884 cl_or(pRExC_state, data->start_class, &accum);
885 if (min1) {
886 cl_and(data->start_class, &and_with);
887 flags &= ~SCF_DO_STCLASS;
888 }
889 }
890 else if (flags & SCF_DO_STCLASS_AND) {
891 if (min1) {
892 cl_and(data->start_class, &accum);
893 flags &= ~SCF_DO_STCLASS;
894 }
895 else {
896 /* Switch to OR mode: cache the old value of
897 * data->start_class */
898 StructCopy(data->start_class, &and_with,
899 struct regnode_charclass_class);
900 flags &= ~SCF_DO_STCLASS_AND;
901 StructCopy(&accum, data->start_class,
902 struct regnode_charclass_class);
903 flags |= SCF_DO_STCLASS_OR;
904 data->start_class->flags |= ANYOF_EOS;
905 }
906 }
907 }
908 else if (code == BRANCHJ) /* single branch is optimized. */
909 scan = NEXTOPER(NEXTOPER(scan));
910 else /* single branch is optimized. */
911 scan = NEXTOPER(scan);
912 continue;
913 }
914 else if (OP(scan) == EXACT) {
915 I32 l = STR_LEN(scan);
916 UV uc = *((U8*)STRING(scan));
917 if (UTF) {
918 U8 *s = (U8*)STRING(scan);
919 l = utf8_length(s, s + l);
920 uc = utf8_to_uvchr(s, NULL);
921 }
922 min += l;
923 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
924 /* The code below prefers earlier match for fixed
925 offset, later match for variable offset. */
926 if (data->last_end == -1) { /* Update the start info. */
927 data->last_start_min = data->pos_min;
928 data->last_start_max = is_inf
929 ? I32_MAX : data->pos_min + data->pos_delta;
930 }
931 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
932 {
933 SV * sv = data->last_found;
934 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
935 mg_find(sv, PERL_MAGIC_utf8) : NULL;
936 if (mg && mg->mg_len >= 0)
937 mg->mg_len += utf8_length((U8*)STRING(scan),
938 (U8*)STRING(scan)+STR_LEN(scan));
939 }
940 if (UTF)
941 SvUTF8_on(data->last_found);
942 data->last_end = data->pos_min + l;
943 data->pos_min += l; /* As in the first entry. */
944 data->flags &= ~SF_BEFORE_EOL;
945 }
946 if (flags & SCF_DO_STCLASS_AND) {
947 /* Check whether it is compatible with what we know already! */
948 int compat = 1;
949
950 if (uc >= 0x100 ||
951 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
952 && !ANYOF_BITMAP_TEST(data->start_class, uc)
953 && (!(data->start_class->flags & ANYOF_FOLD)
954 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
955 )
956 compat = 0;
957 ANYOF_CLASS_ZERO(data->start_class);
958 ANYOF_BITMAP_ZERO(data->start_class);
959 if (compat)
960 ANYOF_BITMAP_SET(data->start_class, uc);
961 data->start_class->flags &= ~ANYOF_EOS;
962 if (uc < 0x100)
963 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
964 }
965 else if (flags & SCF_DO_STCLASS_OR) {
966 /* false positive possible if the class is case-folded */
967 if (uc < 0x100)
968 ANYOF_BITMAP_SET(data->start_class, uc);
969 else
970 data->start_class->flags |= ANYOF_UNICODE_ALL;
971 data->start_class->flags &= ~ANYOF_EOS;
972 cl_and(data->start_class, &and_with);
973 }
974 flags &= ~SCF_DO_STCLASS;
975 }
976 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
977 I32 l = STR_LEN(scan);
978 UV uc = *((U8*)STRING(scan));
979
980 /* Search for fixed substrings supports EXACT only. */
981 if (flags & SCF_DO_SUBSTR)
982 scan_commit(pRExC_state, data);
983 if (UTF) {
984 U8 *s = (U8 *)STRING(scan);
985 l = utf8_length(s, s + l);
986 uc = utf8_to_uvchr(s, NULL);
987 }
988 min += l;
989 if (data && (flags & SCF_DO_SUBSTR))
990 data->pos_min += l;
991 if (flags & SCF_DO_STCLASS_AND) {
992 /* Check whether it is compatible with what we know already! */
993 int compat = 1;
994
995 if (uc >= 0x100 ||
996 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
997 && !ANYOF_BITMAP_TEST(data->start_class, uc)
998 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
999 compat = 0;
1000 ANYOF_CLASS_ZERO(data->start_class);
1001 ANYOF_BITMAP_ZERO(data->start_class);
1002 if (compat) {
1003 ANYOF_BITMAP_SET(data->start_class, uc);
1004 data->start_class->flags &= ~ANYOF_EOS;
1005 data->start_class->flags |= ANYOF_FOLD;
1006 if (OP(scan) == EXACTFL)
1007 data->start_class->flags |= ANYOF_LOCALE;
1008 }
1009 }
1010 else if (flags & SCF_DO_STCLASS_OR) {
1011 if (data->start_class->flags & ANYOF_FOLD) {
1012 /* false positive possible if the class is case-folded.
1013 Assume that the locale settings are the same... */
1014 if (uc < 0x100)
1015 ANYOF_BITMAP_SET(data->start_class, uc);
1016 data->start_class->flags &= ~ANYOF_EOS;
1017 }
1018 cl_and(data->start_class, &and_with);
1019 }
1020 flags &= ~SCF_DO_STCLASS;
1021 }
1022 else if (strchr((char*)PL_varies,OP(scan))) {
1023 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1024 I32 f = flags, pos_before = 0;
1025 regnode *oscan = scan;
1026 struct regnode_charclass_class this_class;
1027 struct regnode_charclass_class *oclass = NULL;
1028 I32 next_is_eval = 0;
1029
1030 switch (PL_regkind[(U8)OP(scan)]) {
1031 case WHILEM: /* End of (?:...)* . */
1032 scan = NEXTOPER(scan);
1033 goto finish;
1034 case PLUS:
1035 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1036 next = NEXTOPER(scan);
1037 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1038 mincount = 1;
1039 maxcount = REG_INFTY;
1040 next = regnext(scan);
1041 scan = NEXTOPER(scan);
1042 goto do_curly;
1043 }
1044 }
1045 if (flags & SCF_DO_SUBSTR)
1046 data->pos_min++;
1047 min++;
1048 /* Fall through. */
1049 case STAR:
1050 if (flags & SCF_DO_STCLASS) {
1051 mincount = 0;
1052 maxcount = REG_INFTY;
1053 next = regnext(scan);
1054 scan = NEXTOPER(scan);
1055 goto do_curly;
1056 }
1057 is_inf = is_inf_internal = 1;
1058 scan = regnext(scan);
1059 if (flags & SCF_DO_SUBSTR) {
1060 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1061 data->longest = &(data->longest_float);
1062 }
1063 goto optimize_curly_tail;
1064 case CURLY:
1065 mincount = ARG1(scan);
1066 maxcount = ARG2(scan);
1067 next = regnext(scan);
1068 if (OP(scan) == CURLYX) {
1069 I32 lp = (data ? *(data->last_closep) : 0);
1070
1071 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1072 }
1073 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1074 next_is_eval = (OP(scan) == EVAL);
1075 do_curly:
1076 if (flags & SCF_DO_SUBSTR) {
1077 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1078 pos_before = data->pos_min;
1079 }
1080 if (data) {
1081 fl = data->flags;
1082 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1083 if (is_inf)
1084 data->flags |= SF_IS_INF;
1085 }
1086 if (flags & SCF_DO_STCLASS) {
1087 cl_init(pRExC_state, &this_class);
1088 oclass = data->start_class;
1089 data->start_class = &this_class;
1090 f |= SCF_DO_STCLASS_AND;
1091 f &= ~SCF_DO_STCLASS_OR;
1092 }
1093 /* These are the cases when once a subexpression
1094 fails at a particular position, it cannot succeed
1095 even after backtracking at the enclosing scope.
1096
1097 XXXX what if minimal match and we are at the
1098 initial run of {n,m}? */
1099 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1100 f &= ~SCF_WHILEM_VISITED_POS;
1101
1102 /* This will finish on WHILEM, setting scan, or on NULL: */
1103 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1104 mincount == 0
1105 ? (f & ~SCF_DO_SUBSTR) : f);
1106
1107 if (flags & SCF_DO_STCLASS)
1108 data->start_class = oclass;
1109 if (mincount == 0 || minnext == 0) {
1110 if (flags & SCF_DO_STCLASS_OR) {
1111 cl_or(pRExC_state, data->start_class, &this_class);
1112 }
1113 else if (flags & SCF_DO_STCLASS_AND) {
1114 /* Switch to OR mode: cache the old value of
1115 * data->start_class */
1116 StructCopy(data->start_class, &and_with,
1117 struct regnode_charclass_class);
1118 flags &= ~SCF_DO_STCLASS_AND;
1119 StructCopy(&this_class, data->start_class,
1120 struct regnode_charclass_class);
1121 flags |= SCF_DO_STCLASS_OR;
1122 data->start_class->flags |= ANYOF_EOS;
1123 }
1124 } else { /* Non-zero len */
1125 if (flags & SCF_DO_STCLASS_OR) {
1126 cl_or(pRExC_state, data->start_class, &this_class);
1127 cl_and(data->start_class, &and_with);
1128 }
1129 else if (flags & SCF_DO_STCLASS_AND)
1130 cl_and(data->start_class, &this_class);
1131 flags &= ~SCF_DO_STCLASS;
1132 }
1133 if (!scan) /* It was not CURLYX, but CURLY. */
1134 scan = next;
1135 if (ckWARN(WARN_REGEXP)
1136 /* ? quantifier ok, except for (?{ ... }) */
1137 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1138 && (minnext == 0) && (deltanext == 0)
1139 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1140 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1141 {
1142 vWARN(RExC_parse,
1143 "Quantifier unexpected on zero-length expression");
1144 }
1145
1146 min += minnext * mincount;
1147 is_inf_internal |= ((maxcount == REG_INFTY
1148 && (minnext + deltanext) > 0)
1149 || deltanext == I32_MAX);
1150 is_inf |= is_inf_internal;
1151 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1152
1153 /* Try powerful optimization CURLYX => CURLYN. */
1154 if ( OP(oscan) == CURLYX && data
1155 && data->flags & SF_IN_PAR
1156 && !(data->flags & SF_HAS_EVAL)
1157 && !deltanext && minnext == 1 ) {
1158 /* Try to optimize to CURLYN. */
1159 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1160 regnode *nxt1 = nxt;
1161#ifdef DEBUGGING
1162 regnode *nxt2;
1163#endif
1164
1165 /* Skip open. */
1166 nxt = regnext(nxt);
1167 if (!strchr((char*)PL_simple,OP(nxt))
1168 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1169 && STR_LEN(nxt) == 1))
1170 goto nogo;
1171#ifdef DEBUGGING
1172 nxt2 = nxt;
1173#endif
1174 nxt = regnext(nxt);
1175 if (OP(nxt) != CLOSE)
1176 goto nogo;
1177 /* Now we know that nxt2 is the only contents: */
1178 oscan->flags = (U8)ARG(nxt);
1179 OP(oscan) = CURLYN;
1180 OP(nxt1) = NOTHING; /* was OPEN. */
1181#ifdef DEBUGGING
1182 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1183 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1184 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1185 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1186 OP(nxt + 1) = OPTIMIZED; /* was count. */
1187 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1188#endif
1189 }
1190 nogo:
1191
1192 /* Try optimization CURLYX => CURLYM. */
1193 if ( OP(oscan) == CURLYX && data
1194 && !(data->flags & SF_HAS_PAR)
1195 && !(data->flags & SF_HAS_EVAL)
1196 && !deltanext /* atom is fixed width */
1197 && minnext != 0 /* CURLYM can't handle zero width */
1198 ) {
1199 /* XXXX How to optimize if data == 0? */
1200 /* Optimize to a simpler form. */
1201 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1202 regnode *nxt2;
1203
1204 OP(oscan) = CURLYM;
1205 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1206 && (OP(nxt2) != WHILEM))
1207 nxt = nxt2;
1208 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1209 /* Need to optimize away parenths. */
1210 if (data->flags & SF_IN_PAR) {
1211 /* Set the parenth number. */
1212 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1213
1214 if (OP(nxt) != CLOSE)
1215 FAIL("Panic opt close");
1216 oscan->flags = (U8)ARG(nxt);
1217 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1218 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1219#ifdef DEBUGGING
1220 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1221 OP(nxt + 1) = OPTIMIZED; /* was count. */
1222 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1223 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1224#endif
1225#if 0
1226 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1227 regnode *nnxt = regnext(nxt1);
1228
1229 if (nnxt == nxt) {
1230 if (reg_off_by_arg[OP(nxt1)])
1231 ARG_SET(nxt1, nxt2 - nxt1);
1232 else if (nxt2 - nxt1 < U16_MAX)
1233 NEXT_OFF(nxt1) = nxt2 - nxt1;
1234 else
1235 OP(nxt) = NOTHING; /* Cannot beautify */
1236 }
1237 nxt1 = nnxt;
1238 }
1239#endif
1240 /* Optimize again: */
1241 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1242 NULL, 0);
1243 }
1244 else
1245 oscan->flags = 0;
1246 }
1247 else if ((OP(oscan) == CURLYX)
1248 && (flags & SCF_WHILEM_VISITED_POS)
1249 /* See the comment on a similar expression above.
1250 However, this time it not a subexpression
1251 we care about, but the expression itself. */
1252 && (maxcount == REG_INFTY)
1253 && data && ++data->whilem_c < 16) {
1254 /* This stays as CURLYX, we can put the count/of pair. */
1255 /* Find WHILEM (as in regexec.c) */
1256 regnode *nxt = oscan + NEXT_OFF(oscan);
1257
1258 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1259 nxt += ARG(nxt);
1260 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1261 | (RExC_whilem_seen << 4)); /* On WHILEM */
1262 }
1263 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1264 pars++;
1265 if (flags & SCF_DO_SUBSTR) {
1266 SV *last_str = Nullsv;
1267 int counted = mincount != 0;
1268
1269 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1270#if defined(SPARC64_GCC_WORKAROUND)
1271 I32 b = 0;
1272 STRLEN l = 0;
1273 char *s = NULL;
1274 I32 old = 0;
1275
1276 if (pos_before >= data->last_start_min)
1277 b = pos_before;
1278 else
1279 b = data->last_start_min;
1280
1281 l = 0;
1282 s = SvPV(data->last_found, l);
1283 old = b - data->last_start_min;
1284
1285#else
1286 I32 b = pos_before >= data->last_start_min
1287 ? pos_before : data->last_start_min;
1288 STRLEN l;
1289 char *s = SvPV(data->last_found, l);
1290 I32 old = b - data->last_start_min;
1291#endif
1292
1293 if (UTF)
1294 old = utf8_hop((U8*)s, old) - (U8*)s;
1295
1296 l -= old;
1297 /* Get the added string: */
1298 last_str = newSVpvn(s + old, l);
1299 if (UTF)
1300 SvUTF8_on(last_str);
1301 if (deltanext == 0 && pos_before == b) {
1302 /* What was added is a constant string */
1303 if (mincount > 1) {
1304 SvGROW(last_str, (mincount * l) + 1);
1305 repeatcpy(SvPVX(last_str) + l,
1306 SvPVX(last_str), l, mincount - 1);
1307 SvCUR(last_str) *= mincount;
1308 /* Add additional parts. */
1309 SvCUR_set(data->last_found,
1310 SvCUR(data->last_found) - l);
1311 sv_catsv(data->last_found, last_str);
1312 {
1313 SV * sv = data->last_found;
1314 MAGIC *mg =
1315 SvUTF8(sv) && SvMAGICAL(sv) ?
1316 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1317 if (mg && mg->mg_len >= 0)
1318 mg->mg_len += CHR_SVLEN(last_str);
1319 }
1320 data->last_end += l * (mincount - 1);
1321 }
1322 } else {
1323 /* start offset must point into the last copy */
1324 data->last_start_min += minnext * (mincount - 1);
1325 data->last_start_max += is_inf ? I32_MAX
1326 : (maxcount - 1) * (minnext + data->pos_delta);
1327 }
1328 }
1329 /* It is counted once already... */
1330 data->pos_min += minnext * (mincount - counted);
1331 data->pos_delta += - counted * deltanext +
1332 (minnext + deltanext) * maxcount - minnext * mincount;
1333 if (mincount != maxcount) {
1334 /* Cannot extend fixed substrings found inside
1335 the group. */
1336 scan_commit(pRExC_state,data);
1337 if (mincount && last_str) {
1338 sv_setsv(data->last_found, last_str);
1339 data->last_end = data->pos_min;
1340 data->last_start_min =
1341 data->pos_min - CHR_SVLEN(last_str);
1342 data->last_start_max = is_inf
1343 ? I32_MAX
1344 : data->pos_min + data->pos_delta
1345 - CHR_SVLEN(last_str);
1346 }
1347 data->longest = &(data->longest_float);
1348 }
1349 SvREFCNT_dec(last_str);
1350 }
1351 if (data && (fl & SF_HAS_EVAL))
1352 data->flags |= SF_HAS_EVAL;
1353 optimize_curly_tail:
1354 if (OP(oscan) != CURLYX) {
1355 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1356 && NEXT_OFF(next))
1357 NEXT_OFF(oscan) += NEXT_OFF(next);
1358 }
1359 continue;
1360 default: /* REF and CLUMP only? */
1361 if (flags & SCF_DO_SUBSTR) {
1362 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1363 data->longest = &(data->longest_float);
1364 }
1365 is_inf = is_inf_internal = 1;
1366 if (flags & SCF_DO_STCLASS_OR)
1367 cl_anything(pRExC_state, data->start_class);
1368 flags &= ~SCF_DO_STCLASS;
1369 break;
1370 }
1371 }
1372 else if (strchr((char*)PL_simple,OP(scan))) {
1373 int value = 0;
1374
1375 if (flags & SCF_DO_SUBSTR) {
1376 scan_commit(pRExC_state,data);
1377 data->pos_min++;
1378 }
1379 min++;
1380 if (flags & SCF_DO_STCLASS) {
1381 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1382
1383 /* Some of the logic below assumes that switching
1384 locale on will only add false positives. */
1385 switch (PL_regkind[(U8)OP(scan)]) {
1386 case SANY:
1387 default:
1388 do_default:
1389 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1390 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1391 cl_anything(pRExC_state, data->start_class);
1392 break;
1393 case REG_ANY:
1394 if (OP(scan) == SANY)
1395 goto do_default;
1396 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1397 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1398 || (data->start_class->flags & ANYOF_CLASS));
1399 cl_anything(pRExC_state, data->start_class);
1400 }
1401 if (flags & SCF_DO_STCLASS_AND || !value)
1402 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1403 break;
1404 case ANYOF:
1405 if (flags & SCF_DO_STCLASS_AND)
1406 cl_and(data->start_class,
1407 (struct regnode_charclass_class*)scan);
1408 else
1409 cl_or(pRExC_state, data->start_class,
1410 (struct regnode_charclass_class*)scan);
1411 break;
1412 case ALNUM:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416 for (value = 0; value < 256; value++)
1417 if (!isALNUM(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (isALNUM(value))
1427 ANYOF_BITMAP_SET(data->start_class, value);
1428 }
1429 }
1430 break;
1431 case ALNUML:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1435 }
1436 else {
1437 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1438 data->start_class->flags |= ANYOF_LOCALE;
1439 }
1440 break;
1441 case NALNUM:
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445 for (value = 0; value < 256; value++)
1446 if (isALNUM(value))
1447 ANYOF_BITMAP_CLEAR(data->start_class, value);
1448 }
1449 }
1450 else {
1451 if (data->start_class->flags & ANYOF_LOCALE)
1452 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1453 else {
1454 for (value = 0; value < 256; value++)
1455 if (!isALNUM(value))
1456 ANYOF_BITMAP_SET(data->start_class, value);
1457 }
1458 }
1459 break;
1460 case NALNUML:
1461 if (flags & SCF_DO_STCLASS_AND) {
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1464 }
1465 else {
1466 data->start_class->flags |= ANYOF_LOCALE;
1467 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1468 }
1469 break;
1470 case SPACE:
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474 for (value = 0; value < 256; value++)
1475 if (!isSPACE(value))
1476 ANYOF_BITMAP_CLEAR(data->start_class, value);
1477 }
1478 }
1479 else {
1480 if (data->start_class->flags & ANYOF_LOCALE)
1481 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1482 else {
1483 for (value = 0; value < 256; value++)
1484 if (isSPACE(value))
1485 ANYOF_BITMAP_SET(data->start_class, value);
1486 }
1487 }
1488 break;
1489 case SPACEL:
1490 if (flags & SCF_DO_STCLASS_AND) {
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1493 }
1494 else {
1495 data->start_class->flags |= ANYOF_LOCALE;
1496 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1497 }
1498 break;
1499 case NSPACE:
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1504 if (isSPACE(value))
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 }
1507 }
1508 else {
1509 if (data->start_class->flags & ANYOF_LOCALE)
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511 else {
1512 for (value = 0; value < 256; value++)
1513 if (!isSPACE(value))
1514 ANYOF_BITMAP_SET(data->start_class, value);
1515 }
1516 }
1517 break;
1518 case NSPACEL:
1519 if (flags & SCF_DO_STCLASS_AND) {
1520 if (data->start_class->flags & ANYOF_LOCALE) {
1521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1522 for (value = 0; value < 256; value++)
1523 if (!isSPACE(value))
1524 ANYOF_BITMAP_CLEAR(data->start_class, value);
1525 }
1526 }
1527 else {
1528 data->start_class->flags |= ANYOF_LOCALE;
1529 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1530 }
1531 break;
1532 case DIGIT:
1533 if (flags & SCF_DO_STCLASS_AND) {
1534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1535 for (value = 0; value < 256; value++)
1536 if (!isDIGIT(value))
1537 ANYOF_BITMAP_CLEAR(data->start_class, value);
1538 }
1539 else {
1540 if (data->start_class->flags & ANYOF_LOCALE)
1541 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1542 else {
1543 for (value = 0; value < 256; value++)
1544 if (isDIGIT(value))
1545 ANYOF_BITMAP_SET(data->start_class, value);
1546 }
1547 }
1548 break;
1549 case NDIGIT:
1550 if (flags & SCF_DO_STCLASS_AND) {
1551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1552 for (value = 0; value < 256; value++)
1553 if (isDIGIT(value))
1554 ANYOF_BITMAP_CLEAR(data->start_class, value);
1555 }
1556 else {
1557 if (data->start_class->flags & ANYOF_LOCALE)
1558 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1559 else {
1560 for (value = 0; value < 256; value++)
1561 if (!isDIGIT(value))
1562 ANYOF_BITMAP_SET(data->start_class, value);
1563 }
1564 }
1565 break;
1566 }
1567 if (flags & SCF_DO_STCLASS_OR)
1568 cl_and(data->start_class, &and_with);
1569 flags &= ~SCF_DO_STCLASS;
1570 }
1571 }
1572 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1573 data->flags |= (OP(scan) == MEOL
1574 ? SF_BEFORE_MEOL
1575 : SF_BEFORE_SEOL);
1576 }
1577 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1578 /* Lookbehind, or need to calculate parens/evals/stclass: */
1579 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1580 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1581 /* Lookahead/lookbehind */
1582 I32 deltanext, minnext, fake = 0;
1583 regnode *nscan;
1584 struct regnode_charclass_class intrnl;
1585 int f = 0;
1586
1587 data_fake.flags = 0;
1588 if (data) {
1589 data_fake.whilem_c = data->whilem_c;
1590 data_fake.last_closep = data->last_closep;
1591 }
1592 else
1593 data_fake.last_closep = &fake;
1594 if ( flags & SCF_DO_STCLASS && !scan->flags
1595 && OP(scan) == IFMATCH ) { /* Lookahead */
1596 cl_init(pRExC_state, &intrnl);
1597 data_fake.start_class = &intrnl;
1598 f |= SCF_DO_STCLASS_AND;
1599 }
1600 if (flags & SCF_WHILEM_VISITED_POS)
1601 f |= SCF_WHILEM_VISITED_POS;
1602 next = regnext(scan);
1603 nscan = NEXTOPER(NEXTOPER(scan));
1604 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1605 if (scan->flags) {
1606 if (deltanext) {
1607 vFAIL("Variable length lookbehind not implemented");
1608 }
1609 else if (minnext > U8_MAX) {
1610 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1611 }
1612 scan->flags = (U8)minnext;
1613 }
1614 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1615 pars++;
1616 if (data && (data_fake.flags & SF_HAS_EVAL))
1617 data->flags |= SF_HAS_EVAL;
1618 if (data)
1619 data->whilem_c = data_fake.whilem_c;
1620 if (f & SCF_DO_STCLASS_AND) {
1621 int was = (data->start_class->flags & ANYOF_EOS);
1622
1623 cl_and(data->start_class, &intrnl);
1624 if (was)
1625 data->start_class->flags |= ANYOF_EOS;
1626 }
1627 }
1628 else if (OP(scan) == OPEN) {
1629 pars++;
1630 }
1631 else if (OP(scan) == CLOSE) {
1632 if ((I32)ARG(scan) == is_par) {
1633 next = regnext(scan);
1634
1635 if ( next && (OP(next) != WHILEM) && next < last)
1636 is_par = 0; /* Disable optimization */
1637 }
1638 if (data)
1639 *(data->last_closep) = ARG(scan);
1640 }
1641 else if (OP(scan) == EVAL) {
1642 if (data)
1643 data->flags |= SF_HAS_EVAL;
1644 }
1645 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1646 if (flags & SCF_DO_SUBSTR) {
1647 scan_commit(pRExC_state,data);
1648 data->longest = &(data->longest_float);
1649 }
1650 is_inf = is_inf_internal = 1;
1651 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1652 cl_anything(pRExC_state, data->start_class);
1653 flags &= ~SCF_DO_STCLASS;
1654 }
1655 /* Else: zero-length, ignore. */
1656 scan = regnext(scan);
1657 }
1658
1659 finish:
1660 *scanp = scan;
1661 *deltap = is_inf_internal ? I32_MAX : delta;
1662 if (flags & SCF_DO_SUBSTR && is_inf)
1663 data->pos_delta = I32_MAX - data->pos_min;
1664 if (is_par > U8_MAX)
1665 is_par = 0;
1666 if (is_par && pars==1 && data) {
1667 data->flags |= SF_IN_PAR;
1668 data->flags &= ~SF_HAS_PAR;
1669 }
1670 else if (pars && data) {
1671 data->flags |= SF_HAS_PAR;
1672 data->flags &= ~SF_IN_PAR;
1673 }
1674 if (flags & SCF_DO_STCLASS_OR)
1675 cl_and(data->start_class, &and_with);
1676 return min;
1677}
1678
1679STATIC I32
1680S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1681{
1682 if (RExC_rx->data) {
1683 Renewc(RExC_rx->data,
1684 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1685 char, struct reg_data);
1686 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1687 RExC_rx->data->count += n;
1688 }
1689 else {
1690 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1691 char, struct reg_data);
1692 New(1208, RExC_rx->data->what, n, U8);
1693 RExC_rx->data->count = n;
1694 }
1695 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1696 return RExC_rx->data->count - n;
1697}
1698
1699void
1700Perl_reginitcolors(pTHX)
1701{
1702 int i = 0;
1703 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1704
1705 if (s) {
1706 PL_colors[0] = s = savepv(s);
1707 while (++i < 6) {
1708 s = strchr(s, '\t');
1709 if (s) {
1710 *s = '\0';
1711 PL_colors[i] = ++s;
1712 }
1713 else
1714 PL_colors[i] = s = "";
1715 }
1716 } else {
1717 while (i < 6)
1718 PL_colors[i++] = "";
1719 }
1720 PL_colorset = 1;
1721}
1722
1723
1724/*
1725 - pregcomp - compile a regular expression into internal code
1726 *
1727 * We can't allocate space until we know how big the compiled form will be,
1728 * but we can't compile it (and thus know how big it is) until we've got a
1729 * place to put the code. So we cheat: we compile it twice, once with code
1730 * generation turned off and size counting turned on, and once "for real".
1731 * This also means that we don't allocate space until we are sure that the
1732 * thing really will compile successfully, and we never have to move the
1733 * code and thus invalidate pointers into it. (Note that it has to be in
1734 * one piece because free() must be able to free it all.) [NB: not true in perl]
1735 *
1736 * Beware that the optimization-preparation code in here knows about some
1737 * of the structure of the compiled regexp. [I'll say.]
1738 */
1739regexp *
1740Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1741{
1742 register regexp *r;
1743 regnode *scan;
1744 regnode *first;
1745 I32 flags;
1746 I32 minlen = 0;
1747 I32 sawplus = 0;
1748 I32 sawopen = 0;
1749 scan_data_t data;
1750 RExC_state_t RExC_state;
1751 RExC_state_t *pRExC_state = &RExC_state;
1752
1753 if (exp == NULL)
1754 FAIL("NULL regexp argument");
1755
1756 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1757
1758 RExC_precomp = exp;
1759 DEBUG_r({
1760 if (!PL_colorset) reginitcolors();
1761 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1762 PL_colors[4],PL_colors[5],PL_colors[0],
1763 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1764 });
1765 RExC_flags = pm->op_pmflags;
1766 RExC_sawback = 0;
1767
1768 RExC_seen = 0;
1769 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1770 RExC_seen_evals = 0;
1771 RExC_extralen = 0;
1772
1773 /* First pass: determine size, legality. */
1774 RExC_parse = exp;
1775 RExC_start = exp;
1776 RExC_end = xend;
1777 RExC_naughty = 0;
1778 RExC_npar = 1;
1779 RExC_size = 0L;
1780 RExC_emit = &PL_regdummy;
1781 RExC_whilem_seen = 0;
1782#if 0 /* REGC() is (currently) a NOP at the first pass.
1783 * Clever compilers notice this and complain. --jhi */
1784 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1785#endif
1786 if (reg(pRExC_state, 0, &flags) == NULL) {
1787 RExC_precomp = Nullch;
1788 return(NULL);
1789 }
1790 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1791
1792 /* Small enough for pointer-storage convention?
1793 If extralen==0, this means that we will not need long jumps. */
1794 if (RExC_size >= 0x10000L && RExC_extralen)
1795 RExC_size += RExC_extralen;
1796 else
1797 RExC_extralen = 0;
1798 if (RExC_whilem_seen > 15)
1799 RExC_whilem_seen = 15;
1800
1801 /* Allocate space and initialize. */
1802 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1803 char, regexp);
1804 if (r == NULL)
1805 FAIL("Regexp out of space");
1806
1807#ifdef DEBUGGING
1808 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1809 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1810#endif
1811 r->refcnt = 1;
1812 r->prelen = xend - exp;
1813 r->precomp = savepvn(RExC_precomp, r->prelen);
1814 r->subbeg = NULL;
1815#ifdef PERL_COPY_ON_WRITE
1816 r->saved_copy = Nullsv;
1817#endif
1818 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1819 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1820
1821 r->substrs = 0; /* Useful during FAIL. */
1822 r->startp = 0; /* Useful during FAIL. */
1823 r->endp = 0; /* Useful during FAIL. */
1824
1825 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1826 if (r->offsets) {
1827 r->offsets[0] = RExC_size;
1828 }
1829 DEBUG_r(PerlIO_printf(Perl_debug_log,
1830 "%s %"UVuf" bytes for offset annotations.\n",
1831 r->offsets ? "Got" : "Couldn't get",
1832 (UV)((2*RExC_size+1) * sizeof(U32))));
1833
1834 RExC_rx = r;
1835
1836 /* Second pass: emit code. */
1837 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1838 RExC_parse = exp;
1839 RExC_end = xend;
1840 RExC_naughty = 0;
1841 RExC_npar = 1;
1842 RExC_emit_start = r->program;
1843 RExC_emit = r->program;
1844 /* Store the count of eval-groups for security checks: */
1845 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1846 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1847 r->data = 0;
1848 if (reg(pRExC_state, 0, &flags) == NULL)
1849 return(NULL);
1850
1851 /* Dig out information for optimizations. */
1852 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1853 pm->op_pmflags = RExC_flags;
1854 if (UTF)
1855 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1856 r->regstclass = NULL;
1857 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1858 r->reganch |= ROPT_NAUGHTY;
1859 scan = r->program + 1; /* First BRANCH. */
1860
1861 /* XXXX To minimize changes to RE engine we always allocate
1862 3-units-long substrs field. */
1863 Newz(1004, r->substrs, 1, struct reg_substr_data);
1864
1865 StructCopy(&zero_scan_data, &data, scan_data_t);
1866 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1867 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1868 I32 fake;
1869 STRLEN longest_float_length, longest_fixed_length;
1870 struct regnode_charclass_class ch_class;
1871 int stclass_flag;
1872 I32 last_close = 0;
1873
1874 first = scan;
1875 /* Skip introductions and multiplicators >= 1. */
1876 while ((OP(first) == OPEN && (sawopen = 1)) ||
1877 /* An OR of *one* alternative - should not happen now. */
1878 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1879 (OP(first) == PLUS) ||
1880 (OP(first) == MINMOD) ||
1881 /* An {n,m} with n>0 */
1882 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1883 if (OP(first) == PLUS)
1884 sawplus = 1;
1885 else
1886 first += regarglen[(U8)OP(first)];
1887 first = NEXTOPER(first);
1888 }
1889
1890 /* Starting-point info. */
1891 again:
1892 if (PL_regkind[(U8)OP(first)] == EXACT) {
1893 if (OP(first) == EXACT)
1894 ; /* Empty, get anchored substr later. */
1895 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1896 r->regstclass = first;
1897 }
1898 else if (strchr((char*)PL_simple,OP(first)))
1899 r->regstclass = first;
1900 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1901 PL_regkind[(U8)OP(first)] == NBOUND)
1902 r->regstclass = first;
1903 else if (PL_regkind[(U8)OP(first)] == BOL) {
1904 r->reganch |= (OP(first) == MBOL
1905 ? ROPT_ANCH_MBOL
1906 : (OP(first) == SBOL
1907 ? ROPT_ANCH_SBOL
1908 : ROPT_ANCH_BOL));
1909 first = NEXTOPER(first);
1910 goto again;
1911 }
1912 else if (OP(first) == GPOS) {
1913 r->reganch |= ROPT_ANCH_GPOS;
1914 first = NEXTOPER(first);
1915 goto again;
1916 }
1917 else if (!sawopen && (OP(first) == STAR &&
1918 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1919 !(r->reganch & ROPT_ANCH) )
1920 {
1921 /* turn .* into ^.* with an implied $*=1 */
1922 int type = OP(NEXTOPER(first));
1923
1924 if (type == REG_ANY)
1925 type = ROPT_ANCH_MBOL;
1926 else
1927 type = ROPT_ANCH_SBOL;
1928
1929 r->reganch |= type | ROPT_IMPLICIT;
1930 first = NEXTOPER(first);
1931 goto again;
1932 }
1933 if (sawplus && (!sawopen || !RExC_sawback)
1934 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1935 /* x+ must match at the 1st pos of run of x's */
1936 r->reganch |= ROPT_SKIP;
1937
1938 /* Scan is after the zeroth branch, first is atomic matcher. */
1939 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1940 (IV)(first - scan + 1)));
1941 /*
1942 * If there's something expensive in the r.e., find the
1943 * longest literal string that must appear and make it the
1944 * regmust. Resolve ties in favor of later strings, since
1945 * the regstart check works with the beginning of the r.e.
1946 * and avoiding duplication strengthens checking. Not a
1947 * strong reason, but sufficient in the absence of others.
1948 * [Now we resolve ties in favor of the earlier string if
1949 * it happens that c_offset_min has been invalidated, since the
1950 * earlier string may buy us something the later one won't.]
1951 */
1952 minlen = 0;
1953
1954 data.longest_fixed = newSVpvn("",0);
1955 data.longest_float = newSVpvn("",0);
1956 data.last_found = newSVpvn("",0);
1957 data.longest = &(data.longest_fixed);
1958 first = scan;
1959 if (!r->regstclass) {
1960 cl_init(pRExC_state, &ch_class);
1961 data.start_class = &ch_class;
1962 stclass_flag = SCF_DO_STCLASS_AND;
1963 } else /* XXXX Check for BOUND? */
1964 stclass_flag = 0;
1965 data.last_closep = &last_close;
1966
1967 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1968 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1969 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1970 && data.last_start_min == 0 && data.last_end > 0
1971 && !RExC_seen_zerolen
1972 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1973 r->reganch |= ROPT_CHECK_ALL;
1974 scan_commit(pRExC_state, &data);
1975 SvREFCNT_dec(data.last_found);
1976
1977 longest_float_length = CHR_SVLEN(data.longest_float);
1978 if (longest_float_length
1979 || (data.flags & SF_FL_BEFORE_EOL
1980 && (!(data.flags & SF_FL_BEFORE_MEOL)
1981 || (RExC_flags & PMf_MULTILINE)))) {
1982 int t;
1983
1984 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1985 && data.offset_fixed == data.offset_float_min
1986 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1987 goto remove_float; /* As in (a)+. */
1988
1989 if (SvUTF8(data.longest_float)) {
1990 r->float_utf8 = data.longest_float;
1991 r->float_substr = Nullsv;
1992 } else {
1993 r->float_substr = data.longest_float;
1994 r->float_utf8 = Nullsv;
1995 }
1996 r->float_min_offset = data.offset_float_min;
1997 r->float_max_offset = data.offset_float_max;
1998 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1999 && (!(data.flags & SF_FL_BEFORE_MEOL)
2000 || (RExC_flags & PMf_MULTILINE)));
2001 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2002 }
2003 else {
2004 remove_float:
2005 r->float_substr = r->float_utf8 = Nullsv;
2006 SvREFCNT_dec(data.longest_float);
2007 longest_float_length = 0;
2008 }
2009
2010 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2011 if (longest_fixed_length
2012 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2013 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2014 || (RExC_flags & PMf_MULTILINE)))) {
2015 int t;
2016
2017 if (SvUTF8(data.longest_fixed)) {
2018 r->anchored_utf8 = data.longest_fixed;
2019 r->anchored_substr = Nullsv;
2020 } else {
2021 r->anchored_substr = data.longest_fixed;
2022 r->anchored_utf8 = Nullsv;
2023 }
2024 r->anchored_offset = data.offset_fixed;
2025 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2026 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2027 || (RExC_flags & PMf_MULTILINE)));
2028 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2029 }
2030 else {
2031 r->anchored_substr = r->anchored_utf8 = Nullsv;
2032 SvREFCNT_dec(data.longest_fixed);
2033 longest_fixed_length = 0;
2034 }
2035 if (r->regstclass
2036 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2037 r->regstclass = NULL;
2038 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2039 && stclass_flag
2040 && !(data.start_class->flags & ANYOF_EOS)
2041 && !cl_is_anything(data.start_class))
2042 {
2043 I32 n = add_data(pRExC_state, 1, "f");
2044
2045 New(1006, RExC_rx->data->data[n], 1,
2046 struct regnode_charclass_class);
2047 StructCopy(data.start_class,
2048 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2049 struct regnode_charclass_class);
2050 r->regstclass = (regnode*)RExC_rx->data->data[n];
2051 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2052 PL_regdata = r->data; /* for regprop() */
2053 DEBUG_r({ SV *sv = sv_newmortal();
2054 regprop(sv, (regnode*)data.start_class);
2055 PerlIO_printf(Perl_debug_log,
2056 "synthetic stclass `%s'.\n",
2057 SvPVX(sv));});
2058 }
2059
2060 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2061 if (longest_fixed_length > longest_float_length) {
2062 r->check_substr = r->anchored_substr;
2063 r->check_utf8 = r->anchored_utf8;
2064 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2065 if (r->reganch & ROPT_ANCH_SINGLE)
2066 r->reganch |= ROPT_NOSCAN;
2067 }
2068 else {
2069 r->check_substr = r->float_substr;
2070 r->check_utf8 = r->float_utf8;
2071 r->check_offset_min = data.offset_float_min;
2072 r->check_offset_max = data.offset_float_max;
2073 }
2074 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2075 This should be changed ASAP! */
2076 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2077 r->reganch |= RE_USE_INTUIT;
2078 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2079 r->reganch |= RE_INTUIT_TAIL;
2080 }
2081 }
2082 else {
2083 /* Several toplevels. Best we can is to set minlen. */
2084 I32 fake;
2085 struct regnode_charclass_class ch_class;
2086 I32 last_close = 0;
2087
2088 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2089 scan = r->program + 1;
2090 cl_init(pRExC_state, &ch_class);
2091 data.start_class = &ch_class;
2092 data.last_closep = &last_close;
2093 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2094 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2095 = r->float_substr = r->float_utf8 = Nullsv;
2096 if (!(data.start_class->flags & ANYOF_EOS)
2097 && !cl_is_anything(data.start_class))
2098 {
2099 I32 n = add_data(pRExC_state, 1, "f");
2100
2101 New(1006, RExC_rx->data->data[n], 1,
2102 struct regnode_charclass_class);
2103 StructCopy(data.start_class,
2104 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2105 struct regnode_charclass_class);
2106 r->regstclass = (regnode*)RExC_rx->data->data[n];
2107 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2108 DEBUG_r({ SV* sv = sv_newmortal();
2109 regprop(sv, (regnode*)data.start_class);
2110 PerlIO_printf(Perl_debug_log,
2111 "synthetic stclass `%s'.\n",
2112 SvPVX(sv));});
2113 }
2114 }
2115
2116 r->minlen = minlen;
2117 if (RExC_seen & REG_SEEN_GPOS)
2118 r->reganch |= ROPT_GPOS_SEEN;
2119 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2120 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2121 if (RExC_seen & REG_SEEN_EVAL)
2122 r->reganch |= ROPT_EVAL_SEEN;
2123 if (RExC_seen & REG_SEEN_CANY)
2124 r->reganch |= ROPT_CANY_SEEN;
2125 Newz(1002, r->startp, RExC_npar, I32);
2126 Newz(1002, r->endp, RExC_npar, I32);
2127 PL_regdata = r->data; /* for regprop() */
2128 DEBUG_r(regdump(r));
2129 return(r);
2130}
2131
2132/*
2133 - reg - regular expression, i.e. main body or parenthesized thing
2134 *
2135 * Caller must absorb opening parenthesis.
2136 *
2137 * Combining parenthesis handling with the base level of regular expression
2138 * is a trifle forced, but the need to tie the tails of the branches to what
2139 * follows makes it hard to avoid.
2140 */
2141STATIC regnode *
2142S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2143 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2144{
2145 register regnode *ret; /* Will be the head of the group. */
2146 register regnode *br;
2147 register regnode *lastbr;
2148 register regnode *ender = 0;
2149 register I32 parno = 0;
2150 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2151
2152 /* for (?g), (?gc), and (?o) warnings; warning
2153 about (?c) will warn about (?g) -- japhy */
2154
2155 I32 wastedflags = 0x00,
2156 wasted_o = 0x01,
2157 wasted_g = 0x02,
2158 wasted_gc = 0x02 | 0x04,
2159 wasted_c = 0x04;
2160
2161 char * parse_start = RExC_parse; /* MJD */
2162 char *oregcomp_parse = RExC_parse;
2163 char c;
2164
2165 *flagp = 0; /* Tentatively. */
2166
2167
2168 /* Make an OPEN node, if parenthesized. */
2169 if (paren) {
2170 if (*RExC_parse == '?') { /* (?...) */
2171 U32 posflags = 0, negflags = 0;
2172 U32 *flagsp = &posflags;
2173 int logical = 0;
2174 char *seqstart = RExC_parse;
2175
2176 RExC_parse++;
2177 paren = *RExC_parse++;
2178 ret = NULL; /* For look-ahead/behind. */
2179 switch (paren) {
2180 case '<': /* (?<...) */
2181 RExC_seen |= REG_SEEN_LOOKBEHIND;
2182 if (*RExC_parse == '!')
2183 paren = ',';
2184 if (*RExC_parse != '=' && *RExC_parse != '!')
2185 goto unknown;
2186 RExC_parse++;
2187 case '=': /* (?=...) */
2188 case '!': /* (?!...) */
2189 RExC_seen_zerolen++;
2190 case ':': /* (?:...) */
2191 case '>': /* (?>...) */
2192 break;
2193 case '$': /* (?$...) */
2194 case '@': /* (?@...) */
2195 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2196 break;
2197 case '#': /* (?#...) */
2198 while (*RExC_parse && *RExC_parse != ')')
2199 RExC_parse++;
2200 if (*RExC_parse != ')')
2201 FAIL("Sequence (?#... not terminated");
2202 nextchar(pRExC_state);
2203 *flagp = TRYAGAIN;
2204 return NULL;
2205 case 'p': /* (?p...) */
2206 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2207 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2208 /* FALL THROUGH*/
2209 case '?': /* (??...) */
2210 logical = 1;
2211 if (*RExC_parse != '{')
2212 goto unknown;
2213 paren = *RExC_parse++;
2214 /* FALL THROUGH */
2215 case '{': /* (?{...}) */
2216 {
2217 I32 count = 1, n = 0;
2218 char c;
2219 char *s = RExC_parse;
2220 SV *sv;
2221 OP_4tree *sop, *rop;
2222
2223 RExC_seen_zerolen++;
2224 RExC_seen |= REG_SEEN_EVAL;
2225 while (count && (c = *RExC_parse)) {
2226 if (c == '\\' && RExC_parse[1])
2227 RExC_parse++;
2228 else if (c == '{')
2229 count++;
2230 else if (c == '}')
2231 count--;
2232 RExC_parse++;
2233 }
2234 if (*RExC_parse != ')')
2235 {
2236 RExC_parse = s;
2237 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2238 }
2239 if (!SIZE_ONLY) {
2240 PAD *pad;
2241
2242 if (RExC_parse - 1 - s)
2243 sv = newSVpvn(s, RExC_parse - 1 - s);
2244 else
2245 sv = newSVpvn("", 0);
2246
2247 ENTER;
2248 Perl_save_re_context(aTHX);
2249 rop = sv_compile_2op(sv, &sop, "re", &pad);
2250 sop->op_private |= OPpREFCOUNTED;
2251 /* re_dup will OpREFCNT_inc */
2252 OpREFCNT_set(sop, 1);
2253 LEAVE;
2254
2255 n = add_data(pRExC_state, 3, "nop");
2256 RExC_rx->data->data[n] = (void*)rop;
2257 RExC_rx->data->data[n+1] = (void*)sop;
2258 RExC_rx->data->data[n+2] = (void*)pad;
2259 SvREFCNT_dec(sv);
2260 }
2261 else { /* First pass */
2262 if (PL_reginterp_cnt < ++RExC_seen_evals
2263 && IN_PERL_RUNTIME)
2264 /* No compiled RE interpolated, has runtime
2265 components ===> unsafe. */
2266 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2267 if (PL_tainting && PL_tainted)
2268 FAIL("Eval-group in insecure regular expression");
2269 if (IN_PERL_COMPILETIME)
2270 PL_cv_has_eval = 1;
2271 }
2272
2273 nextchar(pRExC_state);
2274 if (logical) {
2275 ret = reg_node(pRExC_state, LOGICAL);
2276 if (!SIZE_ONLY)
2277 ret->flags = 2;
2278 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2279 /* deal with the length of this later - MJD */
2280 return ret;
2281 }
2282 ret = reganode(pRExC_state, EVAL, n);
2283 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2284 Set_Node_Offset(ret, parse_start);
2285 return ret;
2286 }
2287 case '(': /* (?(?{...})...) and (?(?=...)...) */
2288 {
2289 if (RExC_parse[0] == '?') { /* (?(?...)) */
2290 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2291 || RExC_parse[1] == '<'
2292 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2293 I32 flag;
2294
2295 ret = reg_node(pRExC_state, LOGICAL);
2296 if (!SIZE_ONLY)
2297 ret->flags = 1;
2298 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2299 goto insert_if;
2300 }
2301 }
2302 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2303 /* (?(1)...) */
2304 parno = atoi(RExC_parse++);
2305
2306 while (isDIGIT(*RExC_parse))
2307 RExC_parse++;
2308 ret = reganode(pRExC_state, GROUPP, parno);
2309
2310 if ((c = *nextchar(pRExC_state)) != ')')
2311 vFAIL("Switch condition not recognized");
2312 insert_if:
2313 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2314 br = regbranch(pRExC_state, &flags, 1);
2315 if (br == NULL)
2316 br = reganode(pRExC_state, LONGJMP, 0);
2317 else
2318 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2319 c = *nextchar(pRExC_state);
2320 if (flags&HASWIDTH)
2321 *flagp |= HASWIDTH;
2322 if (c == '|') {
2323 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2324 regbranch(pRExC_state, &flags, 1);
2325 regtail(pRExC_state, ret, lastbr);
2326 if (flags&HASWIDTH)
2327 *flagp |= HASWIDTH;
2328 c = *nextchar(pRExC_state);
2329 }
2330 else
2331 lastbr = NULL;
2332 if (c != ')')
2333 vFAIL("Switch (?(condition)... contains too many branches");
2334 ender = reg_node(pRExC_state, TAIL);
2335 regtail(pRExC_state, br, ender);
2336 if (lastbr) {
2337 regtail(pRExC_state, lastbr, ender);
2338 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2339 }
2340 else
2341 regtail(pRExC_state, ret, ender);
2342 return ret;
2343 }
2344 else {
2345 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2346 }
2347 }
2348 case 0:
2349 RExC_parse--; /* for vFAIL to print correctly */
2350 vFAIL("Sequence (? incomplete");
2351 break;
2352 default:
2353 --RExC_parse;
2354 parse_flags: /* (?i) */
2355 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2356 /* (?g), (?gc) and (?o) are useless here
2357 and must be globally applied -- japhy */
2358
2359 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2360 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2362 if (! (wastedflags & wflagbit) ) {
2363 wastedflags |= wflagbit;
2364 vWARN5(
2365 RExC_parse + 1,
2366 "Useless (%s%c) - %suse /%c modifier",
2367 flagsp == &negflags ? "?-" : "?",
2368 *RExC_parse,
2369 flagsp == &negflags ? "don't " : "",
2370 *RExC_parse
2371 );
2372 }
2373 }
2374 }
2375 else if (*RExC_parse == 'c') {
2376 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2377 if (! (wastedflags & wasted_c) ) {
2378 wastedflags |= wasted_gc;
2379 vWARN3(
2380 RExC_parse + 1,
2381 "Useless (%sc) - %suse /gc modifier",
2382 flagsp == &negflags ? "?-" : "?",
2383 flagsp == &negflags ? "don't " : ""
2384 );
2385 }
2386 }
2387 }
2388 else { pmflag(flagsp, *RExC_parse); }
2389
2390 ++RExC_parse;
2391 }
2392 if (*RExC_parse == '-') {
2393 flagsp = &negflags;
2394 wastedflags = 0; /* reset so (?g-c) warns twice */
2395 ++RExC_parse;
2396 goto parse_flags;
2397 }
2398 RExC_flags |= posflags;
2399 RExC_flags &= ~negflags;
2400 if (*RExC_parse == ':') {
2401 RExC_parse++;
2402 paren = ':';
2403 break;
2404 }
2405 unknown:
2406 if (*RExC_parse != ')') {
2407 RExC_parse++;
2408 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2409 }
2410 nextchar(pRExC_state);
2411 *flagp = TRYAGAIN;
2412 return NULL;
2413 }
2414 }
2415 else { /* (...) */
2416 parno = RExC_npar;
2417 RExC_npar++;
2418 ret = reganode(pRExC_state, OPEN, parno);
2419 Set_Node_Length(ret, 1); /* MJD */
2420 Set_Node_Offset(ret, RExC_parse); /* MJD */
2421 open = 1;
2422 }
2423 }
2424 else /* ! paren */
2425 ret = NULL;
2426
2427 /* Pick up the branches, linking them together. */
2428 parse_start = RExC_parse; /* MJD */
2429 br = regbranch(pRExC_state, &flags, 1);
2430 /* branch_len = (paren != 0); */
2431
2432 if (br == NULL)
2433 return(NULL);
2434 if (*RExC_parse == '|') {
2435 if (!SIZE_ONLY && RExC_extralen) {
2436 reginsert(pRExC_state, BRANCHJ, br);
2437 }
2438 else { /* MJD */
2439 reginsert(pRExC_state, BRANCH, br);
2440 Set_Node_Length(br, paren != 0);
2441 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2442 }
2443 have_branch = 1;
2444 if (SIZE_ONLY)
2445 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2446 }
2447 else if (paren == ':') {
2448 *flagp |= flags&SIMPLE;
2449 }
2450 if (open) { /* Starts with OPEN. */
2451 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2452 }
2453 else if (paren != '?') /* Not Conditional */
2454 ret = br;
2455 *flagp |= flags & (SPSTART | HASWIDTH);
2456 lastbr = br;
2457 while (*RExC_parse == '|') {
2458 if (!SIZE_ONLY && RExC_extralen) {
2459 ender = reganode(pRExC_state, LONGJMP,0);
2460 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2461 }
2462 if (SIZE_ONLY)
2463 RExC_extralen += 2; /* Account for LONGJMP. */
2464 nextchar(pRExC_state);
2465 br = regbranch(pRExC_state, &flags, 0);
2466
2467 if (br == NULL)
2468 return(NULL);
2469 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2470 lastbr = br;
2471 if (flags&HASWIDTH)
2472 *flagp |= HASWIDTH;
2473 *flagp |= flags&SPSTART;
2474 }
2475
2476 if (have_branch || paren != ':') {
2477 /* Make a closing node, and hook it on the end. */
2478 switch (paren) {
2479 case ':':
2480 ender = reg_node(pRExC_state, TAIL);
2481 break;
2482 case 1:
2483 ender = reganode(pRExC_state, CLOSE, parno);
2484 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2485 Set_Node_Length(ender,1); /* MJD */
2486 break;
2487 case '<':
2488 case ',':
2489 case '=':
2490 case '!':
2491 *flagp &= ~HASWIDTH;
2492 /* FALL THROUGH */
2493 case '>':
2494 ender = reg_node(pRExC_state, SUCCEED);
2495 break;
2496 case 0:
2497 ender = reg_node(pRExC_state, END);
2498 break;
2499 }
2500 regtail(pRExC_state, lastbr, ender);
2501
2502 if (have_branch) {
2503 /* Hook the tails of the branches to the closing node. */
2504 for (br = ret; br != NULL; br = regnext(br)) {
2505 regoptail(pRExC_state, br, ender);
2506 }
2507 }
2508 }
2509
2510 {
2511 char *p;
2512 static char parens[] = "=!<,>";
2513
2514 if (paren && (p = strchr(parens, paren))) {
2515 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2516 int flag = (p - parens) > 1;
2517
2518 if (paren == '>')
2519 node = SUSPEND, flag = 0;
2520 reginsert(pRExC_state, node,ret);
2521 Set_Node_Cur_Length(ret);
2522 Set_Node_Offset(ret, parse_start + 1);
2523 ret->flags = flag;
2524 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2525 }
2526 }
2527
2528 /* Check for proper termination. */
2529 if (paren) {
2530 RExC_flags = oregflags;
2531 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2532 RExC_parse = oregcomp_parse;
2533 vFAIL("Unmatched (");
2534 }
2535 }
2536 else if (!paren && RExC_parse < RExC_end) {
2537 if (*RExC_parse == ')') {
2538 RExC_parse++;
2539 vFAIL("Unmatched )");
2540 }
2541 else
2542 FAIL("Junk on end of regexp"); /* "Can't happen". */
2543 /* NOTREACHED */
2544 }
2545
2546 return(ret);
2547}
2548
2549/*
2550 - regbranch - one alternative of an | operator
2551 *
2552 * Implements the concatenation operator.
2553 */
2554STATIC regnode *
2555S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2556{
2557 register regnode *ret;
2558 register regnode *chain = NULL;
2559 register regnode *latest;
2560 I32 flags = 0, c = 0;
2561
2562 if (first)
2563 ret = NULL;
2564 else {
2565 if (!SIZE_ONLY && RExC_extralen)
2566 ret = reganode(pRExC_state, BRANCHJ,0);
2567 else {
2568 ret = reg_node(pRExC_state, BRANCH);
2569 Set_Node_Length(ret, 1);
2570 }
2571 }
2572
2573 if (!first && SIZE_ONLY)
2574 RExC_extralen += 1; /* BRANCHJ */
2575
2576 *flagp = WORST; /* Tentatively. */
2577
2578 RExC_parse--;
2579 nextchar(pRExC_state);
2580 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2581 flags &= ~TRYAGAIN;
2582 latest = regpiece(pRExC_state, &flags);
2583 if (latest == NULL) {
2584 if (flags & TRYAGAIN)
2585 continue;
2586 return(NULL);
2587 }
2588 else if (ret == NULL)
2589 ret = latest;
2590 *flagp |= flags&HASWIDTH;
2591 if (chain == NULL) /* First piece. */
2592 *flagp |= flags&SPSTART;
2593 else {
2594 RExC_naughty++;
2595 regtail(pRExC_state, chain, latest);
2596 }
2597 chain = latest;
2598 c++;
2599 }
2600 if (chain == NULL) { /* Loop ran zero times. */
2601 chain = reg_node(pRExC_state, NOTHING);
2602 if (ret == NULL)
2603 ret = chain;
2604 }
2605 if (c == 1) {
2606 *flagp |= flags&SIMPLE;
2607 }
2608
2609 return(ret);
2610}
2611
2612/*
2613 - regpiece - something followed by possible [*+?]
2614 *
2615 * Note that the branching code sequences used for ? and the general cases
2616 * of * and + are somewhat optimized: they use the same NOTHING node as
2617 * both the endmarker for their branch list and the body of the last branch.
2618 * It might seem that this node could be dispensed with entirely, but the
2619 * endmarker role is not redundant.
2620 */
2621STATIC regnode *
2622S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2623{
2624 register regnode *ret;
2625 register char op;
2626 register char *next;
2627 I32 flags;
2628 char *origparse = RExC_parse;
2629 char *maxpos;
2630 I32 min;
2631 I32 max = REG_INFTY;
2632 char *parse_start;
2633
2634 ret = regatom(pRExC_state, &flags);
2635 if (ret == NULL) {
2636 if (flags & TRYAGAIN)
2637 *flagp |= TRYAGAIN;
2638 return(NULL);
2639 }
2640
2641 op = *RExC_parse;
2642
2643 if (op == '{' && regcurly(RExC_parse)) {
2644 parse_start = RExC_parse; /* MJD */
2645 next = RExC_parse + 1;
2646 maxpos = Nullch;
2647 while (isDIGIT(*next) || *next == ',') {
2648 if (*next == ',') {
2649 if (maxpos)
2650 break;
2651 else
2652 maxpos = next;
2653 }
2654 next++;
2655 }
2656 if (*next == '}') { /* got one */
2657 if (!maxpos)
2658 maxpos = next;
2659 RExC_parse++;
2660 min = atoi(RExC_parse);
2661 if (*maxpos == ',')
2662 maxpos++;
2663 else
2664 maxpos = RExC_parse;
2665 max = atoi(maxpos);
2666 if (!max && *maxpos != '0')
2667 max = REG_INFTY; /* meaning "infinity" */
2668 else if (max >= REG_INFTY)
2669 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2670 RExC_parse = next;
2671 nextchar(pRExC_state);
2672
2673 do_curly:
2674 if ((flags&SIMPLE)) {
2675 RExC_naughty += 2 + RExC_naughty / 2;
2676 reginsert(pRExC_state, CURLY, ret);
2677 Set_Node_Offset(ret, parse_start+1); /* MJD */
2678 Set_Node_Cur_Length(ret);
2679 }
2680 else {
2681 regnode *w = reg_node(pRExC_state, WHILEM);
2682
2683 w->flags = 0;
2684 regtail(pRExC_state, ret, w);
2685 if (!SIZE_ONLY && RExC_extralen) {
2686 reginsert(pRExC_state, LONGJMP,ret);
2687 reginsert(pRExC_state, NOTHING,ret);
2688 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2689 }
2690 reginsert(pRExC_state, CURLYX,ret);
2691 /* MJD hk */
2692 Set_Node_Offset(ret, parse_start+1);
2693 Set_Node_Length(ret,
2694 op == '{' ? (RExC_parse - parse_start) : 1);
2695
2696 if (!SIZE_ONLY && RExC_extralen)
2697 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2698 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2699 if (SIZE_ONLY)
2700 RExC_whilem_seen++, RExC_extralen += 3;
2701 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2702 }
2703 ret->flags = 0;
2704
2705 if (min > 0)
2706 *flagp = WORST;
2707 if (max > 0)
2708 *flagp |= HASWIDTH;
2709 if (max && max < min)
2710 vFAIL("Can't do {n,m} with n > m");
2711 if (!SIZE_ONLY) {
2712 ARG1_SET(ret, (U16)min);
2713 ARG2_SET(ret, (U16)max);
2714 }
2715
2716 goto nest_check;
2717 }
2718 }
2719
2720 if (!ISMULT1(op)) {
2721 *flagp = flags;
2722 return(ret);
2723 }
2724
2725#if 0 /* Now runtime fix should be reliable. */
2726
2727 /* if this is reinstated, don't forget to put this back into perldiag:
2728
2729 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2730
2731 (F) The part of the regexp subject to either the * or + quantifier
2732 could match an empty string. The {#} shows in the regular
2733 expression about where the problem was discovered.
2734
2735 */
2736
2737 if (!(flags&HASWIDTH) && op != '?')
2738 vFAIL("Regexp *+ operand could be empty");
2739#endif
2740
2741 parse_start = RExC_parse;
2742 nextchar(pRExC_state);
2743
2744 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2745
2746 if (op == '*' && (flags&SIMPLE)) {
2747 reginsert(pRExC_state, STAR, ret);
2748 ret->flags = 0;
2749 RExC_naughty += 4;
2750 }
2751 else if (op == '*') {
2752 min = 0;
2753 goto do_curly;
2754 }
2755 else if (op == '+' && (flags&SIMPLE)) {
2756 reginsert(pRExC_state, PLUS, ret);
2757 ret->flags = 0;
2758 RExC_naughty += 3;
2759 }
2760 else if (op == '+') {
2761 min = 1;
2762 goto do_curly;
2763 }
2764 else if (op == '?') {
2765 min = 0; max = 1;
2766 goto do_curly;
2767 }
2768 nest_check:
2769 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2770 vWARN3(RExC_parse,
2771 "%.*s matches null string many times",
2772 RExC_parse - origparse,
2773 origparse);
2774 }
2775
2776 if (*RExC_parse == '?') {
2777 nextchar(pRExC_state);
2778 reginsert(pRExC_state, MINMOD, ret);
2779 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2780 }
2781 if (ISMULT2(RExC_parse)) {
2782 RExC_parse++;
2783 vFAIL("Nested quantifiers");
2784 }
2785
2786 return(ret);
2787}
2788
2789/*
2790 - regatom - the lowest level
2791 *
2792 * Optimization: gobbles an entire sequence of ordinary characters so that
2793 * it can turn them into a single node, which is smaller to store and
2794 * faster to run. Backslashed characters are exceptions, each becoming a
2795 * separate node; the code is simpler that way and it's not worth fixing.
2796 *
2797 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2798STATIC regnode *
2799S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2800{
2801 register regnode *ret = 0;
2802 I32 flags;
2803 char *parse_start = RExC_parse;
2804
2805 *flagp = WORST; /* Tentatively. */
2806
2807tryagain:
2808 switch (*RExC_parse) {
2809 case '^':
2810 RExC_seen_zerolen++;
2811 nextchar(pRExC_state);
2812 if (RExC_flags & PMf_MULTILINE)
2813 ret = reg_node(pRExC_state, MBOL);
2814 else if (RExC_flags & PMf_SINGLELINE)
2815 ret = reg_node(pRExC_state, SBOL);
2816 else
2817 ret = reg_node(pRExC_state, BOL);
2818 Set_Node_Length(ret, 1); /* MJD */
2819 break;
2820 case '$':
2821 nextchar(pRExC_state);
2822 if (*RExC_parse)
2823 RExC_seen_zerolen++;
2824 if (RExC_flags & PMf_MULTILINE)
2825 ret = reg_node(pRExC_state, MEOL);
2826 else if (RExC_flags & PMf_SINGLELINE)
2827 ret = reg_node(pRExC_state, SEOL);
2828 else
2829 ret = reg_node(pRExC_state, EOL);
2830 Set_Node_Length(ret, 1); /* MJD */
2831 break;
2832 case '.':
2833 nextchar(pRExC_state);
2834 if (RExC_flags & PMf_SINGLELINE)
2835 ret = reg_node(pRExC_state, SANY);
2836 else
2837 ret = reg_node(pRExC_state, REG_ANY);
2838 *flagp |= HASWIDTH|SIMPLE;
2839 RExC_naughty++;
2840 Set_Node_Length(ret, 1); /* MJD */
2841 break;
2842 case '[':
2843 {
2844 char *oregcomp_parse = ++RExC_parse;
2845 ret = regclass(pRExC_state);
2846 if (*RExC_parse != ']') {
2847 RExC_parse = oregcomp_parse;
2848 vFAIL("Unmatched [");
2849 }
2850 nextchar(pRExC_state);
2851 *flagp |= HASWIDTH|SIMPLE;
2852 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2853 break;
2854 }
2855 case '(':
2856 nextchar(pRExC_state);
2857 ret = reg(pRExC_state, 1, &flags);
2858 if (ret == NULL) {
2859 if (flags & TRYAGAIN) {
2860 if (RExC_parse == RExC_end) {
2861 /* Make parent create an empty node if needed. */
2862 *flagp |= TRYAGAIN;
2863 return(NULL);
2864 }
2865 goto tryagain;
2866 }
2867 return(NULL);
2868 }
2869 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2870 break;
2871 case '|':
2872 case ')':
2873 if (flags & TRYAGAIN) {
2874 *flagp |= TRYAGAIN;
2875 return NULL;
2876 }
2877 vFAIL("Internal urp");
2878 /* Supposed to be caught earlier. */
2879 break;
2880 case '{':
2881 if (!regcurly(RExC_parse)) {
2882 RExC_parse++;
2883 goto defchar;
2884 }
2885 /* FALL THROUGH */
2886 case '?':
2887 case '+':
2888 case '*':
2889 RExC_parse++;
2890 vFAIL("Quantifier follows nothing");
2891 break;
2892 case '\\':
2893 switch (*++RExC_parse) {
2894 case 'A':
2895 RExC_seen_zerolen++;
2896 ret = reg_node(pRExC_state, SBOL);
2897 *flagp |= SIMPLE;
2898 nextchar(pRExC_state);
2899 Set_Node_Length(ret, 2); /* MJD */
2900 break;
2901 case 'G':
2902 ret = reg_node(pRExC_state, GPOS);
2903 RExC_seen |= REG_SEEN_GPOS;
2904 *flagp |= SIMPLE;
2905 nextchar(pRExC_state);
2906 Set_Node_Length(ret, 2); /* MJD */
2907 break;
2908 case 'Z':
2909 ret = reg_node(pRExC_state, SEOL);
2910 *flagp |= SIMPLE;
2911 RExC_seen_zerolen++; /* Do not optimize RE away */
2912 nextchar(pRExC_state);
2913 break;
2914 case 'z':
2915 ret = reg_node(pRExC_state, EOS);
2916 *flagp |= SIMPLE;
2917 RExC_seen_zerolen++; /* Do not optimize RE away */
2918 nextchar(pRExC_state);
2919 Set_Node_Length(ret, 2); /* MJD */
2920 break;
2921 case 'C':
2922 ret = reg_node(pRExC_state, CANY);
2923 RExC_seen |= REG_SEEN_CANY;
2924 *flagp |= HASWIDTH|SIMPLE;
2925 nextchar(pRExC_state);
2926 Set_Node_Length(ret, 2); /* MJD */
2927 break;
2928 case 'X':
2929 ret = reg_node(pRExC_state, CLUMP);
2930 *flagp |= HASWIDTH;
2931 nextchar(pRExC_state);
2932 Set_Node_Length(ret, 2); /* MJD */
2933 break;
2934 case 'w':
2935 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2936 *flagp |= HASWIDTH|SIMPLE;
2937 nextchar(pRExC_state);
2938 Set_Node_Length(ret, 2); /* MJD */
2939 break;
2940 case 'W':
2941 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2942 *flagp |= HASWIDTH|SIMPLE;
2943 nextchar(pRExC_state);
2944 Set_Node_Length(ret, 2); /* MJD */
2945 break;
2946 case 'b':
2947 RExC_seen_zerolen++;
2948 RExC_seen |= REG_SEEN_LOOKBEHIND;
2949 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2950 *flagp |= SIMPLE;
2951 nextchar(pRExC_state);
2952 Set_Node_Length(ret, 2); /* MJD */
2953 break;
2954 case 'B':
2955 RExC_seen_zerolen++;
2956 RExC_seen |= REG_SEEN_LOOKBEHIND;
2957 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2958 *flagp |= SIMPLE;
2959 nextchar(pRExC_state);
2960 Set_Node_Length(ret, 2); /* MJD */
2961 break;
2962 case 's':
2963 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2964 *flagp |= HASWIDTH|SIMPLE;
2965 nextchar(pRExC_state);
2966 Set_Node_Length(ret, 2); /* MJD */
2967 break;
2968 case 'S':
2969 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2970 *flagp |= HASWIDTH|SIMPLE;
2971 nextchar(pRExC_state);
2972 Set_Node_Length(ret, 2); /* MJD */
2973 break;
2974 case 'd':
2975 ret = reg_node(pRExC_state, DIGIT);
2976 *flagp |= HASWIDTH|SIMPLE;
2977 nextchar(pRExC_state);
2978 Set_Node_Length(ret, 2); /* MJD */
2979 break;
2980 case 'D':
2981 ret = reg_node(pRExC_state, NDIGIT);
2982 *flagp |= HASWIDTH|SIMPLE;
2983 nextchar(pRExC_state);
2984 Set_Node_Length(ret, 2); /* MJD */
2985 break;
2986 case 'p':
2987 case 'P':
2988 {
2989 char* oldregxend = RExC_end;
2990 char* parse_start = RExC_parse - 2;
2991
2992 if (RExC_parse[1] == '{') {
2993 /* a lovely hack--pretend we saw [\pX] instead */
2994 RExC_end = strchr(RExC_parse, '}');
2995 if (!RExC_end) {
2996 U8 c = (U8)*RExC_parse;
2997 RExC_parse += 2;
2998 RExC_end = oldregxend;
2999 vFAIL2("Missing right brace on \\%c{}", c);
3000 }
3001 RExC_end++;
3002 }
3003 else {
3004 RExC_end = RExC_parse + 2;
3005 if (RExC_end > oldregxend)
3006 RExC_end = oldregxend;
3007 }
3008 RExC_parse--;
3009
3010 ret = regclass(pRExC_state);
3011
3012 RExC_end = oldregxend;
3013 RExC_parse--;
3014
3015 Set_Node_Offset(ret, parse_start + 2);
3016 Set_Node_Cur_Length(ret);
3017 nextchar(pRExC_state);
3018 *flagp |= HASWIDTH|SIMPLE;
3019 }
3020 break;
3021 case 'n':
3022 case 'r':
3023 case 't':
3024 case 'f':
3025 case 'e':
3026 case 'a':
3027 case 'x':
3028 case 'c':
3029 case '0':
3030 goto defchar;
3031 case '1': case '2': case '3': case '4':
3032 case '5': case '6': case '7': case '8': case '9':
3033 {
3034 I32 num = atoi(RExC_parse);
3035
3036 if (num > 9 && num >= RExC_npar)
3037 goto defchar;
3038 else {
3039 char * parse_start = RExC_parse - 1; /* MJD */
3040 while (isDIGIT(*RExC_parse))
3041 RExC_parse++;
3042
3043 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3044 vFAIL("Reference to nonexistent group");
3045 RExC_sawback = 1;
3046 ret = reganode(pRExC_state,
3047 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3048 num);
3049 *flagp |= HASWIDTH;
3050
3051 /* override incorrect value set in reganode MJD */
3052 Set_Node_Offset(ret, parse_start+1);
3053 Set_Node_Cur_Length(ret); /* MJD */
3054 RExC_parse--;
3055 nextchar(pRExC_state);
3056 }
3057 }
3058 break;
3059 case '\0':
3060 if (RExC_parse >= RExC_end)
3061 FAIL("Trailing \\");
3062 /* FALL THROUGH */
3063 default:
3064 /* Do not generate `unrecognized' warnings here, we fall
3065 back into the quick-grab loop below */
3066 parse_start--;
3067 goto defchar;
3068 }
3069 break;
3070
3071 case '#':
3072 if (RExC_flags & PMf_EXTENDED) {
3073 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3074 if (RExC_parse < RExC_end)
3075 goto tryagain;
3076 }
3077 /* FALL THROUGH */
3078
3079 default: {
3080 register STRLEN len;
3081 register UV ender;
3082 register char *p;
3083 char *oldp, *s;
3084 STRLEN numlen;
3085 STRLEN foldlen;
3086 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3087
3088 parse_start = RExC_parse - 1;
3089
3090 RExC_parse++;
3091
3092 defchar:
3093 ender = 0;
3094 ret = reg_node(pRExC_state,
3095 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3096 s = STRING(ret);
3097 for (len = 0, p = RExC_parse - 1;
3098 len < 127 && p < RExC_end;
3099 len++)
3100 {
3101 oldp = p;
3102
3103 if (RExC_flags & PMf_EXTENDED)
3104 p = regwhite(p, RExC_end);
3105 switch (*p) {
3106 case '^':
3107 case '$':
3108 case '.':
3109 case '[':
3110 case '(':
3111 case ')':
3112 case '|':
3113 goto loopdone;
3114 case '\\':
3115 switch (*++p) {
3116 case 'A':
3117 case 'C':
3118 case 'X':
3119 case 'G':
3120 case 'Z':
3121 case 'z':
3122 case 'w':
3123 case 'W':
3124 case 'b':
3125 case 'B':
3126 case 's':
3127 case 'S':
3128 case 'd':
3129 case 'D':
3130 case 'p':
3131 case 'P':
3132 --p;
3133 goto loopdone;
3134 case 'n':
3135 ender = '\n';
3136 p++;
3137 break;
3138 case 'r':
3139 ender = '\r';
3140 p++;
3141 break;
3142 case 't':
3143 ender = '\t';
3144 p++;
3145 break;
3146 case 'f':
3147 ender = '\f';
3148 p++;
3149 break;
3150 case 'e':
3151 ender = ASCII_TO_NATIVE('\033');
3152 p++;
3153 break;
3154 case 'a':
3155 ender = ASCII_TO_NATIVE('\007');
3156 p++;
3157 break;
3158 case 'x':
3159 if (*++p == '{') {
3160 char* e = strchr(p, '}');
3161
3162 if (!e) {
3163 RExC_parse = p + 1;
3164 vFAIL("Missing right brace on \\x{}");
3165 }
3166 else {
3167 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3168 | PERL_SCAN_DISALLOW_PREFIX;
3169 numlen = e - p - 1;
3170 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3171 if (ender > 0xff)
3172 RExC_utf8 = 1;
3173 p = e + 1;
3174 }
3175 }
3176 else {
3177 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3178 numlen = 2;
3179 ender = grok_hex(p, &numlen, &flags, NULL);
3180 p += numlen;
3181 }
3182 break;
3183 case 'c':
3184 p++;
3185 ender = UCHARAT(p++);
3186 ender = toCTRL(ender);
3187 break;
3188 case '0': case '1': case '2': case '3':case '4':
3189 case '5': case '6': case '7': case '8':case '9':
3190 if (*p == '0' ||
3191 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3192 I32 flags = 0;
3193 numlen = 3;
3194 ender = grok_oct(p, &numlen, &flags, NULL);
3195 p += numlen;
3196 }
3197 else {
3198 --p;
3199 goto loopdone;
3200 }
3201 break;
3202 case '\0':
3203 if (p >= RExC_end)
3204 FAIL("Trailing \\");
3205 /* FALL THROUGH */
3206 default:
3207 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3208 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3209 goto normal_default;
3210 }
3211 break;
3212 default:
3213 normal_default:
3214 if (UTF8_IS_START(*p) && UTF) {
3215 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3216 &numlen, 0);
3217 p += numlen;
3218 }
3219 else
3220 ender = *p++;
3221 break;
3222 }
3223 if (RExC_flags & PMf_EXTENDED)
3224 p = regwhite(p, RExC_end);
3225 if (UTF && FOLD) {
3226 /* Prime the casefolded buffer. */
3227 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3228 }
3229 if (ISMULT2(p)) { /* Back off on ?+*. */
3230 if (len)
3231 p = oldp;
3232 else if (UTF) {
3233 STRLEN unilen;
3234
3235 if (FOLD) {
3236 /* Emit all the Unicode characters. */
3237 for (foldbuf = tmpbuf;
3238 foldlen;
3239 foldlen -= numlen) {
3240 ender = utf8_to_uvchr(foldbuf, &numlen);
3241 if (numlen > 0) {
3242 reguni(pRExC_state, ender, s, &unilen);
3243 s += unilen;
3244 len += unilen;
3245 /* In EBCDIC the numlen
3246 * and unilen can differ. */
3247 foldbuf += numlen;
3248 if (numlen >= foldlen)
3249 break;
3250 }
3251 else
3252 break; /* "Can't happen." */
3253 }
3254 }
3255 else {
3256 reguni(pRExC_state, ender, s, &unilen);
3257 if (unilen > 0) {
3258 s += unilen;
3259 len += unilen;
3260 }
3261 }
3262 }
3263 else {
3264 len++;
3265 REGC((char)ender, s++);
3266 }
3267 break;
3268 }
3269 if (UTF) {
3270 STRLEN unilen;
3271
3272 if (FOLD) {
3273 /* Emit all the Unicode characters. */
3274 for (foldbuf = tmpbuf;
3275 foldlen;
3276 foldlen -= numlen) {
3277 ender = utf8_to_uvchr(foldbuf, &numlen);
3278 if (numlen > 0) {
3279 reguni(pRExC_state, ender, s, &unilen);
3280 len += unilen;
3281 s += unilen;
3282 /* In EBCDIC the numlen
3283 * and unilen can differ. */
3284 foldbuf += numlen;
3285 if (numlen >= foldlen)
3286 break;
3287 }
3288 else
3289 break;
3290 }
3291 }
3292 else {
3293 reguni(pRExC_state, ender, s, &unilen);
3294 if (unilen > 0) {
3295 s += unilen;
3296 len += unilen;
3297 }
3298 }
3299 len--;
3300 }
3301 else
3302 REGC((char)ender, s++);
3303 }
3304 loopdone:
3305 RExC_parse = p - 1;
3306 Set_Node_Cur_Length(ret); /* MJD */
3307 nextchar(pRExC_state);
3308 {
3309 /* len is STRLEN which is unsigned, need to copy to signed */
3310 IV iv = len;
3311 if (iv < 0)
3312 vFAIL("Internal disaster");
3313 }
3314 if (len > 0)
3315 *flagp |= HASWIDTH;
3316 if (len == 1 && UNI_IS_INVARIANT(ender))
3317 *flagp |= SIMPLE;
3318 if (!SIZE_ONLY)
3319 STR_LEN(ret) = len;
3320 if (SIZE_ONLY)
3321 RExC_size += STR_SZ(len);
3322 else
3323 RExC_emit += STR_SZ(len);
3324 }
3325 break;
3326 }
3327
3328 /* If the encoding pragma is in effect recode the text of
3329 * any EXACT-kind nodes. */
3330 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3331 STRLEN oldlen = STR_LEN(ret);
3332 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3333
3334 if (RExC_utf8)
3335 SvUTF8_on(sv);
3336 if (sv_utf8_downgrade(sv, TRUE)) {
3337 char *s = sv_recode_to_utf8(sv, PL_encoding);
3338 STRLEN newlen = SvCUR(sv);
3339
3340 if (SvUTF8(sv))
3341 RExC_utf8 = 1;
3342 if (!SIZE_ONLY) {
3343 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344 (int)oldlen, STRING(ret),
3345 (int)newlen, s));
3346 Copy(s, STRING(ret), newlen, char);
3347 STR_LEN(ret) += newlen - oldlen;
3348 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3349 } else
3350 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3351 }
3352 }
3353
3354 return(ret);
3355}
3356
3357STATIC char *
3358S_regwhite(pTHX_ char *p, char *e)
3359{
3360 while (p < e) {
3361 if (isSPACE(*p))
3362 ++p;
3363 else if (*p == '#') {
3364 do {
3365 p++;
3366 } while (p < e && *p != '\n');
3367 }
3368 else
3369 break;
3370 }
3371 return p;
3372}
3373
3374/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375 Character classes ([:foo:]) can also be negated ([:^foo:]).
3376 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3378 but trigger failures because they are currently unimplemented. */
3379
3380#define POSIXCC_DONE(c) ((c) == ':')
3381#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3383
3384STATIC I32
3385S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3386{
3387 char *posixcc = 0;
3388 I32 namedclass = OOB_NAMEDCLASS;
3389
3390 if (value == '[' && RExC_parse + 1 < RExC_end &&
3391 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3392 POSIXCC(UCHARAT(RExC_parse))) {
3393 char c = UCHARAT(RExC_parse);
3394 char* s = RExC_parse++;
3395
3396 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3397 RExC_parse++;
3398 if (RExC_parse == RExC_end)
3399 /* Grandfather lone [:, [=, [. */
3400 RExC_parse = s;
3401 else {
3402 char* t = RExC_parse++; /* skip over the c */
3403
3404 if (UCHARAT(RExC_parse) == ']') {
3405 RExC_parse++; /* skip over the ending ] */
3406 posixcc = s + 1;
3407 if (*s == ':') {
3408 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3409 I32 skip = 5; /* the most common skip */
3410
3411 switch (*posixcc) {
3412 case 'a':
3413 if (strnEQ(posixcc, "alnum", 5))
3414 namedclass =
3415 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3416 else if (strnEQ(posixcc, "alpha", 5))
3417 namedclass =
3418 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3419 else if (strnEQ(posixcc, "ascii", 5))
3420 namedclass =
3421 complement ? ANYOF_NASCII : ANYOF_ASCII;
3422 break;
3423 case 'b':
3424 if (strnEQ(posixcc, "blank", 5))
3425 namedclass =
3426 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3427 break;
3428 case 'c':
3429 if (strnEQ(posixcc, "cntrl", 5))
3430 namedclass =
3431 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3432 break;
3433 case 'd':
3434 if (strnEQ(posixcc, "digit", 5))
3435 namedclass =
3436 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3437 break;
3438 case 'g':
3439 if (strnEQ(posixcc, "graph", 5))
3440 namedclass =
3441 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3442 break;
3443 case 'l':
3444 if (strnEQ(posixcc, "lower", 5))
3445 namedclass =
3446 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3447 break;
3448 case 'p':
3449 if (strnEQ(posixcc, "print", 5))
3450 namedclass =
3451 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3452 else if (strnEQ(posixcc, "punct", 5))
3453 namedclass =
3454 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3455 break;
3456 case 's':
3457 if (strnEQ(posixcc, "space", 5))
3458 namedclass =
3459 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3460 break;
3461 case 'u':
3462 if (strnEQ(posixcc, "upper", 5))
3463 namedclass =
3464 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3465 break;
3466 case 'w': /* this is not POSIX, this is the Perl \w */
3467 if (strnEQ(posixcc, "word", 4)) {
3468 namedclass =
3469 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3470 skip = 4;
3471 }
3472 break;
3473 case 'x':
3474 if (strnEQ(posixcc, "xdigit", 6)) {
3475 namedclass =
3476 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3477 skip = 6;
3478 }
3479 break;
3480 }
3481 if (namedclass == OOB_NAMEDCLASS ||
3482 posixcc[skip] != ':' ||
3483 posixcc[skip+1] != ']')
3484 {
3485 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3486 t - s - 1, s + 1);
3487 }
3488 } else if (!SIZE_ONLY) {
3489 /* [[=foo=]] and [[.foo.]] are still future. */
3490
3491 /* adjust RExC_parse so the warning shows after
3492 the class closes */
3493 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3494 RExC_parse++;
3495 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3496 }
3497 } else {
3498 /* Maternal grandfather:
3499 * "[:" ending in ":" but not in ":]" */
3500 RExC_parse = s;
3501 }
3502 }
3503 }
3504
3505 return namedclass;
3506}
3507
3508STATIC void
3509S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3510{
3511 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3512 char *s = RExC_parse;
3513 char c = *s++;
3514
3515 while(*s && isALNUM(*s))
3516 s++;
3517 if (*s && c == *s && s[1] == ']') {
3518 if (ckWARN(WARN_REGEXP))
3519 vWARN3(s+2,
3520 "POSIX syntax [%c %c] belongs inside character classes",
3521 c, c);
3522
3523 /* [[=foo=]] and [[.foo.]] are still future. */
3524 if (POSIXCC_NOTYET(c)) {
3525 /* adjust RExC_parse so the error shows after
3526 the class closes */
3527 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3528 ;
3529 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3530 }
3531 }
3532 }
3533}
3534
3535STATIC regnode *
3536S_regclass(pTHX_ RExC_state_t *pRExC_state)
3537{
3538 register UV value;
3539 register UV nextvalue;
3540 register IV prevvalue = OOB_UNICODE;
3541 register IV range = 0;
3542 register regnode *ret;
3543 STRLEN numlen;
3544 IV namedclass;
3545 char *rangebegin = 0;
3546 bool need_class = 0;
3547 SV *listsv = Nullsv;
3548 register char *e;
3549 UV n;
3550 bool optimize_invert = TRUE;
3551 AV* unicode_alternate = 0;
3552#ifdef EBCDIC
3553 UV literal_endpoint = 0;
3554#endif
3555
3556 ret = reganode(pRExC_state, ANYOF, 0);
3557
3558 if (!SIZE_ONLY)
3559 ANYOF_FLAGS(ret) = 0;
3560
3561 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3562 RExC_naughty++;
3563 RExC_parse++;
3564 if (!SIZE_ONLY)
3565 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3566 }
3567
3568 if (SIZE_ONLY)
3569 RExC_size += ANYOF_SKIP;
3570 else {
3571 RExC_emit += ANYOF_SKIP;
3572 if (FOLD)
3573 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3574 if (LOC)
3575 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3576 ANYOF_BITMAP_ZERO(ret);
3577 listsv = newSVpvn("# comment\n", 10);
3578 }
3579
3580 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3581
3582 if (!SIZE_ONLY && POSIXCC(nextvalue))
3583 checkposixcc(pRExC_state);
3584
3585 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3586 if (UCHARAT(RExC_parse) == ']')
3587 goto charclassloop;
3588
3589 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3590
3591 charclassloop:
3592
3593 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3594
3595 if (!range)
3596 rangebegin = RExC_parse;
3597 if (UTF) {
3598 value = utf8n_to_uvchr((U8*)RExC_parse,
3599 RExC_end - RExC_parse,
3600 &numlen, 0);
3601 RExC_parse += numlen;
3602 }
3603 else
3604 value = UCHARAT(RExC_parse++);
3605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606 if (value == '[' && POSIXCC(nextvalue))
3607 namedclass = regpposixcc(pRExC_state, value);
3608 else if (value == '\\') {
3609 if (UTF) {
3610 value = utf8n_to_uvchr((U8*)RExC_parse,
3611 RExC_end - RExC_parse,
3612 &numlen, 0);
3613 RExC_parse += numlen;
3614 }
3615 else
3616 value = UCHARAT(RExC_parse++);
3617 /* Some compilers cannot handle switching on 64-bit integer
3618 * values, therefore value cannot be an UV. Yes, this will
3619 * be a problem later if we want switch on Unicode.
3620 * A similar issue a little bit later when switching on
3621 * namedclass. --jhi */
3622 switch ((I32)value) {
3623 case 'w': namedclass = ANYOF_ALNUM; break;
3624 case 'W': namedclass = ANYOF_NALNUM; break;
3625 case 's': namedclass = ANYOF_SPACE; break;
3626 case 'S': namedclass = ANYOF_NSPACE; break;
3627 case 'd': namedclass = ANYOF_DIGIT; break;
3628 case 'D': namedclass = ANYOF_NDIGIT; break;
3629 case 'p':
3630 case 'P':
3631 if (RExC_parse >= RExC_end)
3632 vFAIL2("Empty \\%c{}", (U8)value);
3633 if (*RExC_parse == '{') {
3634 U8 c = (U8)value;
3635 e = strchr(RExC_parse++, '}');
3636 if (!e)
3637 vFAIL2("Missing right brace on \\%c{}", c);
3638 while (isSPACE(UCHARAT(RExC_parse)))
3639 RExC_parse++;
3640 if (e == RExC_parse)
3641 vFAIL2("Empty \\%c{}", c);
3642 n = e - RExC_parse;
3643 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3644 n--;
3645 }
3646 else {
3647 e = RExC_parse;
3648 n = 1;
3649 }
3650 if (!SIZE_ONLY) {
3651 if (UCHARAT(RExC_parse) == '^') {
3652 RExC_parse++;
3653 n--;
3654 value = value == 'p' ? 'P' : 'p'; /* toggle */
3655 while (isSPACE(UCHARAT(RExC_parse))) {
3656 RExC_parse++;
3657 n--;
3658 }
3659 }
3660 if (value == 'p')
3661 Perl_sv_catpvf(aTHX_ listsv,
3662 "+utf8::%.*s\n", (int)n, RExC_parse);
3663 else
3664 Perl_sv_catpvf(aTHX_ listsv,
3665 "!utf8::%.*s\n", (int)n, RExC_parse);
3666 }
3667 RExC_parse = e + 1;
3668 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3669 namedclass = ANYOF_MAX; /* no official name, but it's named */
3670 break;
3671 case 'n': value = '\n'; break;
3672 case 'r': value = '\r'; break;
3673 case 't': value = '\t'; break;
3674 case 'f': value = '\f'; break;
3675 case 'b': value = '\b'; break;
3676 case 'e': value = ASCII_TO_NATIVE('\033');break;
3677 case 'a': value = ASCII_TO_NATIVE('\007');break;
3678 case 'x':
3679 if (*RExC_parse == '{') {
3680 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3681 | PERL_SCAN_DISALLOW_PREFIX;
3682 e = strchr(RExC_parse++, '}');
3683 if (!e)
3684 vFAIL("Missing right brace on \\x{}");
3685
3686 numlen = e - RExC_parse;
3687 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3688 RExC_parse = e + 1;
3689 }
3690 else {
3691 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3692 numlen = 2;
3693 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3694 RExC_parse += numlen;
3695 }
3696 break;
3697 case 'c':
3698 value = UCHARAT(RExC_parse++);
3699 value = toCTRL(value);
3700 break;
3701 case '0': case '1': case '2': case '3': case '4':
3702 case '5': case '6': case '7': case '8': case '9':
3703 {
3704 I32 flags = 0;
3705 numlen = 3;
3706 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3707 RExC_parse += numlen;
3708 break;
3709 }
3710 default:
3711 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3712 vWARN2(RExC_parse,
3713 "Unrecognized escape \\%c in character class passed through",
3714 (int)value);
3715 break;
3716 }
3717 } /* end of \blah */
3718#ifdef EBCDIC
3719 else
3720 literal_endpoint++;
3721#endif
3722
3723 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3724
3725 if (!SIZE_ONLY && !need_class)
3726 ANYOF_CLASS_ZERO(ret);
3727
3728 need_class = 1;
3729
3730 /* a bad range like a-\d, a-[:digit:] ? */
3731 if (range) {
3732 if (!SIZE_ONLY) {
3733 if (ckWARN(WARN_REGEXP))
3734 vWARN4(RExC_parse,
3735 "False [] range \"%*.*s\"",
3736 RExC_parse - rangebegin,
3737 RExC_parse - rangebegin,
3738 rangebegin);
3739 if (prevvalue < 256) {
3740 ANYOF_BITMAP_SET(ret, prevvalue);
3741 ANYOF_BITMAP_SET(ret, '-');
3742 }
3743 else {
3744 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3745 Perl_sv_catpvf(aTHX_ listsv,
3746 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3747 }
3748 }
3749
3750 range = 0; /* this was not a true range */
3751 }
3752
3753 if (!SIZE_ONLY) {
3754 if (namedclass > OOB_NAMEDCLASS)
3755 optimize_invert = FALSE;
3756 /* Possible truncation here but in some 64-bit environments
3757 * the compiler gets heartburn about switch on 64-bit values.
3758 * A similar issue a little earlier when switching on value.
3759 * --jhi */
3760 switch ((I32)namedclass) {
3761 case ANYOF_ALNUM:
3762 if (LOC)
3763 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3764 else {
3765 for (value = 0; value < 256; value++)
3766 if (isALNUM(value))
3767 ANYOF_BITMAP_SET(ret, value);
3768 }
3769 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3770 break;
3771 case ANYOF_NALNUM:
3772 if (LOC)
3773 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3774 else {
3775 for (value = 0; value < 256; value++)
3776 if (!isALNUM(value))
3777 ANYOF_BITMAP_SET(ret, value);
3778 }
3779 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3780 break;
3781 case ANYOF_ALNUMC:
3782 if (LOC)
3783 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3784 else {
3785 for (value = 0; value < 256; value++)
3786 if (isALNUMC(value))
3787 ANYOF_BITMAP_SET(ret, value);
3788 }
3789 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3790 break;
3791 case ANYOF_NALNUMC:
3792 if (LOC)
3793 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3794 else {
3795 for (value = 0; value < 256; value++)
3796 if (!isALNUMC(value))
3797 ANYOF_BITMAP_SET(ret, value);
3798 }
3799 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3800 break;
3801 case ANYOF_ALPHA:
3802 if (LOC)
3803 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3804 else {
3805 for (value = 0; value < 256; value++)
3806 if (isALPHA(value))
3807 ANYOF_BITMAP_SET(ret, value);
3808 }
3809 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3810 break;
3811 case ANYOF_NALPHA:
3812 if (LOC)
3813 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3814 else {
3815 for (value = 0; value < 256; value++)
3816 if (!isALPHA(value))
3817 ANYOF_BITMAP_SET(ret, value);
3818 }
3819 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3820 break;
3821 case ANYOF_ASCII:
3822 if (LOC)
3823 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3824 else {
3825#ifndef EBCDIC
3826 for (value = 0; value < 128; value++)
3827 ANYOF_BITMAP_SET(ret, value);
3828#else /* EBCDIC */
3829 for (value = 0; value < 256; value++) {
3830 if (isASCII(value))
3831 ANYOF_BITMAP_SET(ret, value);
3832 }
3833#endif /* EBCDIC */
3834 }
3835 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3836 break;
3837 case ANYOF_NASCII:
3838 if (LOC)
3839 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3840 else {
3841#ifndef EBCDIC
3842 for (value = 128; value < 256; value++)
3843 ANYOF_BITMAP_SET(ret, value);
3844#else /* EBCDIC */
3845 for (value = 0; value < 256; value++) {
3846 if (!isASCII(value))
3847 ANYOF_BITMAP_SET(ret, value);
3848 }
3849#endif /* EBCDIC */
3850 }
3851 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3852 break;
3853 case ANYOF_BLANK:
3854 if (LOC)
3855 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3856 else {
3857 for (value = 0; value < 256; value++)
3858 if (isBLANK(value))
3859 ANYOF_BITMAP_SET(ret, value);
3860 }
3861 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3862 break;
3863 case ANYOF_NBLANK:
3864 if (LOC)
3865 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3866 else {
3867 for (value = 0; value < 256; value++)
3868 if (!isBLANK(value))
3869 ANYOF_BITMAP_SET(ret, value);
3870 }
3871 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3872 break;
3873 case ANYOF_CNTRL:
3874 if (LOC)
3875 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3876 else {
3877 for (value = 0; value < 256; value++)
3878 if (isCNTRL(value))
3879 ANYOF_BITMAP_SET(ret, value);
3880 }
3881 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3882 break;
3883 case ANYOF_NCNTRL:
3884 if (LOC)
3885 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3886 else {
3887 for (value = 0; value < 256; value++)
3888 if (!isCNTRL(value))
3889 ANYOF_BITMAP_SET(ret, value);
3890 }
3891 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3892 break;
3893 case ANYOF_DIGIT:
3894 if (LOC)
3895 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3896 else {
3897 /* consecutive digits assumed */
3898 for (value = '0'; value <= '9'; value++)
3899 ANYOF_BITMAP_SET(ret, value);
3900 }
3901 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3902 break;
3903 case ANYOF_NDIGIT:
3904 if (LOC)
3905 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3906 else {
3907 /* consecutive digits assumed */
3908 for (value = 0; value < '0'; value++)
3909 ANYOF_BITMAP_SET(ret, value);
3910 for (value = '9' + 1; value < 256; value++)
3911 ANYOF_BITMAP_SET(ret, value);
3912 }
3913 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3914 break;
3915 case ANYOF_GRAPH:
3916 if (LOC)
3917 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3918 else {
3919 for (value = 0; value < 256; value++)
3920 if (isGRAPH(value))
3921 ANYOF_BITMAP_SET(ret, value);
3922 }
3923 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3924 break;
3925 case ANYOF_NGRAPH:
3926 if (LOC)
3927 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3928 else {
3929 for (value = 0; value < 256; value++)
3930 if (!isGRAPH(value))
3931 ANYOF_BITMAP_SET(ret, value);
3932 }
3933 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3934 break;
3935 case ANYOF_LOWER:
3936 if (LOC)
3937 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3938 else {
3939 for (value = 0; value < 256; value++)
3940 if (isLOWER(value))
3941 ANYOF_BITMAP_SET(ret, value);
3942 }
3943 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3944 break;
3945 case ANYOF_NLOWER:
3946 if (LOC)
3947 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3948 else {
3949 for (value = 0; value < 256; value++)
3950 if (!isLOWER(value))
3951 ANYOF_BITMAP_SET(ret, value);
3952 }
3953 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3954 break;
3955 case ANYOF_PRINT:
3956 if (LOC)
3957 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3958 else {
3959 for (value = 0; value < 256; value++)
3960 if (isPRINT(value))
3961 ANYOF_BITMAP_SET(ret, value);
3962 }
3963 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3964 break;
3965 case ANYOF_NPRINT:
3966 if (LOC)
3967 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3968 else {
3969 for (value = 0; value < 256; value++)
3970 if (!isPRINT(value))
3971 ANYOF_BITMAP_SET(ret, value);
3972 }
3973 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3974 break;
3975 case ANYOF_PSXSPC:
3976 if (LOC)
3977 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3978 else {
3979 for (value = 0; value < 256; value++)
3980 if (isPSXSPC(value))
3981 ANYOF_BITMAP_SET(ret, value);
3982 }
3983 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3984 break;
3985 case ANYOF_NPSXSPC:
3986 if (LOC)
3987 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3988 else {
3989 for (value = 0; value < 256; value++)
3990 if (!isPSXSPC(value))
3991 ANYOF_BITMAP_SET(ret, value);
3992 }
3993 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3994 break;
3995 case ANYOF_PUNCT:
3996 if (LOC)
3997 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3998 else {
3999 for (value = 0; value < 256; value++)
4000 if (isPUNCT(value))
4001 ANYOF_BITMAP_SET(ret, value);
4002 }
4003 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4004 break;
4005 case ANYOF_NPUNCT:
4006 if (LOC)
4007 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4008 else {
4009 for (value = 0; value < 256; value++)
4010 if (!isPUNCT(value))
4011 ANYOF_BITMAP_SET(ret, value);
4012 }
4013 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4014 break;
4015 case ANYOF_SPACE:
4016 if (LOC)
4017 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4018 else {
4019 for (value = 0; value < 256; value++)
4020 if (isSPACE(value))
4021 ANYOF_BITMAP_SET(ret, value);
4022 }
4023 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4024 break;
4025 case ANYOF_NSPACE:
4026 if (LOC)
4027 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4028 else {
4029 for (value = 0; value < 256; value++)
4030 if (!isSPACE(value))
4031 ANYOF_BITMAP_SET(ret, value);
4032 }
4033 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4034 break;
4035 case ANYOF_UPPER:
4036 if (LOC)
4037 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4038 else {
4039 for (value = 0; value < 256; value++)
4040 if (isUPPER(value))
4041 ANYOF_BITMAP_SET(ret, value);
4042 }
4043 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4044 break;
4045 case ANYOF_NUPPER:
4046 if (LOC)
4047 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4048 else {
4049 for (value = 0; value < 256; value++)
4050 if (!isUPPER(value))
4051 ANYOF_BITMAP_SET(ret, value);
4052 }
4053 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4054 break;
4055 case ANYOF_XDIGIT:
4056 if (LOC)
4057 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4058 else {
4059 for (value = 0; value < 256; value++)
4060 if (isXDIGIT(value))
4061 ANYOF_BITMAP_SET(ret, value);
4062 }
4063 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4064 break;
4065 case ANYOF_NXDIGIT:
4066 if (LOC)
4067 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4068 else {
4069 for (value = 0; value < 256; value++)
4070 if (!isXDIGIT(value))
4071 ANYOF_BITMAP_SET(ret, value);
4072 }
4073 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4074 break;
4075 case ANYOF_MAX:
4076 /* this is to handle \p and \P */
4077 break;
4078 default:
4079 vFAIL("Invalid [::] class");
4080 break;
4081 }
4082 if (LOC)
4083 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4084 continue;
4085 }
4086 } /* end of namedclass \blah */
4087
4088 if (range) {
4089 if (prevvalue > (IV)value) /* b-a */ {
4090 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4091 RExC_parse - rangebegin,
4092 RExC_parse - rangebegin,
4093 rangebegin);
4094 range = 0; /* not a valid range */
4095 }
4096 }
4097 else {
4098 prevvalue = value; /* save the beginning of the range */
4099 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4100 RExC_parse[1] != ']') {
4101 RExC_parse++;
4102
4103 /* a bad range like \w-, [:word:]- ? */
4104 if (namedclass > OOB_NAMEDCLASS) {
4105 if (ckWARN(WARN_REGEXP))
4106 vWARN4(RExC_parse,
4107 "False [] range \"%*.*s\"",
4108 RExC_parse - rangebegin,
4109 RExC_parse - rangebegin,
4110 rangebegin);
4111 if (!SIZE_ONLY)
4112 ANYOF_BITMAP_SET(ret, '-');
4113 } else
4114 range = 1; /* yeah, it's a range! */
4115 continue; /* but do it the next time */
4116 }
4117 }
4118
4119 /* now is the next time */
4120 if (!SIZE_ONLY) {
4121 IV i;
4122
4123 if (prevvalue < 256) {
4124 IV ceilvalue = value < 256 ? value : 255;
4125
4126#ifdef EBCDIC
4127 /* In EBCDIC [\x89-\x91] should include
4128 * the \x8e but [i-j] should not. */
4129 if (literal_endpoint == 2 &&
4130 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4131 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4132 {
4133 if (isLOWER(prevvalue)) {
4134 for (i = prevvalue; i <= ceilvalue; i++)
4135 if (isLOWER(i))
4136 ANYOF_BITMAP_SET(ret, i);
4137 } else {
4138 for (i = prevvalue; i <= ceilvalue; i++)
4139 if (isUPPER(i))
4140 ANYOF_BITMAP_SET(ret, i);
4141 }
4142 }
4143 else
4144#endif
4145 for (i = prevvalue; i <= ceilvalue; i++)
4146 ANYOF_BITMAP_SET(ret, i);
4147 }
4148 if (value > 255 || UTF) {
4149 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4150 UV natvalue = NATIVE_TO_UNI(value);
4151
4152 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4153 if (prevnatvalue < natvalue) { /* what about > ? */
4154 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4155 prevnatvalue, natvalue);
4156 }
4157 else if (prevnatvalue == natvalue) {
4158 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4159 if (FOLD) {
4160 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4161 STRLEN foldlen;
4162 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4163
4164 /* If folding and foldable and a single
4165 * character, insert also the folded version
4166 * to the charclass. */
4167 if (f != value) {
4168 if (foldlen == (STRLEN)UNISKIP(f))
4169 Perl_sv_catpvf(aTHX_ listsv,
4170 "%04"UVxf"\n", f);
4171 else {
4172 /* Any multicharacter foldings
4173 * require the following transform:
4174 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4175 * where E folds into "pq" and F folds
4176 * into "rst", all other characters
4177 * fold to single characters. We save
4178 * away these multicharacter foldings,
4179 * to be later saved as part of the
4180 * additional "s" data. */
4181 SV *sv;
4182
4183 if (!unicode_alternate)
4184 unicode_alternate = newAV();
4185 sv = newSVpvn((char*)foldbuf, foldlen);
4186 SvUTF8_on(sv);
4187 av_push(unicode_alternate, sv);
4188 }
4189 }
4190
4191 /* If folding and the value is one of the Greek
4192 * sigmas insert a few more sigmas to make the
4193 * folding rules of the sigmas to work right.
4194 * Note that not all the possible combinations
4195 * are handled here: some of them are handled
4196 * by the standard folding rules, and some of
4197 * them (literal or EXACTF cases) are handled
4198 * during runtime in regexec.c:S_find_byclass(). */
4199 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4200 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4201 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4202 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4203 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4204 }
4205 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4206 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4207 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4208 }
4209 }
4210 }
4211#ifdef EBCDIC
4212 literal_endpoint = 0;
4213#endif
4214 }
4215
4216 range = 0; /* this range (if it was one) is done now */
4217 }
4218
4219 if (need_class) {
4220 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4221 if (SIZE_ONLY)
4222 RExC_size += ANYOF_CLASS_ADD_SKIP;
4223 else
4224 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4225 }
4226
4227 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4228 if (!SIZE_ONLY &&
4229 /* If the only flag is folding (plus possibly inversion). */
4230 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4231 ) {
4232 for (value = 0; value < 256; ++value) {
4233 if (ANYOF_BITMAP_TEST(ret, value)) {
4234 UV fold = PL_fold[value];
4235
4236 if (fold != value)
4237 ANYOF_BITMAP_SET(ret, fold);
4238 }
4239 }
4240 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4241 }
4242
4243 /* optimize inverted simple patterns (e.g. [^a-z]) */
4244 if (!SIZE_ONLY && optimize_invert &&
4245 /* If the only flag is inversion. */
4246 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4247 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4248 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4249 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4250 }
4251
4252 if (!SIZE_ONLY) {
4253 AV *av = newAV();
4254 SV *rv;
4255
4256 /* The 0th element stores the character class description
4257 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4258 * to initialize the appropriate swash (which gets stored in
4259 * the 1st element), and also useful for dumping the regnode.
4260 * The 2nd element stores the multicharacter foldings,
4261 * used later (regexec.c:S_reginclass()). */
4262 av_store(av, 0, listsv);
4263 av_store(av, 1, NULL);
4264 av_store(av, 2, (SV*)unicode_alternate);
4265 rv = newRV_noinc((SV*)av);
4266 n = add_data(pRExC_state, 1, "s");
4267 RExC_rx->data->data[n] = (void*)rv;
4268 ARG_SET(ret, n);
4269 }
4270
4271 return ret;
4272}
4273
4274STATIC char*
4275S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4276{
4277 char* retval = RExC_parse++;
4278
4279 for (;;) {
4280 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4281 RExC_parse[2] == '#') {
4282 while (*RExC_parse != ')') {
4283 if (RExC_parse == RExC_end)
4284 FAIL("Sequence (?#... not terminated");
4285 RExC_parse++;
4286 }
4287 RExC_parse++;
4288 continue;
4289 }
4290 if (RExC_flags & PMf_EXTENDED) {
4291 if (isSPACE(*RExC_parse)) {
4292 RExC_parse++;
4293 continue;
4294 }
4295 else if (*RExC_parse == '#') {
4296 while (RExC_parse < RExC_end)
4297 if (*RExC_parse++ == '\n') break;
4298 continue;
4299 }
4300 }
4301 return retval;
4302 }
4303}
4304
4305/*
4306- reg_node - emit a node
4307*/
4308STATIC regnode * /* Location. */
4309S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4310{
4311 register regnode *ret;
4312 register regnode *ptr;
4313
4314 ret = RExC_emit;
4315 if (SIZE_ONLY) {
4316 SIZE_ALIGN(RExC_size);
4317 RExC_size += 1;
4318 return(ret);
4319 }
4320
4321 NODE_ALIGN_FILL(ret);
4322 ptr = ret;
4323 FILL_ADVANCE_NODE(ptr, op);
4324 if (RExC_offsets) { /* MJD */
4325 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4326 "reg_node", __LINE__,
4327 reg_name[op],
4328 RExC_emit - RExC_emit_start > RExC_offsets[0]
4329 ? "Overwriting end of array!\n" : "OK",
4330 RExC_emit - RExC_emit_start,
4331 RExC_parse - RExC_start,
4332 RExC_offsets[0]));
4333 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4334 }
4335
4336 RExC_emit = ptr;
4337
4338 return(ret);
4339}
4340
4341/*
4342- reganode - emit a node with an argument
4343*/
4344STATIC regnode * /* Location. */
4345S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4346{
4347 register regnode *ret;
4348 register regnode *ptr;
4349
4350 ret = RExC_emit;
4351 if (SIZE_ONLY) {
4352 SIZE_ALIGN(RExC_size);
4353 RExC_size += 2;
4354 return(ret);
4355 }
4356
4357 NODE_ALIGN_FILL(ret);
4358 ptr = ret;
4359 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4360 if (RExC_offsets) { /* MJD */
4361 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4362 "reganode",
4363 __LINE__,
4364 reg_name[op],
4365 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4366 "Overwriting end of array!\n" : "OK",
4367 RExC_emit - RExC_emit_start,
4368 RExC_parse - RExC_start,
4369 RExC_offsets[0]));
4370 Set_Cur_Node_Offset;
4371 }
4372
4373 RExC_emit = ptr;
4374
4375 return(ret);
4376}
4377
4378/*
4379- reguni - emit (if appropriate) a Unicode character
4380*/
4381STATIC void
4382S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4383{
4384 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4385}
4386
4387/*
4388- reginsert - insert an operator in front of already-emitted operand
4389*
4390* Means relocating the operand.
4391*/
4392STATIC void
4393S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4394{
4395 register regnode *src;
4396 register regnode *dst;
4397 register regnode *place;
4398 register int offset = regarglen[(U8)op];
4399
4400/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4401
4402 if (SIZE_ONLY) {
4403 RExC_size += NODE_STEP_REGNODE + offset;
4404 return;
4405 }
4406
4407 src = RExC_emit;
4408 RExC_emit += NODE_STEP_REGNODE + offset;
4409 dst = RExC_emit;
4410 while (src > opnd) {
4411 StructCopy(--src, --dst, regnode);
4412 if (RExC_offsets) { /* MJD 20010112 */
4413 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4414 "reg_insert",
4415 __LINE__,
4416 reg_name[op],
4417 dst - RExC_emit_start > RExC_offsets[0]
4418 ? "Overwriting end of array!\n" : "OK",
4419 src - RExC_emit_start,
4420 dst - RExC_emit_start,
4421 RExC_offsets[0]));
4422 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4423 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4424 }
4425 }
4426
4427
4428 place = opnd; /* Op node, where operand used to be. */
4429 if (RExC_offsets) { /* MJD */
4430 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4431 "reginsert",
4432 __LINE__,
4433 reg_name[op],
4434 place - RExC_emit_start > RExC_offsets[0]
4435 ? "Overwriting end of array!\n" : "OK",
4436 place - RExC_emit_start,
4437 RExC_parse - RExC_start,
4438 RExC_offsets[0]));
4439 Set_Node_Offset(place, RExC_parse);
4440 Set_Node_Length(place, 1);
4441 }
4442 src = NEXTOPER(place);
4443 FILL_ADVANCE_NODE(place, op);
4444 Zero(src, offset, regnode);
4445}
4446
4447/*
4448- regtail - set the next-pointer at the end of a node chain of p to val.
4449*/
4450STATIC void
4451S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4452{
4453 register regnode *scan;
4454 register regnode *temp;
4455
4456 if (SIZE_ONLY)
4457 return;
4458
4459 /* Find last node. */
4460 scan = p;
4461 for (;;) {
4462 temp = regnext(scan);
4463 if (temp == NULL)
4464 break;
4465 scan = temp;
4466 }
4467
4468 if (reg_off_by_arg[OP(scan)]) {
4469 ARG_SET(scan, val - scan);
4470 }
4471 else {
4472 NEXT_OFF(scan) = val - scan;
4473 }
4474}
4475
4476/*
4477- regoptail - regtail on operand of first argument; nop if operandless
4478*/
4479STATIC void
4480S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4481{
4482 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4483 if (p == NULL || SIZE_ONLY)
4484 return;
4485 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4486 regtail(pRExC_state, NEXTOPER(p), val);
4487 }
4488 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4489 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4490 }
4491 else
4492 return;
4493}
4494
4495/*
4496 - regcurly - a little FSA that accepts {\d+,?\d*}
4497 */
4498STATIC I32
4499S_regcurly(pTHX_ register char *s)
4500{
4501 if (*s++ != '{')
4502 return FALSE;
4503 if (!isDIGIT(*s))
4504 return FALSE;
4505 while (isDIGIT(*s))
4506 s++;
4507 if (*s == ',')
4508 s++;
4509 while (isDIGIT(*s))
4510 s++;
4511 if (*s != '}')
4512 return FALSE;
4513 return TRUE;
4514}
4515
4516
4517#ifdef DEBUGGING
4518
4519STATIC regnode *
4520S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4521{
4522 register U8 op = EXACT; /* Arbitrary non-END op. */
4523 register regnode *next;
4524
4525 while (op != END && (!last || node < last)) {
4526 /* While that wasn't END last time... */
4527
4528 NODE_ALIGN(node);
4529 op = OP(node);
4530 if (op == CLOSE)
4531 l--;
4532 next = regnext(node);
4533 /* Where, what. */
4534 if (OP(node) == OPTIMIZED)
4535 goto after_print;
4536 regprop(sv, node);
4537 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4538 (int)(2*l + 1), "", SvPVX(sv));
4539 if (next == NULL) /* Next ptr. */
4540 PerlIO_printf(Perl_debug_log, "(0)");
4541 else
4542 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4543 (void)PerlIO_putc(Perl_debug_log, '\n');
4544 after_print:
4545 if (PL_regkind[(U8)op] == BRANCHJ) {
4546 register regnode *nnode = (OP(next) == LONGJMP
4547 ? regnext(next)
4548 : next);
4549 if (last && nnode > last)
4550 nnode = last;
4551 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4552 }
4553 else if (PL_regkind[(U8)op] == BRANCH) {
4554 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4555 }
4556 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4557 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4558 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4559 }
4560 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4561 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4562 next, sv, l + 1);
4563 }
4564 else if ( op == PLUS || op == STAR) {
4565 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4566 }
4567 else if (op == ANYOF) {
4568 /* arglen 1 + class block */
4569 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4570 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4571 node = NEXTOPER(node);
4572 }
4573 else if (PL_regkind[(U8)op] == EXACT) {
4574 /* Literal string, where present. */
4575 node += NODE_SZ_STR(node) - 1;
4576 node = NEXTOPER(node);
4577 }
4578 else {
4579 node = NEXTOPER(node);
4580 node += regarglen[(U8)op];
4581 }
4582 if (op == CURLYX || op == OPEN)
4583 l++;
4584 else if (op == WHILEM)
4585 l--;
4586 }
4587 return node;
4588}
4589
4590#endif /* DEBUGGING */
4591
4592/*
4593 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4594 */
4595void
4596Perl_regdump(pTHX_ regexp *r)
4597{
4598#ifdef DEBUGGING
4599 SV *sv = sv_newmortal();
4600
4601 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4602
4603 /* Header fields of interest. */
4604 if (r->anchored_substr)
4605 PerlIO_printf(Perl_debug_log,
4606 "anchored `%s%.*s%s'%s at %"IVdf" ",
4607 PL_colors[0],
4608 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4609 SvPVX(r->anchored_substr),
4610 PL_colors[1],
4611 SvTAIL(r->anchored_substr) ? "$" : "",
4612 (IV)r->anchored_offset);
4613 else if (r->anchored_utf8)
4614 PerlIO_printf(Perl_debug_log,
4615 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4616 PL_colors[0],
4617 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4618 SvPVX(r->anchored_utf8),
4619 PL_colors[1],
4620 SvTAIL(r->anchored_utf8) ? "$" : "",
4621 (IV)r->anchored_offset);
4622 if (r->float_substr)
4623 PerlIO_printf(Perl_debug_log,
4624 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4625 PL_colors[0],
4626 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4627 SvPVX(r->float_substr),
4628 PL_colors[1],
4629 SvTAIL(r->float_substr) ? "$" : "",
4630 (IV)r->float_min_offset, (UV)r->float_max_offset);
4631 else if (r->float_utf8)
4632 PerlIO_printf(Perl_debug_log,
4633 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4634 PL_colors[0],
4635 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4636 SvPVX(r->float_utf8),
4637 PL_colors[1],
4638 SvTAIL(r->float_utf8) ? "$" : "",
4639 (IV)r->float_min_offset, (UV)r->float_max_offset);
4640 if (r->check_substr || r->check_utf8)
4641 PerlIO_printf(Perl_debug_log,
4642 r->check_substr == r->float_substr
4643 && r->check_utf8 == r->float_utf8
4644 ? "(checking floating" : "(checking anchored");
4645 if (r->reganch & ROPT_NOSCAN)
4646 PerlIO_printf(Perl_debug_log, " noscan");
4647 if (r->reganch & ROPT_CHECK_ALL)
4648 PerlIO_printf(Perl_debug_log, " isall");
4649 if (r->check_substr || r->check_utf8)
4650 PerlIO_printf(Perl_debug_log, ") ");
4651
4652 if (r->regstclass) {
4653 regprop(sv, r->regstclass);
4654 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4655 }
4656 if (r->reganch & ROPT_ANCH) {
4657 PerlIO_printf(Perl_debug_log, "anchored");
4658 if (r->reganch & ROPT_ANCH_BOL)
4659 PerlIO_printf(Perl_debug_log, "(BOL)");
4660 if (r->reganch & ROPT_ANCH_MBOL)
4661 PerlIO_printf(Perl_debug_log, "(MBOL)");
4662 if (r->reganch & ROPT_ANCH_SBOL)
4663 PerlIO_printf(Perl_debug_log, "(SBOL)");
4664 if (r->reganch & ROPT_ANCH_GPOS)
4665 PerlIO_printf(Perl_debug_log, "(GPOS)");
4666 PerlIO_putc(Perl_debug_log, ' ');
4667 }
4668 if (r->reganch & ROPT_GPOS_SEEN)
4669 PerlIO_printf(Perl_debug_log, "GPOS ");
4670 if (r->reganch & ROPT_SKIP)
4671 PerlIO_printf(Perl_debug_log, "plus ");
4672 if (r->reganch & ROPT_IMPLICIT)
4673 PerlIO_printf(Perl_debug_log, "implicit ");
4674 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4675 if (r->reganch & ROPT_EVAL_SEEN)
4676 PerlIO_printf(Perl_debug_log, "with eval ");
4677 PerlIO_printf(Perl_debug_log, "\n");
4678 if (r->offsets) {
4679 U32 i;
4680 U32 len = r->offsets[0];
4681 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4682 for (i = 1; i <= len; i++)
4683 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4684 (UV)r->offsets[i*2-1],
4685 (UV)r->offsets[i*2]);
4686 PerlIO_printf(Perl_debug_log, "\n");
4687 }
4688#endif /* DEBUGGING */
4689}
4690
4691#ifdef DEBUGGING
4692
4693STATIC void
4694S_put_byte(pTHX_ SV *sv, int c)
4695{
4696 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4697 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4698 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4699 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4700 else
4701 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4702}
4703
4704#endif /* DEBUGGING */
4705
4706/*
4707- regprop - printable representation of opcode
4708*/
4709void
4710Perl_regprop(pTHX_ SV *sv, regnode *o)
4711{
4712#ifdef DEBUGGING
4713 register int k;
4714
4715 sv_setpvn(sv, "", 0);
4716 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4717 /* It would be nice to FAIL() here, but this may be called from
4718 regexec.c, and it would be hard to supply pRExC_state. */
4719 Perl_croak(aTHX_ "Corrupted regexp opcode");
4720 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4721
4722 k = PL_regkind[(U8)OP(o)];
4723
4724 if (k == EXACT) {
4725 SV *dsv = sv_2mortal(newSVpvn("", 0));
4726 /* Using is_utf8_string() is a crude hack but it may
4727 * be the best for now since we have no flag "this EXACTish
4728 * node was UTF-8" --jhi */
4729 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4730 char *s = do_utf8 ?
4731 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4732 UNI_DISPLAY_REGEX) :
4733 STRING(o);
4734 int len = do_utf8 ?
4735 strlen(s) :
4736 STR_LEN(o);
4737 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4738 PL_colors[0],
4739 len, s,
4740 PL_colors[1]);
4741 }
4742 else if (k == CURLY) {
4743 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4744 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4745 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4746 }
4747 else if (k == WHILEM && o->flags) /* Ordinal/of */
4748 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4749 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4750 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4751 else if (k == LOGICAL)
4752 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4753 else if (k == ANYOF) {
4754 int i, rangestart = -1;
4755 U8 flags = ANYOF_FLAGS(o);
4756 const char * const anyofs[] = { /* Should be synchronized with
4757 * ANYOF_ #xdefines in regcomp.h */
4758 "\\w",
4759 "\\W",
4760 "\\s",
4761 "\\S",
4762 "\\d",
4763 "\\D",
4764 "[:alnum:]",
4765 "[:^alnum:]",
4766 "[:alpha:]",
4767 "[:^alpha:]",
4768 "[:ascii:]",
4769 "[:^ascii:]",
4770 "[:ctrl:]",
4771 "[:^ctrl:]",
4772 "[:graph:]",
4773 "[:^graph:]",
4774 "[:lower:]",
4775 "[:^lower:]",
4776 "[:print:]",
4777 "[:^print:]",
4778 "[:punct:]",
4779 "[:^punct:]",
4780 "[:upper:]",
4781 "[:^upper:]",
4782 "[:xdigit:]",
4783 "[:^xdigit:]",
4784 "[:space:]",
4785 "[:^space:]",
4786 "[:blank:]",
4787 "[:^blank:]"
4788 };
4789
4790 if (flags & ANYOF_LOCALE)
4791 sv_catpv(sv, "{loc}");
4792 if (flags & ANYOF_FOLD)
4793 sv_catpv(sv, "{i}");
4794 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4795 if (flags & ANYOF_INVERT)
4796 sv_catpv(sv, "^");
4797 for (i = 0; i <= 256; i++) {
4798 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4799 if (rangestart == -1)
4800 rangestart = i;
4801 } else if (rangestart != -1) {
4802 if (i <= rangestart + 3)
4803 for (; rangestart < i; rangestart++)
4804 put_byte(sv, rangestart);
4805 else {
4806 put_byte(sv, rangestart);
4807 sv_catpv(sv, "-");
4808 put_byte(sv, i - 1);
4809 }
4810 rangestart = -1;
4811 }
4812 }
4813
4814 if (o->flags & ANYOF_CLASS)
4815 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4816 if (ANYOF_CLASS_TEST(o,i))
4817 sv_catpv(sv, anyofs[i]);
4818
4819 if (flags & ANYOF_UNICODE)
4820 sv_catpv(sv, "{unicode}");
4821 else if (flags & ANYOF_UNICODE_ALL)
4822 sv_catpv(sv, "{unicode_all}");
4823
4824 {
4825 SV *lv;
4826 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4827
4828 if (lv) {
4829 if (sw) {
4830 U8 s[UTF8_MAXLEN+1];
4831
4832 for (i = 0; i <= 256; i++) { /* just the first 256 */
4833 U8 *e = uvchr_to_utf8(s, i);
4834
4835 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4836 if (rangestart == -1)
4837 rangestart = i;
4838 } else if (rangestart != -1) {
4839 U8 *p;
4840
4841 if (i <= rangestart + 3)
4842 for (; rangestart < i; rangestart++) {
4843 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4844 put_byte(sv, *p);
4845 }
4846 else {
4847 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4848 put_byte(sv, *p);
4849 sv_catpv(sv, "-");
4850 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4851 put_byte(sv, *p);
4852 }
4853 rangestart = -1;
4854 }
4855 }
4856
4857 sv_catpv(sv, "..."); /* et cetera */
4858 }
4859
4860 {
4861 char *s = savepv(SvPVX(lv));
4862 char *origs = s;
4863
4864 while(*s && *s != '\n') s++;
4865
4866 if (*s == '\n') {
4867 char *t = ++s;
4868
4869 while (*s) {
4870 if (*s == '\n')
4871 *s = ' ';
4872 s++;
4873 }
4874 if (s[-1] == ' ')
4875 s[-1] = 0;
4876
4877 sv_catpv(sv, t);
4878 }
4879
4880 Safefree(origs);
4881 }
4882 }
4883 }
4884
4885 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4886 }
4887 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4888 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4889#endif /* DEBUGGING */
4890}
4891
4892SV *
4893Perl_re_intuit_string(pTHX_ regexp *prog)
4894{ /* Assume that RE_INTUIT is set */
4895 DEBUG_r(
4896 { STRLEN n_a;
4897 char *s = SvPV(prog->check_substr
4898 ? prog->check_substr : prog->check_utf8, n_a);
4899
4900 if (!PL_colorset) reginitcolors();
4901 PerlIO_printf(Perl_debug_log,
4902 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4903 PL_colors[4],
4904 prog->check_substr ? "" : "utf8 ",
4905 PL_colors[5],PL_colors[0],
4906 s,
4907 PL_colors[1],
4908 (strlen(s) > 60 ? "..." : ""));
4909 } );
4910
4911 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4912}
4913
4914void
4915Perl_pregfree(pTHX_ struct regexp *r)
4916{
4917#ifdef DEBUGGING
4918 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4919#endif
4920
4921 if (!r || (--r->refcnt > 0))
4922 return;
4923 DEBUG_r({
4924 int len;
4925 char *s;
4926
4927 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4928 r->prelen, 60, UNI_DISPLAY_REGEX)
4929 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4930 len = SvCUR(dsv);
4931 if (!PL_colorset)
4932 reginitcolors();
4933 PerlIO_printf(Perl_debug_log,
4934 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4935 PL_colors[4],PL_colors[5],PL_colors[0],
4936 len, len, s,
4937 PL_colors[1],
4938 len > 60 ? "..." : "");
4939 });
4940
4941 if (r->precomp)
4942 Safefree(r->precomp);
4943 if (r->offsets) /* 20010421 MJD */
4944 Safefree(r->offsets);
4945 RX_MATCH_COPY_FREE(r);
4946#ifdef PERL_COPY_ON_WRITE
4947 if (r->saved_copy)
4948 SvREFCNT_dec(r->saved_copy);
4949#endif
4950 if (r->substrs) {
4951 if (r->anchored_substr)
4952 SvREFCNT_dec(r->anchored_substr);
4953 if (r->anchored_utf8)
4954 SvREFCNT_dec(r->anchored_utf8);
4955 if (r->float_substr)
4956 SvREFCNT_dec(r->float_substr);
4957 if (r->float_utf8)
4958 SvREFCNT_dec(r->float_utf8);
4959 Safefree(r->substrs);
4960 }
4961 if (r->data) {
4962 int n = r->data->count;
4963 PAD* new_comppad = NULL;
4964 PAD* old_comppad;
4965
4966 while (--n >= 0) {
4967 /* If you add a ->what type here, update the comment in regcomp.h */
4968 switch (r->data->what[n]) {
4969 case 's':
4970 SvREFCNT_dec((SV*)r->data->data[n]);
4971 break;
4972 case 'f':
4973 Safefree(r->data->data[n]);
4974 break;
4975 case 'p':
4976 new_comppad = (AV*)r->data->data[n];
4977 break;
4978 case 'o':
4979 if (new_comppad == NULL)
4980 Perl_croak(aTHX_ "panic: pregfree comppad");
4981 PAD_SAVE_LOCAL(old_comppad,
4982 /* Watch out for global destruction's random ordering. */
4983 (SvTYPE(new_comppad) == SVt_PVAV) ?
4984 new_comppad : Null(PAD *)
4985 );
4986 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4987 op_free((OP_4tree*)r->data->data[n]);
4988 }
4989
4990 PAD_RESTORE_LOCAL(old_comppad);
4991 SvREFCNT_dec((SV*)new_comppad);
4992 new_comppad = NULL;
4993 break;
4994 case 'n':
4995 break;
4996 default:
4997 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4998 }
4999 }
5000 Safefree(r->data->what);
5001 Safefree(r->data);
5002 }
5003 Safefree(r->startp);
5004 Safefree(r->endp);
5005 Safefree(r);
5006}
5007
5008/*
5009 - regnext - dig the "next" pointer out of a node
5010 *
5011 * [Note, when REGALIGN is defined there are two places in regmatch()
5012 * that bypass this code for speed.]
5013 */
5014regnode *
5015Perl_regnext(pTHX_ register regnode *p)
5016{
5017 register I32 offset;
5018
5019 if (p == &PL_regdummy)
5020 return(NULL);
5021
5022 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5023 if (offset == 0)
5024 return(NULL);
5025
5026 return(p+offset);
5027}
5028
5029STATIC void
5030S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5031{
5032 va_list args;
5033 STRLEN l1 = strlen(pat1);
5034 STRLEN l2 = strlen(pat2);
5035 char buf[512];
5036 SV *msv;
5037 char *message;
5038
5039 if (l1 > 510)
5040 l1 = 510;
5041 if (l1 + l2 > 510)
5042 l2 = 510 - l1;
5043 Copy(pat1, buf, l1 , char);
5044 Copy(pat2, buf + l1, l2 , char);
5045 buf[l1 + l2] = '\n';
5046 buf[l1 + l2 + 1] = '\0';
5047#ifdef I_STDARG
5048 /* ANSI variant takes additional second argument */
5049 va_start(args, pat2);
5050#else
5051 va_start(args);
5052#endif
5053 msv = vmess(buf, &args);
5054 va_end(args);
5055 message = SvPV(msv,l1);
5056 if (l1 > 512)
5057 l1 = 512;
5058 Copy(message, buf, l1 , char);
5059 buf[l1-1] = '\0'; /* Overwrite \n */
5060 Perl_croak(aTHX_ "%s", buf);
5061}
5062
5063/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5064
5065void
5066Perl_save_re_context(pTHX)
5067{
5068 SAVEI32(PL_reg_flags); /* from regexec.c */
5069 SAVEPPTR(PL_bostr);
5070 SAVEPPTR(PL_reginput); /* String-input pointer. */
5071 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5072 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5073 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5074 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5075 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5076 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5077 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5078 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5079 PL_reg_start_tmp = 0;
5080 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5081 PL_reg_start_tmpl = 0;
5082 SAVEVPTR(PL_regdata);
5083 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5084 SAVEI32(PL_regnarrate); /* from regexec.c */
5085 SAVEVPTR(PL_regprogram); /* from regexec.c */
5086 SAVEINT(PL_regindent); /* from regexec.c */
5087 SAVEVPTR(PL_regcc); /* from regexec.c */
5088 SAVEVPTR(PL_curcop);
5089 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5090 SAVEVPTR(PL_reg_re); /* from regexec.c */
5091 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5092 SAVESPTR(PL_reg_sv); /* from regexec.c */
5093 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5094 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5095 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5096 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5097 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5098 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5099 PL_reg_oldsaved = Nullch;
5100 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5101 PL_reg_oldsavedlen = 0;
5102#ifdef PERL_COPY_ON_WRITE
5103 SAVESPTR(PL_nrs);
5104 PL_nrs = Nullsv;
5105#endif
5106 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5107 PL_reg_maxiter = 0;
5108 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5109 PL_reg_leftiter = 0;
5110 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5111 PL_reg_poscache = Nullch;
5112 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5113 PL_reg_poscache_size = 0;
5114 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5115 SAVEI32(PL_regnpar); /* () count. */
5116 SAVEI32(PL_regsize); /* from regexec.c */
5117
5118 {
5119 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5120 U32 i;
5121 GV *mgv;
5122 REGEXP *rx;
5123 char digits[16];
5124
5125 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5126 for (i = 1; i <= rx->nparens; i++) {
5127 sprintf(digits, "%lu", (long)i);
5128 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5129 save_scalar(mgv);
5130 }
5131 }
5132 }
5133
5134#ifdef DEBUGGING
5135 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5136#endif
5137}
5138
5139static void
5140clear_re(pTHX_ void *r)
5141{
5142 ReREFCNT_dec((regexp *)r);
5143}
5144