This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen embedvar.h
[perl5.git] / regcomp.h
CommitLineData
a0d0e21e 1/* regcomp.h
a687059c
LW
2 */
3
c277df42
IZ
4typedef OP OP_4tree; /* Will be redefined later. */
5
a687059c
LW
6/*
7 * The "internal use only" fields in regexp.h are present to pass info from
8 * compile to execute that permits the execute phase to run lots faster on
9 * simple cases. They are:
10 *
79072805 11 * regstart sv that must begin a match; Nullch if none obvious
a687059c
LW
12 * reganch is the match anchored (at beginning-of-line only)?
13 * regmust string (pointer into program) that match must include, or NULL
79072805 14 * [regmust changed to SV* for bminstr()--law]
a687059c
LW
15 * regmlen length of regmust string
16 * [regmlen not used currently]
17 *
18 * Regstart and reganch permit very fast decisions on suitable starting points
19 * for a match, cutting down the work a lot. Regmust permits fast rejection
20 * of lines that cannot possibly match. The regmust tests are costly enough
e50aee73 21 * that pregcomp() supplies a regmust only if the r.e. contains something
a687059c
LW
22 * potentially expensive (at present, the only such thing detected is * or +
23 * at the start of the r.e., which can involve a lot of backup). Regmlen is
e50aee73 24 * supplied because the test in pregexec() needs it and pregcomp() is computing
a687059c
LW
25 * it anyway.
26 * [regmust is now supplied always. The tests that use regmust have a
27 * heuristic that disables the test if it usually matches.]
28 *
29 * [In fact, we now use regmust in many cases to locate where the search
30 * starts in the string, so if regback is >= 0, the regmust search is never
31 * wasted effort. The regback variable says how many characters back from
32 * where regmust matched is the earliest possible start of the match.
33 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
34 */
35
c277df42
IZ
36/* #ifndef gould */
37/* #ifndef cray */
38/* #ifndef eta10 */
39#define REGALIGN
40/* #endif */
41/* #endif */
42/* #endif */
43
44#ifdef REGALIGN
45# define REGALIGN_STRUCT
46#endif
47
a687059c
LW
48/*
49 * Structure for regexp "program". This is essentially a linear encoding
50 * of a nondeterministic finite-state machine (aka syntax charts or
51 * "railroad normal form" in parsing technology). Each node is an opcode
52 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
53 * all nodes except BRANCH implement concatenation; a "next" pointer with
54 * a BRANCH on both ends of it is connecting two alternatives. (Here we
55 * have one of the subtle syntax dependencies: an individual BRANCH (as
56 * opposed to a collection of them) is never concatenated with anything
57 * because of operator precedence.) The operand of some types of node is
58 * a literal string; for others, it is a node leading into a sub-FSM. In
59 * particular, the operand of a BRANCH node is the first node of the branch.
60 * (NB this is *not* a tree structure: the tail of the branch connects
61 * to the thing following the set of BRANCHes.) The opcodes are:
62 */
63
64/* definition number opnd? meaning */
bbce6d69
PP
65#define END 0 /* no End of program. */
66#define BOL 1 /* no Match "" at beginning of line. */
67#define MBOL 2 /* no Same, assuming multiline. */
68#define SBOL 3 /* no Same, assuming singleline. */
69#define EOL 4 /* no Match "" at end of line. */
70#define MEOL 5 /* no Same, assuming multiline. */
71#define SEOL 6 /* no Same, assuming singleline. */
72#define ANY 7 /* no Match any one character (except newline). */
73#define SANY 8 /* no Match any one character. */
74#define ANYOF 9 /* sv Match character in (or not in) this class. */
a0d0e21e
LW
75#define CURLY 10 /* sv Match this simple thing {n,m} times. */
76#define CURLYX 11 /* sv Match this complex thing {n,m} times. */
77#define BRANCH 12 /* node Match this alternative, or the next... */
78#define BACK 13 /* no Match "", "next" ptr points backward. */
bbce6d69
PP
79#define EXACT 14 /* sv Match this string (preceded by length). */
80#define EXACTF 15 /* sv Match this string, folded (prec. by length). */
81#define EXACTFL 16 /* sv Match this string, folded in locale (w/len). */
82#define NOTHING 17 /* no Match empty string. */
83#define STAR 18 /* node Match this (simple) thing 0 or more times. */
84#define PLUS 19 /* node Match this (simple) thing 1 or more times. */
a0d0e21e 85#define BOUND 20 /* no Match "" at any word boundary */
bbce6d69
PP
86#define BOUNDL 21 /* no Match "" at any word boundary */
87#define NBOUND 22 /* no Match "" at any word non-boundary */
88#define NBOUNDL 23 /* no Match "" at any word non-boundary */
c277df42
IZ
89#define REF 24 /* num Match some already matched string */
90#define OPEN 25 /* num Mark this point in input as start of #n. */
91#define CLOSE 26 /* num Analogous to OPEN. */
92#define MINMOD 27 /* no Next operator is not greedy. */
93#define GPOS 28 /* no Matches where last m//g left off. */
94#define IFMATCH 29 /* off Succeeds if the following matches. */
95#define UNLESSM 30 /* off Fails if the following matches. */
96#define SUCCEED 31 /* no Return from a subroutine, basically. */
97#define WHILEM 32 /* no Do curly processing and see if rest matches. */
98#define ALNUM 33 /* no Match any alphanumeric character */
99#define ALNUML 34 /* no Match any alphanumeric char in locale */
100#define NALNUM 35 /* no Match any non-alphanumeric character */
101#define NALNUML 36 /* no Match any non-alphanumeric char in locale */
102#define SPACE 37 /* no Match any whitespace character */
103#define SPACEL 38 /* no Match any whitespace char in locale */
104#define NSPACE 39 /* no Match any non-whitespace character */
105#define NSPACEL 40 /* no Match any non-whitespace char in locale */
106#define DIGIT 41 /* no Match any numeric character */
107#define NDIGIT 42 /* no Match any non-numeric character */
108#define CURLYM 43 /* no Match this medium-complex thing {n,m} times. */
109#define CURLYN 44 /* no Match next-after-this simple thing
110 {n,m} times, set parenths. */
111#define TAIL 45 /* no Match empty string. Can jump here from outside. */
112#define REFF 46 /* num Match already matched string, folded */
113#define REFFL 47 /* num Match already matched string, folded in loc. */
114#define EVAL 48 /* evl Execute some Perl code. */
115#define LONGJMP 49 /* off Jump far away, requires REGALIGN_STRUCT. */
116#define BRANCHJ 50 /* off BRANCH with long offset, requires REGALIGN_STRUCT. */
117#define IFTHEN 51 /* off Switch, should be preceeded by switcher . */
118#define GROUPP 52 /* num Whether the group matched. */
119#define LOGICAL 53 /* no Next opcode should set the flag only. */
120#define SUSPEND 54 /* off "Independent" sub-RE. */
121#define RENUM 55 /* off Group with independently numbered parens. */
122#define OPTIMIZED 56 /* off Placeholder for dump. */
a687059c
LW
123
124/*
125 * Opcode notes:
126 *
127 * BRANCH The set of branches constituting a single choice are hooked
128 * together with their "next" pointers, since precedence prevents
129 * anything being concatenated to any individual branch. The
130 * "next" pointer of the last BRANCH in a choice points to the
131 * thing following the whole choice. This is also where the
132 * final "next" pointer of each individual branch points; each
133 * branch starts with the operand node of a BRANCH node.
134 *
135 * BACK Normal "next" pointers all implicitly point forward; BACK
136 * exists to make loop structures possible.
137 *
138 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
139 * BRANCH structures using BACK. Simple cases (one character
140 * per match) are implemented with STAR and PLUS for speed
141 * and to minimize recursive plunges.
142 *
c277df42 143 * OPEN,CLOSE,GROUPP ...are numbered at compile time.
a687059c
LW
144 */
145
fe14fcc3 146#ifndef DOINIT
161b471a 147EXTCONST U8 regkind[];
a0d0e21e 148#else
161b471a 149EXTCONST U8 regkind[] = {
a0d0e21e
LW
150 END,
151 BOL,
152 BOL,
153 BOL,
154 EOL,
155 EOL,
156 EOL,
157 ANY,
158 ANY,
159 ANYOF,
160 CURLY,
161 CURLY,
162 BRANCH,
163 BACK,
bbce6d69
PP
164 EXACT,
165 EXACT,
166 EXACT,
a0d0e21e
LW
167 NOTHING,
168 STAR,
169 PLUS,
bbce6d69 170 BOUND,
a0d0e21e
LW
171 BOUND,
172 NBOUND,
bbce6d69 173 NBOUND,
a0d0e21e
LW
174 REF,
175 OPEN,
176 CLOSE,
177 MINMOD,
774d564b 178 GPOS,
c277df42
IZ
179 BRANCHJ,
180 BRANCHJ,
a0d0e21e 181 END,
bbce6d69
PP
182 WHILEM,
183 ALNUM,
184 ALNUM,
185 NALNUM,
186 NALNUM,
187 SPACE,
188 SPACE,
189 NSPACE,
190 NSPACE,
191 DIGIT,
192 NDIGIT,
c277df42
IZ
193 CURLY,
194 CURLY,
195 NOTHING,
196 REF,
197 REF,
198 EVAL,
199 LONGJMP,
200 BRANCHJ,
201 BRANCHJ,
202 GROUPP,
203 LOGICAL,
204 BRANCHJ,
205 BRANCHJ,
206 NOTHING,
a0d0e21e 207};
fe14fcc3
LW
208#endif
209
c277df42 210/* The following have no fixed length. char* since we do strchr on it. */
a687059c 211#ifndef DOINIT
fc1ce8cc 212EXTCONST char varies[];
a687059c 213#else
fc1ce8cc 214EXTCONST char varies[] = {
c277df42
IZ
215 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
216 WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, 0
bbce6d69 217};
a687059c
LW
218#endif
219
c277df42 220/* The following always have a length of 1. char* since we do strchr on it. */
a687059c 221#ifndef DOINIT
fc1ce8cc 222EXTCONST char simple[];
a687059c 223#else
fc1ce8cc 224EXTCONST char simple[] = {
bbce6d69
PP
225 ANY, SANY, ANYOF,
226 ALNUM, ALNUML, NALNUM, NALNUML,
227 SPACE, SPACEL, NSPACE, NSPACEL,
228 DIGIT, NDIGIT, 0
229};
a687059c
LW
230#endif
231
a687059c
LW
232/*
233 * A node is one char of opcode followed by two chars of "next" pointer.
234 * "Next" pointers are stored as two 8-bit pieces, high order first. The
235 * value is a positive offset from the opcode of the node containing it.
236 * An operand, if any, simply follows the node. (Note that much of the
237 * code generation knows about this implicit relationship.)
238 *
239 * Using two bytes for the "next" pointer is vast overkill for most things,
240 * but allows patterns to get big without disasters.
241 *
242 * [If REGALIGN is defined, the "next" pointer is always aligned on an even
243 * boundary, and reads the offset directly as a short. Also, there is no
244 * special test to reverse the sign of BACK pointers since the offset is
245 * stored negative.]
246 */
247
c277df42
IZ
248#ifdef REGALIGN_STRUCT
249
250struct regnode_string {
251 U8 flags;
252 U8 type;
253 U16 next_off;
254 U8 string[1];
255};
a687059c 256
c277df42
IZ
257struct regnode_1 {
258 U8 flags;
259 U8 type;
260 U16 next_off;
261 U32 arg1;
262};
263
264struct regnode_2 {
265 U8 flags;
266 U8 type;
267 U16 next_off;
268 U16 arg1;
269 U16 arg2;
270};
271
272#endif
273
83cfe48c
IZ
274#ifndef
275# define REG_INFTY I16_MAX
276#endif
a687059c 277
a687059c 278#ifdef REGALIGN
c277df42
IZ
279# define ARG_VALUE(arg) (arg)
280# define ARG__SET(arg,val) ((arg) = (val))
a687059c 281#else
c277df42
IZ
282# define ARG_VALUE(arg) (((*((char*)&arg)&0377)<<8) + (*(((char*)&arg)+1)&0377))
283# define ARG__SET(arg,val) (((char*)&arg)[0] = (val) >> 8; ((char*)&arg)[1] = (val) & 0377;)
a687059c 284#endif
c277df42
IZ
285
286#define ARG(p) ARG_VALUE(ARG_LOC(p))
287#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
288#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
289#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
290#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
291#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
292
293#ifndef lint
294# ifdef REGALIGN
295# ifdef REGALIGN_STRUCT
296# define NEXT_OFF(p) ((p)->next_off)
297# define NODE_ALIGN(node)
298# define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
299# else
300# define NEXT_OFF(p) (*(short*)(p+1))
301# define NODE_ALIGN(node) ((!((long)node & 1)) ? node++ : 0)
302# define NODE_ALIGN_FILL(node) ((!((long)node & 1)) ? *node++ = 127 : 0)
303# endif
304# else
305# define NEXT_OFF(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
306# define NODE_ALIGN(node)
307# define NODE_ALIGN_FILL(node)
308# endif
a687059c 309#else /* lint */
c277df42
IZ
310# define NEXT_OFF(p) 0
311# define NODE_ALIGN(node)
312# define NODE_ALIGN_FILL(node)
a687059c
LW
313#endif /* lint */
314
c277df42
IZ
315#define SIZE_ALIGN NODE_ALIGN
316
317#ifdef REGALIGN_STRUCT
318# define OP(p) ((p)->type)
319# define OPERAND(p) (((struct regnode_string *)p)->string)
320# define NODE_ALIGN(node)
321# define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
322# define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
323# define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
324# define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
325# define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
326#else
327# define OP(p) (*(p))
328# define OPERAND(p) ((p) + 3)
329# define ARG_LOC(p) (*(unsigned short*)(p+3))
330# define ARG1_LOC(p) (*(unsigned short*)(p+3))
331# define ARG2_LOC(p) (*(unsigned short*)(p+5))
332typedef char* regnode;
333# define NODE_STEP_REGNODE NODE_STEP_B
334# define EXTRA_STEP_2ARGS 4
335#endif
a687059c
LW
336
337#ifdef REGALIGN
c277df42 338# define NODE_STEP_B 4
a687059c 339#else
c277df42
IZ
340# define NODE_STEP_B 3
341#endif
342
343#define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
344#define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
345
346#ifdef REGALIGN_STRUCT
347# define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
348 (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
349# define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
350 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
351#else
352# define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
353 *(ptr)++ = op; *(ptr)++ = '\0'; *(ptr)++ = '\0'; } STMT_END
354# define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
355 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 2; } STMT_END
a687059c
LW
356#endif
357
358#define MAGIC 0234
359
c277df42
IZ
360#define SIZE_ONLY (regcode == &regdummy)
361
bbce6d69
PP
362/* Flags for first parameter byte of ANYOF */
363#define ANYOF_INVERT 0x40
364#define ANYOF_FOLD 0x20
365#define ANYOF_LOCALE 0x10
366#define ANYOF_ISA 0x0F
367#define ANYOF_ALNUML 0x08
368#define ANYOF_NALNUML 0x04
369#define ANYOF_SPACEL 0x02
370#define ANYOF_NSPACEL 0x01
371
ae5c130c
GS
372/* Utility macros for bitmap of ANYOF */
373#define ANYOF_BYTE(p,c) (p)[1 + (((c) >> 3) & 31)]
374#define ANYOF_BIT(c) (1 << ((c) & 7))
375#define ANYOF_SET(p,c) (ANYOF_BYTE(p,c) |= ANYOF_BIT(c))
376#define ANYOF_CLEAR(p,c) (ANYOF_BYTE(p,c) &= ~ANYOF_BIT(c))
377#define ANYOF_TEST(p,c) (ANYOF_BYTE(p,c) & ANYOF_BIT(c))
378
c277df42
IZ
379#ifdef REGALIGN_STRUCT
380#define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1)
381#else
382#define ANY_SKIP 32 /* overwrite the first byte of
383 * the next guy. */
384#endif
385
a687059c
LW
386/*
387 * Utility definitions.
388 */
389#ifndef lint
2304df62 390#ifndef CHARMASK
a687059c
LW
391#define UCHARAT(p) ((int)*(unsigned char *)(p))
392#else
2304df62 393#define UCHARAT(p) ((int)*(p)&CHARMASK)
a687059c
LW
394#endif
395#else /* lint */
396#define UCHARAT(p) regdummy
397#endif /* lint */
398
c277df42
IZ
399#define FAIL(m) croak ("/%.127s/: %s", regprecomp,m)
400#define FAIL2(pat,m) re_croak2("/%.127s/: ",pat,regprecomp,m)
401
402#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
403
404#ifdef REG_COMP_C
405const static U8 regarglen[] = {
406# ifdef REGALIGN_STRUCT
407 0,0,0,0,0,0,0,0,0,0,
408 /*CURLY*/ EXTRA_SIZE(struct regnode_2),
409 /*CURLYX*/ EXTRA_SIZE(struct regnode_2),
410 0,0,0,0,0,0,0,0,0,0,0,0,
411 /*REF*/ EXTRA_SIZE(struct regnode_1),
412 /*OPEN*/ EXTRA_SIZE(struct regnode_1),
413 /*CLOSE*/ EXTRA_SIZE(struct regnode_1),
414 0,0,
415 /*IFMATCH*/ EXTRA_SIZE(struct regnode_1),
416 /*UNLESSM*/ EXTRA_SIZE(struct regnode_1),
417 0,0,0,0,0,0,0,0,0,0,0,0,
418 /*CURLYM*/ EXTRA_SIZE(struct regnode_2),
419 /*CURLYN*/ EXTRA_SIZE(struct regnode_2),
420 0,
421 /*REFF*/ EXTRA_SIZE(struct regnode_1),
422 /*REFFL*/ EXTRA_SIZE(struct regnode_1),
423 /*EVAL*/ EXTRA_SIZE(struct regnode_1),
424 /*LONGJMP*/ EXTRA_SIZE(struct regnode_1),
425 /*BRANCHJ*/ EXTRA_SIZE(struct regnode_1),
426 /*IFTHEN*/ EXTRA_SIZE(struct regnode_1),
427 /*GROUPP*/ EXTRA_SIZE(struct regnode_1),
428 /*LOGICAL*/ 0,
429 /*SUSPEND*/ EXTRA_SIZE(struct regnode_1),
430 /*RENUM*/ EXTRA_SIZE(struct regnode_1), 0,
431# else
432 0,0,0,0,0,0,0,0,0,0,
433 /*CURLY*/ 4, /*CURLYX*/ 4,
434 0,0,0,0,0,0,0,0,0,0,0,0,
435 /*REF*/ 2, /*OPEN*/ 2, /*CLOSE*/ 2,
436 0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2,
437 0,0,0,0,0,0,0,0,0,0,0,0,/*CURLYM*/ 4,/*CURLYN*/ 4,
438 0, /*REFF*/ 2, /*REFFL*/ 2, /*EVAL*/ 2, /*LONGJMP*/ 2, /*BRANCHJ*/ 2,
439 /*IFTHEN*/ 2, /*GROUPP*/ 2, /*LOGICAL*/ 0, /*RENUM*/ 2, /*RENUM*/ 2, 0,
440# endif
441};
442
443const static char reg_off_by_arg[] = {
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0 .. 15 */
445 0,0,0,0,0,0,0,0,0,0,0,0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2, 0,
446 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 32 .. 47 */
447 0, /*LONGJMP*/ 1, /*BRANCHJ*/ 1, /*IFTHEN*/ 1, 0, 0,
448 /*RENUM*/ 1, /*RENUM*/ 1,0,
449};
450#endif
451
c277df42
IZ
452#define REG_SEEN_ZERO_LEN 1
453#define REG_SEEN_LOOKBEHIND 2
454#define REG_SEEN_GPOS 4
ce862d02 455#define REG_SEEN_EVAL 8
c277df42 456