This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix unnecessary re-linking
[perl5.git] / regcomp.h
CommitLineData
a0d0e21e 1/* regcomp.h
a687059c
LW
2 */
3
4/*
5 * The "internal use only" fields in regexp.h are present to pass info from
6 * compile to execute that permits the execute phase to run lots faster on
7 * simple cases. They are:
8 *
79072805 9 * regstart sv that must begin a match; Nullch if none obvious
a687059c
LW
10 * reganch is the match anchored (at beginning-of-line only)?
11 * regmust string (pointer into program) that match must include, or NULL
79072805 12 * [regmust changed to SV* for bminstr()--law]
a687059c
LW
13 * regmlen length of regmust string
14 * [regmlen not used currently]
15 *
16 * Regstart and reganch permit very fast decisions on suitable starting points
17 * for a match, cutting down the work a lot. Regmust permits fast rejection
18 * of lines that cannot possibly match. The regmust tests are costly enough
e50aee73 19 * that pregcomp() supplies a regmust only if the r.e. contains something
a687059c
LW
20 * potentially expensive (at present, the only such thing detected is * or +
21 * at the start of the r.e., which can involve a lot of backup). Regmlen is
e50aee73 22 * supplied because the test in pregexec() needs it and pregcomp() is computing
a687059c
LW
23 * it anyway.
24 * [regmust is now supplied always. The tests that use regmust have a
25 * heuristic that disables the test if it usually matches.]
26 *
27 * [In fact, we now use regmust in many cases to locate where the search
28 * starts in the string, so if regback is >= 0, the regmust search is never
29 * wasted effort. The regback variable says how many characters back from
30 * where regmust matched is the earliest possible start of the match.
31 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
32 */
33
34/*
35 * Structure for regexp "program". This is essentially a linear encoding
36 * of a nondeterministic finite-state machine (aka syntax charts or
37 * "railroad normal form" in parsing technology). Each node is an opcode
38 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
39 * all nodes except BRANCH implement concatenation; a "next" pointer with
40 * a BRANCH on both ends of it is connecting two alternatives. (Here we
41 * have one of the subtle syntax dependencies: an individual BRANCH (as
42 * opposed to a collection of them) is never concatenated with anything
43 * because of operator precedence.) The operand of some types of node is
44 * a literal string; for others, it is a node leading into a sub-FSM. In
45 * particular, the operand of a BRANCH node is the first node of the branch.
46 * (NB this is *not* a tree structure: the tail of the branch connects
47 * to the thing following the set of BRANCHes.) The opcodes are:
48 */
49
50/* definition number opnd? meaning */
bbce6d69 51#define END 0 /* no End of program. */
52#define BOL 1 /* no Match "" at beginning of line. */
53#define MBOL 2 /* no Same, assuming multiline. */
54#define SBOL 3 /* no Same, assuming singleline. */
55#define EOL 4 /* no Match "" at end of line. */
56#define MEOL 5 /* no Same, assuming multiline. */
57#define SEOL 6 /* no Same, assuming singleline. */
58#define ANY 7 /* no Match any one character (except newline). */
59#define SANY 8 /* no Match any one character. */
60#define ANYOF 9 /* sv Match character in (or not in) this class. */
a0d0e21e
LW
61#define CURLY 10 /* sv Match this simple thing {n,m} times. */
62#define CURLYX 11 /* sv Match this complex thing {n,m} times. */
63#define BRANCH 12 /* node Match this alternative, or the next... */
64#define BACK 13 /* no Match "", "next" ptr points backward. */
bbce6d69 65#define EXACT 14 /* sv Match this string (preceded by length). */
66#define EXACTF 15 /* sv Match this string, folded (prec. by length). */
67#define EXACTFL 16 /* sv Match this string, folded in locale (w/len). */
68#define NOTHING 17 /* no Match empty string. */
69#define STAR 18 /* node Match this (simple) thing 0 or more times. */
70#define PLUS 19 /* node Match this (simple) thing 1 or more times. */
a0d0e21e 71#define BOUND 20 /* no Match "" at any word boundary */
bbce6d69 72#define BOUNDL 21 /* no Match "" at any word boundary */
73#define NBOUND 22 /* no Match "" at any word non-boundary */
74#define NBOUNDL 23 /* no Match "" at any word non-boundary */
75#define REF 24 /* num Match some already matched string */
76#define OPEN 25 /* num Mark this point in input as start of #n. */
77#define CLOSE 26 /* num Analogous to OPEN. */
78#define MINMOD 27 /* no Next operator is not greedy. */
774d564b 79#define GPOS 28 /* no Matches where last m//g left off. */
bbce6d69 80#define IFMATCH 29 /* no Succeeds if the following matches. */
81#define UNLESSM 30 /* no Fails if the following matches. */
82#define SUCCEED 31 /* no Return from a subroutine, basically. */
83#define WHILEM 32 /* no Do curly processing and see if rest matches. */
84#define ALNUM 33 /* no Match any alphanumeric character */
85#define ALNUML 34 /* no Match any alphanumeric char in locale */
86#define NALNUM 35 /* no Match any non-alphanumeric character */
87#define NALNUML 36 /* no Match any non-alphanumeric char in locale */
88#define SPACE 37 /* no Match any whitespace character */
89#define SPACEL 38 /* no Match any whitespace char in locale */
90#define NSPACE 39 /* no Match any non-whitespace character */
91#define NSPACEL 40 /* no Match any non-whitespace char in locale */
92#define DIGIT 41 /* no Match any numeric character */
93#define NDIGIT 42 /* no Match any non-numeric character */
a687059c
LW
94
95/*
96 * Opcode notes:
97 *
98 * BRANCH The set of branches constituting a single choice are hooked
99 * together with their "next" pointers, since precedence prevents
100 * anything being concatenated to any individual branch. The
101 * "next" pointer of the last BRANCH in a choice points to the
102 * thing following the whole choice. This is also where the
103 * final "next" pointer of each individual branch points; each
104 * branch starts with the operand node of a BRANCH node.
105 *
106 * BACK Normal "next" pointers all implicitly point forward; BACK
107 * exists to make loop structures possible.
108 *
109 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
110 * BRANCH structures using BACK. Simple cases (one character
111 * per match) are implemented with STAR and PLUS for speed
112 * and to minimize recursive plunges.
113 *
114 * OPEN,CLOSE ...are numbered at compile time.
115 */
116
fe14fcc3 117#ifndef DOINIT
a0d0e21e
LW
118EXT char regarglen[];
119#else
bbce6d69 120EXT char regarglen[] = {
121 0,0,0,0,0,0,0,0,0,0,
122 /*CURLY*/ 4, /*CURLYX*/ 4,
123 0,0,0,0,0,0,0,0,0,0,0,0,
124 /*REF*/ 2, /*OPEN*/ 2, /*CLOSE*/ 2,
125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126};
a0d0e21e
LW
127#endif
128
129#ifndef DOINIT
130EXT char regkind[];
fe14fcc3 131#else
a0d0e21e
LW
132EXT char regkind[] = {
133 END,
134 BOL,
135 BOL,
136 BOL,
137 EOL,
138 EOL,
139 EOL,
140 ANY,
141 ANY,
142 ANYOF,
143 CURLY,
144 CURLY,
145 BRANCH,
146 BACK,
bbce6d69 147 EXACT,
148 EXACT,
149 EXACT,
a0d0e21e
LW
150 NOTHING,
151 STAR,
152 PLUS,
bbce6d69 153 BOUND,
a0d0e21e
LW
154 BOUND,
155 NBOUND,
bbce6d69 156 NBOUND,
a0d0e21e
LW
157 REF,
158 OPEN,
159 CLOSE,
160 MINMOD,
774d564b 161 GPOS,
a0d0e21e
LW
162 BRANCH,
163 BRANCH,
164 END,
bbce6d69 165 WHILEM,
166 ALNUM,
167 ALNUM,
168 NALNUM,
169 NALNUM,
170 SPACE,
171 SPACE,
172 NSPACE,
173 NSPACE,
174 DIGIT,
175 NDIGIT,
a0d0e21e 176};
fe14fcc3
LW
177#endif
178
a687059c
LW
179/* The following have no fixed length. */
180#ifndef DOINIT
a0d0e21e 181EXT char varies[];
a687059c 182#else
bbce6d69 183EXT char varies[] = {
184 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, WHILEM, 0
185};
a687059c
LW
186#endif
187
188/* The following always have a length of 1. */
189#ifndef DOINIT
a0d0e21e 190EXT char simple[];
a687059c 191#else
bbce6d69 192EXT char simple[] = {
193 ANY, SANY, ANYOF,
194 ALNUM, ALNUML, NALNUM, NALNUML,
195 SPACE, SPACEL, NSPACE, NSPACEL,
196 DIGIT, NDIGIT, 0
197};
a687059c
LW
198#endif
199
200EXT char regdummy;
201
202/*
203 * A node is one char of opcode followed by two chars of "next" pointer.
204 * "Next" pointers are stored as two 8-bit pieces, high order first. The
205 * value is a positive offset from the opcode of the node containing it.
206 * An operand, if any, simply follows the node. (Note that much of the
207 * code generation knows about this implicit relationship.)
208 *
209 * Using two bytes for the "next" pointer is vast overkill for most things,
210 * but allows patterns to get big without disasters.
211 *
212 * [If REGALIGN is defined, the "next" pointer is always aligned on an even
213 * boundary, and reads the offset directly as a short. Also, there is no
214 * special test to reverse the sign of BACK pointers since the offset is
215 * stored negative.]
216 */
217
218#ifndef gould
219#ifndef cray
34de22dd 220#ifndef eta10
a687059c
LW
221#define REGALIGN
222#endif
223#endif
34de22dd 224#endif
a687059c
LW
225
226#define OP(p) (*(p))
227
228#ifndef lint
229#ifdef REGALIGN
230#define NEXT(p) (*(short*)(p+1))
00bf170e
LW
231#define ARG1(p) (*(unsigned short*)(p+3))
232#define ARG2(p) (*(unsigned short*)(p+5))
a687059c
LW
233#else
234#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
00bf170e
LW
235#define ARG1(p) (((*((p)+3)&0377)<<8) + (*((p)+4)&0377))
236#define ARG2(p) (((*((p)+5)&0377)<<8) + (*((p)+6)&0377))
a687059c
LW
237#endif
238#else /* lint */
239#define NEXT(p) 0
240#endif /* lint */
241
242#define OPERAND(p) ((p) + 3)
243
244#ifdef REGALIGN
245#define NEXTOPER(p) ((p) + 4)
a0d0e21e 246#define PREVOPER(p) ((p) - 4)
a687059c
LW
247#else
248#define NEXTOPER(p) ((p) + 3)
a0d0e21e 249#define PREVOPER(p) ((p) - 3)
a687059c
LW
250#endif
251
252#define MAGIC 0234
253
bbce6d69 254/* Flags for first parameter byte of ANYOF */
255#define ANYOF_INVERT 0x40
256#define ANYOF_FOLD 0x20
257#define ANYOF_LOCALE 0x10
258#define ANYOF_ISA 0x0F
259#define ANYOF_ALNUML 0x08
260#define ANYOF_NALNUML 0x04
261#define ANYOF_SPACEL 0x02
262#define ANYOF_NSPACEL 0x01
263
a687059c
LW
264/*
265 * Utility definitions.
266 */
267#ifndef lint
2304df62 268#ifndef CHARMASK
a687059c
LW
269#define UCHARAT(p) ((int)*(unsigned char *)(p))
270#else
2304df62 271#define UCHARAT(p) ((int)*(p)&CHARMASK)
a687059c
LW
272#endif
273#else /* lint */
274#define UCHARAT(p) regdummy
275#endif /* lint */
276
a0d0e21e 277#define FAIL(m) croak("/%.127s/: %s",regprecomp,m)