This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
win32: additional default libraries
[perl5.git] / regcomp.h
CommitLineData
a0d0e21e 1/* regcomp.h
a687059c
LW
2 */
3
4/*
5 * The "internal use only" fields in regexp.h are present to pass info from
6 * compile to execute that permits the execute phase to run lots faster on
7 * simple cases. They are:
8 *
79072805 9 * regstart sv that must begin a match; Nullch if none obvious
a687059c
LW
10 * reganch is the match anchored (at beginning-of-line only)?
11 * regmust string (pointer into program) that match must include, or NULL
79072805 12 * [regmust changed to SV* for bminstr()--law]
a687059c
LW
13 * regmlen length of regmust string
14 * [regmlen not used currently]
15 *
16 * Regstart and reganch permit very fast decisions on suitable starting points
17 * for a match, cutting down the work a lot. Regmust permits fast rejection
18 * of lines that cannot possibly match. The regmust tests are costly enough
e50aee73 19 * that pregcomp() supplies a regmust only if the r.e. contains something
a687059c
LW
20 * potentially expensive (at present, the only such thing detected is * or +
21 * at the start of the r.e., which can involve a lot of backup). Regmlen is
e50aee73 22 * supplied because the test in pregexec() needs it and pregcomp() is computing
a687059c
LW
23 * it anyway.
24 * [regmust is now supplied always. The tests that use regmust have a
25 * heuristic that disables the test if it usually matches.]
26 *
27 * [In fact, we now use regmust in many cases to locate where the search
28 * starts in the string, so if regback is >= 0, the regmust search is never
29 * wasted effort. The regback variable says how many characters back from
30 * where regmust matched is the earliest possible start of the match.
31 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
32 */
33
34/*
35 * Structure for regexp "program". This is essentially a linear encoding
36 * of a nondeterministic finite-state machine (aka syntax charts or
37 * "railroad normal form" in parsing technology). Each node is an opcode
38 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
39 * all nodes except BRANCH implement concatenation; a "next" pointer with
40 * a BRANCH on both ends of it is connecting two alternatives. (Here we
41 * have one of the subtle syntax dependencies: an individual BRANCH (as
42 * opposed to a collection of them) is never concatenated with anything
43 * because of operator precedence.) The operand of some types of node is
44 * a literal string; for others, it is a node leading into a sub-FSM. In
45 * particular, the operand of a BRANCH node is the first node of the branch.
46 * (NB this is *not* a tree structure: the tail of the branch connects
47 * to the thing following the set of BRANCHes.) The opcodes are:
48 */
49
50/* definition number opnd? meaning */
bbce6d69 51#define END 0 /* no End of program. */
52#define BOL 1 /* no Match "" at beginning of line. */
53#define MBOL 2 /* no Same, assuming multiline. */
54#define SBOL 3 /* no Same, assuming singleline. */
55#define EOL 4 /* no Match "" at end of line. */
56#define MEOL 5 /* no Same, assuming multiline. */
57#define SEOL 6 /* no Same, assuming singleline. */
58#define ANY 7 /* no Match any one character (except newline). */
59#define SANY 8 /* no Match any one character. */
60#define ANYOF 9 /* sv Match character in (or not in) this class. */
a0d0e21e
LW
61#define CURLY 10 /* sv Match this simple thing {n,m} times. */
62#define CURLYX 11 /* sv Match this complex thing {n,m} times. */
63#define BRANCH 12 /* node Match this alternative, or the next... */
64#define BACK 13 /* no Match "", "next" ptr points backward. */
bbce6d69 65#define EXACT 14 /* sv Match this string (preceded by length). */
66#define EXACTF 15 /* sv Match this string, folded (prec. by length). */
67#define EXACTFL 16 /* sv Match this string, folded in locale (w/len). */
68#define NOTHING 17 /* no Match empty string. */
69#define STAR 18 /* node Match this (simple) thing 0 or more times. */
70#define PLUS 19 /* node Match this (simple) thing 1 or more times. */
a0d0e21e 71#define BOUND 20 /* no Match "" at any word boundary */
bbce6d69 72#define BOUNDL 21 /* no Match "" at any word boundary */
73#define NBOUND 22 /* no Match "" at any word non-boundary */
74#define NBOUNDL 23 /* no Match "" at any word non-boundary */
c8756f30
AK
75#define REF 24 /* num Match already matched string */
76#define REFF 25 /* num Match already matched string, folded */
77#define REFFL 26 /* num Match already matched string, folded in loc. */
78#define OPEN 27 /* num Mark this point in input as start of #n. */
79#define CLOSE 28 /* num Analogous to OPEN. */
80#define MINMOD 29 /* no Next operator is not greedy. */
81#define GPOS 30 /* no Matches where last m//g left off. */
82#define IFMATCH 31 /* no Succeeds if the following matches. */
83#define UNLESSM 32 /* no Fails if the following matches. */
84#define SUCCEED 33 /* no Return from a subroutine, basically. */
85#define WHILEM 34 /* no Do curly processing and see if rest matches. */
86#define ALNUM 35 /* no Match any alphanumeric character */
87#define ALNUML 36 /* no Match any alphanumeric char in locale */
88#define NALNUM 37 /* no Match any non-alphanumeric character */
89#define NALNUML 38 /* no Match any non-alphanumeric char in locale */
90#define SPACE 39 /* no Match any whitespace character */
91#define SPACEL 40 /* no Match any whitespace char in locale */
92#define NSPACE 41 /* no Match any non-whitespace character */
93#define NSPACEL 42 /* no Match any non-whitespace char in locale */
94#define DIGIT 43 /* no Match any numeric character */
95#define NDIGIT 44 /* no Match any non-numeric character */
a687059c
LW
96
97/*
98 * Opcode notes:
99 *
100 * BRANCH The set of branches constituting a single choice are hooked
101 * together with their "next" pointers, since precedence prevents
102 * anything being concatenated to any individual branch. The
103 * "next" pointer of the last BRANCH in a choice points to the
104 * thing following the whole choice. This is also where the
105 * final "next" pointer of each individual branch points; each
106 * branch starts with the operand node of a BRANCH node.
107 *
108 * BACK Normal "next" pointers all implicitly point forward; BACK
109 * exists to make loop structures possible.
110 *
111 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
112 * BRANCH structures using BACK. Simple cases (one character
113 * per match) are implemented with STAR and PLUS for speed
114 * and to minimize recursive plunges.
115 *
116 * OPEN,CLOSE ...are numbered at compile time.
117 */
118
fe14fcc3 119#ifndef DOINIT
a0d0e21e
LW
120EXT char regarglen[];
121#else
bbce6d69 122EXT char regarglen[] = {
123 0,0,0,0,0,0,0,0,0,0,
124 /*CURLY*/ 4, /*CURLYX*/ 4,
125 0,0,0,0,0,0,0,0,0,0,0,0,
c8756f30 126 /*REF*/ 2, 2, 2, /*OPEN*/ 2, /*CLOSE*/ 2,
bbce6d69 127 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
128};
a0d0e21e
LW
129#endif
130
131#ifndef DOINIT
132EXT char regkind[];
fe14fcc3 133#else
a0d0e21e
LW
134EXT char regkind[] = {
135 END,
136 BOL,
137 BOL,
138 BOL,
139 EOL,
140 EOL,
141 EOL,
142 ANY,
143 ANY,
144 ANYOF,
145 CURLY,
146 CURLY,
147 BRANCH,
148 BACK,
bbce6d69 149 EXACT,
150 EXACT,
151 EXACT,
a0d0e21e
LW
152 NOTHING,
153 STAR,
154 PLUS,
bbce6d69 155 BOUND,
a0d0e21e
LW
156 BOUND,
157 NBOUND,
bbce6d69 158 NBOUND,
a0d0e21e 159 REF,
c8756f30
AK
160 REF,
161 REF,
a0d0e21e
LW
162 OPEN,
163 CLOSE,
164 MINMOD,
774d564b 165 GPOS,
a0d0e21e
LW
166 BRANCH,
167 BRANCH,
168 END,
bbce6d69 169 WHILEM,
170 ALNUM,
171 ALNUM,
172 NALNUM,
173 NALNUM,
174 SPACE,
175 SPACE,
176 NSPACE,
177 NSPACE,
178 DIGIT,
179 NDIGIT,
a0d0e21e 180};
fe14fcc3
LW
181#endif
182
a687059c
LW
183/* The following have no fixed length. */
184#ifndef DOINIT
a0d0e21e 185EXT char varies[];
a687059c 186#else
bbce6d69 187EXT char varies[] = {
c8756f30 188 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, WHILEM, 0
bbce6d69 189};
a687059c
LW
190#endif
191
192/* The following always have a length of 1. */
193#ifndef DOINIT
a0d0e21e 194EXT char simple[];
a687059c 195#else
bbce6d69 196EXT char simple[] = {
197 ANY, SANY, ANYOF,
198 ALNUM, ALNUML, NALNUM, NALNUML,
199 SPACE, SPACEL, NSPACE, NSPACEL,
200 DIGIT, NDIGIT, 0
201};
a687059c
LW
202#endif
203
204EXT char regdummy;
205
206/*
207 * A node is one char of opcode followed by two chars of "next" pointer.
208 * "Next" pointers are stored as two 8-bit pieces, high order first. The
209 * value is a positive offset from the opcode of the node containing it.
210 * An operand, if any, simply follows the node. (Note that much of the
211 * code generation knows about this implicit relationship.)
212 *
213 * Using two bytes for the "next" pointer is vast overkill for most things,
214 * but allows patterns to get big without disasters.
215 *
216 * [If REGALIGN is defined, the "next" pointer is always aligned on an even
217 * boundary, and reads the offset directly as a short. Also, there is no
218 * special test to reverse the sign of BACK pointers since the offset is
219 * stored negative.]
220 */
221
222#ifndef gould
223#ifndef cray
34de22dd 224#ifndef eta10
a687059c
LW
225#define REGALIGN
226#endif
227#endif
34de22dd 228#endif
a687059c
LW
229
230#define OP(p) (*(p))
231
232#ifndef lint
233#ifdef REGALIGN
234#define NEXT(p) (*(short*)(p+1))
00bf170e
LW
235#define ARG1(p) (*(unsigned short*)(p+3))
236#define ARG2(p) (*(unsigned short*)(p+5))
a687059c
LW
237#else
238#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
00bf170e
LW
239#define ARG1(p) (((*((p)+3)&0377)<<8) + (*((p)+4)&0377))
240#define ARG2(p) (((*((p)+5)&0377)<<8) + (*((p)+6)&0377))
a687059c
LW
241#endif
242#else /* lint */
243#define NEXT(p) 0
244#endif /* lint */
245
246#define OPERAND(p) ((p) + 3)
247
248#ifdef REGALIGN
249#define NEXTOPER(p) ((p) + 4)
a0d0e21e 250#define PREVOPER(p) ((p) - 4)
a687059c
LW
251#else
252#define NEXTOPER(p) ((p) + 3)
a0d0e21e 253#define PREVOPER(p) ((p) - 3)
a687059c
LW
254#endif
255
256#define MAGIC 0234
257
bbce6d69 258/* Flags for first parameter byte of ANYOF */
259#define ANYOF_INVERT 0x40
260#define ANYOF_FOLD 0x20
261#define ANYOF_LOCALE 0x10
262#define ANYOF_ISA 0x0F
263#define ANYOF_ALNUML 0x08
264#define ANYOF_NALNUML 0x04
265#define ANYOF_SPACEL 0x02
266#define ANYOF_NSPACEL 0x01
267
a687059c
LW
268/*
269 * Utility definitions.
270 */
271#ifndef lint
2304df62 272#ifndef CHARMASK
a687059c
LW
273#define UCHARAT(p) ((int)*(unsigned char *)(p))
274#else
2304df62 275#define UCHARAT(p) ((int)*(p)&CHARMASK)
a687059c
LW
276#endif
277#else /* lint */
278#define UCHARAT(p) regdummy
279#endif /* lint */
280
a0d0e21e 281#define FAIL(m) croak("/%.127s/: %s",regprecomp,m)