This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update SH_PATH comment.
[perl5.git] / regcomp.h
CommitLineData
a0d0e21e 1/* regcomp.h
a687059c
LW
2 */
3
4/*
5 * The "internal use only" fields in regexp.h are present to pass info from
6 * compile to execute that permits the execute phase to run lots faster on
7 * simple cases. They are:
8 *
79072805 9 * regstart sv that must begin a match; Nullch if none obvious
a687059c
LW
10 * reganch is the match anchored (at beginning-of-line only)?
11 * regmust string (pointer into program) that match must include, or NULL
79072805 12 * [regmust changed to SV* for bminstr()--law]
a687059c
LW
13 * regmlen length of regmust string
14 * [regmlen not used currently]
15 *
16 * Regstart and reganch permit very fast decisions on suitable starting points
17 * for a match, cutting down the work a lot. Regmust permits fast rejection
18 * of lines that cannot possibly match. The regmust tests are costly enough
e50aee73 19 * that pregcomp() supplies a regmust only if the r.e. contains something
a687059c
LW
20 * potentially expensive (at present, the only such thing detected is * or +
21 * at the start of the r.e., which can involve a lot of backup). Regmlen is
e50aee73 22 * supplied because the test in pregexec() needs it and pregcomp() is computing
a687059c
LW
23 * it anyway.
24 * [regmust is now supplied always. The tests that use regmust have a
25 * heuristic that disables the test if it usually matches.]
26 *
27 * [In fact, we now use regmust in many cases to locate where the search
28 * starts in the string, so if regback is >= 0, the regmust search is never
29 * wasted effort. The regback variable says how many characters back from
30 * where regmust matched is the earliest possible start of the match.
31 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
32 */
33
34/*
35 * Structure for regexp "program". This is essentially a linear encoding
36 * of a nondeterministic finite-state machine (aka syntax charts or
37 * "railroad normal form" in parsing technology). Each node is an opcode
38 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
39 * all nodes except BRANCH implement concatenation; a "next" pointer with
40 * a BRANCH on both ends of it is connecting two alternatives. (Here we
41 * have one of the subtle syntax dependencies: an individual BRANCH (as
42 * opposed to a collection of them) is never concatenated with anything
43 * because of operator precedence.) The operand of some types of node is
44 * a literal string; for others, it is a node leading into a sub-FSM. In
45 * particular, the operand of a BRANCH node is the first node of the branch.
46 * (NB this is *not* a tree structure: the tail of the branch connects
47 * to the thing following the set of BRANCHes.) The opcodes are:
48 */
49
50/* definition number opnd? meaning */
51#define END 0 /* no End of program. */
52#define BOL 1 /* no Match "" at beginning of line. */
a0d0e21e
LW
53#define MBOL 2 /* no Same, assuming multiline. */
54#define SBOL 3 /* no Same, assuming singleline. */
55#define EOL 4 /* no Match "" at end of line. */
56#define MEOL 5 /* no Same, assuming multiline. */
57#define SEOL 6 /* no Same, assuming singleline. */
58#define ANY 7 /* no Match any one character (except newline). */
59#define SANY 8 /* no Match any one character. */
60#define ANYOF 9 /* sv Match character in (or not in) this class. */
61#define CURLY 10 /* sv Match this simple thing {n,m} times. */
62#define CURLYX 11 /* sv Match this complex thing {n,m} times. */
63#define BRANCH 12 /* node Match this alternative, or the next... */
64#define BACK 13 /* no Match "", "next" ptr points backward. */
65#define EXACTLY 14 /* sv Match this string (preceded by length). */
66#define NOTHING 15 /* no Match empty string. */
67#define STAR 16 /* node Match this (simple) thing 0 or more times. */
68#define PLUS 17 /* node Match this (simple) thing 1 or more times. */
69#define ALNUM 18 /* no Match any alphanumeric character */
70#define NALNUM 19 /* no Match any non-alphanumeric character */
71#define BOUND 20 /* no Match "" at any word boundary */
72#define NBOUND 21 /* no Match "" at any word non-boundary */
73#define SPACE 22 /* no Match any whitespace character */
74#define NSPACE 23 /* no Match any non-whitespace character */
75#define DIGIT 24 /* no Match any numeric character */
76#define NDIGIT 25 /* no Match any non-numeric character */
77#define REF 26 /* num Match some already matched string */
78#define OPEN 27 /* num Mark this point in input as start of #n. */
79#define CLOSE 28 /* num Analogous to OPEN. */
80#define MINMOD 29 /* no Next operator is not greedy. */
81#define GBOL 30 /* no Matches where last m//g left off. */
82#define IFMATCH 31 /* no Succeeds if the following matches. */
83#define UNLESSM 32 /* no Fails if the following matches. */
84#define SUCCEED 33 /* no Return from a subroutine, basically. */
85#define WHILEM 34 /* no Do curly processing and see if rest matches. */
a687059c
LW
86
87/*
88 * Opcode notes:
89 *
90 * BRANCH The set of branches constituting a single choice are hooked
91 * together with their "next" pointers, since precedence prevents
92 * anything being concatenated to any individual branch. The
93 * "next" pointer of the last BRANCH in a choice points to the
94 * thing following the whole choice. This is also where the
95 * final "next" pointer of each individual branch points; each
96 * branch starts with the operand node of a BRANCH node.
97 *
98 * BACK Normal "next" pointers all implicitly point forward; BACK
99 * exists to make loop structures possible.
100 *
101 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
102 * BRANCH structures using BACK. Simple cases (one character
103 * per match) are implemented with STAR and PLUS for speed
104 * and to minimize recursive plunges.
105 *
106 * OPEN,CLOSE ...are numbered at compile time.
107 */
108
fe14fcc3 109#ifndef DOINIT
a0d0e21e
LW
110EXT char regarglen[];
111#else
112EXT char regarglen[] = {0,0,0,0,0,0,0,0,0,0,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,0};
113#endif
114
115#ifndef DOINIT
116EXT char regkind[];
fe14fcc3 117#else
a0d0e21e
LW
118EXT char regkind[] = {
119 END,
120 BOL,
121 BOL,
122 BOL,
123 EOL,
124 EOL,
125 EOL,
126 ANY,
127 ANY,
128 ANYOF,
129 CURLY,
130 CURLY,
131 BRANCH,
132 BACK,
133 EXACTLY,
134 NOTHING,
135 STAR,
136 PLUS,
137 ALNUM,
138 NALNUM,
139 BOUND,
140 NBOUND,
141 SPACE,
142 NSPACE,
143 DIGIT,
144 NDIGIT,
145 REF,
146 OPEN,
147 CLOSE,
148 MINMOD,
149 BOL,
150 BRANCH,
151 BRANCH,
152 END,
153 WHILEM
154};
fe14fcc3
LW
155#endif
156
a687059c
LW
157/* The following have no fixed length. */
158#ifndef DOINIT
a0d0e21e 159EXT char varies[];
a687059c 160#else
a0d0e21e 161EXT char varies[] = {BRANCH,BACK,STAR,PLUS,CURLY,CURLYX,REF,WHILEM,0};
a687059c
LW
162#endif
163
164/* The following always have a length of 1. */
165#ifndef DOINIT
a0d0e21e 166EXT char simple[];
a687059c 167#else
a0d0e21e 168EXT char simple[] = {ANY,SANY,ANYOF,ALNUM,NALNUM,SPACE,NSPACE,DIGIT,NDIGIT,0};
a687059c
LW
169#endif
170
171EXT char regdummy;
172
173/*
174 * A node is one char of opcode followed by two chars of "next" pointer.
175 * "Next" pointers are stored as two 8-bit pieces, high order first. The
176 * value is a positive offset from the opcode of the node containing it.
177 * An operand, if any, simply follows the node. (Note that much of the
178 * code generation knows about this implicit relationship.)
179 *
180 * Using two bytes for the "next" pointer is vast overkill for most things,
181 * but allows patterns to get big without disasters.
182 *
183 * [If REGALIGN is defined, the "next" pointer is always aligned on an even
184 * boundary, and reads the offset directly as a short. Also, there is no
185 * special test to reverse the sign of BACK pointers since the offset is
186 * stored negative.]
187 */
188
189#ifndef gould
190#ifndef cray
34de22dd 191#ifndef eta10
a687059c
LW
192#define REGALIGN
193#endif
194#endif
34de22dd 195#endif
a687059c
LW
196
197#define OP(p) (*(p))
198
199#ifndef lint
200#ifdef REGALIGN
201#define NEXT(p) (*(short*)(p+1))
00bf170e
LW
202#define ARG1(p) (*(unsigned short*)(p+3))
203#define ARG2(p) (*(unsigned short*)(p+5))
a687059c
LW
204#else
205#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
00bf170e
LW
206#define ARG1(p) (((*((p)+3)&0377)<<8) + (*((p)+4)&0377))
207#define ARG2(p) (((*((p)+5)&0377)<<8) + (*((p)+6)&0377))
a687059c
LW
208#endif
209#else /* lint */
210#define NEXT(p) 0
211#endif /* lint */
212
213#define OPERAND(p) ((p) + 3)
214
215#ifdef REGALIGN
216#define NEXTOPER(p) ((p) + 4)
a0d0e21e 217#define PREVOPER(p) ((p) - 4)
a687059c
LW
218#else
219#define NEXTOPER(p) ((p) + 3)
a0d0e21e 220#define PREVOPER(p) ((p) - 3)
a687059c
LW
221#endif
222
223#define MAGIC 0234
224
225/*
226 * Utility definitions.
227 */
228#ifndef lint
2304df62 229#ifndef CHARMASK
a687059c
LW
230#define UCHARAT(p) ((int)*(unsigned char *)(p))
231#else
2304df62 232#define UCHARAT(p) ((int)*(p)&CHARMASK)
a687059c
LW
233#endif
234#else /* lint */
235#define UCHARAT(p) regdummy
236#endif /* lint */
237
a0d0e21e 238#define FAIL(m) croak("/%.127s/: %s",regprecomp,m)