This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make sort respect overloading
[perl5.git] / regcomp.h
CommitLineData
a0d0e21e 1/* regcomp.h
a687059c
LW
2 */
3
c277df42
IZ
4typedef OP OP_4tree; /* Will be redefined later. */
5
a687059c
LW
6/*
7 * The "internal use only" fields in regexp.h are present to pass info from
8 * compile to execute that permits the execute phase to run lots faster on
9 * simple cases. They are:
10 *
79072805 11 * regstart sv that must begin a match; Nullch if none obvious
a687059c
LW
12 * reganch is the match anchored (at beginning-of-line only)?
13 * regmust string (pointer into program) that match must include, or NULL
79072805 14 * [regmust changed to SV* for bminstr()--law]
a687059c
LW
15 * regmlen length of regmust string
16 * [regmlen not used currently]
17 *
18 * Regstart and reganch permit very fast decisions on suitable starting points
19 * for a match, cutting down the work a lot. Regmust permits fast rejection
20 * of lines that cannot possibly match. The regmust tests are costly enough
e50aee73 21 * that pregcomp() supplies a regmust only if the r.e. contains something
a687059c
LW
22 * potentially expensive (at present, the only such thing detected is * or +
23 * at the start of the r.e., which can involve a lot of backup). Regmlen is
e50aee73 24 * supplied because the test in pregexec() needs it and pregcomp() is computing
a687059c
LW
25 * it anyway.
26 * [regmust is now supplied always. The tests that use regmust have a
27 * heuristic that disables the test if it usually matches.]
28 *
29 * [In fact, we now use regmust in many cases to locate where the search
30 * starts in the string, so if regback is >= 0, the regmust search is never
31 * wasted effort. The regback variable says how many characters back from
32 * where regmust matched is the earliest possible start of the match.
33 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
34 */
35
36/*
37 * Structure for regexp "program". This is essentially a linear encoding
38 * of a nondeterministic finite-state machine (aka syntax charts or
39 * "railroad normal form" in parsing technology). Each node is an opcode
40 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
41 * all nodes except BRANCH implement concatenation; a "next" pointer with
42 * a BRANCH on both ends of it is connecting two alternatives. (Here we
43 * have one of the subtle syntax dependencies: an individual BRANCH (as
44 * opposed to a collection of them) is never concatenated with anything
45 * because of operator precedence.) The operand of some types of node is
46 * a literal string; for others, it is a node leading into a sub-FSM. In
47 * particular, the operand of a BRANCH node is the first node of the branch.
48 * (NB this is *not* a tree structure: the tail of the branch connects
49 * to the thing following the set of BRANCHes.) The opcodes are:
50 */
51
a687059c
LW
52/*
53 * A node is one char of opcode followed by two chars of "next" pointer.
54 * "Next" pointers are stored as two 8-bit pieces, high order first. The
55 * value is a positive offset from the opcode of the node containing it.
56 * An operand, if any, simply follows the node. (Note that much of the
57 * code generation knows about this implicit relationship.)
58 *
59 * Using two bytes for the "next" pointer is vast overkill for most things,
60 * but allows patterns to get big without disasters.
61 *
e91177ed 62 * [The "next" pointer is always aligned on an even
a687059c
LW
63 * boundary, and reads the offset directly as a short. Also, there is no
64 * special test to reverse the sign of BACK pointers since the offset is
65 * stored negative.]
66 */
67
c277df42
IZ
68struct regnode_string {
69 U8 flags;
70 U8 type;
71 U16 next_off;
72 U8 string[1];
73};
a687059c 74
c277df42
IZ
75struct regnode_1 {
76 U8 flags;
77 U8 type;
78 U16 next_off;
79 U32 arg1;
80};
81
82struct regnode_2 {
83 U8 flags;
84 U8 type;
85 U16 next_off;
86 U16 arg1;
87 U16 arg2;
88};
89
9c7e81e8
AD
90/* XXX fix this description.
91 Impose a limit of REG_INFTY on various pattern matching operations
92 to limit stack growth and to avoid "infinite" recursions.
93*/
94/* The default size for REG_INFTY is I16_MAX, which is the same as
95 SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits
96 (see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is
97 ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is
98 ((1<<63)-1). To limit stack growth to reasonable sizes, supply a
99 smaller default.
100 --Andy Dougherty 11 June 1998
101*/
102#if SHORTSIZE > 2
103# ifndef REG_INFTY
104# define REG_INFTY ((1<<15)-1)
105# endif
106#endif
107
108#ifndef REG_INFTY
83cfe48c
IZ
109# define REG_INFTY I16_MAX
110#endif
a687059c 111
e91177ed
IZ
112#define ARG_VALUE(arg) (arg)
113#define ARG__SET(arg,val) ((arg) = (val))
c277df42
IZ
114
115#define ARG(p) ARG_VALUE(ARG_LOC(p))
116#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
117#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
118#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
119#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
120#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
121
122#ifndef lint
e91177ed
IZ
123# define NEXT_OFF(p) ((p)->next_off)
124# define NODE_ALIGN(node)
125# define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
a687059c 126#else /* lint */
c277df42 127# define NEXT_OFF(p) 0
e91177ed
IZ
128# define NODE_ALIGN(node)
129# define NODE_ALIGN_FILL(node)
a687059c
LW
130#endif /* lint */
131
c277df42
IZ
132#define SIZE_ALIGN NODE_ALIGN
133
e91177ed
IZ
134#define OP(p) ((p)->type)
135#define OPERAND(p) (((struct regnode_string *)p)->string)
136#define NODE_ALIGN(node)
137#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
138#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
139#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
140#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
141#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
142
143#define NODE_STEP_B 4
c277df42
IZ
144
145#define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
146#define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
147
e91177ed 148#define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
c277df42 149 (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
e91177ed 150#define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
c277df42 151 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
a687059c
LW
152
153#define MAGIC 0234
154
3280af22 155#define SIZE_ONLY (PL_regcode == &PL_regdummy)
c277df42 156
bbce6d69 157/* Flags for first parameter byte of ANYOF */
158#define ANYOF_INVERT 0x40
159#define ANYOF_FOLD 0x20
160#define ANYOF_LOCALE 0x10
161#define ANYOF_ISA 0x0F
162#define ANYOF_ALNUML 0x08
163#define ANYOF_NALNUML 0x04
164#define ANYOF_SPACEL 0x02
165#define ANYOF_NSPACEL 0x01
166
ae5c130c
GS
167/* Utility macros for bitmap of ANYOF */
168#define ANYOF_BYTE(p,c) (p)[1 + (((c) >> 3) & 31)]
169#define ANYOF_BIT(c) (1 << ((c) & 7))
170#define ANYOF_SET(p,c) (ANYOF_BYTE(p,c) |= ANYOF_BIT(c))
171#define ANYOF_CLEAR(p,c) (ANYOF_BYTE(p,c) &= ~ANYOF_BIT(c))
172#define ANYOF_TEST(p,c) (ANYOF_BYTE(p,c) & ANYOF_BIT(c))
173
c277df42 174#define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1)
c277df42 175
a687059c
LW
176/*
177 * Utility definitions.
178 */
179#ifndef lint
2304df62 180#ifndef CHARMASK
a687059c
LW
181#define UCHARAT(p) ((int)*(unsigned char *)(p))
182#else
2304df62 183#define UCHARAT(p) ((int)*(p)&CHARMASK)
a687059c
LW
184#endif
185#else /* lint */
6b88bc9c 186#define UCHARAT(p) PL_regdummy
a687059c
LW
187#endif /* lint */
188
3280af22
NIS
189#define FAIL(m) croak ("/%.127s/: %s", PL_regprecomp,m)
190#define FAIL2(pat,m) re_croak2("/%.127s/: ",pat,PL_regprecomp,m)
c277df42
IZ
191
192#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
193
d09b2d29
IZ
194#define REG_SEEN_ZERO_LEN 1
195#define REG_SEEN_LOOKBEHIND 2
196#define REG_SEEN_GPOS 4
197#define REG_SEEN_EVAL 8
198
199#include "regnodes.h"
200
201/* The following have no fixed length. char* since we do strchr on it. */
202#ifndef DOINIT
203EXTCONST char varies[];
204#else
205EXTCONST char varies[] = {
206 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
a0ed51b3 207 WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, 0
c277df42 208};
d09b2d29 209#endif
c277df42 210
d09b2d29 211/* The following always have a length of 1. char* since we do strchr on it. */
a0ed51b3 212/* (Note that lenght 1 means "one character" under UTF8, not "one octet".) */
d09b2d29
IZ
213#ifndef DOINIT
214EXTCONST char simple[];
215#else
216EXTCONST char simple[] = {
a0ed51b3
LW
217 ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8,
218 ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8,
219 NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8,
220 SPACE, SPACEUTF8, SPACEL, SPACELUTF8,
221 NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8,
222 DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0
c277df42
IZ
223};
224#endif
225