perl5.git.perl.org Git - perl5.git/blame_incremental

Commit	Line	Data
	1	# regcomp.sym
	2	#
	3	# File has two sections, divided by a line of dashes '-'.
	4	#
	5	# Empty rows after #-comment are removed from input are ignored
	6	#
	7	# First section is for regops, second section is for regmatch-states
	8	#
	9	# Note that the order in this file is important.
	10	#
	11	# Format for first section:
	12	# NAME \s+ TYPE, arg-description [num-args] [flags] [longjump-len] ; DESCRIPTION
	13	# flag <S> means is REGNODE_SIMPLE; flag <V> means is REGNODE_VARIES
	14	#
	15	#
	16	# run perl regen.pl after editing this file
	17	# Also update perlredebguts.pod
	18
	19
	20
	21	#* Exit points
	22
	23	END END, no ; End of program.
	24	SUCCEED END, no ; Return from a subroutine, basically.
	25
	26	#* Anchors:
	27
	28	BOL BOL, no ; Match "" at beginning of line.
	29	MBOL BOL, no ; Same, assuming multiline.
	30	SBOL BOL, no ; Same, assuming singleline.
	31	EOS EOL, no ; Match "" at end of string.
	32	EOL EOL, no ; Match "" at end of line.
	33	MEOL EOL, no ; Same, assuming multiline.
	34	SEOL EOL, no ; Same, assuming singleline.
	35	# The regops that have varieties that vary depending on the character set regex
	36	# modifiers have to ordered thusly: /d, /l, /u, /a, /aa. This is because code
	37	# in regcomp.c uses the enum value of the modifier as an offset from the /d
	38	# version. The complements must come after the non-complements.
	39	# BOUND, ALNUM, SPACE, DIGIT, and their complements are affected, as well as
	40	# EXACTF.
	41	BOUND BOUND, no ; Match "" at any word boundary using native charset semantics for non-utf8
	42	BOUNDL BOUND, no ; Match "" at any locale word boundary
	43	BOUNDU BOUND, no ; Match "" at any word boundary using Unicode semantics
	44	BOUNDA BOUND, no ; Match "" at any word boundary using ASCII semantics
	45	# All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones
	46	NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset semantics for non-utf8
	47	NBOUNDL NBOUND, no ; Match "" at any locale word non-boundary
	48	NBOUNDU NBOUND, no ; Match "" at any word non-boundary using Unicode semantics
	49	NBOUNDA NBOUND, no ; Match "" at any word non-boundary using ASCII semantics
	50	GPOS GPOS, no ; Matches where last m//g left off.
	51
	52	#* [Special] alternatives:
	53
	54	REG_ANY REG_ANY, no 0 S ; Match any one character (except newline).
	55	SANY REG_ANY, no 0 S ; Match any one character.
	56	CANY REG_ANY, no 0 S ; Match any one byte.
	57	ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class, single char match only
	58
	59	# Order (within each group) of the below is important. See ordering comment
	60	# above. The PLACEHOLDERn ones are wasting a value. Right now, we have plenty
	61	# to spare, but these would be obvious candidates if ever we ran out of node
	62	# types in a U8.
	63	ALNUM ALNUM, no 0 S ; Match any alphanumeric character using native charset semantics for non-utf8
	64	ALNUML ALNUM, no 0 S ; Match any alphanumeric char in locale
	65	ALNUMU ALNUM, no 0 S ; Match any alphanumeric char using Unicode semantics
	66	ALNUMA ALNUM, no 0 S ; Match [A-Za-z_0-9]
	67	NALNUM NALNUM, no 0 S ; Match any non-alphanumeric character using native charset semantics for non-utf8
	68	NALNUML NALNUM, no 0 S ; Match any non-alphanumeric char in locale
	69	NALNUMU NALNUM, no 0 S ; Match any non-alphanumeric char using Unicode semantics
	70	NALNUMA NALNUM, no 0 S ; Match [^A-Za-z_0-9]
	71	SPACE SPACE, no 0 S ; Match any whitespace character using native charset semantics for non-utf8
	72	SPACEL SPACE, no 0 S ; Match any whitespace char in locale
	73	SPACEU SPACE, no 0 S ; Match any whitespace char using Unicode semantics
	74	SPACEA SPACE, no 0 S ; Match [ \t\n\f\r]
	75	NSPACE NSPACE, no 0 S ; Match any non-whitespace character using native charset semantics for non-utf8
	76	NSPACEL NSPACE, no 0 S ; Match any non-whitespace char in locale
	77	NSPACEU NSPACE, no 0 S ; Match any non-whitespace char using Unicode semantics
	78	NSPACEA NSPACE, no 0 S ; Match [^ \t\n\f\r]
	79	DIGIT DIGIT, no 0 S ; Match any numeric character using native charset semantics for non-utf8
	80	DIGITL DIGIT, no 0 S ; Match any numeric character in locale
	81	PLACEHOLDER1 NOTHING, no ; placeholder for missing DIGITU
	82	DIGITA DIGIT, no 0 S ; Match [0-9]
	83	NDIGIT NDIGIT, no 0 S ; Match any non-numeric character using native charset semantics for non-utf8
	84	NDIGITL NDIGIT, no 0 S ; Match any non-numeric character in locale
	85	PLACEHOLDER2 NOTHING, no ; placeholder for missing NDIGITU
	86	NDIGITA NDIGIT, no 0 S ; Match [^0-9]
	87
	88	POSIXD POSIXD, none 0 S ; currently unused except as a placeholder
	89	POSIXL POSIXD, none 0 S ; currently unused except as a placeholder
	90	POSIXU POSIXD, none 0 S ; currently unused except as a placeholder
	91	POSIXA POSIXD, none 0 S ; Some [[:class:]] under /a; the FLAGS field gives which one
	92	NPOSIXD NPOSIXD, none 0 S ; currently unused except as a placeholder
	93	NPOSIXL NPOSIXD, none 0 S ; currently unused except as a placeholder
	94	NPOSIXU NPOSIXD, none 0 S ; currently unused except as a placeholder
	95	NPOSIXA NPOSIXD, none 0 S ; complement of POSIXA, [[:^class:]]
	96	# End of order is important (within groups)
	97
	98	CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence
	99
	100	#* Alternation
	101
	102	# BRANCH The set of branches constituting a single choice are hooked
	103	# together with their "next" pointers, since precedence prevents
	104	# anything being concatenated to any individual branch. The
	105	# "next" pointer of the last BRANCH in a choice points to the
	106	# thing following the whole choice. This is also where the
	107	# final "next" pointer of each individual branch points; each
	108	# branch starts with the operand node of a BRANCH node.
	109	#
	110	BRANCH BRANCH, node 0 V ; Match this alternative, or the next...
	111
	112	#*Back pointer
	113
	114	# BACK Normal "next" pointers all implicitly point forward; BACK
	115	# exists to make loop structures possible.
	116	# not used
	117	BACK BACK, no 0 V ; Match "", "next" ptr points backward.
	118
	119	#*Literals - NOTE the relative ordering of these types is important do not change it
	120
	121	EXACT EXACT, str ; Match this string (preceded by length).
	122	EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).
	123	EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len).
	124	EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
	125	EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len).
	126	EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).
	127	EXACTFU_TRICKYFOLD EXACT, str ; Match this folded UTF-8 string using /iu rules
	128
	129	#*Do nothing types
	130
	131	NOTHING NOTHING, no ; Match empty string.
	132	# A variant of above which delimits a group, thus stops optimizations
	133	TAIL NOTHING, no ; Match empty string. Can jump here from outside.
	134
	135	#*Loops
	136
	137	# STAR,PLUS '?', and complex '*' and '+', are implemented as circular
	138	# BRANCH structures using BACK. Simple cases (one character
	139	# per match) are implemented with STAR and PLUS for speed
	140	# and to minimize recursive plunges.
	141	#
	142	STAR STAR, node 0 V ; Match this (simple) thing 0 or more times.
	143	PLUS PLUS, node 0 V ; Match this (simple) thing 1 or more times.
	144
	145	CURLY CURLY, sv 2 V ; Match this simple thing {n,m} times.
	146	CURLYN CURLY, no 2 V ; Capture next-after-this simple thing
	147	CURLYM CURLY, no 2 V ; Capture this medium-complex thing {n,m} times.
	148	CURLYX CURLY, sv 2 V ; Match this complex thing {n,m} times.
	149
	150	# This terminator creates a loop structure for CURLYX
	151	WHILEM WHILEM, no 0 V ; Do curly processing and see if rest matches.
	152
	153	#*Buffer related
	154
	155	# OPEN,CLOSE,GROUPP ...are numbered at compile time.
	156	OPEN OPEN, num 1 ; Mark this point in input as start of #n.
	157	CLOSE CLOSE, num 1 ; Analogous to OPEN.
	158
	159	REF REF, num 1 V ; Match some already matched string
	160	REFF REF, num 1 V ; Match already matched string, folded using native charset semantics for non-utf8
	161	REFFL REF, num 1 V ; Match already matched string, folded in loc.
	162	# N?REFF[AU] could have been implemented using the FLAGS field of the
	163	# regnode, but by having a separate node type, we can use the existing switch
	164	# statement to avoid some tests
	165	REFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8
	166	REFFA REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII
	167
	168	#*Named references. Code in regcomp.c assumes that these all are after the numbered references
	169	NREF REF, no-sv 1 V ; Match some already matched string
	170	NREFF REF, no-sv 1 V ; Match already matched string, folded using native charset semantics for non-utf8
	171	NREFFL REF, no-sv 1 V ; Match already matched string, folded in loc.
	172	NREFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8
	173	NREFFA REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII
	174
	175	IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches.
	176	UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches.
	177	SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE.
	178	IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher .
	179	GROUPP GROUPP, num 1 ; Whether the group matched.
	180
	181	#*Support for long RE
	182
	183	LONGJMP LONGJMP, off 1 . 1 ; Jump far away.
	184	BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset.
	185
	186	#*The heavy worker
	187
	188	EVAL EVAL, evl 1 ; Execute some Perl code.
	189
	190	#*Modifiers
	191
	192	MINMOD MINMOD, no ; Next operator is not greedy.
	193	LOGICAL LOGICAL, no ; Next opcode should set the flag only.
	194
	195	# This is not used yet
	196	RENUM BRANCHJ, off 1 . 1 ; Group with independently numbered parens.
	197
	198	#*Trie Related
	199
	200	# Behave the same as A\|LIST\|OF\|WORDS would. The '..C' variants have
	201	# inline charclass data (ascii only), the 'C' store it in the structure.
	202	# NOTE: the relative order of the TRIE-like regops is significant
	203
	204	TRIE TRIE, trie 1 ; Match many EXACT(F[ALU]?)? at once. flags==type
	205	TRIEC TRIE,trie charclass ; Same as TRIE, but with embedded charclass data
	206
	207	# For start classes, contains an added fail table.
	208	AHOCORASICK TRIE, trie 1 ; Aho Corasick stclass. flags==type
	209	AHOCORASICKC TRIE,trie charclass ; Same as AHOCORASICK, but with embedded charclass data
	210
	211	#*Regex Subroutines
	212	GOSUB GOSUB, num/ofs 2L ; recurse to paren arg1 at (signed) ofs arg2
	213	GOSTART GOSTART, no ; recurse to start of pattern
	214
	215	#*Special conditionals
	216	NGROUPP NGROUPP, no-sv 1 ; Whether the group matched.
	217	INSUBP INSUBP, num 1 ; Whether we are in a specific recurse.
	218	DEFINEP DEFINEP, none 1 ; Never execute directly.
	219
	220	#*Backtracking Verbs
	221	ENDLIKE ENDLIKE, none ; Used only for the type field of verbs
	222	OPFAIL ENDLIKE, none ; Same as (?!)
	223	ACCEPT ENDLIKE, parno 1 ; Accepts the current matched string.
	224
	225
	226	#*Verbs With Arguments
	227	VERB VERB, no-sv 1 ; Used only for the type field of verbs
	228	PRUNE VERB, no-sv 1 ; Pattern fails at this startpoint if no-backtracking through this
	229	MARKPOINT VERB, no-sv 1 ; Push the current location for rollback by cut.
	230	SKIP VERB, no-sv 1 ; On failure skip forward (to the mark) before retrying
	231	COMMIT VERB, no-sv 1 ; Pattern fails outright if backtracking through this
	232	CUTGROUP VERB, no-sv 1 ; On failure go to the next alternation in the group
	233
	234	#*Control what to keep in $&.
	235	KEEPS KEEPS, no ; $& begins here.
	236
	237	#*New charclass like patterns
	238	LNBREAK LNBREAK, none ; generic newline pattern
	239
	240	# regcomp.c expects the node number of the complement to be one greater than
	241	# the non-complement
	242	VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6)
	243	NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6)
	244	HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6)
	245	NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6)
	246
	247	# NEW STUFF SOMEWHERE ABOVE THIS LINE
	248
	249	################################################################################
	250
	251	#*SPECIAL REGOPS
	252
	253	# This is not really a node, but an optimized away piece of a "long" node.
	254	# To simplify debugging output, we mark it as if it were a node
	255	OPTIMIZED NOTHING, off ; Placeholder for dump.
	256
	257	# Special opcode with the property that no opcode in a compiled program
	258	# will ever be of this type. Thus it can be used as a flag value that
	259	# no other opcode has been seen. END is used similarly, in that an END
	260	# node cant be optimized. So END implies "unoptimizable" and PSEUDO mean
	261	# "not seen anything to optimize yet".
	262	PSEUDO PSEUDO, off ; Pseudo opcode for internal use.
	263
	264	-------------------------------------------------------------------------------
	265	# Format for second section:
	266	# REGOP \t typelist [ \t typelist] [# Comment]
	267	# typelist= namelist
	268	# = namelist:FAIL
	269	# = name:count
	270
	271	# Anything below is a state
	272	#
	273	#
	274	TRIE next:FAIL
	275	EVAL AB:FAIL
	276	CURLYX end:FAIL
	277	WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL
	278	BRANCH next:FAIL
	279	CURLYM A,B:FAIL
	280	IFMATCH A:FAIL
	281	CURLY B_min_known,B_min,B_max:FAIL
	282	COMMIT next:FAIL
	283	MARKPOINT next:FAIL
	284	SKIP next:FAIL
	285	CUTGROUP next:FAIL
	286	KEEPS next:FAIL

1

# regcomp.sym

2

#

3

# File has two sections, divided by a line of dashes '-'.

4

#

5

# Empty rows after #-comment are removed from input are ignored

6

#

7

# First section is for regops, second section is for regmatch-states

8

#

9

# Note that the order in this file is important.

10

#

11

# Format for first section:

12

# NAME \s+ TYPE, arg-description [num-args] [flags] [longjump-len] ; DESCRIPTION

13

# flag <S> means is REGNODE_SIMPLE; flag <V> means is REGNODE_VARIES

14

#

15

#

16

# run perl regen.pl after editing this file

17

# Also update perlredebguts.pod

#* Exit points

END END, no ; End of program.

24

SUCCEED END, no ; Return from a subroutine, basically.

#* Anchors:

BOL BOL, no ; Match "" at beginning of line.

29

MBOL BOL, no ; Same, assuming multiline.

30

SBOL BOL, no ; Same, assuming singleline.

31

EOS EOL, no ; Match "" at end of string.

32

EOL EOL, no ; Match "" at end of line.

33

MEOL EOL, no ; Same, assuming multiline.

34

SEOL EOL, no ; Same, assuming singleline.

35

# The regops that have varieties that vary depending on the character set regex

36

# modifiers have to ordered thusly: /d, /l, /u, /a, /aa. This is because code

37

# in regcomp.c uses the enum value of the modifier as an offset from the /d

38

# version. The complements must come after the non-complements.

39

# BOUND, ALNUM, SPACE, DIGIT, and their complements are affected, as well as

40

# EXACTF.

41

BOUND BOUND, no ; Match "" at any word boundary using native charset semantics for non-utf8

42

BOUNDL BOUND, no ; Match "" at any locale word boundary

43

BOUNDU BOUND, no ; Match "" at any word boundary using Unicode semantics

44

BOUNDA BOUND, no ; Match "" at any word boundary using ASCII semantics

45

# All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones

46

NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset semantics for non-utf8

47

NBOUNDL NBOUND, no ; Match "" at any locale word non-boundary

48

NBOUNDU NBOUND, no ; Match "" at any word non-boundary using Unicode semantics

49

NBOUNDA NBOUND, no ; Match "" at any word non-boundary using ASCII semantics

50

GPOS GPOS, no ; Matches where last m//g left off.

51

52

#* [Special] alternatives:

53

54

REG_ANY REG_ANY, no 0 S ; Match any one character (except newline).

55

SANY REG_ANY, no 0 S ; Match any one character.

56

CANY REG_ANY, no 0 S ; Match any one byte.

57

ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class, single char match only

58

59

# Order (within each group) of the below is important. See ordering comment

60

# above. The PLACEHOLDERn ones are wasting a value. Right now, we have plenty

61

# to spare, but these would be obvious candidates if ever we ran out of node

62

# types in a U8.

63

ALNUM ALNUM, no 0 S ; Match any alphanumeric character using native charset semantics for non-utf8

64

ALNUML ALNUM, no 0 S ; Match any alphanumeric char in locale

65

ALNUMU ALNUM, no 0 S ; Match any alphanumeric char using Unicode semantics

66

ALNUMA ALNUM, no 0 S ; Match [A-Za-z_0-9]

67

NALNUM NALNUM, no 0 S ; Match any non-alphanumeric character using native charset semantics for non-utf8

68

NALNUML NALNUM, no 0 S ; Match any non-alphanumeric char in locale

69

NALNUMU NALNUM, no 0 S ; Match any non-alphanumeric char using Unicode semantics

70

NALNUMA NALNUM, no 0 S ; Match [^A-Za-z_0-9]

71

SPACE SPACE, no 0 S ; Match any whitespace character using native charset semantics for non-utf8

72

SPACEL SPACE, no 0 S ; Match any whitespace char in locale

73

SPACEU SPACE, no 0 S ; Match any whitespace char using Unicode semantics

74

SPACEA SPACE, no 0 S ; Match [ \t\n\f\r]

75

NSPACE NSPACE, no 0 S ; Match any non-whitespace character using native charset semantics for non-utf8

76

NSPACEL NSPACE, no 0 S ; Match any non-whitespace char in locale

77

NSPACEU NSPACE, no 0 S ; Match any non-whitespace char using Unicode semantics

78

NSPACEA NSPACE, no 0 S ; Match [^ \t\n\f\r]

79

DIGIT DIGIT, no 0 S ; Match any numeric character using native charset semantics for non-utf8

80

DIGITL DIGIT, no 0 S ; Match any numeric character in locale

81

PLACEHOLDER1 NOTHING, no ; placeholder for missing DIGITU

82

DIGITA DIGIT, no 0 S ; Match [0-9]

83

NDIGIT NDIGIT, no 0 S ; Match any non-numeric character using native charset semantics for non-utf8

84

NDIGITL NDIGIT, no 0 S ; Match any non-numeric character in locale

85

PLACEHOLDER2 NOTHING, no ; placeholder for missing NDIGITU

86

NDIGITA NDIGIT, no 0 S ; Match [^0-9]

87

88

POSIXD POSIXD, none 0 S ; currently unused except as a placeholder

89

POSIXL POSIXD, none 0 S ; currently unused except as a placeholder

90

POSIXU POSIXD, none 0 S ; currently unused except as a placeholder

91

POSIXA POSIXD, none 0 S ; Some [[:class:]] under /a; the FLAGS field gives which one

92

NPOSIXD NPOSIXD, none 0 S ; currently unused except as a placeholder

93

NPOSIXL NPOSIXD, none 0 S ; currently unused except as a placeholder

94

NPOSIXU NPOSIXD, none 0 S ; currently unused except as a placeholder

95

NPOSIXA NPOSIXD, none 0 S ; complement of POSIXA, [[:^class:]]

96

# End of order is important (within groups)

97

98

CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence

#* Alternation

# BRANCH The set of branches constituting a single choice are hooked

103

# together with their "next" pointers, since precedence prevents

104

# anything being concatenated to any individual branch. The

105

# "next" pointer of the last BRANCH in a choice points to the

106

# thing following the whole choice. This is also where the

107

# final "next" pointer of each individual branch points; each

108

# branch starts with the operand node of a BRANCH node.

109

#

110

BRANCH BRANCH, node 0 V ; Match this alternative, or the next...

#*Back pointer

# BACK Normal "next" pointers all implicitly point forward; BACK

115

# exists to make loop structures possible.

116

# not used

117

BACK BACK, no 0 V ; Match "", "next" ptr points backward.

118

119

#*Literals - NOTE the relative ordering of these types is important do not change it

120

121

EXACT EXACT, str ; Match this string (preceded by length).

122

EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).

123

EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len).

124

EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).

125

EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len).

126

EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).

127

EXACTFU_TRICKYFOLD EXACT, str ; Match this folded UTF-8 string using /iu rules

#*Do nothing types

NOTHING NOTHING, no ; Match empty string.

132

# A variant of above which delimits a group, thus stops optimizations

133

TAIL NOTHING, no ; Match empty string. Can jump here from outside.

#*Loops

# STAR,PLUS '?', and complex '*' and '+', are implemented as circular

138

# BRANCH structures using BACK. Simple cases (one character

139

# per match) are implemented with STAR and PLUS for speed

140

# and to minimize recursive plunges.

141

#

142

STAR STAR, node 0 V ; Match this (simple) thing 0 or more times.

143

PLUS PLUS, node 0 V ; Match this (simple) thing 1 or more times.

144

145

CURLY CURLY, sv 2 V ; Match this simple thing {n,m} times.

146

CURLYN CURLY, no 2 V ; Capture next-after-this simple thing

147

CURLYM CURLY, no 2 V ; Capture this medium-complex thing {n,m} times.

148

CURLYX CURLY, sv 2 V ; Match this complex thing {n,m} times.

149

150

# This terminator creates a loop structure for CURLYX

151

WHILEM WHILEM, no 0 V ; Do curly processing and see if rest matches.

#*Buffer related

# OPEN,CLOSE,GROUPP ...are numbered at compile time.

156

OPEN OPEN, num 1 ; Mark this point in input as start of #n.

157

CLOSE CLOSE, num 1 ; Analogous to OPEN.

158

159

REF REF, num 1 V ; Match some already matched string

160

REFF REF, num 1 V ; Match already matched string, folded using native charset semantics for non-utf8

161

REFFL REF, num 1 V ; Match already matched string, folded in loc.

162

# N?REFF[AU] could have been implemented using the FLAGS field of the

163

# regnode, but by having a separate node type, we can use the existing switch

164

# statement to avoid some tests

165

REFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8

166

REFFA REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII

167

168

#*Named references. Code in regcomp.c assumes that these all are after the numbered references

169

NREF REF, no-sv 1 V ; Match some already matched string

170

NREFF REF, no-sv 1 V ; Match already matched string, folded using native charset semantics for non-utf8

171

NREFFL REF, no-sv 1 V ; Match already matched string, folded in loc.

172

NREFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8

173

NREFFA REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII

174

175

IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches.

176

UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches.

177

SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE.

178

IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher .

179

GROUPP GROUPP, num 1 ; Whether the group matched.

180

181

#*Support for long RE

182

183

LONGJMP LONGJMP, off 1 . 1 ; Jump far away.

184

BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset.

#*The heavy worker

EVAL EVAL, evl 1 ; Execute some Perl code.

#*Modifiers

MINMOD MINMOD, no ; Next operator is not greedy.

193

LOGICAL LOGICAL, no ; Next opcode should set the flag only.

194

195

# This is not used yet

196

RENUM BRANCHJ, off 1 . 1 ; Group with independently numbered parens.

#*Trie Related

# Behave the same as A|LIST|OF|WORDS would. The '..C' variants have

201

# inline charclass data (ascii only), the 'C' store it in the structure.

202

# NOTE: the relative order of the TRIE-like regops is significant

203

204

TRIE TRIE, trie 1 ; Match many EXACT(F[ALU]?)? at once. flags==type

205

TRIEC TRIE,trie charclass ; Same as TRIE, but with embedded charclass data

206

207

# For start classes, contains an added fail table.

208

AHOCORASICK TRIE, trie 1 ; Aho Corasick stclass. flags==type

209

AHOCORASICKC TRIE,trie charclass ; Same as AHOCORASICK, but with embedded charclass data

210

211

#*Regex Subroutines

212

GOSUB GOSUB, num/ofs 2L ; recurse to paren arg1 at (signed) ofs arg2

213

GOSTART GOSTART, no ; recurse to start of pattern

214

215

#*Special conditionals

216

NGROUPP NGROUPP, no-sv 1 ; Whether the group matched.

217

INSUBP INSUBP, num 1 ; Whether we are in a specific recurse.

218

DEFINEP DEFINEP, none 1 ; Never execute directly.

219

220

#*Backtracking Verbs

221

ENDLIKE ENDLIKE, none ; Used only for the type field of verbs

222

OPFAIL ENDLIKE, none ; Same as (?!)

223

ACCEPT ENDLIKE, parno 1 ; Accepts the current matched string.

224

225

226

#*Verbs With Arguments

227

VERB VERB, no-sv 1 ; Used only for the type field of verbs

228

PRUNE VERB, no-sv 1 ; Pattern fails at this startpoint if no-backtracking through this

229

MARKPOINT VERB, no-sv 1 ; Push the current location for rollback by cut.

230

SKIP VERB, no-sv 1 ; On failure skip forward (to the mark) before retrying

231

COMMIT VERB, no-sv 1 ; Pattern fails outright if backtracking through this

232

CUTGROUP VERB, no-sv 1 ; On failure go to the next alternation in the group

233

234

#*Control what to keep in $&.

235

KEEPS KEEPS, no ; $& begins here.

236

237

#*New charclass like patterns

238

LNBREAK LNBREAK, none ; generic newline pattern

239

240

# regcomp.c expects the node number of the complement to be one greater than

241

# the non-complement

242

VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6)

243

NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6)

244

HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6)

245

NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6)

246

247

# NEW STUFF SOMEWHERE ABOVE THIS LINE

248

249

################################################################################

#*SPECIAL REGOPS

# This is not really a node, but an optimized away piece of a "long" node.

254

# To simplify debugging output, we mark it as if it were a node

255

OPTIMIZED NOTHING, off ; Placeholder for dump.

256

257

# Special opcode with the property that no opcode in a compiled program

258

# will ever be of this type. Thus it can be used as a flag value that

259

# no other opcode has been seen. END is used similarly, in that an END

260

# node cant be optimized. So END implies "unoptimizable" and PSEUDO mean

261

# "not seen anything to optimize yet".

262

PSEUDO PSEUDO, off ; Pseudo opcode for internal use.

263

264

-------------------------------------------------------------------------------

265

# Format for second section:

266

# REGOP \t typelist [ \t typelist] [# Comment]

# typelist= namelist

# = namelist:FAIL

# = name:count

# Anything below is a state

#

#

TRIE next:FAIL

EVAL AB:FAIL

CURLYX end:FAIL

WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL

BRANCH next:FAIL

CURLYM A,B:FAIL

IFMATCH A:FAIL

CURLY B_min_known,B_min,B_max:FAIL

COMMIT next:FAIL

MARKPOINT next:FAIL

SKIP next:FAIL

CUTGROUP next:FAIL

KEEPS next:FAIL