This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
msgrcv: properly downgrade the receive buffer
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
526e4b9d
KW
86static const char b_utf8_locale_required[] =
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88 " Assuming a UTF-8 locale";
89
90#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
91 STMT_START { \
92 if (! IN_UTF8_CTYPE_LOCALE) { \
93 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
94 b_utf8_locale_required); \
95 } \
96 } STMT_END
bbac6b20 97
25f81fd5 98static const char sets_utf8_locale_required[] =
a0bd1a30
KW
99 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
100
b272adb4
KW
101#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
102 STMT_START { \
103 if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \
104 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
105 sets_utf8_locale_required); \
106 } \
107 } STMT_END
108
e1cf74e3
CB
109#ifdef DEBUGGING
110/* At least one required character in the target string is expressible only in
111 * UTF-8. */
e7a474c0 112static const char non_utf8_target_but_utf8_required[]
e1cf74e3
CB
113 = "Can't match, because target string needs to be in UTF-8\n";
114#endif
115
7b031478 116#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 117 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 118 goto target; \
e1cf74e3
CB
119} STMT_END
120
a687059c
LW
121#ifndef STATIC
122#define STATIC static
123#endif
124
c277df42
IZ
125/*
126 * Forwards.
127 */
128
f2ed9b32 129#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 130
3dab1dad 131#define HOPc(pos,off) \
ba44c216 132 (char *)(reginfo->is_utf8_target \
220db18a 133 ? reghop3((U8*)pos, off, \
9d9163fb 134 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 135 : (U8*)(pos + off))
557f47af 136
bb152a4b
DM
137/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138#define HOPBACK3(pos, off, lim) \
139 (reginfo->is_utf8_target \
140 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141 : (pos - off >= lim) \
142 ? (U8*)pos - off \
3dab1dad 143 : NULL)
efb30f32 144
bb152a4b
DM
145#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
146
ba44c216 147#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 148#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 149
557f47af
DM
150/* lim must be +ve. Returns NULL on overshoot */
151#define HOPMAYBE3(pos,off,lim) \
152 (reginfo->is_utf8_target \
153 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
154 : ((U8*)pos + off <= lim) \
155 ? (U8*)pos + off \
156 : NULL)
157
8e9f2289
DM
158/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159 * off must be >=0; args should be vars rather than expressions */
160#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 163#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 164
2974eaec
DM
165#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
167 : (U8*)(pos + off))
168#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb 169
c7304fe2 170#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
171/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
172
5f80c4cf 173/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
174/* it would be nice to rework regcomp.sym to generate this stuff. sigh
175 *
176 * NOTE that *nothing* that affects backtracking should be in here, specifically
177 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
178 * node that is in between two EXACT like nodes when ascertaining what the required
179 * "follow" character is. This should probably be moved to regex compile time
180 * although it may be done at run time beause of the REF possibility - more
181 * investigation required. -- demerphq
182*/
baa60164
KW
183#define JUMPABLE(rn) ( \
184 OP(rn) == OPEN || \
24be3102
YO
185 (OP(rn) == CLOSE && \
186 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
187 OP(rn) == EVAL || \
188 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
189 OP(rn) == PLUS || OP(rn) == MINMOD || \
190 OP(rn) == KEEPS || \
191 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 192)
ee9b8eae 193#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 194
ee9b8eae
YO
195#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
196
a84d97b6
HS
197/*
198 Search for mandatory following text node; for lookahead, the text must
199 follow but for lookbehind (rn->flags != 0) we skip to the next step.
200*/
baa60164 201#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
202 while (JUMPABLE(rn)) { \
203 const OPCODE type = OP(rn); \
204 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 205 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 206 else if (type == PLUS) \
cca55fe3 207 rn = NEXTOPER(rn); \
3dab1dad 208 else if (type == IFMATCH) \
a84d97b6 209 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 210 else rn += NEXT_OFF(rn); \
3dab1dad 211 } \
5f80c4cf 212} STMT_END
74750237 213
006f26b2
DM
214#define SLAB_FIRST(s) (&(s)->states[0])
215#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
216
a75351a1 217static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 218static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 219static regmatch_state * S_push_slab(pTHX);
51371543 220
87c0511b 221#define REGCP_PAREN_ELEMS 3
f067efbf 222#define REGCP_OTHER_ELEMS 3
e0fa7e2b 223#define REGCP_FRAME_ELEMS 1
620d5b66
NC
224/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
226
76e3520e 227STATIC CHECKPOINT
21553840 228S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 229{
a3b680e6 230 const int retval = PL_savestack_ix;
92da3157
DM
231 const int paren_elems_to_push =
232 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
233 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 235 I32 p;
271b36b1 236 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a0d0e21e 237
b93070ed
DM
238 PERL_ARGS_ASSERT_REGCPPUSH;
239
e49a9654 240 if (paren_elems_to_push < 0)
e8a85d26
JH
241 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242 (int)paren_elems_to_push, (int)maxopenparen,
243 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 244
e0fa7e2b 245 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 246 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 247 " out of range (%lu-%ld)",
92da3157
DM
248 total_elems,
249 (unsigned long)maxopenparen,
250 (long)parenfloor);
e0fa7e2b 251
620d5b66 252 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 253
495f47a5 254 DEBUG_BUFFERS_r(
92da3157 255 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 256 Perl_re_exec_indentf( aTHX_
147e3846 257 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
258 depth,
259 PTR2UV(rex),
495f47a5
DM
260 PTR2UV(rex->offs)
261 );
262 );
92da3157 263 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 264/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
265 SSPUSHIV(rex->offs[p].end);
266 SSPUSHIV(rex->offs[p].start);
1ca2007e 267 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 268 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 269 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
270 depth,
271 (UV)p,
495f47a5
DM
272 (IV)rex->offs[p].start,
273 (IV)rex->offs[p].start_tmp,
274 (IV)rex->offs[p].end
40a82448 275 ));
a0d0e21e 276 }
b1ce53c5 277/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 278 SSPUSHINT(maxopenparen);
b93070ed
DM
279 SSPUSHINT(rex->lastparen);
280 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 281 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 282
a0d0e21e
LW
283 return retval;
284}
285
c277df42 286/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
287#define REGCP_SET(cp) \
288 DEBUG_STATE_r( \
147e3846
KW
289 Perl_re_exec_indentf( aTHX_ \
290 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
291 depth, (IV)PL_savestack_ix \
292 ) \
293 ); \
ab3bbdeb 294 cp = PL_savestack_ix
c3464db5 295
ab3bbdeb 296#define REGCP_UNWIND(cp) \
e4f74956 297 DEBUG_STATE_r( \
cb41e5d6 298 if (cp != PL_savestack_ix) \
147e3846
KW
299 Perl_re_exec_indentf( aTHX_ \
300 "Clearing an EVAL scope, savestack=%" \
301 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
302 depth, (IV)(cp), (IV)PL_savestack_ix \
303 ) \
304 ); \
ab3bbdeb 305 regcpblow(cp)
c277df42 306
8e9ed7cc
DM
307/* set the start and end positions of capture ix */
308#define CLOSE_CAPTURE(ix, s, e) \
309 rex->offs[ix].start = s; \
310 rex->offs[ix].end = e; \
3be2a9fd
DM
311 if (ix > rex->lastparen) \
312 rex->lastparen = ix; \
313 rex->lastcloseparen = ix; \
69cd2617 314 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
befca383 315 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
69cd2617
DM
316 depth, \
317 PTR2UV(rex), \
318 PTR2UV(rex->offs), \
8e9ed7cc
DM
319 (UV)ix, \
320 (IV)rex->offs[ix].start, \
3be2a9fd
DM
321 (IV)rex->offs[ix].end, \
322 (UV)rex->lastparen \
69cd2617
DM
323 ))
324
a8d1f4b4 325#define UNWIND_PAREN(lp, lcp) \
befca383
DM
326 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
327 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
328 depth, \
329 PTR2UV(rex), \
330 PTR2UV(rex->offs), \
331 (UV)(lp), \
332 (UV)(rex->lastparen), \
333 (UV)(lcp) \
334 )); \
a8d1f4b4
DM
335 for (n = rex->lastparen; n > lp; n--) \
336 rex->offs[n].end = -1; \
337 rex->lastparen = n; \
338 rex->lastcloseparen = lcp;
339
340
f067efbf 341STATIC void
21553840 342S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 343{
e0fa7e2b 344 UV i;
87c0511b 345 U32 paren;
271b36b1 346 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 347
7918f24d
NC
348 PERL_ARGS_ASSERT_REGCPPOP;
349
b1ce53c5 350 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 351 i = SSPOPUV;
e0fa7e2b
NC
352 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
354 rex->lastcloseparen = SSPOPINT;
355 rex->lastparen = SSPOPINT;
92da3157 356 *maxopenparen_p = SSPOPINT;
b1ce53c5 357
620d5b66 358 i -= REGCP_OTHER_ELEMS;
b1ce53c5 359 /* Now restore the parentheses context. */
495f47a5
DM
360 DEBUG_BUFFERS_r(
361 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 362 Perl_re_exec_indentf( aTHX_
147e3846 363 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
364 depth,
365 PTR2UV(rex),
495f47a5
DM
366 PTR2UV(rex->offs)
367 );
368 );
92da3157 369 paren = *maxopenparen_p;
620d5b66 370 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 371 SSize_t tmps;
1ca2007e 372 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
373 rex->offs[paren].start = SSPOPIV;
374 tmps = SSPOPIV;
b93070ed
DM
375 if (paren <= rex->lastparen)
376 rex->offs[paren].end = tmps;
2b1a3689 377 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 378 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
379 depth,
380 (UV)paren,
495f47a5
DM
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 385 );
87c0511b 386 paren--;
a0d0e21e 387 }
daf18116 388#if 1
dafc8851
JH
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
225593e1
DM
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
b93070ed 398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 399 if (i > *maxopenparen_p)
b93070ed
DM
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
2b1a3689 402 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 403 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
404 depth,
405 (UV)i,
92da3157 406 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 407 ));
a0d0e21e 408 }
dafc8851 409#endif
a0d0e21e
LW
410}
411
74088413
DM
412/* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
414
415STATIC void
21553840 416S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
417{
418 I32 tmpix = PL_savestack_ix;
85882954
YO
419 PERL_ARGS_ASSERT_REGCP_RESTORE;
420
74088413 421 PL_savestack_ix = ix;
21553840 422 regcppop(rex, maxopenparen_p);
74088413
DM
423 PL_savestack_ix = tmpix;
424}
425
02db2b7b 426#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 427
9637d2a5
CB
428#ifndef PERL_IN_XSUB_RE
429
24e16d7b
KW
430bool
431Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
432{
433 /* Returns a boolean as to whether or not 'character' is a member of the
434 * Posix character class given by 'classnum' that should be equivalent to a
435 * value in the typedef '_char_class_number'.
436 *
437 * Ideally this could be replaced by a just an array of function pointers
438 * to the C library functions that implement the macros this calls.
439 * However, to compile, the precise function signatures are required, and
a3815e44 440 * these may vary from platform to platform. To avoid having to figure
31c7f561 441 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
442 * which adds an extra layer of function call overhead (unless the C
443 * optimizer strips it away). But we don't particularly care about
444 * performance with locales anyway. */
31c7f561
KW
445
446 switch ((_char_class_number) classnum) {
15861f94 447 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 448 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
449 case _CC_ENUM_ASCII: return isASCII_LC(character);
450 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 451 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 452 || isUPPER_LC(character);
e8d596e0 453 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
454 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
455 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
456 case _CC_ENUM_LOWER: return isLOWER_LC(character);
457 case _CC_ENUM_PRINT: return isPRINT_LC(character);
458 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 459 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
460 case _CC_ENUM_UPPER: return isUPPER_LC(character);
461 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 462 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
463 default: /* VERTSPACE should never occur in locales */
464 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
465 }
466
e5964223 467 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
468 return FALSE;
469}
470
9637d2a5
CB
471#endif
472
53362e85
KW
473PERL_STATIC_INLINE I32
474S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
475{
476 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
477 * folded. Works on all folds representable without UTF-8, except for
478 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
479 * check that the strings each have at least 'len' characters.
480 *
481 * There is almost an identical API function where s2 need not be folded:
482 * Perl_foldEQ_latin1() */
483
484 const U8 *a = (const U8 *)s1;
485 const U8 *b = (const U8 *)s2;
486
487 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
488
489 assert(len >= 0);
490
491 while (len--) {
492 assert(! isUPPER_L1(*b));
493 if (toLOWER_L1(*a) != *b) {
494 return 0;
495 }
496 a++, b++;
497 }
498 return 1;
499}
500
3018b823 501STATIC bool
a78c2fa6 502S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
3018b823
KW
503{
504 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
505 * 'character' is a member of the Posix character class given by 'classnum'
506 * that should be equivalent to a value in the typedef
507 * '_char_class_number'.
508 *
509 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 510 * the range 0-255. Outside that range, all characters use Unicode
3018b823 511 * rules, ignoring any locale. So use the Unicode function if this class
4c404f26 512 * requires an inversion list, and use the Unicode macro otherwise. */
3018b823 513
1565c085 514
3018b823
KW
515 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
516
517 if (UTF8_IS_INVARIANT(*character)) {
518 return isFOO_lc(classnum, *character);
519 }
520 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
521 return isFOO_lc(classnum,
a62b247b 522 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
523 }
524
a78c2fa6 525 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
613abc6d 526
3018b823 527 switch ((_char_class_number) classnum) {
779cf272 528 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
529 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
530 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
531 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
4d3d8522
KW
532 default:
533 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
a78c2fa6 534 utf8_to_uvchr_buf(character, e, NULL));
3018b823
KW
535 }
536
e1ee3960 537 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
538}
539
2b1f9c71
KW
540STATIC U8 *
541S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
ab1efbdc
KW
542{
543 /* Returns the position of the first byte in the sequence between 's' and
544 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
545 * */
546
547 PERL_ARGS_ASSERT_FIND_SPAN_END;
548
549 assert(send >= s);
550
551 if ((STRLEN) (send - s) >= PERL_WORDSIZE
552 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
553 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
554 {
555 PERL_UINTMAX_T span_word;
556
557 /* Process per-byte until reach word boundary. XXX This loop could be
558 * eliminated if we knew that this platform had fast unaligned reads */
559 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
560 if (*s != span_byte) {
561 return s;
562 }
563 s++;
564 }
565
566 /* Create a word filled with the bytes we are spanning */
567 span_word = PERL_COUNT_MULTIPLIER * span_byte;
568
569 /* Process per-word as long as we have at least a full word left */
570 do {
571
572 /* Keep going if the whole word is composed of 'span_byte's */
573 if ((* (PERL_UINTMAX_T *) s) == span_word) {
574 s += PERL_WORDSIZE;
575 continue;
576 }
577
23a7ee81
KW
578 /* Here, at least one byte in the word isn't 'span_byte'. */
579
580#ifdef EBCDIC
581
582 break;
583
584#else
585
586 /* This xor leaves 1 bits only in those non-matching bytes */
ab1efbdc
KW
587 span_word ^= * (PERL_UINTMAX_T *) s;
588
589 /* Make sure the upper bit of each non-matching byte is set. This
590 * makes each such byte look like an ASCII platform variant byte */
591 span_word |= span_word << 1;
592 span_word |= span_word << 2;
593 span_word |= span_word << 4;
594
595 /* That reduces the problem to what this function solves */
73f0a2eb 596 return s + variant_byte_number(span_word);
ab1efbdc 597
23a7ee81
KW
598#endif
599
ab1efbdc
KW
600 } while (s + PERL_WORDSIZE <= send);
601 }
602
603 /* Process the straggler bytes beyond the final word boundary */
604 while (s < send) {
605 if (*s != span_byte) {
606 return s;
607 }
608 s++;
609 }
610
611 return s;
612}
613
2b1f9c71
KW
614STATIC U8 *
615S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
2813d4ad
KW
616{
617 /* Returns the position of the first byte in the sequence between 's'
618 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
619 * returns 'send' if none found. It uses word-level operations instead of
620 * byte to speed up the process */
621
622 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
623
624 assert(send >= s);
625 assert((byte & mask) == byte);
626
23a7ee81
KW
627#ifndef EBCDIC
628
2813d4ad
KW
629 if ((STRLEN) (send - s) >= PERL_WORDSIZE
630 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
631 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
632 {
1d8aafa0 633 PERL_UINTMAX_T word, mask_word;
2813d4ad
KW
634
635 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 636 if (((*s) & mask) == byte) {
2813d4ad
KW
637 return s;
638 }
639 s++;
640 }
641
1d8aafa0
KW
642 word = PERL_COUNT_MULTIPLIER * byte;
643 mask_word = PERL_COUNT_MULTIPLIER * mask;
2813d4ad
KW
644
645 do {
646 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
647
1d8aafa0
KW
648 /* If 'masked' contains bytes with the bit pattern of 'byte' within
649 * it, xoring with 'word' will leave each of the 8 bits in such
650 * bytes be 0, and no byte containing any other bit pattern will be
651 * 0. */
652 masked ^= word;
2813d4ad
KW
653
654 /* This causes the most significant bit to be set to 1 for any
655 * bytes in the word that aren't completely 0 */
656 masked |= masked << 1;
657 masked |= masked << 2;
658 masked |= masked << 4;
659
660 /* The msbits are the same as what marks a byte as variant, so we
661 * can use this mask. If all msbits are 1, the word doesn't
662 * contain 'byte' */
663 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
664 s += PERL_WORDSIZE;
665 continue;
666 }
667
668 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
669 * and any that are, are 0. Complement and re-AND to swap that */
670 masked = ~ masked;
671 masked &= PERL_VARIANTS_WORD_MASK;
672
673 /* This reduces the problem to that solved by this function */
73f0a2eb 674 s += variant_byte_number(masked);
2813d4ad
KW
675 return s;
676
677 } while (s + PERL_WORDSIZE <= send);
678 }
679
23a7ee81
KW
680#endif
681
2813d4ad 682 while (s < send) {
2b1f9c71 683 if (((*s) & mask) == byte) {
2813d4ad
KW
684 return s;
685 }
686 s++;
687 }
688
689 return s;
690}
691
070e8b2e
KW
692STATIC U8 *
693S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
694{
695 /* Returns the position of the first byte in the sequence between 's' and
696 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
697 * 'span_byte' should have been ANDed with 'mask' in the call of this
698 * function. Returns 'send' if none found. Works like find_span_end(),
699 * except for the AND */
700
701 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
702
703 assert(send >= s);
704 assert((span_byte & mask) == span_byte);
705
706 if ((STRLEN) (send - s) >= PERL_WORDSIZE
707 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
708 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
709 {
710 PERL_UINTMAX_T span_word, mask_word;
711
712 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 713 if (((*s) & mask) != span_byte) {
070e8b2e
KW
714 return s;
715 }
716 s++;
717 }
718
719 span_word = PERL_COUNT_MULTIPLIER * span_byte;
720 mask_word = PERL_COUNT_MULTIPLIER * mask;
721
722 do {
723 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
724
725 if (masked == span_word) {
726 s += PERL_WORDSIZE;
727 continue;
728 }
729
23a7ee81
KW
730#ifdef EBCDIC
731
732 break;
733
734#else
735
070e8b2e
KW
736 masked ^= span_word;
737 masked |= masked << 1;
738 masked |= masked << 2;
739 masked |= masked << 4;
73f0a2eb 740 return s + variant_byte_number(masked);
070e8b2e 741
23a7ee81
KW
742#endif
743
070e8b2e
KW
744 } while (s + PERL_WORDSIZE <= send);
745 }
746
747 while (s < send) {
2b1f9c71 748 if (((*s) & mask) != span_byte) {
070e8b2e
KW
749 return s;
750 }
751 s++;
752 }
753
754 return s;
755}
756
a687059c 757/*
e50aee73 758 * pregexec and friends
a687059c
LW
759 */
760
76234dfb 761#ifndef PERL_IN_XSUB_RE
a687059c 762/*
c277df42 763 - pregexec - match a regexp against a string
a687059c 764 */
c277df42 765I32
5aaab254 766Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 767 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
768/* stringarg: the point in the string at which to begin matching */
769/* strend: pointer to null at end of string */
770/* strbeg: real beginning of string */
771/* minend: end of match must be >= minend bytes after stringarg. */
772/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
773 * itself is accessed via the pointers above */
774/* nosave: For optimizations. */
c277df42 775{
7918f24d
NC
776 PERL_ARGS_ASSERT_PREGEXEC;
777
c277df42 778 return
9041c2e3 779 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
780 nosave ? 0 : REXEC_COPY_STR);
781}
76234dfb 782#endif
22e551b9 783
cad2e5aa 784
6eb5f6b9 785
1a4edc3c
DM
786/* re_intuit_start():
787 *
788 * Based on some optimiser hints, try to find the earliest position in the
789 * string where the regex could match.
790 *
791 * rx: the regex to match against
792 * sv: the SV being matched: only used for utf8 flag; the string
793 * itself is accessed via the pointers below. Note that on
794 * something like an overloaded SV, SvPOK(sv) may be false
795 * and the string pointers may point to something unrelated to
796 * the SV itself.
797 * strbeg: real beginning of string
798 * strpos: the point in the string at which to begin matching
799 * strend: pointer to the byte following the last char of the string
800 * flags currently unused; set to 0
801 * data: currently unused; set to NULL
802 *
803 * The basic idea of re_intuit_start() is to use some known information
804 * about the pattern, namely:
805 *
806 * a) the longest known anchored substring (i.e. one that's at a
807 * constant offset from the beginning of the pattern; but not
808 * necessarily at a fixed offset from the beginning of the
809 * string);
810 * b) the longest floating substring (i.e. one that's not at a constant
811 * offset from the beginning of the pattern);
812 * c) Whether the pattern is anchored to the string; either
813 * an absolute anchor: /^../, or anchored to \n: /^.../m,
814 * or anchored to pos(): /\G/;
815 * d) A start class: a real or synthetic character class which
816 * represents which characters are legal at the start of the pattern;
817 *
818 * to either quickly reject the match, or to find the earliest position
819 * within the string at which the pattern might match, thus avoiding
820 * running the full NFA engine at those earlier locations, only to
821 * eventually fail and retry further along.
822 *
823 * Returns NULL if the pattern can't match, or returns the address within
824 * the string which is the earliest place the match could occur.
825 *
826 * The longest of the anchored and floating substrings is called 'check'
827 * and is checked first. The other is called 'other' and is checked
828 * second. The 'other' substring may not be present. For example,
829 *
830 * /(abc|xyz)ABC\d{0,3}DEFG/
831 *
832 * will have
833 *
834 * check substr (float) = "DEFG", offset 6..9 chars
835 * other substr (anchored) = "ABC", offset 3..3 chars
836 * stclass = [ax]
837 *
838 * Be aware that during the course of this function, sometimes 'anchored'
839 * refers to a substring being anchored relative to the start of the
840 * pattern, and sometimes to the pattern itself being anchored relative to
841 * the string. For example:
842 *
843 * /\dabc/: "abc" is anchored to the pattern;
844 * /^\dabc/: "abc" is anchored to the pattern and the string;
845 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
846 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
847 * but the pattern is anchored to the string.
52a21eb3
DM
848 */
849
cad2e5aa 850char *
52a21eb3
DM
851Perl_re_intuit_start(pTHX_
852 REGEXP * const rx,
853 SV *sv,
854 const char * const strbeg,
855 char *strpos,
856 char *strend,
857 const U32 flags,
858 re_scream_pos_data *data)
cad2e5aa 859{
8d919b0a 860 struct regexp *const prog = ReANY(rx);
b2ad2123
DM
861 SSize_t start_shift = prog->check_offset_min;
862 /* Should be nonnegative! */
863 SSize_t end_shift = 0;
0fc004dd
DM
864 /* current lowest pos in string where the regex can start matching */
865 char *rx_origin = strpos;
eb578fdb 866 SV *check;
f2ed9b32 867 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 868 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 869 bool ml_anch = 0;
8f4bf5fc 870 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 871 char *check_at = NULL; /* check substr found at this pos */
bbe252da 872 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 873 RXi_GET_DECL(prog,progi);
02d5137b
DM
874 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
875 regmatch_info *const reginfo = &reginfo_buf;
271b36b1 876 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 877
7918f24d 878 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
879 PERL_UNUSED_ARG(flags);
880 PERL_UNUSED_ARG(data);
7918f24d 881
6ad9a8ab 882 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
883 "Intuit: trying to determine minimum start position...\n"));
884
fb9bbddb 885 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 886 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
887 * -ve offsets, they'll need to fix up any code in this function
888 * which uses these offsets. See the thread beginning
889 * <20140113145929.GF27210@iabyn.com>
890 */
891 assert(prog->substrs->data[0].min_offset >= 0);
892 assert(prog->substrs->data[0].max_offset >= 0);
893 assert(prog->substrs->data[1].min_offset >= 0);
894 assert(prog->substrs->data[1].max_offset >= 0);
895 assert(prog->substrs->data[2].min_offset >= 0);
896 assert(prog->substrs->data[2].max_offset >= 0);
897
f7022b5a 898 /* for now, assume that if both present, that the floating substring
83f2232d 899 * doesn't start before the anchored substring.
f7022b5a
DM
900 * If you break this assumption (e.g. doing better optimisations
901 * with lookahead/behind), then you'll need to audit the code in this
902 * function carefully first
903 */
904 assert(
905 ! ( (prog->anchored_utf8 || prog->anchored_substr)
906 && (prog->float_utf8 || prog->float_substr))
907 || (prog->float_min_offset >= prog->anchored_offset));
908
1a4edc3c
DM
909 /* byte rather than char calculation for efficiency. It fails
910 * to quickly reject some cases that can't match, but will reject
911 * them later after doing full char arithmetic */
c344f387 912 if (prog->minlen > strend - strpos) {
6ad9a8ab 913 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 914 " String too short...\n"));
cad2e5aa 915 goto fail;
2c2d71f5 916 }
d8da0584 917
196a02af 918 RXp_MATCH_UTF8_set(prog, utf8_target);
6c3fea77 919 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 920 reginfo->info_aux = NULL;
9d9163fb 921 reginfo->strbeg = strbeg;
220db18a 922 reginfo->strend = strend;
aed7b151 923 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 924 reginfo->intuit = 1;
1cb48e53
DM
925 /* not actually used within intuit, but zero for safety anyway */
926 reginfo->poscache_maxiter = 0;
02d5137b 927
f2ed9b32 928 if (utf8_target) {
2814f4b3
HS
929 if ((!prog->anchored_utf8 && prog->anchored_substr)
930 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
931 to_utf8_substr(prog);
932 check = prog->check_utf8;
933 } else {
7e0d5ad7
KW
934 if (!prog->check_substr && prog->check_utf8) {
935 if (! to_byte_substr(prog)) {
6b54ddc5 936 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
937 }
938 }
33b8afdf
JH
939 check = prog->check_substr;
940 }
274cd312 941
1dc475d0
DM
942 /* dump the various substring data */
943 DEBUG_OPTIMISE_MORE_r({
944 int i;
945 for (i=0; i<=2; i++) {
946 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
947 : prog->substrs->data[i].substr);
948 if (!sv)
949 continue;
950
6ad9a8ab 951 Perl_re_printf( aTHX_
147e3846
KW
952 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
953 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
954 i,
955 (IV)prog->substrs->data[i].min_offset,
956 (IV)prog->substrs->data[i].max_offset,
957 (IV)prog->substrs->data[i].end_shift,
958 BmUSEFUL(sv),
959 utf8_target ? 1 : 0,
960 SvPEEK(sv));
961 }
962 });
963
8e1490ee 964 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
965
966 /* ml_anch: check after \n?
967 *
0fa70a06 968 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
969 * with /.*.../, these flags will have been added by the
970 * compiler:
971 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
972 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
973 */
7d2d37f5
DM
974 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
975 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 976
343c8a29 977 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
978 /* we are only allowed to match at BOS or \G */
979
57fcbfa7 980 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
981 *
982 * Note that we don't try to do a similar quick reject for
983 * \G, since generally the caller will have calculated strpos
984 * based on pos() and gofs, so the string is already correctly
985 * anchored by definition; and handling the exceptions would
986 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 987 */
7bb3b9eb 988 if ( strpos != strbeg
d3d47aac 989 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 990 {
6ad9a8ab 991 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 992 " Not at start...\n"));
c889ccc8
DM
993 goto fail;
994 }
995
a5d12a4b
DM
996 /* in the presence of an anchor, the anchored (relative to the
997 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
998 * to strpos. So quickly reject if substr isn't found there.
999 * This works for \G too, because the caller will already have
1000 * subtracted gofs from pos, and gofs is the offset from the
1001 * \G to the start of the regex. For example, in /.abc\Gdef/,
1002 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1003 * caller will have set strpos=pos()-4; we look for the substr
1004 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 1005
33c28ab2 1006 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 1007 /* Substring at constant offset from beg-of-str... */
b2ad2123 1008 SSize_t slen = SvCUR(check);
343c8a29 1009 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 1010
6ad9a8ab 1011 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1012 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
1013 (IV)prog->check_offset_min));
1014
7742aa66
DM
1015 if (SvTAIL(check)) {
1016 /* In this case, the regex is anchored at the end too.
1017 * Unless it's a multiline match, the lengths must match
b2ad2123 1018 * exactly, give or take a \n. NB: slen >= 1 since
7742aa66
DM
1019 * the last char of check is \n */
1020 if (!multiline
b2ad2123
DM
1021 && ( strend - s > slen
1022 || strend - s < slen - 1
1023 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 1024 {
6ad9a8ab 1025 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1026 " String too long...\n"));
c889ccc8
DM
1027 goto fail_finish;
1028 }
b2ad2123
DM
1029 /* Now should match s[0..slen-2] */
1030 slen--;
c889ccc8 1031 }
b2ad2123 1032 if (slen && (strend - s < slen
26fb2318 1033 || *SvPVX_const(check) != *s
b2ad2123 1034 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 1035 {
6ad9a8ab 1036 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1037 " String not equal...\n"));
d307bf57
DM
1038 goto fail_finish;
1039 }
c889ccc8
DM
1040
1041 check_at = s;
1042 goto success_at_start;
cad2e5aa 1043 }
cad2e5aa 1044 }
cad2e5aa 1045 }
0fc004dd 1046
b2ad2123 1047 end_shift = prog->check_end_shift;
cad2e5aa 1048
19188028 1049#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
b2ad2123 1050 if (end_shift < 0)
147e3846 1051 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
b2ad2123 1052 (IV)end_shift, RX_PRECOMP(rx));
2c2d71f5
JH
1053#endif
1054
2c2d71f5 1055 restart:
1de06328 1056
66b7ec5c
DM
1057 /* This is the (re)entry point of the main loop in this function.
1058 * The goal of this loop is to:
1059 * 1) find the "check" substring in the region rx_origin..strend
b2ad2123 1060 * (adjusted by start_shift / end_shift). If not found, reject
66b7ec5c
DM
1061 * immediately.
1062 * 2) If it exists, look for the "other" substr too if defined; for
1063 * example, if the check substr maps to the anchored substr, then
1064 * check the floating substr, and vice-versa. If not found, go
1065 * back to (1) with rx_origin suitably incremented.
1066 * 3) If we find an rx_origin position that doesn't contradict
1067 * either of the substrings, then check the possible additional
1068 * constraints on rx_origin of /^.../m or a known start class.
1069 * If these fail, then depending on which constraints fail, jump
1070 * back to here, or to various other re-entry points further along
1071 * that skip some of the first steps.
1072 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1073 * substring. If the start position was determined to be at the
1074 * beginning of the string - so, not rejected, but not optimised,
1075 * since we have to run regmatch from position 0 - decrement the
1076 * BmUSEFUL() count. Otherwise increment it.
1077 */
1078
1a4edc3c
DM
1079
1080 /* first, look for the 'check' substring */
1081
1de06328 1082 {
c33e64f0
FC
1083 U8* start_point;
1084 U8* end_point;
c889ccc8 1085
c889ccc8 1086 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 1087 Perl_re_printf( aTHX_
147e3846
KW
1088 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1089 " Start shift: %" IVdf " End shift %" IVdf
1090 " Real end Shift: %" IVdf "\n",
675e93ee 1091 (IV)(rx_origin - strbeg),
c889ccc8 1092 (IV)prog->check_offset_min,
b2ad2123
DM
1093 (IV)start_shift,
1094 (IV)end_shift,
c889ccc8
DM
1095 (IV)prog->check_end_shift);
1096 });
1de06328 1097
b2ad2123 1098 end_point = HOPBACK3(strend, end_shift, rx_origin);
bb152a4b
DM
1099 if (!end_point)
1100 goto fail_finish;
b2ad2123 1101 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
33c28ab2
DM
1102 if (!start_point)
1103 goto fail_finish;
c889ccc8 1104
557f47af 1105
e0362b86 1106 /* If the regex is absolutely anchored to either the start of the
d3d47aac 1107 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
1108 * check_offset_max represents an upper bound on the string where
1109 * the substr could start. For the ANCH_GPOS case, we assume that
1110 * the caller of intuit will have already set strpos to
1111 * pos()-gofs, so in this case strpos + offset_max will still be
1112 * an upper bound on the substr.
1113 */
c19c836a
DM
1114 if (!ml_anch
1115 && prog->intflags & PREGf_ANCH
e0362b86 1116 && prog->check_offset_max != SSize_t_MAX)
c19c836a 1117 {
b2ad2123 1118 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
1119 const char * const anchor =
1120 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
b2ad2123 1121 SSize_t targ_len = (char*)end_point - anchor;
2ce94a86 1122
b2ad2123 1123 if (check_len > targ_len) {
2ce94a86 1124 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
12453e29 1125 "Target string too short to match required substring...\n"));
2ce94a86
DM
1126 goto fail_finish;
1127 }
e0362b86
DM
1128
1129 /* do a bytes rather than chars comparison. It's conservative;
1130 * so it skips doing the HOP if the result can't possibly end
1131 * up earlier than the old value of end_point.
1132 */
b2ad2123
DM
1133 assert(anchor + check_len <= (char *)end_point);
1134 if (prog->check_offset_max + check_len < targ_len) {
e0362b86
DM
1135 end_point = HOP3lim((U8*)anchor,
1136 prog->check_offset_max,
b2ad2123 1137 end_point - check_len
2ce94a86 1138 )
b2ad2123 1139 + check_len;
12453e29
YO
1140 if (end_point < start_point)
1141 goto fail_finish;
e0362b86 1142 }
d6ef1678
DM
1143 }
1144
ae5d4331 1145 check_at = fbm_instr( start_point, end_point,
7fba1cd6 1146 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 1147
6ad9a8ab 1148 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1149 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
1150 (IV)((char*)start_point - strbeg),
1151 (IV)((char*)end_point - strbeg),
1152 (IV)(check_at ? check_at - strbeg : -1)
1153 ));
1154
8fd34720
DM
1155 /* Update the count-of-usability, remove useless subpatterns,
1156 unshift s. */
1157
1158 DEBUG_EXECUTE_r({
1159 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1160 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 1161 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
1162 (check_at ? "Found" : "Did not find"),
1163 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1164 ? "anchored" : "floating"),
1165 quoted,
1166 RE_SV_TAIL(check),
1167 (check_at ? " at offset " : "...\n") );
1168 });
2c2d71f5 1169
8fd34720
DM
1170 if (!check_at)
1171 goto fail_finish;
8fd34720
DM
1172 /* set rx_origin to the minimum position where the regex could start
1173 * matching, given the constraint of the just-matched check substring.
1174 * But don't set it lower than previously.
1175 */
fdc003fd 1176
8fd34720
DM
1177 if (check_at - rx_origin > prog->check_offset_max)
1178 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 1179 /* Finish the diagnostic message */
6ad9a8ab 1180 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1181 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1182 (long)(check_at - strbeg),
1183 (IV)(rx_origin - strbeg)
1184 ));
8fd34720 1185 }
fdc003fd
DM
1186
1187
1a4edc3c 1188 /* now look for the 'other' substring if defined */
2c2d71f5 1189
fd8def15
HS
1190 if (prog->substrs->data[other_ix].utf8_substr
1191 || prog->substrs->data[other_ix].substr)
1de06328 1192 {
30944b6d 1193 /* Take into account the "other" substring. */
6c3343a6
DM
1194 char *last, *last1;
1195 char *s;
1196 SV* must;
1197 struct reg_substr_datum *other;
1198
1199 do_other_substr:
1200 other = &prog->substrs->data[other_ix];
fd8def15
HS
1201 if (!utf8_target && !other->substr) {
1202 if (!to_byte_substr(prog)) {
1203 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1204 }
1205 }
6c3343a6
DM
1206
1207 /* if "other" is anchored:
1208 * we've previously found a floating substr starting at check_at.
1209 * This means that the regex origin must lie somewhere
1210 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1211 * and max: HOP3(check_at, -check_offset_min)
1212 * (except that min will be >= strpos)
1213 * So the fixed substr must lie somewhere between
1214 * HOP3(min, anchored_offset)
1215 * HOP3(max, anchored_offset) + SvCUR(substr)
1216 */
1217
1218 /* if "other" is floating
1219 * Calculate last1, the absolute latest point where the
1220 * floating substr could start in the string, ignoring any
1221 * constraints from the earlier fixed match. It is calculated
1222 * as follows:
1223 *
1224 * strend - prog->minlen (in chars) is the absolute latest
1225 * position within the string where the origin of the regex
1226 * could appear. The latest start point for the floating
1227 * substr is float_min_offset(*) on from the start of the
1228 * regex. last1 simply combines thee two offsets.
1229 *
1230 * (*) You might think the latest start point should be
1231 * float_max_offset from the regex origin, and technically
1232 * you'd be correct. However, consider
1233 * /a\d{2,4}bcd\w/
1234 * Here, float min, max are 3,5 and minlen is 7.
1235 * This can match either
1236 * /a\d\dbcd\w/
1237 * /a\d\d\dbcd\w/
1238 * /a\d\d\d\dbcd\w/
1239 * In the first case, the regex matches minlen chars; in the
1240 * second, minlen+1, in the third, minlen+2.
1241 * In the first case, the floating offset is 3 (which equals
1242 * float_min), in the second, 4, and in the third, 5 (which
1243 * equals float_max). In all cases, the floating string bcd
1244 * can never start more than 4 chars from the end of the
1245 * string, which equals minlen - float_min. As the substring
1246 * starts to match more than float_min from the start of the
1247 * regex, it makes the regex match more than minlen chars,
1248 * and the two cancel each other out. So we can always use
1249 * float_min - minlen, rather than float_max - minlen for the
1250 * latest position in the string.
1251 *
1252 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1253 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1254 */
1255
e7a14a9c 1256 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1257 last1 = HOP3c(strend,
1258 other->min_offset - prog->minlen, strbeg);
1259
4d006249 1260 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1261 /* last is the latest point where the floating substr could
1262 * start, *given* any constraints from the earlier fixed
1263 * match. This constraint is that the floating string starts
1264 * <= float_max_offset chars from the regex origin (rx_origin).
1265 * If this value is less than last1, use it instead.
eb3831ce 1266 */
6c3343a6
DM
1267 assert(rx_origin <= last1);
1268 last =
1269 /* this condition handles the offset==infinity case, and
1270 * is a short-cut otherwise. Although it's comparing a
1271 * byte offset to a char length, it does so in a safe way,
1272 * since 1 char always occupies 1 or more bytes,
1273 * so if a string range is (last1 - rx_origin) bytes,
1274 * it will be less than or equal to (last1 - rx_origin)
1275 * chars; meaning it errs towards doing the accurate HOP3
1276 * rather than just using last1 as a short-cut */
1277 (last1 - rx_origin) < other->max_offset
1278 ? last1
1279 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1280 }
1281 else {
b2ad2123
DM
1282 assert(strpos + start_shift <= check_at);
1283 last = HOP4c(check_at, other->min_offset - start_shift,
6c3343a6
DM
1284 strbeg, strend);
1285 }
ead917d0 1286
6c3343a6
DM
1287 s = HOP3c(rx_origin, other->min_offset, strend);
1288 if (s < other_last) /* These positions already checked */
1289 s = other_last;
1290
1291 must = utf8_target ? other->utf8_substr : other->substr;
1292 assert(SvPOK(must));
675e93ee
DM
1293 {
1294 char *from = s;
1295 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1296
71a9d105
DM
1297 if (to > strend)
1298 to = strend;
88203927
DM
1299 if (from > to) {
1300 s = NULL;
6ad9a8ab 1301 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1302 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1303 (IV)(from - strbeg),
1304 (IV)(to - strbeg)
1305 ));
1306 }
1307 else {
1308 s = fbm_instr(
1309 (unsigned char*)from,
1310 (unsigned char*)to,
1311 must,
1312 multiline ? FBMrf_MULTILINE : 0
1313 );
6ad9a8ab 1314 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1315 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1316 (IV)(from - strbeg),
1317 (IV)(to - strbeg),
1318 (IV)(s ? s - strbeg : -1)
1319 ));
1320 }
675e93ee
DM
1321 }
1322
6c3343a6
DM
1323 DEBUG_EXECUTE_r({
1324 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1325 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1326 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1327 s ? "Found" : "Contradicts",
1328 other_ix ? "floating" : "anchored",
1329 quoted, RE_SV_TAIL(must));
1330 });
ead917d0 1331
ead917d0 1332
6c3343a6
DM
1333 if (!s) {
1334 /* last1 is latest possible substr location. If we didn't
1335 * find it before there, we never will */
1336 if (last >= last1) {
6ad9a8ab 1337 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1338 "; giving up...\n"));
6c3343a6 1339 goto fail_finish;
ead917d0
DM
1340 }
1341
6c3343a6
DM
1342 /* try to find the check substr again at a later
1343 * position. Maybe next time we'll find the "other" substr
1344 * in range too */
6c3343a6
DM
1345 other_last = HOP3c(last, 1, strend) /* highest failure */;
1346 rx_origin =
4d006249 1347 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1348 ? HOP3c(rx_origin, 1, strend)
1349 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1350 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1351 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1352 (other_ix ? "floating" : "anchored"),
1353 (long)(HOP3c(check_at, 1, strend) - strbeg),
1354 (IV)(rx_origin - strbeg)
1355 ));
6c3343a6
DM
1356 goto restart;
1357 }
1358 else {
4d006249 1359 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1360 /* other_last is set to s, not s+1, since its possible for
1361 * a floating substr to fail first time, then succeed
1362 * second time at the same floating position; e.g.:
1363 * "-AB--AABZ" =~ /\wAB\d*Z/
1364 * The first time round, anchored and float match at
1365 * "-(AB)--AAB(Z)" then fail on the initial \w character
1366 * class. Second time round, they match at "-AB--A(AB)(Z)".
1367 */
1368 other_last = s;
ead917d0
DM
1369 }
1370 else {
6c3343a6
DM
1371 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1372 other_last = HOP3c(s, 1, strend);
ead917d0 1373 }
6ad9a8ab 1374 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1375 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1376 (long)(s - strbeg),
1377 (IV)(rx_origin - strbeg)
1378 ));
1379
6c3343a6 1380 }
cad2e5aa 1381 }
acba93e8
DM
1382 else {
1383 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1384 Perl_re_printf( aTHX_
147e3846
KW
1385 " Check-only match: offset min:%" IVdf " max:%" IVdf
1386 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1387 " strend:%" IVdf "\n",
acba93e8
DM
1388 (IV)prog->check_offset_min,
1389 (IV)prog->check_offset_max,
675e93ee
DM
1390 (IV)(check_at-strbeg),
1391 (IV)(rx_origin-strbeg),
1c1c599d 1392 (IV)(rx_origin-check_at),
675e93ee 1393 (IV)(strend-strbeg)
acba93e8
DM
1394 )
1395 );
1396 }
2c2d71f5 1397
acba93e8 1398 postprocess_substr_matches:
0991020e 1399
1a4edc3c 1400 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1401
7d2d37f5 1402 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1403 char *s;
1404
6ad9a8ab 1405 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1406 " looking for /^/m anchor"));
d0880ea7
DM
1407
1408 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1409 * Find the next \n, if any, even if it's beyond the current
1410 * anchored and/or floating substrings. Whether we should be
1411 * scanning ahead for the next \n or the next substr is debatable.
1412 * On the one hand you'd expect rare substrings to appear less
1413 * often than \n's. On the other hand, searching for \n means
675e93ee 1414 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1415 * iteration as the current "rarest" string candidate, which
1416 * means for example that we'll quickly reject the whole string if
1417 * hasn't got a \n, rather than trying every substr position
1418 * first
1419 */
d0880ea7 1420
4620cb61
DM
1421 s = HOP3c(strend, - prog->minlen, strpos);
1422 if (s <= rx_origin ||
1423 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1424 {
6ad9a8ab 1425 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1426 " Did not find /%s^%s/m...\n",
1427 PL_colors[0], PL_colors[1]));
a62659bd
DM
1428 goto fail_finish;
1429 }
d0880ea7 1430
4ada1233
DM
1431 /* earliest possible origin is 1 char after the \n.
1432 * (since *rx_origin == '\n', it's safe to ++ here rather than
1433 * HOP(rx_origin, 1)) */
1434 rx_origin++;
d0880ea7 1435
f4f115de 1436 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1437 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1438 {
d0880ea7
DM
1439 /* Position contradicts check-string; either because
1440 * check was anchored (and thus has no wiggle room),
4ada1233 1441 * or check was float and rx_origin is above the float range */
6ad9a8ab 1442 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1443 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1444 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1445 goto restart;
1446 }
1447
1448 /* if we get here, the check substr must have been float,
2e759faa 1449 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1450 * "other" substr which still contradicts */
1451 assert(prog->substrs->check_ix); /* check is float */
1452
1453 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1454 /* whoops, the anchored "other" substr exists, so we still
1455 * contradict. On the other hand, the float "check" substr
1456 * didn't contradict, so just retry the anchored "other"
1457 * substr */
6ad9a8ab 1458 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1459 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1460 PL_colors[0], PL_colors[1],
73e8ff00
DM
1461 (IV)(rx_origin - strbeg + prog->anchored_offset),
1462 (IV)(rx_origin - strbeg)
675e93ee 1463 ));
d0880ea7
DM
1464 goto do_other_substr;
1465 }
1466
1467 /* success: we don't contradict the found floating substring
1468 * (and there's no anchored substr). */
6ad9a8ab 1469 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1470 " Found /%s^%s/m with rx_origin %ld...\n",
1471 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1472 }
1473 else {
6ad9a8ab 1474 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1475 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1476 }
1477
ffad1e6a 1478 success_at_start:
e3c6feb0 1479
cad2e5aa 1480
dd170ff5
DM
1481 /* if we have a starting character class, then test that extra constraint.
1482 * (trie stclasses are too expensive to use here, we are better off to
1483 * leave it to regmatch itself) */
1484
f8fc2ecf 1485 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1486 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1487
b2ad2123 1488 /* XXX this value could be pre-computed */
7d769928 1489 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362 1490 ? (reginfo->is_utf8_pat
7d769928
DM
1491 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1492 : (SSize_t)STR_LEN(progi->regstclass))
66e933ab 1493 : 1);
1de06328 1494 char * endpos;
fa3bb21d 1495 char *s;
000dfd2d
DM
1496 /* latest pos that a matching float substr constrains rx start to */
1497 char *rx_max_float = NULL;
1498
c75a3985
DM
1499 /* if the current rx_origin is anchored, either by satisfying an
1500 * anchored substring constraint, or a /^.../m constraint, then we
1501 * can reject the current origin if the start class isn't found
1502 * at the current position. If we have a float-only match, then
1503 * rx_origin is constrained to a range; so look for the start class
1504 * in that range. if neither, then look for the start class in the
1505 * whole rest of the string */
1506
dd170ff5
DM
1507 /* XXX DAPM it's not clear what the minlen test is for, and why
1508 * it's not used in the floating case. Nothing in the test suite
1509 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1510 * Here are some old comments, which may or may not be correct:
1511 *
1512 * minlen == 0 is possible if regstclass is \b or \B,
1513 * and the fixed substr is ''$.
1514 * Since minlen is already taken into account, rx_origin+1 is
1515 * before strend; accidentally, minlen >= 1 guaranties no false
1516 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1517 * 0) below assumes that regstclass does not come from lookahead...
1518 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1519 * This leaves EXACTF-ish only, which are dealt with in
1520 * find_byclass().
1521 */
1522
7d2d37f5 1523 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
b2ad2123 1524 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d 1525 else if (prog->float_substr || prog->float_utf8) {
b2ad2123
DM
1526 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1527 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1528 }
1de06328
YO
1529 else
1530 endpos= strend;
1531
6ad9a8ab 1532 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
b2ad2123 1533 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
147e3846 1534 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
b2ad2123 1535 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1536 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1537
c43b5520 1538 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1539 reginfo);
be778b1a 1540 if (!s) {
6eb5f6b9 1541 if (endpos == strend) {
6ad9a8ab 1542 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1543 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1544 goto fail;
1545 }
6ad9a8ab 1546 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1547 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1548 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1549 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1550 goto fail;
9fed8d02 1551
6eb5f6b9 1552 /* Contradict one of substrings */
97136c8a
DM
1553 if (prog->anchored_substr || prog->anchored_utf8) {
1554 if (prog->substrs->check_ix == 1) { /* check is float */
1555 /* Have both, check_string is floating */
b2ad2123
DM
1556 assert(rx_origin + start_shift <= check_at);
1557 if (rx_origin + start_shift != check_at) {
97136c8a 1558 /* not at latest position float substr could match:
c75a3985
DM
1559 * Recheck anchored substring, but not floating.
1560 * The condition above is in bytes rather than
1561 * chars for efficiency. It's conservative, in
1562 * that it errs on the side of doing 'goto
88203927
DM
1563 * do_other_substr'. In this case, at worst,
1564 * an extra anchored search may get done, but in
1565 * practice the extra fbm_instr() is likely to
1566 * get skipped anyway. */
6ad9a8ab 1567 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1568 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1569 (long)(other_last - strbeg),
1570 (IV)(rx_origin - strbeg)
1571 ));
97136c8a 1572 goto do_other_substr;
3369914b 1573 }
3369914b
DM
1574 }
1575 }
97136c8a 1576 else {
9fed8d02
DM
1577 /* float-only */
1578
7d2d37f5 1579 if (ml_anch) {
c75a3985
DM
1580 /* In the presence of ml_anch, we might be able to
1581 * find another \n without breaking the current float
1582 * constraint. */
1583
1584 /* strictly speaking this should be HOP3c(..., 1, ...),
1585 * but since we goto a block of code that's going to
1586 * search for the next \n if any, its safe here */
9fed8d02 1587 rx_origin++;
6ad9a8ab 1588 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1589 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1590 PL_colors[0], PL_colors[1],
675e93ee 1591 (long)(rx_origin - strbeg)) );
9fed8d02 1592 goto postprocess_substr_matches;
ab60c45a 1593 }
c75a3985
DM
1594
1595 /* strictly speaking this can never be true; but might
1596 * be if we ever allow intuit without substrings */
1597 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1598 goto fail;
c75a3985 1599
000dfd2d 1600 rx_origin = rx_max_float;
9fed8d02
DM
1601 }
1602
c75a3985
DM
1603 /* at this point, any matching substrings have been
1604 * contradicted. Start again... */
1605
9fed8d02 1606 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1607
1608 /* uses bytes rather than char calculations for efficiency.
1609 * It's conservative: it errs on the side of doing 'goto restart',
1610 * where there is code that does a proper char-based test */
b2ad2123 1611 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1612 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1613 " Could not match STCLASS...\n") );
1614 goto fail;
1615 }
6ad9a8ab 1616 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1617 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1618 (prog->substrs->check_ix ? "floating" : "anchored"),
b2ad2123 1619 (long)(rx_origin + start_shift - strbeg),
675e93ee
DM
1620 (IV)(rx_origin - strbeg)
1621 ));
9fed8d02 1622 goto restart;
6eb5f6b9 1623 }
9fed8d02 1624
c75a3985
DM
1625 /* Success !!! */
1626
5f9c6575 1627 if (rx_origin != s) {
6ad9a8ab 1628 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1629 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1630 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1631 );
1632 }
1633 else {
6ad9a8ab 1634 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1635 " Does not contradict STCLASS...\n");
b7953727
JH
1636 );
1637 }
6eb5f6b9 1638 }
ffad1e6a
DM
1639
1640 /* Decide whether using the substrings helped */
1641
1642 if (rx_origin != strpos) {
1643 /* Fixed substring is found far enough so that the match
1644 cannot start at strpos. */
1645
6ad9a8ab 1646 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1647 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1648 }
1649 else {
70563e16
DM
1650 /* The found rx_origin position does not prohibit matching at
1651 * strpos, so calling intuit didn't gain us anything. Decrement
1652 * the BmUSEFUL() count on the check substring, and if we reach
1653 * zero, free it. */
1654 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1655 && (utf8_target ? (
1656 prog->check_utf8 /* Could be deleted already */
1657 && --BmUSEFUL(prog->check_utf8) < 0
1658 && (prog->check_utf8 == prog->float_utf8)
1659 ) : (
1660 prog->check_substr /* Could be deleted already */
1661 && --BmUSEFUL(prog->check_substr) < 0
1662 && (prog->check_substr == prog->float_substr)
1663 )))
1664 {
1665 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1666 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1667 /* XXX Does the destruction order has to change with utf8_target? */
1668 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1669 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1670 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1671 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1672 check = NULL; /* abort */
ffad1e6a
DM
1673 /* XXXX This is a remnant of the old implementation. It
1674 looks wasteful, since now INTUIT can use many
1675 other heuristics. */
1676 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1677 }
1678 }
1679
6ad9a8ab 1680 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1681 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1682 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1683
c765d6e0 1684 return rx_origin;
2c2d71f5
JH
1685
1686 fail_finish: /* Substring not found */
33b8afdf 1687 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1688 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1689 fail:
6ad9a8ab 1690 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1691 PL_colors[4], PL_colors[5]));
bd61b366 1692 return NULL;
cad2e5aa 1693}
9661b544 1694
70563e16 1695
a0a388a1 1696#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1697 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78 1698 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
3ed3004a 1699 trie_utf8l, trie_flu8, trie_flu8_latin } \
e7fd4aa1
KW
1700 trie_type = ((scan->flags == EXACT) \
1701 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1702 : (scan->flags == EXACTL) \
1703 ? (utf8_target ? trie_utf8l : trie_plain) \
89829bb5 1704 : (scan->flags == EXACTFAA) \
a4525e78
KW
1705 ? (utf8_target \
1706 ? trie_utf8_exactfa_fold \
1707 : trie_latin_utf8_exactfa_fold) \
1708 : (scan->flags == EXACTFLU8 \
3ed3004a
YO
1709 ? (utf8_target \
1710 ? trie_flu8 \
1711 : trie_flu8_latin) \
a4525e78
KW
1712 : (utf8_target \
1713 ? trie_utf8_fold \
3ed3004a 1714 : trie_latin_utf8_fold)))
fab2782b 1715
9ad8cac4
KW
1716/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1717 * 'foldbuf+sizeof(foldbuf)' */
1718#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1719STMT_START { \
fab2782b 1720 STRLEN skiplen; \
baa60164 1721 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1722 switch (trie_type) { \
a4525e78 1723 case trie_flu8: \
780fcc9f 1724 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3ed3004a 1725 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
e1a2878a 1726 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
613abc6d 1727 } \
a4525e78 1728 goto do_trie_utf8_fold; \
31f05a37 1729 case trie_utf8_exactfa_fold: \
baa60164 1730 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1731 /* FALLTHROUGH */ \
fab2782b 1732 case trie_utf8_fold: \
a4525e78 1733 do_trie_utf8_fold: \
fab2782b 1734 if ( foldlen>0 ) { \
9ad8cac4 1735 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1736 foldlen -= len; \
1737 uscan += len; \
1738 len=0; \
1739 } else { \
9ad8cac4 1740 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
a1a5ec35 1741 flags); \
695717ad 1742 len = UTF8_SAFE_SKIP(uc, uc_end); \
5f560d8a 1743 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1744 foldlen -= skiplen; \
1745 uscan = foldbuf + skiplen; \
1746 } \
1747 break; \
3ed3004a
YO
1748 case trie_flu8_latin: \
1749 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1750 goto do_trie_latin_utf8_fold; \
baa60164
KW
1751 case trie_latin_utf8_exactfa_fold: \
1752 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1753 /* FALLTHROUGH */ \
fab2782b 1754 case trie_latin_utf8_fold: \
3ed3004a 1755 do_trie_latin_utf8_fold: \
fab2782b 1756 if ( foldlen>0 ) { \
9ad8cac4 1757 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1758 foldlen -= len; \
1759 uscan += len; \
1760 len=0; \
1761 } else { \
1762 len = 1; \
31f05a37 1763 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1764 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1765 foldlen -= skiplen; \
1766 uscan = foldbuf + skiplen; \
1767 } \
1768 break; \
a4525e78 1769 case trie_utf8l: \
780fcc9f 1770 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d 1771 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
14f657d4 1772 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
613abc6d 1773 } \
780fcc9f 1774 /* FALLTHROUGH */ \
fab2782b 1775 case trie_utf8: \
9ad8cac4 1776 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
fab2782b
YO
1777 break; \
1778 case trie_plain: \
1779 uvc = (UV)*uc; \
1780 len = 1; \
1781 } \
1782 if (uvc < 256) { \
1783 charid = trie->charmap[ uvc ]; \
1784 } \
1785 else { \
1786 charid = 0; \
1787 if (widecharmap) { \
1788 SV** const svpp = hv_fetch(widecharmap, \
1789 (char*)&uvc, sizeof(UV), 0); \
1790 if (svpp) \
1791 charid = (U16)SvIV(*svpp); \
1792 } \
1793 } \
4cadc6a9
YO
1794} STMT_END
1795
cb41e5d6 1796#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1797 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1798 startpos, doutf8, depth)
ae7c5b9b 1799
56ff0609 1800#define REXEC_FBC_UTF8_SCAN(CODE) \
da10aa09
KW
1801 STMT_START { \
1802 while (s < strend) { \
1803 CODE \
56ff0609
KW
1804 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1805 } \
1806 } STMT_END
1807
1808#define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1809 STMT_START { \
1810 while (s < strend) { \
1811 CODE \
1812 s++; \
1813 } \
1814 } STMT_END
1815
1816#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1817 STMT_START { \
1818 while (s < strend) { \
1819 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
da10aa09
KW
1820 } \
1821 } STMT_END
4cadc6a9 1822
56ff0609 1823#define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
d990bd30
KW
1824 STMT_START { \
1825 while (s < strend) { \
56ff0609 1826 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
d990bd30
KW
1827 } \
1828 } STMT_END
4cadc6a9 1829
56ff0609 1830#define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
05bd126c 1831 if (COND) { \
19719145 1832 FBC_CHECK_AND_TRY \
56ff0609 1833 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
21d1ed54 1834 previous_occurrence_end = s; \
05bd126c 1835 } \
21d1ed54 1836 else { \
56ff0609 1837 s += UTF8SKIP(s); \
21d1ed54 1838 }
4cadc6a9 1839
56ff0609
KW
1840#define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1841 if (COND) { \
1842 FBC_CHECK_AND_TRY \
1843 s++; \
1844 previous_occurrence_end = s; \
e1d1eefb
YO
1845 } \
1846 else { \
56ff0609 1847 s++; \
d981ef24 1848 }
05bd126c 1849
a9448551
KW
1850/* We keep track of where the next character should start after an occurrence
1851 * of the one we're looking for. Knowing that, we can see right away if the
1852 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1853 * don't accept the 2nd and succeeding adjacent occurrences */
8c9c2723
KW
1854#define FBC_CHECK_AND_TRY \
1855 if ( ( doevery \
1856 || s != previous_occurrence_end) \
1857 && ( reginfo->intuit \
1858 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1859 { \
1860 goto got_it; \
19719145
KW
1861 }
1862
a9448551 1863
56ff0609
KW
1864/* These differ from the above macros in that they call a function which
1865 * returns the next occurrence of the thing being looked for in 's'; and
1866 * 'strend' if there is no such occurrence. */
1867#define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
a9448551 1868 while (s < strend) { \
2b1f9c71 1869 s = (f); \
a9448551
KW
1870 if (s >= strend) { \
1871 break; \
1872 } \
1873 \
1874 FBC_CHECK_AND_TRY \
56ff0609
KW
1875 s += UTF8SKIP(s); \
1876 previous_occurrence_end = s; \
1877 }
1878
1879#define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1880 while (s < strend) { \
1881 s = (f); \
1882 if (s >= strend) { \
1883 break; \
1884 } \
1885 \
1886 FBC_CHECK_AND_TRY \
1887 s++; \
a9448551
KW
1888 previous_occurrence_end = s; \
1889 }
1890
ff8b8b42
KW
1891/* This differs from the above macros in that it is passed a single byte that
1892 * is known to begin the next occurrence of the thing being looked for in 's'.
1893 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1894 * at that position. */
1895#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1896 while (s < strend) { \
1897 s = (char *) memchr(s, byte, strend -s); \
1898 if (s == NULL) { \
1899 s = (char *) strend; \
1900 break; \
1901 } \
1902 \
04532561
KW
1903 if (COND) { \
1904 FBC_CHECK_AND_TRY \
8c9c2723 1905 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
04532561
KW
1906 previous_occurrence_end = s; \
1907 } \
1908 else { \
1909 s += UTF8SKIP(s); \
1910 } \
ff8b8b42
KW
1911 }
1912
56ff0609 1913/* The four macros below are slightly different versions of the same logic.
05bd126c
KW
1914 *
1915 * The first is for /a and /aa when the target string is UTF-8. This can only
56ff0609
KW
1916 * match ascii, but it must advance based on UTF-8. The other three handle
1917 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
1918 * looking for the boundary (or non-boundary) between a word and non-word
1919 * character. The utf8 and non-utf8 cases have the same logic, but the details
1920 * must be different. Find the "wordness" of the character just prior to this
1921 * one, and compare it with the wordness of this one. If they differ, we have
1922 * a boundary. At the beginning of the string, pretend that the previous
05bd126c
KW
1923 * character was a new-line.
1924 *
1925 * All these macros uncleanly have side-effects with each other and outside
1926 * variables. So far it's been too much trouble to clean-up
1927 *
1928 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1929 * a word character or not.
1930 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1931 * word/non-word
1932 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1933 *
1934 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1935 * are looking for a boundary or for a non-boundary. If we are looking for a
1936 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1937 * see if this tentative match actually works, and if so, to quit the loop
1938 * here. And vice-versa if we are looking for a non-boundary.
1939 *
56ff0609
KW
1940 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1941 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
05bd126c
KW
1942 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1943 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1944 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1945 * complement. But in that branch we complement tmp, meaning that at the
1946 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1947 * which means at the top of the loop in the next iteration, it is
1948 * TEST_NON_UTF8(s-1) */
b2f4e957 1949#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1950 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1951 tmp = TEST_NON_UTF8(tmp); \
56ff0609 1952 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
05bd126c
KW
1953 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1954 tmp = !tmp; \
1955 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1956 } \
1957 else { \
1958 IF_FAIL; \
1959 } \
1960 ); \
1961
1962/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1963 * TEST_UTF8 is a macro that for the same input code points returns identically
92c5bde9
KW
1964 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1965 * end pointer as well) */
236d82fd 1966#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1967 if (s == reginfo->strbeg) { \
1968 tmp = '\n'; \
1969 } \
1970 else { /* Back-up to the start of the previous character */ \
1971 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1972 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1973 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1974 } \
1975 tmp = TEST_UV(tmp); \
56ff0609 1976 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
7a207065 1977 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
1978 tmp = !tmp; \
1979 IF_SUCCESS; \
1980 } \
1981 else { \
1982 IF_FAIL; \
1983 } \
1984 );
cfaf538b 1985
56ff0609
KW
1986/* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
1987 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
1988 * set-up by the FBC_BOUND, etc macros below */
1989#define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1990 UTF8_CODE; \
c8519dc7 1991 /* Here, things have been set up by the previous code so that tmp is the \
56ff0609
KW
1992 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
1993 * against the EOS, which we treat as a \n */ \
c8519dc7
KW
1994 if (tmp == ! TEST_NON_UTF8('\n')) { \
1995 IF_SUCCESS; \
1996 } \
1997 else { \
1998 IF_FAIL; \
1999 }
63ac0dad 2000
56ff0609
KW
2001/* Same as the macro above, but the target isn't UTF-8 */
2002#define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2003 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2004 tmp = TEST_NON_UTF8(tmp); \
2005 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2006 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2007 IF_SUCCESS; \
2008 tmp = !tmp; \
2009 } \
2010 else { \
2011 IF_FAIL; \
2012 } \
2013 ); \
2014 /* Here, things have been set up by the previous code so that tmp is \
2015 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2016 * matches against the EOS, which we treat as a \n */ \
2017 if (tmp == ! TEST_NON_UTF8('\n')) { \
2018 IF_SUCCESS; \
2019 } \
2020 else { \
2021 IF_FAIL; \
2022 }
2023
ae7c5b9b 2024/* This is the macro to use when we want to see if something that looks like it
b92b2705
KW
2025 * could match, actually does, and if so exits the loop. It needs to be used
2026 * only for bounds checking macros, as it allows for matching beyond the end of
2027 * string (which should be zero length without having to look at the string
2028 * contents) */
2029#define REXEC_FBC_TRYIT \
8c9c2723 2030 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
ae7c5b9b
KW
2031 goto got_it
2032
2033/* The only difference between the BOUND and NBOUND cases is that
2034 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2035 * NBOUND. This is accomplished by passing it as either the if or else clause,
2036 * with the other one being empty (PLACEHOLDER is defined as empty).
2037 *
2038 * The TEST_FOO parameters are for operating on different forms of input, but
2039 * all should be ones that return identically for the same underlying code
2040 * points */
56ff0609
KW
2041
2042#define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2043 FBC_BOUND_COMMON_UTF8( \
2044 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
ae7c5b9b
KW
2045 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2046
56ff0609
KW
2047#define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2048 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
ae7c5b9b 2049
56ff0609
KW
2050#define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2051 FBC_BOUND_COMMON_UTF8( \
2052 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2053 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
ae7c5b9b 2054
56ff0609
KW
2055#define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2056 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2057
2058#define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2059 FBC_BOUND_COMMON_UTF8( \
2060 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2061 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2062
2063#define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2064 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2065
2066#define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2067 FBC_BOUND_COMMON_UTF8( \
2068 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
ae7c5b9b
KW
2069 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2070
56ff0609
KW
2071#define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2072 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2073
8bde5eaf
JH
2074#ifdef DEBUGGING
2075static IV
2076S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
9ffbcbcf 2077 IV cp_out = _invlist_search(invlist, cp_in);
8bde5eaf
JH
2078 assert(cp_out >= 0);
2079 return cp_out;
2080}
2081# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2082 invmap[S_get_break_val_cp_checked(invlist, cp)]
2083#else
2084# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2085 invmap[_invlist_search(invlist, cp)]
2086#endif
2087
64935bc6
KW
2088/* Takes a pointer to an inversion list, a pointer to its corresponding
2089 * inversion map, and a code point, and returns the code point's value
2090 * according to the two arrays. It assumes that all code points have a value.
2091 * This is used as the base macro for macros for particular properties */
2092#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 2093 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
2094
2095/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2096 * of a code point, returning the value for the first code point in the string.
2097 * And it takes the particular macro name that finds the desired value given a
2098 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2099#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2100 (__ASSERT_(pos < strend) \
2101 /* Note assumes is valid UTF-8 */ \
2102 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2103
2104/* Returns the GCB value for the input code point */
2105#define getGCB_VAL_CP(cp) \
2106 _generic_GET_BREAK_VAL_CP( \
2107 PL_GCB_invlist, \
02f811dd 2108 _Perl_GCB_invmap, \
64935bc6
KW
2109 (cp))
2110
2111/* Returns the GCB value for the first code point in the UTF-8 encoded string
2112 * bounded by pos and strend */
2113#define getGCB_VAL_UTF8(pos, strend) \
2114 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 2115
6b659339
KW
2116/* Returns the LB value for the input code point */
2117#define getLB_VAL_CP(cp) \
2118 _generic_GET_BREAK_VAL_CP( \
2119 PL_LB_invlist, \
2120 _Perl_LB_invmap, \
2121 (cp))
2122
2123/* Returns the LB value for the first code point in the UTF-8 encoded string
2124 * bounded by pos and strend */
2125#define getLB_VAL_UTF8(pos, strend) \
2126 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2127
06ae2722
KW
2128
2129/* Returns the SB value for the input code point */
2130#define getSB_VAL_CP(cp) \
2131 _generic_GET_BREAK_VAL_CP( \
2132 PL_SB_invlist, \
bf4268fa 2133 _Perl_SB_invmap, \
06ae2722
KW
2134 (cp))
2135
2136/* Returns the SB value for the first code point in the UTF-8 encoded string
2137 * bounded by pos and strend */
2138#define getSB_VAL_UTF8(pos, strend) \
2139 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2140
ae3bb8ea
KW
2141/* Returns the WB value for the input code point */
2142#define getWB_VAL_CP(cp) \
2143 _generic_GET_BREAK_VAL_CP( \
2144 PL_WB_invlist, \
bf4268fa 2145 _Perl_WB_invmap, \
ae3bb8ea
KW
2146 (cp))
2147
2148/* Returns the WB value for the first code point in the UTF-8 encoded string
2149 * bounded by pos and strend */
2150#define getWB_VAL_UTF8(pos, strend) \
2151 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2152
786e8c11 2153/* We know what class REx starts with. Try to find this position... */
02d5137b 2154/* if reginfo->intuit, its a dryrun */
786e8c11
YO
2155/* annoyingly all the vars in this routine have different names from their counterparts
2156 in regmatch. /grrr */
3c3eec57 2157STATIC char *
07be1b83 2158S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 2159 const char *strend, regmatch_info *reginfo)
a687059c 2160{
a9448551
KW
2161
2162 /* TRUE if x+ need not match at just the 1st pos of run of x's */
73104a1b 2163 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
a9448551 2164
73104a1b
KW
2165 char *pat_string; /* The pattern's exactish string */
2166 char *pat_end; /* ptr to end char of pat_string */
2167 re_fold_t folder; /* Function for computing non-utf8 folds */
2168 const U8 *fold_array; /* array for folding ords < 256 */
2169 STRLEN ln;
2170 STRLEN lnc;
73104a1b
KW
2171 U8 c1;
2172 U8 c2;
743dd5b8 2173 char *e = NULL;
21d1ed54
KW
2174
2175 /* In some cases we accept only the first occurence of 'x' in a sequence of
2176 * them. This variable points to just beyond the end of the previous
2177 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2178 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2179 * hop back.) */
2180 char * previous_occurrence_end = 0;
2181
3b6c52ce 2182 I32 tmp; /* Scratch variable */
ba44c216 2183 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 2184 UV utf8_fold_flags = 0;
f9176b44 2185 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
2186 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2187 with a result inverts that result, as 0^1 =
2188 1 and 1^1 = 0 */
2189 _char_class_number classnum;
2190
73104a1b 2191 RXi_GET_DECL(prog,progi);
2f7f8cb1 2192
73104a1b 2193 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 2194
56ff0609
KW
2195 /* We know what class it must start with. The case statements below have
2196 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2197 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2198 * ('p8' and 'pb'. */
dd8dc88c 2199 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
56ff0609
KW
2200
2201 case ANYOFPOSIXL_t8_pb:
2202 case ANYOFPOSIXL_t8_p8:
2203 case ANYOFL_t8_pb:
2204 case ANYOFL_t8_p8:
780fcc9f 2205 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
b272adb4 2206 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
a0bd1a30 2207
780fcc9f 2208 /* FALLTHROUGH */
56ff0609
KW
2209
2210 case ANYOFD_t8_pb:
2211 case ANYOFD_t8_p8:
2212 case ANYOF_t8_pb:
2213 case ANYOF_t8_p8:
2214 REXEC_FBC_UTF8_CLASS_SCAN(
2215 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2216 break;
2217
2218 case ANYOFPOSIXL_tb_pb:
2219 case ANYOFPOSIXL_tb_p8:
2220 case ANYOFL_tb_pb:
2221 case ANYOFL_tb_p8:
2222 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
b272adb4 2223 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
56ff0609
KW
2224
2225 /* FALLTHROUGH */
2226
2227 case ANYOFD_tb_pb:
2228 case ANYOFD_tb_p8:
2229 case ANYOF_tb_pb:
2230 case ANYOF_tb_p8:
2231 if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
daced5ae
KW
2232 /* We know that s is in the bitmap range since the target isn't
2233 * UTF-8, so what happens for out-of-range values is not relevant,
2234 * so exclude that from the flags */
56ff0609
KW
2235 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2236 0));
1451f692 2237 }
73104a1b 2238 else {
56ff0609 2239 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
2240 }
2241 break;
73104a1b 2242
56ff0609
KW
2243 case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2244 case ANYOFM_tb_p8:
2245 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2246 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2247 (U8) ARG(c), FLAGS(c)));
2813d4ad
KW
2248 break;
2249
56ff0609
KW
2250 case ANYOFM_t8_pb:
2251 case ANYOFM_t8_p8:
2252 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2253 * we do anyway for performance reasons, as otherwise we would have to
2254 * examine all the continuation characters */
2255 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2256 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2257 (U8) ARG(c), FLAGS(c)));
3db0bccc
KW
2258 break;
2259
56ff0609
KW
2260 case NANYOFM_tb_pb:
2261 case NANYOFM_tb_p8:
2262 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2263 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2264 (U8) ARG(c), FLAGS(c)));
2265 break;
2266
2267 case NANYOFM_t8_pb:
2268 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2269 variants. */
2270 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2271 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2272 (U8) ARG(c), FLAGS(c)));
2273 break;
2274
2275 /* These nodes all require at least one code point to be in UTF-8 to
2276 * match */
2277 case ANYOFH_tb_pb:
2278 case ANYOFH_tb_p8:
2279 case ANYOFHb_tb_pb:
2280 case ANYOFHb_tb_p8:
2281 case ANYOFHr_tb_pb:
2282 case ANYOFHr_tb_p8:
2283 case ANYOFHs_tb_pb:
2284 case ANYOFHs_tb_p8:
2285 case EXACTFLU8_tb_pb:
2286 case EXACTFLU8_tb_p8:
2287 case EXACTFU_REQ8_tb_pb:
2288 case EXACTFU_REQ8_tb_p8:
6966a05b
KW
2289 break;
2290
56ff0609
KW
2291 case ANYOFH_t8_pb:
2292 case ANYOFH_t8_p8:
2293 REXEC_FBC_UTF8_CLASS_SCAN(
2294 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2295 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2296 break;
6966a05b 2297
56ff0609
KW
2298 case ANYOFHb_t8_pb:
2299 case ANYOFHb_t8_p8:
2300 {
2301 /* We know what the first byte of any matched string should be. */
ff8b8b42
KW
2302 U8 first_byte = FLAGS(c);
2303
6966a05b 2304 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
56ff0609 2305 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
ff8b8b42 2306 }
c316b824
KW
2307 break;
2308
56ff0609
KW
2309 case ANYOFHr_t8_pb:
2310 case ANYOFHr_t8_p8:
2311 REXEC_FBC_UTF8_CLASS_SCAN(
2312 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2313 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2314 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2315 && reginclass(prog, c, (U8*)s, (U8*) strend,
2316 1 /* is utf8 */)));
3146c00a
KW
2317 break;
2318
56ff0609
KW
2319 case ANYOFHs_t8_pb:
2320 case ANYOFHs_t8_p8:
2321 REXEC_FBC_UTF8_CLASS_SCAN(
2322 ( strend -s >= FLAGS(c)
2323 && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2324 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
34924db0
KW
2325 break;
2326
56ff0609
KW
2327 case ANYOFR_tb_pb:
2328 case ANYOFR_tb_p8:
2329 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2330 ANYOFRbase(c), ANYOFRdelta(c)));
2331 break;
2332
2333 case ANYOFR_t8_pb:
2334 case ANYOFR_t8_p8:
2335 REXEC_FBC_UTF8_CLASS_SCAN(
2336 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2337 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2338 (U8 *) strend,
2339 NULL),
2340 ANYOFRbase(c), ANYOFRdelta(c))));
13fcf652
KW
2341 break;
2342
56ff0609
KW
2343 case ANYOFRb_tb_pb:
2344 case ANYOFRb_tb_p8:
2345 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2346 ANYOFRbase(c), ANYOFRdelta(c)));
2347 break;
2d5613be 2348
56ff0609
KW
2349 case ANYOFRb_t8_pb:
2350 case ANYOFRb_t8_p8:
2351 { /* We know what the first byte of any matched string should be */
2d5613be
KW
2352 U8 first_byte = FLAGS(c);
2353
2354 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
56ff0609
KW
2355 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2356 (U8 *) strend,
2357 NULL),
2358 ANYOFRbase(c), ANYOFRdelta(c)));
2d5613be
KW
2359 }
2360 break;
2361
56ff0609 2362 case EXACTFAA_tb_pb:
aa419ff3
KW
2363
2364 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2365 * which these functions don't handle anyway */
2366 fold_array = PL_fold_latin1;
2367 folder = foldEQ_latin1_s2_folded;
2368 goto do_exactf_non_utf8;
77a6d856 2369
56ff0609 2370 case EXACTF_tb_pb:
73104a1b
KW
2371 fold_array = PL_fold;
2372 folder = foldEQ;
2373 goto do_exactf_non_utf8;
2374
56ff0609 2375 case EXACTFL_tb_pb:
780fcc9f 2376 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
56ff0609
KW
2377
2378 if (IN_UTF8_CTYPE_LOCALE) {
cea315b6 2379 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
2380 goto do_exactf_utf8;
2381 }
56ff0609 2382
73104a1b
KW
2383 fold_array = PL_fold_locale;
2384 folder = foldEQ_locale;
2385 goto do_exactf_non_utf8;
3c760661 2386
56ff0609
KW
2387 case EXACTFU_tb_pb:
2388 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2389 * don't have to worry here about this single special case in the
2390 * Latin1 range */
73104a1b 2391 fold_array = PL_fold_latin1;
0fbec7cf 2392 folder = foldEQ_latin1_s2_folded;
73104a1b 2393
924ba076 2394 /* FALLTHROUGH */
73104a1b 2395
56ff0609
KW
2396 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2397 are no glitches with fold-length differences
2398 between the target string and pattern */
73104a1b 2399
56ff0609
KW
2400 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2401 * character of the EXACTF* node and then, if necessary,
73104a1b
KW
2402 * case-insensitively compare the full text of the node. c1 is the
2403 * first character. c2 is its fold. This logic will not work for
56ff0609
KW
2404 * Unicode semantics and the german sharp ss, which hence should not be
2405 * compiled into a node that gets here. */
ae06e581
KW
2406 pat_string = STRINGs(c);
2407 ln = STR_LENs(c); /* length to match in octets/bytes */
73104a1b 2408
56ff0609
KW
2409 /* We know that we have to match at least 'ln' bytes (which is the same
2410 * as characters, since not utf8). If we have to match 3 characters,
2411 * and there are only 2 availabe, we know without trying that it will
2412 * fail; so don't start a match past the required minimum number from
2413 * the far end */
ea3daa5d 2414 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
2415 if (e < s)
2416 break;
fac1af77 2417
73104a1b
KW
2418 c1 = *pat_string;
2419 c2 = fold_array[c1];
2420 if (c1 == c2) { /* If char and fold are the same */
c05cc3b6
KW
2421 while (s <= e) {
2422 s = (char *) memchr(s, c1, e + 1 - s);
2423 if (s == NULL) {
2424 break;
2425 }
2426
2427 /* Check that the rest of the node matches */
2428 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2429 && (reginfo->intuit || regtry(reginfo, &s)) )
2430 {
2431 goto got_it;
2432 }
2433 s++;
2434 }
73104a1b
KW
2435 }
2436 else {
c05cc3b6
KW
2437 U8 bits_differing = c1 ^ c2;
2438
2439 /* If the folds differ in one bit position only, we can mask to
2440 * match either of them, and can use this faster find method. Both
2441 * ASCII and EBCDIC tend to have their case folds differ in only
2442 * one position, so this is very likely */
2443 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2444 bits_differing = ~ bits_differing;
2445 while (s <= e) {
2b1f9c71 2446 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
c05cc3b6
KW
2447 (c1 & bits_differing), bits_differing);
2448 if (s > e) {
2449 break;
2450 }
2451
2452 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2453 && (reginfo->intuit || regtry(reginfo, &s)) )
2454 {
2455 goto got_it;
2456 }
2457 s++;
2458 }
2459 }
2460 else { /* Otherwise, stuck with looking byte-at-a-time. This
2461 should actually happen only in EXACTFL nodes */
2462 while (s <= e) {
2463 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2464 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2465 && (reginfo->intuit || regtry(reginfo, &s)) )
2466 {
2467 goto got_it;
2468 }
2469 s++;
2470 }
2471 }
73104a1b
KW
2472 }
2473 break;
fac1af77 2474
56ff0609
KW
2475 case EXACTFAA_tb_p8:
2476 case EXACTFAA_t8_p8:
2477 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2478 |FOLDEQ_S2_ALREADY_FOLDED
2479 |FOLDEQ_S2_FOLDS_SANE;
2480 goto do_exactf_utf8;
73104a1b 2481
56ff0609
KW
2482 case EXACTFAA_NO_TRIE_tb_pb:
2483 case EXACTFAA_NO_TRIE_t8_pb:
2484 case EXACTFAA_t8_pb:
236d82fd 2485
56ff0609
KW
2486 /* Here, and elsewhere in this file, the reason we can't consider a
2487 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2488 * is because any MICRO SIGN in the pattern won't be folded. Since the
2489 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2490 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2491 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2492 goto do_exactf_utf8;
2493
2494 case EXACTFL_tb_p8:
2495 case EXACTFL_t8_pb:
2496 case EXACTFL_t8_p8:
780fcc9f 2497 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
56ff0609
KW
2498 utf8_fold_flags = FOLDEQ_LOCALE;
2499 goto do_exactf_utf8;
64935bc6 2500
56ff0609
KW
2501 case EXACTFLU8_t8_pb:
2502 case EXACTFLU8_t8_p8:
2503 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2504 | FOLDEQ_S2_FOLDS_SANE;
2505 goto do_exactf_utf8;
64935bc6 2506
56ff0609
KW
2507 case EXACTFU_REQ8_t8_p8:
2508 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2509 goto do_exactf_utf8;
2510
2511 case EXACTFU_tb_p8:
2512 case EXACTFU_t8_pb:
2513 case EXACTFU_t8_p8:
2514 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2515 goto do_exactf_utf8;
2516
2517 /* The following are problematic even though pattern isn't UTF-8. Use
2518 * full functionality normally not done except for UTF-8. */
2519 case EXACTF_t8_pb:
2520 case EXACTFUP_tb_pb:
2521 case EXACTFUP_t8_pb:
2522
2523 do_exactf_utf8:
2524 {
2525 unsigned expansion;
2526
2527 /* If one of the operands is in utf8, we can't use the simpler
2528 * folding above, due to the fact that many different characters
2529 * can have the same fold, or portion of a fold, or different-
2530 * length fold */
2531 pat_string = STRINGs(c);
2532 ln = STR_LENs(c); /* length to match in octets/bytes */
2533 pat_end = pat_string + ln;
2534 lnc = is_utf8_pat /* length to match in characters */
2535 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2536 : ln;
2537
2538 /* We have 'lnc' characters to match in the pattern, but because of
2539 * multi-character folding, each character in the target can match
2540 * up to 3 characters (Unicode guarantees it will never exceed
2541 * this) if it is utf8-encoded; and up to 2 if not (based on the
2542 * fact that the Latin 1 folds are already determined, and the only
2543 * multi-char fold in that range is the sharp-s folding to 'ss'.
2544 * Thus, a pattern character can match as little as 1/3 of a string
2545 * character. Adjust lnc accordingly, rounding up, so that if we
2546 * need to match at least 4+1/3 chars, that really is 5. */
2547 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2548 lnc = (lnc + expansion - 1) / expansion;
2549
2550 /* As in the non-UTF8 case, if we have to match 3 characters, and
2551 * only 2 are left, it's guaranteed to fail, so don't start a match
2552 * that would require us to go beyond the end of the string */
2553 e = HOP3c(strend, -((SSize_t)lnc), s);
2554
2555 /* XXX Note that we could recalculate e to stop the loop earlier,
2556 * as the worst case expansion above will rarely be met, and as we
2557 * go along we would usually find that e moves further to the left.
2558 * This would happen only after we reached the point in the loop
2559 * where if there were no expansion we should fail. Unclear if
2560 * worth the expense */
2561
2562 while (s <= e) {
2563 char *my_strend= (char *)strend;
2564 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2565 pat_string, NULL, ln, is_utf8_pat,
2566 utf8_fold_flags)
2567 && (reginfo->intuit || regtry(reginfo, &s)) )
2568 {
2569 goto got_it;
2570 }
2571 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
89ad707a 2572 }
64935bc6 2573 }
73104a1b 2574 break;
64935bc6 2575
56ff0609
KW
2576 case BOUNDA_tb_pb:
2577 case BOUNDA_tb_p8:
2578 case BOUND_tb_pb: /* /d without utf8 target is /a */
2579 case BOUND_tb_p8:
2580 /* regcomp.c makes sure that these only have the traditional \b
2581 * meaning. */
64935bc6
KW
2582 assert(FLAGS(c) == TRADITIONAL_BOUND);
2583
56ff0609 2584 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
73104a1b 2585 break;
64935bc6 2586
56ff0609
KW
2587 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2588 case BOUNDA_t8_p8:
2589 /* regcomp.c makes sure that these only have the traditional \b
2590 * meaning. */
64935bc6
KW
2591 assert(FLAGS(c) == TRADITIONAL_BOUND);
2592
56ff0609 2593 FBC_BOUND_A_UTF8(isWORDCHAR_A);
73104a1b 2594 break;
64935bc6 2595
56ff0609
KW
2596 case NBOUNDA_tb_pb:
2597 case NBOUNDA_tb_p8:
2598 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2599 case NBOUND_tb_p8:
2600 /* regcomp.c makes sure that these only have the traditional \b
2601 * meaning. */
64935bc6
KW
2602 assert(FLAGS(c) == TRADITIONAL_BOUND);
2603
56ff0609 2604 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
73104a1b 2605 break;
64935bc6 2606
56ff0609
KW
2607 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2608 case NBOUNDA_t8_p8:
2609 /* regcomp.c makes sure that these only have the traditional \b
2610 * meaning. */
64935bc6
KW
2611 assert(FLAGS(c) == TRADITIONAL_BOUND);
2612
56ff0609 2613 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
73104a1b 2614 break;
64935bc6 2615
56ff0609
KW
2616 case NBOUNDU_tb_pb:
2617 case NBOUNDU_tb_p8:
64935bc6 2618 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
56ff0609
KW
2619 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2620 break;
2621 }
2622
2623 to_complement = 1;
2624 goto do_boundu_non_utf8;
2625
2626 case NBOUNDL_tb_pb:
2627 case NBOUNDL_tb_p8:
2628 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2629 if (FLAGS(c) == TRADITIONAL_BOUND) {
2630 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
64935bc6
KW
2631 break;
2632 }
2633
526e4b9d 2634 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
64935bc6
KW
2635
2636 to_complement = 1;
56ff0609
KW
2637 goto do_boundu_non_utf8;
2638
2639 case BOUNDL_tb_pb:
2640 case BOUNDL_tb_p8:
2641 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2642 if (FLAGS(c) == TRADITIONAL_BOUND) {
2643 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2644 break;
2645 }
64935bc6 2646
526e4b9d 2647 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
56ff0609
KW
2648
2649 goto do_boundu_non_utf8;
2650
2651 case BOUNDU_tb_pb:
2652 case BOUNDU_tb_p8:
6ebdcce0 2653 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
56ff0609 2654 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
6ebdcce0
KW
2655 break;
2656 }
2657
56ff0609 2658 do_boundu_non_utf8:
1a651719
KW
2659 if (s == reginfo->strbeg) {
2660 if (reginfo->intuit || regtry(reginfo, &s))
2661 {
2662 goto got_it;
2663 }
a7a8bd1e 2664
1a651719 2665 /* Didn't match. Try at the next position (if there is one) */
56ff0609 2666 s++;
1a651719 2667 if (UNLIKELY(s >= reginfo->strend)) {
6ebdcce0 2668 break;
1a651719
KW
2669 }
2670 }
64935bc6 2671
1a651719
KW
2672 switch((bound_type) FLAGS(c)) {
2673 case TRADITIONAL_BOUND: /* Should have already been handled */
2674 assert(0);
2675 break;
2676
2677 case GCB_BOUND:
56ff0609
KW
2678 /* Not utf8. Everything is a GCB except between CR and LF */
2679 while (s < strend) {
2680 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2681 || UCHARAT(s) != '\n'))
2682 && (reginfo->intuit || regtry(reginfo, &s)))
2683 {
2684 goto got_it;
2685 }
2686 s++;
2687 }
2688
2689 break;
2690
2691 case LB_BOUND:
2692 {
2693 LB_enum before = getLB_VAL_CP((U8) *(s -1));
1a651719 2694 while (s < strend) {
56ff0609
KW
2695 LB_enum after = getLB_VAL_CP((U8) *s);
2696 if (to_complement ^ isLB(before,
2697 after,
2698 (U8*) reginfo->strbeg,
2699 (U8*) s,
2700 (U8*) reginfo->strend,
2701 0 /* target not utf8 */ )
1a651719
KW
2702 && (reginfo->intuit || regtry(reginfo, &s)))
2703 {
2704 goto got_it;
64935bc6 2705 }
1a651719 2706 before = after;
56ff0609 2707 s++;
64935bc6 2708 }
1a651719 2709 }
56ff0609
KW
2710
2711 break;
2712
2713 case SB_BOUND:
2714 {
2715 SB_enum before = getSB_VAL_CP((U8) *(s -1));
1a651719 2716 while (s < strend) {
56ff0609
KW
2717 SB_enum after = getSB_VAL_CP((U8) *s);
2718 if ((to_complement ^ isSB(before,
2719 after,
2720 (U8*) reginfo->strbeg,
2721 (U8*) s,
2722 (U8*) reginfo->strend,
2723 0 /* target not utf8 */ ))
1a651719
KW
2724 && (reginfo->intuit || regtry(reginfo, &s)))
2725 {
2726 goto got_it;
64935bc6 2727 }
56ff0609 2728 before = after;
1a651719 2729 s++;
64935bc6 2730 }
1a651719 2731 }
64935bc6 2732
1a651719 2733 break;
ae3bb8ea 2734
56ff0609
KW
2735 case WB_BOUND:
2736 {
2737 WB_enum previous = WB_UNKNOWN;
2738 WB_enum before = getWB_VAL_CP((U8) *(s -1));
1a651719 2739 while (s < strend) {
56ff0609
KW
2740 WB_enum after = getWB_VAL_CP((U8) *s);
2741 if ((to_complement ^ isWB(previous,
2742 before,
2743 after,
2744 (U8*) reginfo->strbeg,
2745 (U8*) s,
2746 (U8*) reginfo->strend,
2747 0 /* target not utf8 */ ))
2748 && (reginfo->intuit || regtry(reginfo, &s)))
2749 {
2750 goto got_it;
2751 }
2752 previous = before;
2753 before = after;
2754 s++;
2755 }
2756 }
2757 }
2758
2759 /* Here are at the final position in the target string, which is a
2760 * boundary by definition, so matches, depending on other constraints.
2761 * */
2762 if ( reginfo->intuit
2763 || (s <= reginfo->strend && regtry(reginfo, &s)))
2764 {
2765 goto got_it;
2766 }
2767
2768 break;
2769
2770 case BOUNDL_t8_pb:
2771 case BOUNDL_t8_p8:
2772 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2773 if (FLAGS(c) == TRADITIONAL_BOUND) {
2774 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2775 isWORDCHAR_LC_utf8_safe);
2776 break;
2777 }
2778
526e4b9d 2779 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
56ff0609
KW
2780
2781 to_complement = 1;
2782 goto do_boundu_utf8;
2783
2784 case NBOUNDL_t8_pb:
2785 case NBOUNDL_t8_p8:
2786 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2787 if (FLAGS(c) == TRADITIONAL_BOUND) {
2788 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2789 isWORDCHAR_LC_utf8_safe);
2790 break;
2791 }
2792
526e4b9d 2793 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
56ff0609
KW
2794
2795 to_complement = 1;
2796 goto do_boundu_utf8;
2797
2798 case NBOUND_t8_pb:
2799 case NBOUND_t8_p8:
2800 /* regcomp.c makes sure that these only have the traditional \b
2801 * meaning. */
2802 assert(FLAGS(c) == TRADITIONAL_BOUND);
2803
2804 /* FALLTHROUGH */
2805
2806 case NBOUNDU_t8_pb:
2807 case NBOUNDU_t8_p8:
2808 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2809 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2810 isWORDCHAR_utf8_safe);
2811 break;
2812 }
2813
2814 to_complement = 1;
2815 goto do_boundu_utf8;
2816
2817 case BOUND_t8_pb:
2818 case BOUND_t8_p8:
2819 /* regcomp.c makes sure that these only have the traditional \b
2820 * meaning. */
2821 assert(FLAGS(c) == TRADITIONAL_BOUND);
2822
2823 /* FALLTHROUGH */
2824
2825 case BOUNDU_t8_pb:
2826 case BOUNDU_t8_p8:
2827 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2828 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2829 break;
2830 }
2831
2832 do_boundu_utf8:
2833 if (s == reginfo->strbeg) {
2834 if (reginfo->intuit || regtry(reginfo, &s))
2835 {
2836 goto got_it;
2837 }
2838
2839 /* Didn't match. Try at the next position (if there is one) */
2840 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2841 if (UNLIKELY(s >= reginfo->strend)) {
2842 break;
2843 }
2844 }
2845
2846 switch((bound_type) FLAGS(c)) {
2847 case TRADITIONAL_BOUND: /* Should have already been handled */
2848 assert(0);
2849 break;
2850
2851 case GCB_BOUND:
2852 {
2853 GCB_enum before = getGCB_VAL_UTF8(
2854 reghop3((U8*)s, -1,
2855 (U8*)(reginfo->strbeg)),
2856 (U8*) reginfo->strend);
2857 while (s < strend) {
2858 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2859 (U8*) reginfo->strend);
2860 if ( (to_complement ^ isGCB(before,
2861 after,
2862 (U8*) reginfo->strbeg,
2863 (U8*) s,
2864 1 /* target is utf8 */ ))
1a651719
KW
2865 && (reginfo->intuit || regtry(reginfo, &s)))
2866 {
2867 goto got_it;
6b659339 2868 }
1a651719
KW
2869 before = after;
2870 s += UTF8_SAFE_SKIP(s, reginfo->strend);
6b659339 2871 }
1a651719 2872 }
56ff0609
KW
2873 break;
2874
2875 case LB_BOUND:
2876 {
2877 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2878 -1,
2879 (U8*)(reginfo->strbeg)),
2880 (U8*) reginfo->strend);
1a651719 2881 while (s < strend) {
56ff0609
KW
2882 LB_enum after = getLB_VAL_UTF8((U8*) s,
2883 (U8*) reginfo->strend);
1a651719
KW
2884 if (to_complement ^ isLB(before,
2885 after,
2886 (U8*) reginfo->strbeg,
2887 (U8*) s,
2888 (U8*) reginfo->strend,
56ff0609 2889 1 /* target is utf8 */ )
1a651719
KW
2890 && (reginfo->intuit || regtry(reginfo, &s)))
2891 {
2892 goto got_it;
6b659339 2893 }
1a651719 2894 before = after;
56ff0609 2895 s += UTF8_SAFE_SKIP(s, reginfo->strend);
6b659339 2896 }
1a651719 2897 }
6b659339 2898
1a651719 2899 break;
6b659339 2900
1a651719 2901 case SB_BOUND:
56ff0609 2902 {
1a651719
KW
2903 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2904 -1,
2905 (U8*)(reginfo->strbeg)),
2906 (U8*) reginfo->strend);
2907 while (s < strend) {
2908 SB_enum after = getSB_VAL_UTF8((U8*) s,
2909 (U8*) reginfo->strend);
2910 if ((to_complement ^ isSB(before,
2911 after,
2912 (U8*) reginfo->strbeg,
2913 (U8*) s,
2914 (U8*) reginfo->strend,
56ff0609 2915 1 /* target is utf8 */ ))
1a651719
KW
2916 && (reginfo->intuit || regtry(reginfo, &s)))
2917 {
2918 goto got_it;
06ae2722 2919 }
1a651719
KW
2920 before = after;
2921 s += UTF8_SAFE_SKIP(s, reginfo->strend);
06ae2722 2922 }
1a651719 2923 }
06ae2722 2924
1a651719 2925 break;
06ae2722 2926
1a651719 2927 case WB_BOUND:
56ff0609 2928 {
1a651719
KW
2929 /* We are at a boundary between char_sub_0 and char_sub_1.
2930 * We also keep track of the value for char_sub_-1 as we
2931 * loop through the line. Context may be needed to make a
2932 * determination, and if so, this can save having to
2933 * recalculate it */
2934 WB_enum previous = WB_UNKNOWN;
2935 WB_enum before = getWB_VAL_UTF8(
2936 reghop3((U8*)s,
2937 -1,
2938 (U8*)(reginfo->strbeg)),
2939 (U8*) reginfo->strend);
2940 while (s < strend) {
2941 WB_enum after = getWB_VAL_UTF8((U8*) s,
2942 (U8*) reginfo->strend);
2943 if ((to_complement ^ isWB(previous,
2944 before,
2945 after,
2946 (U8*) reginfo->strbeg,
2947 (U8*) s,
2948 (U8*) reginfo->strend,
56ff0609 2949 1 /* target is utf8 */ ))
1a651719
KW
2950 && (reginfo->intuit || regtry(reginfo, &s)))
2951 {
2952 goto got_it;
ae3bb8ea 2953 }
1a651719
KW
2954 previous = before;
2955 before = after;
2956 s += UTF8_SAFE_SKIP(s, reginfo->strend);
ae3bb8ea 2957 }
1a651719 2958 }
6ebdcce0
KW
2959 }
2960
2961 /* Here are at the final position in the target string, which is a
2962 * boundary by definition, so matches, depending on other constraints.
2963 * */
ae3bb8ea 2964
56ff0609
KW
2965 if ( reginfo->intuit
2966 || (s <= reginfo->strend && regtry(reginfo, &s)))
2967 {
2968 goto got_it;
2969 }
73104a1b 2970 break;
64935bc6 2971
56ff0609
KW
2972 case LNBREAK_t8_pb:
2973 case LNBREAK_t8_p8:
2974 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2975 break;
2976
2977 case LNBREAK_tb_pb:
2978 case LNBREAK_tb_p8:
2979 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
73104a1b 2980 break;
3018b823 2981
56ff0609
KW
2982 /* The argument to all the POSIX node types is the class number to pass
2983 * to _generic_isCC() to build a mask for searching in PL_charclass[] */
3018b823 2984
56ff0609
KW
2985 case NPOSIXL_t8_pb:
2986 case NPOSIXL_t8_p8:
3018b823
KW
2987 to_complement = 1;
2988 /* FALLTHROUGH */
2989
56ff0609
KW
2990 case POSIXL_t8_pb:
2991 case POSIXL_t8_p8:
780fcc9f 2992 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
56ff0609
KW
2993 REXEC_FBC_UTF8_CLASS_SCAN(
2994 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
2995 (U8 *) strend)));
73104a1b 2996 break;
3018b823 2997
56ff0609
KW
2998 case NPOSIXL_tb_pb:
2999 case NPOSIXL_tb_p8:
3018b823
KW
3000 to_complement = 1;
3001 /* FALLTHROUGH */
3002
56ff0609
KW
3003 case POSIXL_tb_pb:
3004 case POSIXL_tb_p8:
3005 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3006 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3007 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3008 break;
3018b823 3009
56ff0609
KW
3010 case NPOSIXA_t8_pb:
3011 case NPOSIXA_t8_p8:
3012 /* The complement of something that matches only ASCII matches all
3013 * non-ASCII, plus everything in ASCII that isn't in the class. */
3014 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3015 || ! _generic_isCC_A(*s, FLAGS(c)));
3016 break;
3018b823 3017
56ff0609
KW
3018 case POSIXA_t8_pb:
3019 case POSIXA_t8_p8:
73104a1b 3020 /* Don't need to worry about utf8, as it can match only a single
4a6c6db5
KW
3021 * byte invariant character. But we do anyway for performance reasons,
3022 * as otherwise we would have to examine all the continuation
3023 * characters */
56ff0609
KW
3024 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3025 break;
4a6c6db5 3026
56ff0609
KW
3027 case NPOSIXD_tb_pb:
3028 case NPOSIXD_tb_p8:
3029 case NPOSIXA_tb_pb:
3030 case NPOSIXA_tb_p8:
3031 to_complement = 1;
3032 /* FALLTHROUGH */
3033
3034 case POSIXD_tb_pb:
3035 case POSIXD_tb_p8:
3036 case POSIXA_tb_pb:
3037 case POSIXA_tb_p8:
3038 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3018b823 3039 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 3040 break;
3018b823 3041
56ff0609
KW
3042 case NPOSIXU_tb_pb:
3043 case NPOSIXU_tb_p8:
3018b823
KW
3044 to_complement = 1;
3045 /* FALLTHROUGH */
3046
56ff0609
KW
3047 case POSIXU_tb_pb:
3048 case POSIXU_tb_p8:
3049 REXEC_FBC_NON_UTF8_CLASS_SCAN(
da10aa09 3050 to_complement ^ cBOOL(_generic_isCC(*s,
3018b823 3051 FLAGS(c))));
56ff0609
KW
3052 break;
3053
3054 case NPOSIXD_t8_pb:
3055 case NPOSIXD_t8_p8:
3056 case NPOSIXU_t8_pb:
3057 case NPOSIXU_t8_p8:
3058 to_complement = 1;
3059 /* FALLTHROUGH */
3018b823 3060
56ff0609
KW
3061 case POSIXD_t8_pb:
3062 case POSIXD_t8_p8:
3063 case POSIXU_t8_pb:
3064 case POSIXU_t8_p8:
3065 classnum = (_char_class_number) FLAGS(c);
3066 switch (classnum) {
3067 default:
3068 REXEC_FBC_UTF8_CLASS_SCAN(
8d692afb 3069 to_complement ^ cBOOL(_invlist_contains_cp(
56ff0609
KW
3070 PL_XPosix_ptrs[classnum],
3071 utf8_to_uvchr_buf((U8 *) s,
8d692afb
KW
3072 (U8 *) strend,
3073 NULL))));
56ff0609
KW
3074 break;
3075
3076 case _CC_ENUM_SPACE:
3077 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 3078 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
56ff0609 3079 break;
3018b823 3080
56ff0609
KW
3081 case _CC_ENUM_BLANK:
3082 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 3083 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
56ff0609 3084 break;
3018b823 3085
56ff0609
KW
3086 case _CC_ENUM_XDIGIT:
3087 REXEC_FBC_UTF8_CLASS_SCAN(
3088 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3089 break;
3018b823 3090
56ff0609
KW
3091 case _CC_ENUM_VERTSPACE:
3092 REXEC_FBC_UTF8_CLASS_SCAN(
3093 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3094 break;
3018b823 3095
56ff0609
KW
3096 case _CC_ENUM_CNTRL:
3097 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 3098 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
56ff0609 3099 break;
3018b823
KW
3100 }
3101 break;
3102
56ff0609
KW
3103 case AHOCORASICKC_tb_pb:
3104 case AHOCORASICKC_tb_p8:
3105 case AHOCORASICKC_t8_pb:
3106 case AHOCORASICKC_t8_p8:
3107 case AHOCORASICK_tb_pb:
3108 case AHOCORASICK_tb_p8:
3109 case AHOCORASICK_t8_pb:
3110 case AHOCORASICK_t8_p8:
73104a1b
KW
3111 {
3112 DECL_TRIE_TYPE(c);
3113 /* what trie are we using right now */
3114 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
56ff0609 3115 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
73104a1b
KW
3116 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3117
3118 const char *last_start = strend - trie->minlen;
6148ee25 3119#ifdef DEBUGGING
73104a1b 3120 const char *real_start = s;
6148ee25 3121#endif
73104a1b
KW
3122 STRLEN maxlen = trie->maxlen;
3123 SV *sv_points;
3124 U8 **points; /* map of where we were in the input string
3125 when reading a given char. For ASCII this
3126 is unnecessary overhead as the relationship
3127 is always 1:1, but for Unicode, especially
3128 case folded Unicode this is not true. */
3129 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3130 U8 *bitmap=NULL;
3131
3132
271b36b1 3133 DECLARE_AND_GET_RE_DEBUG_FLAGS;
73104a1b
KW
3134
3135 /* We can't just allocate points here. We need to wrap it in
3136 * an SV so it gets freed properly if there is a croak while
3137 * running the match */
3138 ENTER;
3139 SAVETMPS;
3140 sv_points=newSV(maxlen * sizeof(U8 *));
3141 SvCUR_set(sv_points,
3142 maxlen * sizeof(U8 *));
3143 SvPOK_on(sv_points);
3144 sv_2mortal(sv_points);
3145 points=(U8**)SvPV_nolen(sv_points );
3146 if ( trie_type != trie_utf8_fold
3147 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3148 {
3149 if (trie->bitmap)
3150 bitmap=(U8*)trie->bitmap;
3151 else
3152 bitmap=(U8*)ANYOF_BITMAP(c);
3153 }
3154 /* this is the Aho-Corasick algorithm modified a touch
3155 to include special handling for long "unknown char" sequences.
3156 The basic idea being that we use AC as long as we are dealing
3157 with a possible matching char, when we encounter an unknown char
3158 (and we have not encountered an accepting state) we scan forward
3159 until we find a legal starting char.
3160 AC matching is basically that of trie matching, except that when
3161 we encounter a failing transition, we fall back to the current
3162 states "fail state", and try the current char again, a process
3163 we repeat until we reach the root state, state 1, or a legal
3164 transition. If we fail on the root state then we can either
3165 terminate if we have reached an accepting state previously, or
3166 restart the entire process from the beginning if we have not.
3167
3168 */
3169 while (s <= last_start) {
3170 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3171 U8 *uc = (U8*)s;
3172 U16 charid = 0;
3173 U32 base = 1;
3174 U32 state = 1;
3175 UV uvc = 0;
3176 STRLEN len = 0;
3177 STRLEN foldlen = 0;
3178 U8 *uscan = (U8*)NULL;
3179 U8 *leftmost = NULL;
3180#ifdef DEBUGGING
3181 U32 accepted_word= 0;
786e8c11 3182#endif
73104a1b
KW
3183 U32 pointpos = 0;
3184
3185 while ( state && uc <= (U8*)strend ) {
3186 int failed=0;
3187 U32 word = aho->states[ state ].wordnum;
3188
3189 if( state==1 ) {
3190 if ( bitmap ) {
3191 DEBUG_TRIE_EXECUTE_r(
56ff0609
KW
3192 if ( uc <= (U8*)last_start
3193 && !BITMAP_TEST(bitmap,*uc) )
3194 {
3195 dump_exec_pos( (char *)uc, c, strend,
3196 real_start,
cb41e5d6 3197 (char *)uc, utf8_target, 0 );
6ad9a8ab 3198 Perl_re_printf( aTHX_
73104a1b
KW
3199 " Scanning for legal start char...\n");
3200 }
3201 );
3202 if (utf8_target) {
56ff0609
KW
3203 while ( uc <= (U8*)last_start
3204 && !BITMAP_TEST(bitmap,*uc) )
3205 {
73104a1b
KW
3206 uc += UTF8SKIP(uc);
3207 }
3208 } else {
56ff0609
KW
3209 while ( uc <= (U8*)last_start
3210 && ! BITMAP_TEST(bitmap,*uc) )
3211 {
73104a1b
KW
3212 uc++;
3213 }
786e8c11 3214 }
73104a1b 3215 s= (char *)uc;
07be1b83 3216 }
73104a1b
KW
3217 if (uc >(U8*)last_start) break;
3218 }
3219
3220 if ( word ) {
56ff0609
KW
3221 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3222 % maxlen ];
73104a1b
KW
3223 if (!leftmost || lpos < leftmost) {
3224 DEBUG_r(accepted_word=word);
3225 leftmost= lpos;
7016d6eb 3226 }
73104a1b 3227 if (base==0) break;
7016d6eb 3228
73104a1b
KW
3229 }
3230 points[pointpos++ % maxlen]= uc;
3231 if (foldlen || uc < (U8*)strend) {
9ad8cac4
KW
3232 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3233 (U8 *) strend, uscan, len, uvc,
3234 charid, foldlen, foldbuf,
3235 uniflags);
73104a1b
KW
3236 DEBUG_TRIE_EXECUTE_r({
3237 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 3238 real_start, s, utf8_target, 0);
6ad9a8ab 3239 Perl_re_printf( aTHX_
147e3846 3240 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
3241 charid, uvc);
3242 });
3243 }
3244 else {
3245 len = 0;
3246 charid = 0;
3247 }
07be1b83 3248
73104a1b
KW
3249
3250 do {
6148ee25 3251#ifdef DEBUGGING
73104a1b 3252 word = aho->states[ state ].wordnum;
6148ee25 3253#endif
73104a1b
KW
3254 base = aho->states[ state ].trans.base;
3255
3256 DEBUG_TRIE_EXECUTE_r({
3257 if (failed)
56ff0609 3258 dump_exec_pos((char *)uc, c, strend, real_start,
cb41e5d6 3259 s, utf8_target, 0 );
6ad9a8ab 3260 Perl_re_printf( aTHX_
147e3846 3261 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
3262 failed ? " Fail transition to " : "",
3263 (UV)state, (UV)word);
3264 });
3265 if ( base ) {
3266 U32 tmp;
3267 I32 offset;
3268 if (charid &&
3269 ( ((offset = base + charid
3270 - 1 - trie->uniquecharcount)) >= 0)
3271 && ((U32)offset < trie->lasttrans)
3272 && trie->trans[offset].check == state
3273 && (tmp=trie->trans[offset].next))
3274 {
3275 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3276 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
3277 state = tmp;
3278 break;
07be1b83
YO
3279 }
3280 else {
786e8c11 3281 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3282 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 3283 failed = 1;
73104a1b 3284 state = aho->fail[state];
07be1b83 3285 }
07be1b83 3286 }
73104a1b
KW
3287 else {
3288 /* we must be accepting here */
3289 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3290 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
3291 failed = 1;
3292 break;
786e8c11 3293 }
73104a1b
KW
3294 } while(state);
3295 uc += len;
3296 if (failed) {
3297 if (leftmost)
3298 break;
3299 if (!state) state = 1;
07be1b83 3300 }
73104a1b
KW
3301 }
3302 if ( aho->states[ state ].wordnum ) {
56ff0609
KW
3303 U8 *lpos = points[ (pointpos
3304 - trie->wordinfo[aho->states[ state ]
3305 .wordnum].len) % maxlen ];
73104a1b
KW
3306 if (!leftmost || lpos < leftmost) {
3307 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3308 leftmost = lpos;
07be1b83
YO
3309 }
3310 }
73104a1b
KW
3311 if (leftmost) {
3312 s = (char*)leftmost;
3313 DEBUG_TRIE_EXECUTE_r({
56ff0609
KW
3314 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3315 " at position %" IVdf ". Trying full"
3316 " pattern...\n",
73104a1b
KW
3317 (UV)accepted_word, (IV)(s - real_start)
3318 );
3319 });
02d5137b 3320 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
3321 FREETMPS;
3322 LEAVE;
3323 goto got_it;
3324 }
8c9c2723
KW
3325 if (s < reginfo->strend) {
3326 s = HOPc(s,1);
3327 }
73104a1b 3328 DEBUG_TRIE_EXECUTE_r({
56ff0609
KW
3329 Perl_re_printf( aTHX_
3330 "Pattern failed. Looking for new start"
3331 " point...\n");
73104a1b
KW
3332 });
3333 } else {
3334 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3335 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
3336 break;
3337 }
3338 }
3339 FREETMPS;
3340 LEAVE;
3341 }
3342 break;
56ff0609
KW
3343
3344 case EXACTFU_REQ8_t8_pb:
3345 case EXACTFUP_tb_p8:
3346 case EXACTFUP_t8_p8:
3347 case EXACTF_tb_p8:
3348 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3349 case EXACTFAA_NO_TRIE_tb_p8:
3350 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3351 patterns */
3352 assert(0);
3353
3354 default:
73104a1b 3355 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
56ff0609
KW
3356 } /* End of switch on node type */
3357
73104a1b 3358 return 0;
56ff0609 3359
73104a1b
KW
3360 got_it:
3361 return s;
6eb5f6b9
JH
3362}
3363
60165aa4
DM
3364/* set RX_SAVED_COPY, RX_SUBBEG etc.
3365 * flags have same meanings as with regexec_flags() */
3366
749f4950
DM
3367static void
3368S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
3369 char *strbeg,
3370 char *strend,
3371 SV *sv,
3372 U32 flags,
3373 bool utf8_target)
3374{
3375 struct regexp *const prog = ReANY(rx);
3376
60165aa4
DM
3377 if (flags & REXEC_COPY_STR) {
3378#ifdef PERL_ANY_COW
3379 if (SvCANCOW(sv)) {
eb8fc9fe 3380 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 3381 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 3382 (int) SvTYPE(sv)));
5411a0e5
DM
3383 /* Create a new COW SV to share the match string and store
3384 * in saved_copy, unless the current COW SV in saved_copy
3385 * is valid and suitable for our purpose */
3386 if (( prog->saved_copy
3387 && SvIsCOW(prog->saved_copy)
3388 && SvPOKp(prog->saved_copy)
3389 && SvIsCOW(sv)
3390 && SvPOKp(sv)
3391 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 3392 {
5411a0e5
DM
3393 /* just reuse saved_copy SV */
3394 if (RXp_MATCH_COPIED(prog)) {
3395 Safefree(prog->subbeg);
3396 RXp_MATCH_COPIED_off(prog);
3397 }
3398 }
3399 else {
3400 /* create new COW SV to share string */
196a02af 3401 RXp_MATCH_COPY_FREE(prog);
a76b0e90 3402 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 3403 }
5411a0e5
DM
3404 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3405 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
3406 prog->sublen = strend - strbeg;
3407 prog->suboffset = 0;
3408 prog->subcoffset = 0;
3409 } else
3410#endif
3411 {
99a90e59
FC
3412 SSize_t min = 0;
3413 SSize_t max = strend - strbeg;
ea3daa5d 3414 SSize_t sublen;
60165aa4
DM
3415
3416 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 3417 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3418 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3419 ) { /* don't copy $' part of string */
3420 U32 n = 0;
3421 max = -1;
3422 /* calculate the right-most part of the string covered
f67a5002 3423 * by a capture. Due to lookahead, this may be to
60165aa4
DM
3424 * the right of $&, so we have to scan all captures */
3425 while (n <= prog->lastparen) {
3426 if (prog->offs[n].end > max)
3427 max = prog->offs[n].end;
3428 n++;
3429 }
3430 if (max == -1)
3431 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3432 ? prog->offs[0].start
3433 : 0;
3434 assert(max >= 0 && max <= strend - strbeg);
3435 }
3436
3437 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 3438 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3439 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3440 ) { /* don't copy $` part of string */
3441 U32 n = 0;
3442 min = max;
3443 /* calculate the left-most part of the string covered
f67a5002 3444 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
3445 * the left of $&, so we have to scan all captures */
3446 while (min && n <= prog->lastparen) {
3447 if ( prog->offs[n].start != -1
3448 && prog->offs[n].start < min)
3449 {
3450 min = prog->offs[n].start;
3451 }
3452 n++;
3453 }
3454 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3455 && min > prog->offs[0].end
3456 )
3457 min = prog->offs[0].end;
3458
3459 }
3460
3461 assert(min >= 0 && min <= max && min <= strend - strbeg);
3462 sublen = max - min;
3463
196a02af 3464 if (RXp_MATCH_COPIED(prog)) {
60165aa4
DM
3465 if (sublen > prog->sublen)
3466 prog->subbeg =
3467 (char*)saferealloc(prog->subbeg, sublen+1);
3468 }
3469 else
3470 prog->subbeg = (char*)safemalloc(sublen+1);
3471 Copy(strbeg + min, prog->subbeg, sublen, char);
3472 prog->subbeg[sublen] = '\0';
3473 prog->suboffset = min;
3474 prog->sublen = sublen;
196a02af 3475 RXp_MATCH_COPIED_on(prog);
60165aa4
DM
3476 }
3477 prog->subcoffset = prog->suboffset;
3478 if (prog->suboffset && utf8_target) {
3479 /* Convert byte offset to chars.
3480 * XXX ideally should only compute this if @-/@+
3481 * has been seen, a la PL_sawampersand ??? */
3482
3483 /* If there's a direct correspondence between the
3484 * string which we're matching and the original SV,
3485 * then we can use the utf8 len cache associated with
3486 * the SV. In particular, it means that under //g,
3487 * sv_pos_b2u() will use the previously cached
3488 * position to speed up working out the new length of
3489 * subcoffset, rather than counting from the start of
3490 * the string each time. This stops
3491 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3492 * from going quadratic */
3493 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
3494 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3495 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
3496 else
3497 prog->subcoffset = utf8_length((U8*)strbeg,
3498 (U8*)(strbeg+prog->suboffset));
3499 }
3500 }
3501 else {
196a02af 3502 RXp_MATCH_COPY_FREE(prog);
60165aa4
DM
3503 prog->subbeg = strbeg;
3504 prog->suboffset = 0;
3505 prog->subcoffset = 0;
3506 prog->sublen = strend - strbeg;
3507 }
3508}
3509
3510
3511
fae667d5 3512
6eb5f6b9
JH
3513/*
3514 - regexec_flags - match a regexp against a string
3515 */
3516I32
5aaab254 3517Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 3518 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
3519/* stringarg: the point in the string at which to begin matching */
3520/* strend: pointer to null at end of string */
3521/* strbeg: real beginning of string */
3522/* minend: end of match must be >= minend bytes after stringarg. */
3523/* sv: SV being matched: only used for utf8 flag, pos() etc; string
3524 * itself is accessed via the pointers above */
3525/* data: May be used for some additional optimizations.
d058ec57 3526 Currently unused. */
a340edde 3527/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 3528
6eb5f6b9 3529{
8d919b0a 3530 struct regexp *const prog = ReANY(rx);
5aaab254 3531 char *s;
eb578fdb 3532 regnode *c;
03c83e26 3533 char *startpos;
ea3daa5d
FC
3534 SSize_t minlen; /* must match at least this many chars */
3535 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 3536 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 3537 I32 multiline;
f8fc2ecf</