This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Rmv duplicate strlen()
[perl5.git] / regcomp_invlist.c
CommitLineData
85900e28
YO
1#ifdef PERL_EXT_RE_BUILD
2#include "re_top.h"
3#endif
4
5#include "EXTERN.h"
6#define PERL_IN_REGEX_ENGINE
7#define PERL_IN_REGCOMP_ANY
8#define PERL_IN_REGCOMP_INVLIST_C
9#include "perl.h"
10
11#ifdef PERL_IN_XSUB_RE
12# include "re_comp.h"
13#else
14# include "regcomp.h"
15#endif
16
17#include "invlist_inline.h"
18#include "unicode_constants.h"
19#include "regcomp_internal.h"
20
ba6e2c38 21#ifdef PERL_RE_BUILD_AUX
85900e28
YO
22void
23Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
24{
25 PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
26
27 /* As the name says. The zeroth bit corresponds to the code point given by
28 * 'offset' */
29
30 UV start, end;
31
32 Zero(bitmap, len, U8);
33
34 invlist_iterinit(invlist);
35 while (invlist_iternext(invlist, &start, &end)) {
36 assert(start >= offset);
37
38 for (UV i = start; i <= end; i++) {
39 UV adjusted = i - offset;
40
41 BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
42 }
43 }
44 invlist_iterfinish(invlist);
45}
46
47void
48Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
49{
50 PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
51
52 /* As the name says. The zeroth bit corresponds to the code point given by
53 * 'offset' */
54
55 Size_t i;
56
57 for (i = 0; i < bitmap_len; i++) {
58 if (BITMAP_TEST(bitmap, i)) {
59 int start = i++;
60
61 /* Save a little work by adding a range all at once instead of bit
62 * by bit */
63 while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
64 i++;
65 }
66
67 *invlist = _add_range_to_invlist(*invlist,
68 start + offset,
69 i + offset - 1);
70 }
71 }
72}
ba6e2c38 73#endif /* PERL_RE_BUILD_AUX */
85900e28
YO
74
75/* This section of code defines the inversion list object and its methods. The
76 * interfaces are highly subject to change, so as much as possible is static to
77 * this file. An inversion list is here implemented as a malloc'd C UV array
78 * as an SVt_INVLIST scalar.
79 *
80 * An inversion list for Unicode is an array of code points, sorted by ordinal
81 * number. Each element gives the code point that begins a range that extends
82 * up-to but not including the code point given by the next element. The final
83 * element gives the first code point of a range that extends to the platform's
84 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
85 * ...) give ranges whose code points are all in the inversion list. We say
86 * that those ranges are in the set. The odd-numbered elements give ranges
87 * whose code points are not in the inversion list, and hence not in the set.
88 * Thus, element [0] is the first code point in the list. Element [1]
89 * is the first code point beyond that not in the list; and element [2] is the
90 * first code point beyond that that is in the list. In other words, the first
91 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
92 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
93 * all code points in that range are not in the inversion list. The third
94 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
95 * list, and so forth. Thus every element whose index is divisible by two
96 * gives the beginning of a range that is in the list, and every element whose
97 * index is not divisible by two gives the beginning of a range not in the
98 * list. If the final element's index is divisible by two, the inversion list
99 * extends to the platform's infinity; otherwise the highest code point in the
100 * inversion list is the contents of that element minus 1.
101 *
102 * A range that contains just a single code point N will look like
103 * invlist[i] == N
104 * invlist[i+1] == N+1
105 *
106 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
107 * impossible to represent, so element [i+1] is omitted. The single element
108 * inversion list
109 * invlist[0] == UV_MAX
110 * contains just UV_MAX, but is interpreted as matching to infinity.
111 *
112 * Taking the complement (inverting) an inversion list is quite simple, if the
113 * first element is 0, remove it; otherwise add a 0 element at the beginning.
114 * This implementation reserves an element at the beginning of each inversion
115 * list to always contain 0; there is an additional flag in the header which
116 * indicates if the list begins at the 0, or is offset to begin at the next
117 * element. This means that the inversion list can be inverted without any
118 * copying; just flip the flag.
119 *
120 * More about inversion lists can be found in "Unicode Demystified"
121 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
122 *
123 * The inversion list data structure is currently implemented as an SV pointing
124 * to an array of UVs that the SV thinks are bytes. This allows us to have an
125 * array of UV whose memory management is automatically handled by the existing
126 * facilities for SV's.
127 *
128 * Some of the methods should always be private to the implementation, and some
129 * should eventually be made public */
130
131/* The header definitions are in F<invlist_inline.h> */
132
133#ifndef PERL_IN_XSUB_RE
134
135PERL_STATIC_INLINE UV*
136S__invlist_array_init(SV* const invlist, const bool will_have_0)
137{
138 /* Returns a pointer to the first element in the inversion list's array.
139 * This is called upon initialization of an inversion list. Where the
140 * array begins depends on whether the list has the code point U+0000 in it
141 * or not. The other parameter tells it whether the code that follows this
142 * call is about to put a 0 in the inversion list or not. The first
143 * element is either the element reserved for 0, if TRUE, or the element
144 * after it, if FALSE */
145
146 bool* offset = get_invlist_offset_addr(invlist);
147 UV* zero_addr = (UV *) SvPVX(invlist);
148
149 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
150
151 /* Must be empty */
152 assert(! _invlist_len(invlist));
153
154 *zero_addr = 0;
155
156 /* 1^1 = 0; 1^0 = 1 */
157 *offset = 1 ^ will_have_0;
158 return zero_addr + *offset;
159}
160
161STATIC void
162S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
163{
164 /* Replaces the inversion list in 'dest' with the one from 'src'. It
165 * steals the list from 'src', so 'src' is made to have a NULL list. This
166 * is similar to what SvSetMagicSV() would do, if it were implemented on
167 * inversion lists, though this routine avoids a copy */
168
169 const UV src_len = _invlist_len(src);
170 const bool src_offset = *get_invlist_offset_addr(src);
171 const STRLEN src_byte_len = SvLEN(src);
172 char * array = SvPVX(src);
173
174#ifndef NO_TAINT_SUPPORT
175 const int oldtainted = TAINT_get;
176#endif
177
178 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
179
180 assert(is_invlist(src));
181 assert(is_invlist(dest));
182 assert(! invlist_is_iterating(src));
183 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
184
185 /* Make sure it ends in the right place with a NUL, as our inversion list
186 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
187 * asserts it */
188 array[src_byte_len - 1] = '\0';
189
190 TAINT_NOT; /* Otherwise it breaks */
191 sv_usepvn_flags(dest,
192 (char *) array,
193 src_byte_len - 1,
194
195 /* This flag is documented to cause a copy to be avoided */
196 SV_HAS_TRAILING_NUL);
197 TAINT_set(oldtainted);
198 SvPV_set(src, 0);
199 SvLEN_set(src, 0);
200 SvCUR_set(src, 0);
201
202 /* Finish up copying over the other fields in an inversion list */
203 *get_invlist_offset_addr(dest) = src_offset;
204 invlist_set_len(dest, src_len, src_offset);
205 *get_invlist_previous_index_addr(dest) = 0;
206 invlist_iterfinish(dest);
207}
208
209PERL_STATIC_INLINE IV*
210S_get_invlist_previous_index_addr(SV* invlist)
211{
212 /* Return the address of the IV that is reserved to hold the cached index
213 * */
214 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
215
216 assert(is_invlist(invlist));
217
218 return &(((XINVLIST*) SvANY(invlist))->prev_index);
219}
220
221PERL_STATIC_INLINE IV
222S_invlist_previous_index(SV* const invlist)
223{
224 /* Returns cached index of previous search */
225
226 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
227
228 return *get_invlist_previous_index_addr(invlist);
229}
230
231PERL_STATIC_INLINE void
232S_invlist_set_previous_index(SV* const invlist, const IV index)
233{
234 /* Caches <index> for later retrieval */
235
236 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
237
238 assert(index == 0 || index < (int) _invlist_len(invlist));
239
240 *get_invlist_previous_index_addr(invlist) = index;
241}
242
243PERL_STATIC_INLINE void
244S_invlist_trim(SV* invlist)
245{
246 /* Free the not currently-being-used space in an inversion list */
247
248 /* But don't free up the space needed for the 0 UV that is always at the
249 * beginning of the list, nor the trailing NUL */
250 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
251
252 PERL_ARGS_ASSERT_INVLIST_TRIM;
253
254 assert(is_invlist(invlist));
255
256 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
257}
258
259PERL_STATIC_INLINE void
260S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
261{
262 PERL_ARGS_ASSERT_INVLIST_CLEAR;
263
264 assert(is_invlist(invlist));
265
266 invlist_set_len(invlist, 0, 0);
267 invlist_trim(invlist);
268}
269
270PERL_STATIC_INLINE UV
271S_invlist_max(const SV* const invlist)
272{
273 /* Returns the maximum number of elements storable in the inversion list's
274 * array, without having to realloc() */
275
276 PERL_ARGS_ASSERT_INVLIST_MAX;
277
278 assert(is_invlist(invlist));
279
280 /* Assumes worst case, in which the 0 element is not counted in the
281 * inversion list, so subtracts 1 for that */
282 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
283 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
284 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
285}
286
287STATIC void
288S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
289{
290 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
291
292 /* First 1 is in case the zero element isn't in the list; second 1 is for
293 * trailing NUL */
294 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
295 invlist_set_len(invlist, 0, 0);
296
297 /* Force iterinit() to be used to get iteration to work */
298 invlist_iterfinish(invlist);
299
300 *get_invlist_previous_index_addr(invlist) = 0;
301 SvPOK_on(invlist); /* This allows B to extract the PV */
302}
303
304SV*
305Perl__new_invlist(pTHX_ IV initial_size)
306{
307
308 /* Return a pointer to a newly constructed inversion list, with enough
309 * space to store 'initial_size' elements. If that number is negative, a
310 * system default is used instead */
311
312 SV* new_list;
313
314 if (initial_size < 0) {
315 initial_size = 10;
316 }
317
318 new_list = newSV_type(SVt_INVLIST);
319 initialize_invlist_guts(new_list, initial_size);
320
321 return new_list;
322}
323
324SV*
325Perl__new_invlist_C_array(pTHX_ const UV* const list)
326{
327 /* Return a pointer to a newly constructed inversion list, initialized to
328 * point to <list>, which has to be in the exact correct inversion list
329 * form, including internal fields. Thus this is a dangerous routine that
330 * should not be used in the wrong hands. The passed in 'list' contains
331 * several header fields at the beginning that are not part of the
332 * inversion list body proper */
333
334 const STRLEN length = (STRLEN) list[0];
335 const UV version_id = list[1];
336 const bool offset = cBOOL(list[2]);
337#define HEADER_LENGTH 3
338 /* If any of the above changes in any way, you must change HEADER_LENGTH
339 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
340 * perl -E 'say int(rand 2**31-1)'
341 */
342#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
343 data structure type, so that one being
344 passed in can be validated to be an
345 inversion list of the correct vintage.
346 */
347
348 SV* invlist = newSV_type(SVt_INVLIST);
349
350 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
351
352 if (version_id != INVLIST_VERSION_ID) {
353 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
354 }
355
356 /* The generated array passed in includes header elements that aren't part
357 * of the list proper, so start it just after them */
358 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
359
360 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
361 shouldn't touch it */
362
363 *(get_invlist_offset_addr(invlist)) = offset;
364
365 /* The 'length' passed to us is the physical number of elements in the
366 * inversion list. But if there is an offset the logical number is one
367 * less than that */
368 invlist_set_len(invlist, length - offset, offset);
369
370 invlist_set_previous_index(invlist, 0);
371
372 /* Initialize the iteration pointer. */
373 invlist_iterfinish(invlist);
374
375 SvREADONLY_on(invlist);
376 SvPOK_on(invlist);
377
378 return invlist;
379}
380
381STATIC void
382S__append_range_to_invlist(pTHX_ SV* const invlist,
383 const UV start, const UV end)
384{
385 /* Subject to change or removal. Append the range from 'start' to 'end' at
386 * the end of the inversion list. The range must be above any existing
387 * ones. */
388
389 UV* array;
390 UV max = invlist_max(invlist);
391 UV len = _invlist_len(invlist);
392 bool offset;
393
394 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
395
396 if (len == 0) { /* Empty lists must be initialized */
397 offset = start != 0;
398 array = _invlist_array_init(invlist, ! offset);
399 }
400 else {
401 /* Here, the existing list is non-empty. The current max entry in the
402 * list is generally the first value not in the set, except when the
403 * set extends to the end of permissible values, in which case it is
404 * the first entry in that final set, and so this call is an attempt to
405 * append out-of-order */
406
407 UV final_element = len - 1;
408 array = invlist_array(invlist);
409 if ( array[final_element] > start
410 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
411 {
412 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
413 array[final_element], start,
414 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
415 }
416
417 /* Here, it is a legal append. If the new range begins 1 above the end
418 * of the range below it, it is extending the range below it, so the
419 * new first value not in the set is one greater than the newly
420 * extended range. */
421 offset = *get_invlist_offset_addr(invlist);
422 if (array[final_element] == start) {
423 if (end != UV_MAX) {
424 array[final_element] = end + 1;
425 }
426 else {
427 /* But if the end is the maximum representable on the machine,
428 * assume that infinity was actually what was meant. Just let
429 * the range that this would extend to have no end */
430 invlist_set_len(invlist, len - 1, offset);
431 }
432 return;
433 }
434 }
435
436 /* Here the new range doesn't extend any existing set. Add it */
437
438 len += 2; /* Includes an element each for the start and end of range */
439
440 /* If wll overflow the existing space, extend, which may cause the array to
441 * be moved */
442 if (max < len) {
443 invlist_extend(invlist, len);
444
445 /* Have to set len here to avoid assert failure in invlist_array() */
446 invlist_set_len(invlist, len, offset);
447
448 array = invlist_array(invlist);
449 }
450 else {
451 invlist_set_len(invlist, len, offset);
452 }
453
454 /* The next item on the list starts the range, the one after that is
455 * one past the new range. */
456 array[len - 2] = start;
457 if (end != UV_MAX) {
458 array[len - 1] = end + 1;
459 }
460 else {
461 /* But if the end is the maximum representable on the machine, just let
462 * the range have no end */
463 invlist_set_len(invlist, len - 1, offset);
464 }
465}
466
467SSize_t
468Perl__invlist_search(SV* const invlist, const UV cp)
469{
470 /* Searches the inversion list for the entry that contains the input code
471 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
472 * return value is the index into the list's array of the range that
473 * contains <cp>, that is, 'i' such that
474 * array[i] <= cp < array[i+1]
475 */
476
477 IV low = 0;
478 IV mid;
479 IV high = _invlist_len(invlist);
480 const IV highest_element = high - 1;
481 const UV* array;
482
483 PERL_ARGS_ASSERT__INVLIST_SEARCH;
484
485 /* If list is empty, return failure. */
486 if (UNLIKELY(high == 0)) {
487 return -1;
488 }
489
490 /* (We can't get the array unless we know the list is non-empty) */
491 array = invlist_array(invlist);
492
493 mid = invlist_previous_index(invlist);
494 assert(mid >=0);
495 if (UNLIKELY(mid > highest_element)) {
496 mid = highest_element;
497 }
498
499 /* <mid> contains the cache of the result of the previous call to this
500 * function (0 the first time). See if this call is for the same result,
501 * or if it is for mid-1. This is under the theory that calls to this
502 * function will often be for related code points that are near each other.
503 * And benchmarks show that caching gives better results. We also test
504 * here if the code point is within the bounds of the list. These tests
505 * replace others that would have had to be made anyway to make sure that
506 * the array bounds were not exceeded, and these give us extra information
507 * at the same time */
508 if (cp >= array[mid]) {
509 if (cp >= array[highest_element]) {
510 return highest_element;
511 }
512
513 /* Here, array[mid] <= cp < array[highest_element]. This means that
514 * the final element is not the answer, so can exclude it; it also
515 * means that <mid> is not the final element, so can refer to 'mid + 1'
516 * safely */
517 if (cp < array[mid + 1]) {
518 return mid;
519 }
520 high--;
521 low = mid + 1;
522 }
523 else { /* cp < aray[mid] */
524 if (cp < array[0]) { /* Fail if outside the array */
525 return -1;
526 }
527 high = mid;
528 if (cp >= array[mid - 1]) {
529 goto found_entry;
530 }
531 }
532
533 /* Binary search. What we are looking for is <i> such that
534 * array[i] <= cp < array[i+1]
535 * The loop below converges on the i+1. Note that there may not be an
536 * (i+1)th element in the array, and things work nonetheless */
537 while (low < high) {
538 mid = (low + high) / 2;
539 assert(mid <= highest_element);
540 if (array[mid] <= cp) { /* cp >= array[mid] */
541 low = mid + 1;
542
543 /* We could do this extra test to exit the loop early.
544 if (cp < array[low]) {
545 return mid;
546 }
547 */
548 }
549 else { /* cp < array[mid] */
550 high = mid;
551 }
552 }
553
554 found_entry:
555 high--;
556 invlist_set_previous_index(invlist, high);
557 return high;
558}
559
560void
561Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
562 const bool complement_b, SV** output)
563{
564 /* Take the union of two inversion lists and point '*output' to it. On
565 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
566 * even 'a' or 'b'). If to an inversion list, the contents of the original
567 * list will be replaced by the union. The first list, 'a', may be
568 * NULL, in which case a copy of the second list is placed in '*output'.
569 * If 'complement_b' is TRUE, the union is taken of the complement
570 * (inversion) of 'b' instead of b itself.
571 *
572 * The basis for this comes from "Unicode Demystified" Chapter 13 by
573 * Richard Gillam, published by Addison-Wesley, and explained at some
574 * length there. The preface says to incorporate its examples into your
575 * code at your own risk.
576 *
577 * The algorithm is like a merge sort. */
578
579 const UV* array_a; /* a's array */
580 const UV* array_b;
581 UV len_a; /* length of a's array */
582 UV len_b;
583
584 SV* u; /* the resulting union */
585 UV* array_u;
586 UV len_u = 0;
587
588 UV i_a = 0; /* current index into a's array */
589 UV i_b = 0;
590 UV i_u = 0;
591
592 /* running count, as explained in the algorithm source book; items are
593 * stopped accumulating and are output when the count changes to/from 0.
594 * The count is incremented when we start a range that's in an input's set,
595 * and decremented when we start a range that's not in a set. So this
596 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
597 * and hence nothing goes into the union; 1, just one of the inputs is in
598 * its set (and its current range gets added to the union); and 2 when both
599 * inputs are in their sets. */
600 UV count = 0;
601
602 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
603 assert(a != b);
604 assert(*output == NULL || is_invlist(*output));
605
606 len_b = _invlist_len(b);
607 if (len_b == 0) {
608
609 /* Here, 'b' is empty, hence it's complement is all possible code
610 * points. So if the union includes the complement of 'b', it includes
611 * everything, and we need not even look at 'a'. It's easiest to
612 * create a new inversion list that matches everything. */
613 if (complement_b) {
614 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
615
616 if (*output == NULL) { /* If the output didn't exist, just point it
617 at the new list */
618 *output = everything;
619 }
620 else { /* Otherwise, replace its contents with the new list */
621 invlist_replace_list_destroys_src(*output, everything);
622 SvREFCNT_dec_NN(everything);
623 }
624
625 return;
626 }
627
628 /* Here, we don't want the complement of 'b', and since 'b' is empty,
629 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
630 * output will be empty */
631
632 if (a == NULL || _invlist_len(a) == 0) {
633 if (*output == NULL) {
634 *output = _new_invlist(0);
635 }
636 else {
637 invlist_clear(*output);
638 }
639 return;
640 }
641
642 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
643 * union. We can just return a copy of 'a' if '*output' doesn't point
644 * to an existing list */
645 if (*output == NULL) {
646 *output = invlist_clone(a, NULL);
647 return;
648 }
649
650 /* If the output is to overwrite 'a', we have a no-op, as it's
651 * already in 'a' */
652 if (*output == a) {
653 return;
654 }
655
656 /* Here, '*output' is to be overwritten by 'a' */
657 u = invlist_clone(a, NULL);
658 invlist_replace_list_destroys_src(*output, u);
659 SvREFCNT_dec_NN(u);
660
661 return;
662 }
663
664 /* Here 'b' is not empty. See about 'a' */
665
666 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
667
668 /* Here, 'a' is empty (and b is not). That means the union will come
669 * entirely from 'b'. If '*output' is NULL, we can directly return a
670 * clone of 'b'. Otherwise, we replace the contents of '*output' with
671 * the clone */
672
673 SV ** dest = (*output == NULL) ? output : &u;
674 *dest = invlist_clone(b, NULL);
675 if (complement_b) {
676 _invlist_invert(*dest);
677 }
678
679 if (dest == &u) {
680 invlist_replace_list_destroys_src(*output, u);
681 SvREFCNT_dec_NN(u);
682 }
683
684 return;
685 }
686
687 /* Here both lists exist and are non-empty */
688 array_a = invlist_array(a);
689 array_b = invlist_array(b);
690
691 /* If are to take the union of 'a' with the complement of b, set it
692 * up so are looking at b's complement. */
693 if (complement_b) {
694
695 /* To complement, we invert: if the first element is 0, remove it. To
696 * do this, we just pretend the array starts one later */
697 if (array_b[0] == 0) {
698 array_b++;
699 len_b--;
700 }
701 else {
702
703 /* But if the first element is not zero, we pretend the list starts
704 * at the 0 that is always stored immediately before the array. */
705 array_b--;
706 len_b++;
707 }
708 }
709
710 /* Size the union for the worst case: that the sets are completely
711 * disjoint */
712 u = _new_invlist(len_a + len_b);
713
714 /* Will contain U+0000 if either component does */
715 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
716 || (len_b > 0 && array_b[0] == 0));
717
718 /* Go through each input list item by item, stopping when have exhausted
719 * one of them */
720 while (i_a < len_a && i_b < len_b) {
721 UV cp; /* The element to potentially add to the union's array */
722 bool cp_in_set; /* is it in the input list's set or not */
723
724 /* We need to take one or the other of the two inputs for the union.
725 * Since we are merging two sorted lists, we take the smaller of the
726 * next items. In case of a tie, we take first the one that is in its
727 * set. If we first took the one not in its set, it would decrement
728 * the count, possibly to 0 which would cause it to be output as ending
729 * the range, and the next time through we would take the same number,
730 * and output it again as beginning the next range. By doing it the
731 * opposite way, there is no possibility that the count will be
732 * momentarily decremented to 0, and thus the two adjoining ranges will
733 * be seamlessly merged. (In a tie and both are in the set or both not
734 * in the set, it doesn't matter which we take first.) */
735 if ( array_a[i_a] < array_b[i_b]
736 || ( array_a[i_a] == array_b[i_b]
737 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
738 {
739 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
740 cp = array_a[i_a++];
741 }
742 else {
743 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
744 cp = array_b[i_b++];
745 }
746
747 /* Here, have chosen which of the two inputs to look at. Only output
748 * if the running count changes to/from 0, which marks the
749 * beginning/end of a range that's in the set */
750 if (cp_in_set) {
751 if (count == 0) {
752 array_u[i_u++] = cp;
753 }
754 count++;
755 }
756 else {
757 count--;
758 if (count == 0) {
759 array_u[i_u++] = cp;
760 }
761 }
762 }
763
764
765 /* The loop above increments the index into exactly one of the input lists
766 * each iteration, and ends when either index gets to its list end. That
767 * means the other index is lower than its end, and so something is
768 * remaining in that one. We decrement 'count', as explained below, if
769 * that list is in its set. (i_a and i_b each currently index the element
770 * beyond the one we care about.) */
771 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
772 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
773 {
774 count--;
775 }
776
777 /* Above we decremented 'count' if the list that had unexamined elements in
778 * it was in its set. This has made it so that 'count' being non-zero
779 * means there isn't anything left to output; and 'count' equal to 0 means
780 * that what is left to output is precisely that which is left in the
781 * non-exhausted input list.
782 *
783 * To see why, note first that the exhausted input obviously has nothing
784 * left to add to the union. If it was in its set at its end, that means
785 * the set extends from here to the platform's infinity, and hence so does
786 * the union and the non-exhausted set is irrelevant. The exhausted set
787 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
788 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
789 * 'count' remains at 1. This is consistent with the decremented 'count'
790 * != 0 meaning there's nothing left to add to the union.
791 *
792 * But if the exhausted input wasn't in its set, it contributed 0 to
793 * 'count', and the rest of the union will be whatever the other input is.
794 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
795 * otherwise it gets decremented to 0. This is consistent with 'count'
796 * == 0 meaning the remainder of the union is whatever is left in the
797 * non-exhausted list. */
798 if (count != 0) {
799 len_u = i_u;
800 }
801 else {
802 IV copy_count = len_a - i_a;
803 if (copy_count > 0) { /* The non-exhausted input is 'a' */
804 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
805 }
806 else { /* The non-exhausted input is b */
807 copy_count = len_b - i_b;
808 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
809 }
810 len_u = i_u + copy_count;
811 }
812
813 /* Set the result to the final length, which can change the pointer to
814 * array_u, so re-find it. (Note that it is unlikely that this will
815 * change, as we are shrinking the space, not enlarging it) */
816 if (len_u != _invlist_len(u)) {
817 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
818 invlist_trim(u);
819 array_u = invlist_array(u);
820 }
821
822 if (*output == NULL) { /* Simply return the new inversion list */
823 *output = u;
824 }
825 else {
826 /* Otherwise, overwrite the inversion list that was in '*output'. We
827 * could instead free '*output', and then set it to 'u', but experience
828 * has shown [perl #127392] that if the input is a mortal, we can get a
829 * huge build-up of these during regex compilation before they get
830 * freed. */
831 invlist_replace_list_destroys_src(*output, u);
832 SvREFCNT_dec_NN(u);
833 }
834
835 return;
836}
837
838void
839Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
840 const bool complement_b, SV** i)
841{
842 /* Take the intersection of two inversion lists and point '*i' to it. On
843 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
844 * even 'a' or 'b'). If to an inversion list, the contents of the original
845 * list will be replaced by the intersection. The first list, 'a', may be
846 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
847 * TRUE, the result will be the intersection of 'a' and the complement (or
848 * inversion) of 'b' instead of 'b' directly.
849 *
850 * The basis for this comes from "Unicode Demystified" Chapter 13 by
851 * Richard Gillam, published by Addison-Wesley, and explained at some
852 * length there. The preface says to incorporate its examples into your
853 * code at your own risk. In fact, it had bugs
854 *
855 * The algorithm is like a merge sort, and is essentially the same as the
856 * union above
857 */
858
859 const UV* array_a; /* a's array */
860 const UV* array_b;
861 UV len_a; /* length of a's array */
862 UV len_b;
863
864 SV* r; /* the resulting intersection */
865 UV* array_r;
866 UV len_r = 0;
867
868 UV i_a = 0; /* current index into a's array */
869 UV i_b = 0;
870 UV i_r = 0;
871
872 /* running count of how many of the two inputs are postitioned at ranges
873 * that are in their sets. As explained in the algorithm source book,
874 * items are stopped accumulating and are output when the count changes
875 * to/from 2. The count is incremented when we start a range that's in an
876 * input's set, and decremented when we start a range that's not in a set.
877 * Only when it is 2 are we in the intersection. */
878 UV count = 0;
879
880 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
881 assert(a != b);
882 assert(*i == NULL || is_invlist(*i));
883
884 /* Special case if either one is empty */
885 len_a = (a == NULL) ? 0 : _invlist_len(a);
886 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
887 if (len_a != 0 && complement_b) {
888
889 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
890 * must be empty. Here, also we are using 'b's complement, which
891 * hence must be every possible code point. Thus the intersection
892 * is simply 'a'. */
893
894 if (*i == a) { /* No-op */
895 return;
896 }
897
898 if (*i == NULL) {
899 *i = invlist_clone(a, NULL);
900 return;
901 }
902
903 r = invlist_clone(a, NULL);
904 invlist_replace_list_destroys_src(*i, r);
905 SvREFCNT_dec_NN(r);
906 return;
907 }
908
909 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
910 * intersection must be empty */
911 if (*i == NULL) {
912 *i = _new_invlist(0);
913 return;
914 }
915
916 invlist_clear(*i);
917 return;
918 }
919
920 /* Here both lists exist and are non-empty */
921 array_a = invlist_array(a);
922 array_b = invlist_array(b);
923
924 /* If are to take the intersection of 'a' with the complement of b, set it
925 * up so are looking at b's complement. */
926 if (complement_b) {
927
928 /* To complement, we invert: if the first element is 0, remove it. To
929 * do this, we just pretend the array starts one later */
930 if (array_b[0] == 0) {
931 array_b++;
932 len_b--;
933 }
934 else {
935
936 /* But if the first element is not zero, we pretend the list starts
937 * at the 0 that is always stored immediately before the array. */
938 array_b--;
939 len_b++;
940 }
941 }
942
943 /* Size the intersection for the worst case: that the intersection ends up
944 * fragmenting everything to be completely disjoint */
945 r= _new_invlist(len_a + len_b);
946
947 /* Will contain U+0000 iff both components do */
948 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
949 && len_b > 0 && array_b[0] == 0);
950
951 /* Go through each list item by item, stopping when have exhausted one of
952 * them */
953 while (i_a < len_a && i_b < len_b) {
954 UV cp; /* The element to potentially add to the intersection's
955 array */
956 bool cp_in_set; /* Is it in the input list's set or not */
957
958 /* We need to take one or the other of the two inputs for the
959 * intersection. Since we are merging two sorted lists, we take the
960 * smaller of the next items. In case of a tie, we take first the one
961 * that is not in its set (a difference from the union algorithm). If
962 * we first took the one in its set, it would increment the count,
963 * possibly to 2 which would cause it to be output as starting a range
964 * in the intersection, and the next time through we would take that
965 * same number, and output it again as ending the set. By doing the
966 * opposite of this, there is no possibility that the count will be
967 * momentarily incremented to 2. (In a tie and both are in the set or
968 * both not in the set, it doesn't matter which we take first.) */
969 if ( array_a[i_a] < array_b[i_b]
970 || ( array_a[i_a] == array_b[i_b]
971 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
972 {
973 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
974 cp = array_a[i_a++];
975 }
976 else {
977 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
978 cp= array_b[i_b++];
979 }
980
981 /* Here, have chosen which of the two inputs to look at. Only output
982 * if the running count changes to/from 2, which marks the
983 * beginning/end of a range that's in the intersection */
984 if (cp_in_set) {
985 count++;
986 if (count == 2) {
987 array_r[i_r++] = cp;
988 }
989 }
990 else {
991 if (count == 2) {
992 array_r[i_r++] = cp;
993 }
994 count--;
995 }
996
997 }
998
999 /* The loop above increments the index into exactly one of the input lists
1000 * each iteration, and ends when either index gets to its list end. That
1001 * means the other index is lower than its end, and so something is
1002 * remaining in that one. We increment 'count', as explained below, if the
1003 * exhausted list was in its set. (i_a and i_b each currently index the
1004 * element beyond the one we care about.) */
1005 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
1006 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
1007 {
1008 count++;
1009 }
1010
1011 /* Above we incremented 'count' if the exhausted list was in its set. This
1012 * has made it so that 'count' being below 2 means there is nothing left to
1013 * output; otheriwse what's left to add to the intersection is precisely
1014 * that which is left in the non-exhausted input list.
1015 *
1016 * To see why, note first that the exhausted input obviously has nothing
1017 * left to affect the intersection. If it was in its set at its end, that
1018 * means the set extends from here to the platform's infinity, and hence
1019 * anything in the non-exhausted's list will be in the intersection, and
1020 * anything not in it won't be. Hence, the rest of the intersection is
1021 * precisely what's in the non-exhausted list The exhausted set also
1022 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
1023 * it means 'count' is now at least 2. This is consistent with the
1024 * incremented 'count' being >= 2 means to add the non-exhausted list to
1025 * the intersection.
1026 *
1027 * But if the exhausted input wasn't in its set, it contributed 0 to
1028 * 'count', and the intersection can't include anything further; the
1029 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
1030 * incremented. This is consistent with 'count' being < 2 meaning nothing
1031 * further to add to the intersection. */
1032 if (count < 2) { /* Nothing left to put in the intersection. */
1033 len_r = i_r;
1034 }
1035 else { /* copy the non-exhausted list, unchanged. */
1036 IV copy_count = len_a - i_a;
1037 if (copy_count > 0) { /* a is the one with stuff left */
1038 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
1039 }
1040 else { /* b is the one with stuff left */
1041 copy_count = len_b - i_b;
1042 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
1043 }
1044 len_r = i_r + copy_count;
1045 }
1046
1047 /* Set the result to the final length, which can change the pointer to
1048 * array_r, so re-find it. (Note that it is unlikely that this will
1049 * change, as we are shrinking the space, not enlarging it) */
1050 if (len_r != _invlist_len(r)) {
1051 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
1052 invlist_trim(r);
1053 array_r = invlist_array(r);
1054 }
1055
1056 if (*i == NULL) { /* Simply return the calculated intersection */
1057 *i = r;
1058 }
1059 else { /* Otherwise, replace the existing inversion list in '*i'. We could
1060 instead free '*i', and then set it to 'r', but experience has
1061 shown [perl #127392] that if the input is a mortal, we can get a
1062 huge build-up of these during regex compilation before they get
1063 freed. */
1064 if (len_r) {
1065 invlist_replace_list_destroys_src(*i, r);
1066 }
1067 else {
1068 invlist_clear(*i);
1069 }
1070 SvREFCNT_dec_NN(r);
1071 }
1072
1073 return;
1074}
1075
1076SV*
1077Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
1078{
1079 /* Add the range from 'start' to 'end' inclusive to the inversion list's
1080 * set. A pointer to the inversion list is returned. This may actually be
1081 * a new list, in which case the passed in one has been destroyed. The
1082 * passed-in inversion list can be NULL, in which case a new one is created
1083 * with just the one range in it. The new list is not necessarily
1084 * NUL-terminated. Space is not freed if the inversion list shrinks as a
1085 * result of this function. The gain would not be large, and in many
1086 * cases, this is called multiple times on a single inversion list, so
1087 * anything freed may almost immediately be needed again.
1088 *
1089 * This used to mostly call the 'union' routine, but that is much more
1090 * heavyweight than really needed for a single range addition */
1091
1092 UV* array; /* The array implementing the inversion list */
1093 UV len; /* How many elements in 'array' */
1094 SSize_t i_s; /* index into the invlist array where 'start'
1095 should go */
1096 SSize_t i_e = 0; /* And the index where 'end' should go */
1097 UV cur_highest; /* The highest code point in the inversion list
1098 upon entry to this function */
1099
1100 /* This range becomes the whole inversion list if none already existed */
1101 if (invlist == NULL) {
1102 invlist = _new_invlist(2);
1103 _append_range_to_invlist(invlist, start, end);
1104 return invlist;
1105 }
1106
1107 /* Likewise, if the inversion list is currently empty */
1108 len = _invlist_len(invlist);
1109 if (len == 0) {
1110 _append_range_to_invlist(invlist, start, end);
1111 return invlist;
1112 }
1113
1114 /* Starting here, we have to know the internals of the list */
1115 array = invlist_array(invlist);
1116
1117 /* If the new range ends higher than the current highest ... */
1118 cur_highest = invlist_highest(invlist);
1119 if (end > cur_highest) {
1120
1121 /* If the whole range is higher, we can just append it */
1122 if (start > cur_highest) {
1123 _append_range_to_invlist(invlist, start, end);
1124 return invlist;
1125 }
1126
1127 /* Otherwise, add the portion that is higher ... */
1128 _append_range_to_invlist(invlist, cur_highest + 1, end);
1129
1130 /* ... and continue on below to handle the rest. As a result of the
1131 * above append, we know that the index of the end of the range is the
1132 * final even numbered one of the array. Recall that the final element
1133 * always starts a range that extends to infinity. If that range is in
1134 * the set (meaning the set goes from here to infinity), it will be an
1135 * even index, but if it isn't in the set, it's odd, and the final
1136 * range in the set is one less, which is even. */
1137 if (end == UV_MAX) {
1138 i_e = len;
1139 }
1140 else {
1141 i_e = len - 2;
1142 }
1143 }
1144
1145 /* We have dealt with appending, now see about prepending. If the new
1146 * range starts lower than the current lowest ... */
1147 if (start < array[0]) {
1148
1149 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
1150 * Let the union code handle it, rather than having to know the
1151 * trickiness in two code places. */
1152 if (UNLIKELY(start == 0)) {
1153 SV* range_invlist;
1154
1155 range_invlist = _new_invlist(2);
1156 _append_range_to_invlist(range_invlist, start, end);
1157
1158 _invlist_union(invlist, range_invlist, &invlist);
1159
1160 SvREFCNT_dec_NN(range_invlist);
1161
1162 return invlist;
1163 }
1164
1165 /* If the whole new range comes before the first entry, and doesn't
1166 * extend it, we have to insert it as an additional range */
1167 if (end < array[0] - 1) {
1168 i_s = i_e = -1;
1169 goto splice_in_new_range;
1170 }
1171
1172 /* Here the new range adjoins the existing first range, extending it
1173 * downwards. */
1174 array[0] = start;
1175
1176 /* And continue on below to handle the rest. We know that the index of
1177 * the beginning of the range is the first one of the array */
1178 i_s = 0;
1179 }
1180 else { /* Not prepending any part of the new range to the existing list.
1181 * Find where in the list it should go. This finds i_s, such that:
1182 * invlist[i_s] <= start < array[i_s+1]
1183 */
1184 i_s = _invlist_search(invlist, start);
1185 }
1186
1187 /* At this point, any extending before the beginning of the inversion list
1188 * and/or after the end has been done. This has made it so that, in the
1189 * code below, each endpoint of the new range is either in a range that is
1190 * in the set, or is in a gap between two ranges that are. This means we
1191 * don't have to worry about exceeding the array bounds.
1192 *
1193 * Find where in the list the new range ends (but we can skip this if we
1194 * have already determined what it is, or if it will be the same as i_s,
1195 * which we already have computed) */
1196 if (i_e == 0) {
1197 i_e = (start == end)
1198 ? i_s
1199 : _invlist_search(invlist, end);
1200 }
1201
1202 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
1203 * is a range that goes to infinity there is no element at invlist[i_e+1],
1204 * so only the first relation holds. */
1205
1206 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1207
1208 /* Here, the ranges on either side of the beginning of the new range
1209 * are in the set, and this range starts in the gap between them.
1210 *
1211 * The new range extends the range above it downwards if the new range
1212 * ends at or above that range's start */
1213 const bool extends_the_range_above = ( end == UV_MAX
1214 || end + 1 >= array[i_s+1]);
1215
1216 /* The new range extends the range below it upwards if it begins just
1217 * after where that range ends */
1218 if (start == array[i_s]) {
1219
1220 /* If the new range fills the entire gap between the other ranges,
1221 * they will get merged together. Other ranges may also get
1222 * merged, depending on how many of them the new range spans. In
1223 * the general case, we do the merge later, just once, after we
1224 * figure out how many to merge. But in the case where the new
1225 * range exactly spans just this one gap (possibly extending into
1226 * the one above), we do the merge here, and an early exit. This
1227 * is done here to avoid having to special case later. */
1228 if (i_e - i_s <= 1) {
1229
1230 /* If i_e - i_s == 1, it means that the new range terminates
1231 * within the range above, and hence 'extends_the_range_above'
1232 * must be true. (If the range above it extends to infinity,
1233 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
1234 * will be 0, so no harm done.) */
1235 if (extends_the_range_above) {
1236 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
1237 invlist_set_len(invlist,
1238 len - 2,
1239 *(get_invlist_offset_addr(invlist)));
1240 return invlist;
1241 }
1242
1243 /* Here, i_e must == i_s. We keep them in sync, as they apply
1244 * to the same range, and below we are about to decrement i_s
1245 * */
1246 i_e--;
1247 }
1248
1249 /* Here, the new range is adjacent to the one below. (It may also
1250 * span beyond the range above, but that will get resolved later.)
1251 * Extend the range below to include this one. */
1252 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
1253 i_s--;
1254 start = array[i_s];
1255 }
1256 else if (extends_the_range_above) {
1257
1258 /* Here the new range only extends the range above it, but not the
1259 * one below. It merges with the one above. Again, we keep i_e
1260 * and i_s in sync if they point to the same range */
1261 if (i_e == i_s) {
1262 i_e++;
1263 }
1264 i_s++;
1265 array[i_s] = start;
1266 }
1267 }
1268
1269 /* Here, we've dealt with the new range start extending any adjoining
1270 * existing ranges.
1271 *
1272 * If the new range extends to infinity, it is now the final one,
1273 * regardless of what was there before */
1274 if (UNLIKELY(end == UV_MAX)) {
1275 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
1276 return invlist;
1277 }
1278
1279 /* If i_e started as == i_s, it has also been dealt with,
1280 * and been updated to the new i_s, which will fail the following if */
1281 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
1282
1283 /* Here, the ranges on either side of the end of the new range are in
1284 * the set, and this range ends in the gap between them.
1285 *
1286 * If this range is adjacent to (hence extends) the range above it, it
1287 * becomes part of that range; likewise if it extends the range below,
1288 * it becomes part of that range */
1289 if (end + 1 == array[i_e+1]) {
1290 i_e++;
1291 array[i_e] = start;
1292 }
1293 else if (start <= array[i_e]) {
1294 array[i_e] = end + 1;
1295 i_e--;
1296 }
1297 }
1298
1299 if (i_s == i_e) {
1300
1301 /* If the range fits entirely in an existing range (as possibly already
1302 * extended above), it doesn't add anything new */
1303 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1304 return invlist;
1305 }
1306
1307 /* Here, no part of the range is in the list. Must add it. It will
1308 * occupy 2 more slots */
1309 splice_in_new_range:
1310
1311 invlist_extend(invlist, len + 2);
1312 array = invlist_array(invlist);
1313 /* Move the rest of the array down two slots. Don't include any
1314 * trailing NUL */
1315 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
1316
1317 /* Do the actual splice */
1318 array[i_e+1] = start;
1319 array[i_e+2] = end + 1;
1320 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
1321 return invlist;
1322 }
1323
1324 /* Here the new range crossed the boundaries of a pre-existing range. The
1325 * code above has adjusted things so that both ends are in ranges that are
1326 * in the set. This means everything in between must also be in the set.
1327 * Just squash things together */
1328 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
1329 invlist_set_len(invlist,
1330 len - i_e + i_s,
1331 *(get_invlist_offset_addr(invlist)));
1332
1333 return invlist;
1334}
1335
1336SV*
1337Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
1338 UV** other_elements_ptr)
1339{
1340 /* Create and return an inversion list whose contents are to be populated
1341 * by the caller. The caller gives the number of elements (in 'size') and
1342 * the very first element ('element0'). This function will set
1343 * '*other_elements_ptr' to an array of UVs, where the remaining elements
1344 * are to be placed.
1345 *
1346 * Obviously there is some trust involved that the caller will properly
1347 * fill in the other elements of the array.
1348 *
1349 * (The first element needs to be passed in, as the underlying code does
1350 * things differently depending on whether it is zero or non-zero) */
1351
1352 SV* invlist = _new_invlist(size);
1353 bool offset;
1354
1355 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
1356
1357 invlist = add_cp_to_invlist(invlist, element0);
1358 offset = *get_invlist_offset_addr(invlist);
1359
1360 invlist_set_len(invlist, size, offset);
1361 *other_elements_ptr = invlist_array(invlist) + 1;
1362 return invlist;
1363}
1364
1365#endif
1366
1367#ifndef PERL_IN_XSUB_RE
1368void
1369Perl__invlist_invert(pTHX_ SV* const invlist)
1370{
1371 /* Complement the input inversion list. This adds a 0 if the list didn't
1372 * have a zero; removes it otherwise. As described above, the data
1373 * structure is set up so that this is very efficient */
1374
1375 PERL_ARGS_ASSERT__INVLIST_INVERT;
1376
1377 assert(! invlist_is_iterating(invlist));
1378
1379 /* The inverse of matching nothing is matching everything */
1380 if (_invlist_len(invlist) == 0) {
1381 _append_range_to_invlist(invlist, 0, UV_MAX);
1382 return;
1383 }
1384
1385 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
1386}
1387
1388SV*
1389Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
1390{
1391 /* Return a new inversion list that is a copy of the input one, which is
1392 * unchanged. The new list will not be mortal even if the old one was. */
1393
1394 const STRLEN nominal_length = _invlist_len(invlist);
1395 const STRLEN physical_length = SvCUR(invlist);
1396 const bool offset = *(get_invlist_offset_addr(invlist));
1397
1398 PERL_ARGS_ASSERT_INVLIST_CLONE;
1399
1400 if (new_invlist == NULL) {
1401 new_invlist = _new_invlist(nominal_length);
1402 }
1403 else {
1404 sv_upgrade(new_invlist, SVt_INVLIST);
1405 initialize_invlist_guts(new_invlist, nominal_length);
1406 }
1407
1408 *(get_invlist_offset_addr(new_invlist)) = offset;
1409 invlist_set_len(new_invlist, nominal_length, offset);
1410 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
1411
1412 return new_invlist;
1413}
1414
1415#endif
1416
1417
1418#ifndef PERL_IN_XSUB_RE
1419void
1420Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
1421 const char * const indent, SV* const invlist)
1422{
1423 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
1424 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
1425 * the string 'indent'. The output looks like this:
1426 [0] 0x000A .. 0x000D
1427 [2] 0x0085
1428 [4] 0x2028 .. 0x2029
1429 [6] 0x3104 .. INFTY
1430 * This means that the first range of code points matched by the list are
1431 * 0xA through 0xD; the second range contains only the single code point
1432 * 0x85, etc. An inversion list is an array of UVs. Two array elements
1433 * are used to define each range (except if the final range extends to
1434 * infinity, only a single element is needed). The array index of the
1435 * first element for the corresponding range is given in brackets. */
1436
1437 UV start, end;
1438 STRLEN count = 0;
1439
1440 PERL_ARGS_ASSERT__INVLIST_DUMP;
1441
1442 if (invlist_is_iterating(invlist)) {
1443 Perl_dump_indent(aTHX_ level, file,
1444 "%sCan't dump inversion list because is in middle of iterating\n",
1445 indent);
1446 return;
1447 }
1448
1449 invlist_iterinit(invlist);
1450 while (invlist_iternext(invlist, &start, &end)) {
1451 if (end == UV_MAX) {
1452 Perl_dump_indent(aTHX_ level, file,
1453 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
1454 indent, (UV)count, start);
1455 }
1456 else if (end != start) {
1457 Perl_dump_indent(aTHX_ level, file,
1458 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
1459 indent, (UV)count, start, end);
1460 }
1461 else {
1462 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
1463 indent, (UV)count, start);
1464 }
1465 count += 2;
1466 }
1467}
1468
1469#endif
1470
1471#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
1472bool
1473Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
1474{
1475 /* Return a boolean as to if the two passed in inversion lists are
1476 * identical. The final argument, if TRUE, says to take the complement of
1477 * the second inversion list before doing the comparison */
1478
1479 const UV len_a = _invlist_len(a);
1480 UV len_b = _invlist_len(b);
1481
1482 const UV* array_a = NULL;
1483 const UV* array_b = NULL;
1484
1485 PERL_ARGS_ASSERT__INVLISTEQ;
1486
1487 /* This code avoids accessing the arrays unless it knows the length is
1488 * non-zero */
1489
1490 if (len_a == 0) {
1491 if (len_b == 0) {
1492 return ! complement_b;
1493 }
1494 }
1495 else {
1496 array_a = invlist_array(a);
1497 }
1498
1499 if (len_b != 0) {
1500 array_b = invlist_array(b);
1501 }
1502
1503 /* If are to compare 'a' with the complement of b, set it
1504 * up so are looking at b's complement. */
1505 if (complement_b) {
1506
1507 /* The complement of nothing is everything, so <a> would have to have
1508 * just one element, starting at zero (ending at infinity) */
1509 if (len_b == 0) {
1510 return (len_a == 1 && array_a[0] == 0);
1511 }
1512 if (array_b[0] == 0) {
1513
1514 /* Otherwise, to complement, we invert. Here, the first element is
1515 * 0, just remove it. To do this, we just pretend the array starts
1516 * one later */
1517
1518 array_b++;
1519 len_b--;
1520 }
1521 else {
1522
1523 /* But if the first element is not zero, we pretend the list starts
1524 * at the 0 that is always stored immediately before the array. */
1525 array_b--;
1526 len_b++;
1527 }
1528 }
1529
1530 return len_a == len_b
1531 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
1532
1533}
1534#endif
1535
1536#undef HEADER_LENGTH
1537#undef TO_INTERNAL_SIZE
1538#undef FROM_INTERNAL_SIZE
1539#undef INVLIST_VERSION_ID
1540
1541/* End of inversion list object */