regcomp_invlist.c

   1 #ifdef PERL_EXT_RE_BUILD
   2 #include "re_top.h"
   3 #endif
   4
   5 #include "EXTERN.h"
   6 #define PERL_IN_REGEX_ENGINE
   7 #define PERL_IN_REGCOMP_ANY
   8 #define PERL_IN_REGCOMP_INVLIST_C
   9 #include "perl.h"
  10
  11 #ifdef PERL_IN_XSUB_RE
  12 #  include "re_comp.h"
  13 #else
  14 #  include "regcomp.h"
  15 #endif
  16
  17 #include "invlist_inline.h"
  18 #include "unicode_constants.h"
  19 #include "regcomp_internal.h"
  20
  21
  22 void
  23 Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
  24 {
  25     PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
  26
  27     /* As the name says.  The zeroth bit corresponds to the code point given by
  28      * 'offset' */
  29
  30     UV start, end;
  31
  32     Zero(bitmap, len, U8);
  33
  34     invlist_iterinit(invlist);
  35     while (invlist_iternext(invlist, &start, &end)) {
  36         assert(start >= offset);
  37
  38         for (UV i = start; i <= end; i++) {
  39             UV adjusted = i - offset;
  40
  41             BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
  42         }
  43     }
  44     invlist_iterfinish(invlist);
  45 }
  46
  47 void
  48 Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
  49 {
  50     PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
  51
  52     /* As the name says.  The zeroth bit corresponds to the code point given by
  53      * 'offset' */
  54
  55     Size_t i;
  56
  57     for (i = 0; i < bitmap_len; i++) {
  58         if (BITMAP_TEST(bitmap, i)) {
  59             int start = i++;
  60
  61             /* Save a little work by adding a range all at once instead of bit
  62              * by bit */
  63             while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
  64                 i++;
  65             }
  66
  67             *invlist = _add_range_to_invlist(*invlist,
  68                                              start + offset,
  69                                              i + offset - 1);
  70         }
  71     }
  72 }
  73
  74 /* This section of code defines the inversion list object and its methods.  The
  75  * interfaces are highly subject to change, so as much as possible is static to
  76  * this file.  An inversion list is here implemented as a malloc'd C UV array
  77  * as an SVt_INVLIST scalar.
  78  *
  79  * An inversion list for Unicode is an array of code points, sorted by ordinal
  80  * number.  Each element gives the code point that begins a range that extends
  81  * up-to but not including the code point given by the next element.  The final
  82  * element gives the first code point of a range that extends to the platform's
  83  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
  84  * ...) give ranges whose code points are all in the inversion list.  We say
  85  * that those ranges are in the set.  The odd-numbered elements give ranges
  86  * whose code points are not in the inversion list, and hence not in the set.
  87  * Thus, element [0] is the first code point in the list.  Element [1]
  88  * is the first code point beyond that not in the list; and element [2] is the
  89  * first code point beyond that that is in the list.  In other words, the first
  90  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
  91  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
  92  * all code points in that range are not in the inversion list.  The third
  93  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
  94  * list, and so forth.  Thus every element whose index is divisible by two
  95  * gives the beginning of a range that is in the list, and every element whose
  96  * index is not divisible by two gives the beginning of a range not in the
  97  * list.  If the final element's index is divisible by two, the inversion list
  98  * extends to the platform's infinity; otherwise the highest code point in the
  99  * inversion list is the contents of that element minus 1.
 100  *
 101  * A range that contains just a single code point N will look like
 102  *  invlist[i]   == N
 103  *  invlist[i+1] == N+1
 104  *
 105  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
 106  * impossible to represent, so element [i+1] is omitted.  The single element
 107  * inversion list
 108  *  invlist[0] == UV_MAX
 109  * contains just UV_MAX, but is interpreted as matching to infinity.
 110  *
 111  * Taking the complement (inverting) an inversion list is quite simple, if the
 112  * first element is 0, remove it; otherwise add a 0 element at the beginning.
 113  * This implementation reserves an element at the beginning of each inversion
 114  * list to always contain 0; there is an additional flag in the header which
 115  * indicates if the list begins at the 0, or is offset to begin at the next
 116  * element.  This means that the inversion list can be inverted without any
 117  * copying; just flip the flag.
 118  *
 119  * More about inversion lists can be found in "Unicode Demystified"
 120  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
 121  *
 122  * The inversion list data structure is currently implemented as an SV pointing
 123  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
 124  * array of UV whose memory management is automatically handled by the existing
 125  * facilities for SV's.
 126  *
 127  * Some of the methods should always be private to the implementation, and some
 128  * should eventually be made public */
 129
 130 /* The header definitions are in F<invlist_inline.h> */
 131
 132 #ifndef PERL_IN_XSUB_RE
 133
 134 PERL_STATIC_INLINE UV*
 135 S__invlist_array_init(SV* const invlist, const bool will_have_0)
 136 {
 137     /* Returns a pointer to the first element in the inversion list's array.
 138      * This is called upon initialization of an inversion list.  Where the
 139      * array begins depends on whether the list has the code point U+0000 in it
 140      * or not.  The other parameter tells it whether the code that follows this
 141      * call is about to put a 0 in the inversion list or not.  The first
 142      * element is either the element reserved for 0, if TRUE, or the element
 143      * after it, if FALSE */
 144
 145     bool* offset = get_invlist_offset_addr(invlist);
 146     UV* zero_addr = (UV *) SvPVX(invlist);
 147
 148     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
 149
 150     /* Must be empty */
 151     assert(! _invlist_len(invlist));
 152
 153     *zero_addr = 0;
 154
 155     /* 1^1 = 0; 1^0 = 1 */
 156     *offset = 1 ^ will_have_0;
 157     return zero_addr + *offset;
 158 }
 159
 160 STATIC void
 161 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
 162 {
 163     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
 164      * steals the list from 'src', so 'src' is made to have a NULL list.  This
 165      * is similar to what SvSetMagicSV() would do, if it were implemented on
 166      * inversion lists, though this routine avoids a copy */
 167
 168     const UV src_len          = _invlist_len(src);
 169     const bool src_offset     = *get_invlist_offset_addr(src);
 170     const STRLEN src_byte_len = SvLEN(src);
 171     char * array              = SvPVX(src);
 172
 173 #ifndef NO_TAINT_SUPPORT
 174     const int oldtainted = TAINT_get;
 175 #endif
 176
 177     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
 178
 179     assert(is_invlist(src));
 180     assert(is_invlist(dest));
 181     assert(! invlist_is_iterating(src));
 182     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
 183
 184     /* Make sure it ends in the right place with a NUL, as our inversion list
 185      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
 186      * asserts it */
 187     array[src_byte_len - 1] = '\0';
 188
 189     TAINT_NOT;      /* Otherwise it breaks */
 190     sv_usepvn_flags(dest,
 191                     (char *) array,
 192                     src_byte_len - 1,
 193
 194                     /* This flag is documented to cause a copy to be avoided */
 195                     SV_HAS_TRAILING_NUL);
 196     TAINT_set(oldtainted);
 197     SvPV_set(src, 0);
 198     SvLEN_set(src, 0);
 199     SvCUR_set(src, 0);
 200
 201     /* Finish up copying over the other fields in an inversion list */
 202     *get_invlist_offset_addr(dest) = src_offset;
 203     invlist_set_len(dest, src_len, src_offset);
 204     *get_invlist_previous_index_addr(dest) = 0;
 205     invlist_iterfinish(dest);
 206 }
 207
 208 PERL_STATIC_INLINE IV*
 209 S_get_invlist_previous_index_addr(SV* invlist)
 210 {
 211     /* Return the address of the IV that is reserved to hold the cached index
 212      * */
 213     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
 214
 215     assert(is_invlist(invlist));
 216
 217     return &(((XINVLIST*) SvANY(invlist))->prev_index);
 218 }
 219
 220 PERL_STATIC_INLINE IV
 221 S_invlist_previous_index(SV* const invlist)
 222 {
 223     /* Returns cached index of previous search */
 224
 225     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
 226
 227     return *get_invlist_previous_index_addr(invlist);
 228 }
 229
 230 PERL_STATIC_INLINE void
 231 S_invlist_set_previous_index(SV* const invlist, const IV index)
 232 {
 233     /* Caches <index> for later retrieval */
 234
 235     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
 236
 237     assert(index == 0 || index < (int) _invlist_len(invlist));
 238
 239     *get_invlist_previous_index_addr(invlist) = index;
 240 }
 241
 242 PERL_STATIC_INLINE void
 243 S_invlist_trim(SV* invlist)
 244 {
 245     /* Free the not currently-being-used space in an inversion list */
 246
 247     /* But don't free up the space needed for the 0 UV that is always at the
 248      * beginning of the list, nor the trailing NUL */
 249     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
 250
 251     PERL_ARGS_ASSERT_INVLIST_TRIM;
 252
 253     assert(is_invlist(invlist));
 254
 255     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
 256 }
 257
 258 PERL_STATIC_INLINE void
 259 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
 260 {
 261     PERL_ARGS_ASSERT_INVLIST_CLEAR;
 262
 263     assert(is_invlist(invlist));
 264
 265     invlist_set_len(invlist, 0, 0);
 266     invlist_trim(invlist);
 267 }
 268
 269 PERL_STATIC_INLINE UV
 270 S_invlist_max(const SV* const invlist)
 271 {
 272     /* Returns the maximum number of elements storable in the inversion list's
 273      * array, without having to realloc() */
 274
 275     PERL_ARGS_ASSERT_INVLIST_MAX;
 276
 277     assert(is_invlist(invlist));
 278
 279     /* Assumes worst case, in which the 0 element is not counted in the
 280      * inversion list, so subtracts 1 for that */
 281     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
 282            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
 283            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
 284 }
 285
 286 STATIC void
 287 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
 288 {
 289     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
 290
 291     /* First 1 is in case the zero element isn't in the list; second 1 is for
 292      * trailing NUL */
 293     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
 294     invlist_set_len(invlist, 0, 0);
 295
 296     /* Force iterinit() to be used to get iteration to work */
 297     invlist_iterfinish(invlist);
 298
 299     *get_invlist_previous_index_addr(invlist) = 0;
 300     SvPOK_on(invlist);  /* This allows B to extract the PV */
 301 }
 302
 303 SV*
 304 Perl__new_invlist(pTHX_ IV initial_size)
 305 {
 306
 307     /* Return a pointer to a newly constructed inversion list, with enough
 308      * space to store 'initial_size' elements.  If that number is negative, a
 309      * system default is used instead */
 310
 311     SV* new_list;
 312
 313     if (initial_size < 0) {
 314         initial_size = 10;
 315     }
 316
 317     new_list = newSV_type(SVt_INVLIST);
 318     initialize_invlist_guts(new_list, initial_size);
 319
 320     return new_list;
 321 }
 322
 323 SV*
 324 Perl__new_invlist_C_array(pTHX_ const UV* const list)
 325 {
 326     /* Return a pointer to a newly constructed inversion list, initialized to
 327      * point to <list>, which has to be in the exact correct inversion list
 328      * form, including internal fields.  Thus this is a dangerous routine that
 329      * should not be used in the wrong hands.  The passed in 'list' contains
 330      * several header fields at the beginning that are not part of the
 331      * inversion list body proper */
 332
 333     const STRLEN length = (STRLEN) list[0];
 334     const UV version_id =          list[1];
 335     const bool offset   =    cBOOL(list[2]);
 336 #define HEADER_LENGTH 3
 337     /* If any of the above changes in any way, you must change HEADER_LENGTH
 338      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
 339      *      perl -E 'say int(rand 2**31-1)'
 340      */
 341 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
 342                                         data structure type, so that one being
 343                                         passed in can be validated to be an
 344                                         inversion list of the correct vintage.
 345                                        */
 346
 347     SV* invlist = newSV_type(SVt_INVLIST);
 348
 349     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
 350
 351     if (version_id != INVLIST_VERSION_ID) {
 352         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
 353     }
 354
 355     /* The generated array passed in includes header elements that aren't part
 356      * of the list proper, so start it just after them */
 357     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
 358
 359     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
 360                                shouldn't touch it */
 361
 362     *(get_invlist_offset_addr(invlist)) = offset;
 363
 364     /* The 'length' passed to us is the physical number of elements in the
 365      * inversion list.  But if there is an offset the logical number is one
 366      * less than that */
 367     invlist_set_len(invlist, length  - offset, offset);
 368
 369     invlist_set_previous_index(invlist, 0);
 370
 371     /* Initialize the iteration pointer. */
 372     invlist_iterfinish(invlist);
 373
 374     SvREADONLY_on(invlist);
 375     SvPOK_on(invlist);
 376
 377     return invlist;
 378 }
 379
 380 STATIC void
 381 S__append_range_to_invlist(pTHX_ SV* const invlist,
 382                                  const UV start, const UV end)
 383 {
 384    /* Subject to change or removal.  Append the range from 'start' to 'end' at
 385     * the end of the inversion list.  The range must be above any existing
 386     * ones. */
 387
 388     UV* array;
 389     UV max = invlist_max(invlist);
 390     UV len = _invlist_len(invlist);
 391     bool offset;
 392
 393     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
 394
 395     if (len == 0) { /* Empty lists must be initialized */
 396         offset = start != 0;
 397         array = _invlist_array_init(invlist, ! offset);
 398     }
 399     else {
 400         /* Here, the existing list is non-empty. The current max entry in the
 401          * list is generally the first value not in the set, except when the
 402          * set extends to the end of permissible values, in which case it is
 403          * the first entry in that final set, and so this call is an attempt to
 404          * append out-of-order */
 405
 406         UV final_element = len - 1;
 407         array = invlist_array(invlist);
 408         if (   array[final_element] > start
 409             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
 410         {
 411             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
 412                      array[final_element], start,
 413                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
 414         }
 415
 416         /* Here, it is a legal append.  If the new range begins 1 above the end
 417          * of the range below it, it is extending the range below it, so the
 418          * new first value not in the set is one greater than the newly
 419          * extended range.  */
 420         offset = *get_invlist_offset_addr(invlist);
 421         if (array[final_element] == start) {
 422             if (end != UV_MAX) {
 423                 array[final_element] = end + 1;
 424             }
 425             else {
 426                 /* But if the end is the maximum representable on the machine,
 427                  * assume that infinity was actually what was meant.  Just let
 428                  * the range that this would extend to have no end */
 429                 invlist_set_len(invlist, len - 1, offset);
 430             }
 431             return;
 432         }
 433     }
 434
 435     /* Here the new range doesn't extend any existing set.  Add it */
 436
 437     len += 2;   /* Includes an element each for the start and end of range */
 438
 439     /* If wll overflow the existing space, extend, which may cause the array to
 440      * be moved */
 441     if (max < len) {
 442         invlist_extend(invlist, len);
 443
 444         /* Have to set len here to avoid assert failure in invlist_array() */
 445         invlist_set_len(invlist, len, offset);
 446
 447         array = invlist_array(invlist);
 448     }
 449     else {
 450         invlist_set_len(invlist, len, offset);
 451     }
 452
 453     /* The next item on the list starts the range, the one after that is
 454      * one past the new range.  */
 455     array[len - 2] = start;
 456     if (end != UV_MAX) {
 457         array[len - 1] = end + 1;
 458     }
 459     else {
 460         /* But if the end is the maximum representable on the machine, just let
 461          * the range have no end */
 462         invlist_set_len(invlist, len - 1, offset);
 463     }
 464 }
 465
 466 SSize_t
 467 Perl__invlist_search(SV* const invlist, const UV cp)
 468 {
 469     /* Searches the inversion list for the entry that contains the input code
 470      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
 471      * return value is the index into the list's array of the range that
 472      * contains <cp>, that is, 'i' such that
 473      *  array[i] <= cp < array[i+1]
 474      */
 475
 476     IV low = 0;
 477     IV mid;
 478     IV high = _invlist_len(invlist);
 479     const IV highest_element = high - 1;
 480     const UV* array;
 481
 482     PERL_ARGS_ASSERT__INVLIST_SEARCH;
 483
 484     /* If list is empty, return failure. */
 485     if (UNLIKELY(high == 0)) {
 486         return -1;
 487     }
 488
 489     /* (We can't get the array unless we know the list is non-empty) */
 490     array = invlist_array(invlist);
 491
 492     mid = invlist_previous_index(invlist);
 493     assert(mid >=0);
 494     if (UNLIKELY(mid > highest_element)) {
 495         mid = highest_element;
 496     }
 497
 498     /* <mid> contains the cache of the result of the previous call to this
 499      * function (0 the first time).  See if this call is for the same result,
 500      * or if it is for mid-1.  This is under the theory that calls to this
 501      * function will often be for related code points that are near each other.
 502      * And benchmarks show that caching gives better results.  We also test
 503      * here if the code point is within the bounds of the list.  These tests
 504      * replace others that would have had to be made anyway to make sure that
 505      * the array bounds were not exceeded, and these give us extra information
 506      * at the same time */
 507     if (cp >= array[mid]) {
 508         if (cp >= array[highest_element]) {
 509             return highest_element;
 510         }
 511
 512         /* Here, array[mid] <= cp < array[highest_element].  This means that
 513          * the final element is not the answer, so can exclude it; it also
 514          * means that <mid> is not the final element, so can refer to 'mid + 1'
 515          * safely */
 516         if (cp < array[mid + 1]) {
 517             return mid;
 518         }
 519         high--;
 520         low = mid + 1;
 521     }
 522     else { /* cp < aray[mid] */
 523         if (cp < array[0]) { /* Fail if outside the array */
 524             return -1;
 525         }
 526         high = mid;
 527         if (cp >= array[mid - 1]) {
 528             goto found_entry;
 529         }
 530     }
 531
 532     /* Binary search.  What we are looking for is <i> such that
 533      *  array[i] <= cp < array[i+1]
 534      * The loop below converges on the i+1.  Note that there may not be an
 535      * (i+1)th element in the array, and things work nonetheless */
 536     while (low < high) {
 537         mid = (low + high) / 2;
 538         assert(mid <= highest_element);
 539         if (array[mid] <= cp) { /* cp >= array[mid] */
 540             low = mid + 1;
 541
 542             /* We could do this extra test to exit the loop early.
 543             if (cp < array[low]) {
 544                 return mid;
 545             }
 546             */
 547         }
 548         else { /* cp < array[mid] */
 549             high = mid;
 550         }
 551     }
 552
 553   found_entry:
 554     high--;
 555     invlist_set_previous_index(invlist, high);
 556     return high;
 557 }
 558
 559 void
 560 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 561                                          const bool complement_b, SV** output)
 562 {
 563     /* Take the union of two inversion lists and point '*output' to it.  On
 564      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
 565      * even 'a' or 'b').  If to an inversion list, the contents of the original
 566      * list will be replaced by the union.  The first list, 'a', may be
 567      * NULL, in which case a copy of the second list is placed in '*output'.
 568      * If 'complement_b' is TRUE, the union is taken of the complement
 569      * (inversion) of 'b' instead of b itself.
 570      *
 571      * The basis for this comes from "Unicode Demystified" Chapter 13 by
 572      * Richard Gillam, published by Addison-Wesley, and explained at some
 573      * length there.  The preface says to incorporate its examples into your
 574      * code at your own risk.
 575      *
 576      * The algorithm is like a merge sort. */
 577
 578     const UV* array_a;    /* a's array */
 579     const UV* array_b;
 580     UV len_a;       /* length of a's array */
 581     UV len_b;
 582
 583     SV* u;                      /* the resulting union */
 584     UV* array_u;
 585     UV len_u = 0;
 586
 587     UV i_a = 0;             /* current index into a's array */
 588     UV i_b = 0;
 589     UV i_u = 0;
 590
 591     /* running count, as explained in the algorithm source book; items are
 592      * stopped accumulating and are output when the count changes to/from 0.
 593      * The count is incremented when we start a range that's in an input's set,
 594      * and decremented when we start a range that's not in a set.  So this
 595      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
 596      * and hence nothing goes into the union; 1, just one of the inputs is in
 597      * its set (and its current range gets added to the union); and 2 when both
 598      * inputs are in their sets.  */
 599     UV count = 0;
 600
 601     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
 602     assert(a != b);
 603     assert(*output == NULL || is_invlist(*output));
 604
 605     len_b = _invlist_len(b);
 606     if (len_b == 0) {
 607
 608         /* Here, 'b' is empty, hence it's complement is all possible code
 609          * points.  So if the union includes the complement of 'b', it includes
 610          * everything, and we need not even look at 'a'.  It's easiest to
 611          * create a new inversion list that matches everything.  */
 612         if (complement_b) {
 613             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
 614
 615             if (*output == NULL) { /* If the output didn't exist, just point it
 616                                       at the new list */
 617                 *output = everything;
 618             }
 619             else { /* Otherwise, replace its contents with the new list */
 620                 invlist_replace_list_destroys_src(*output, everything);
 621                 SvREFCNT_dec_NN(everything);
 622             }
 623
 624             return;
 625         }
 626
 627         /* Here, we don't want the complement of 'b', and since 'b' is empty,
 628          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
 629          * output will be empty */
 630
 631         if (a == NULL || _invlist_len(a) == 0) {
 632             if (*output == NULL) {
 633                 *output = _new_invlist(0);
 634             }
 635             else {
 636                 invlist_clear(*output);
 637             }
 638             return;
 639         }
 640
 641         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
 642          * union.  We can just return a copy of 'a' if '*output' doesn't point
 643          * to an existing list */
 644         if (*output == NULL) {
 645             *output = invlist_clone(a, NULL);
 646             return;
 647         }
 648
 649         /* If the output is to overwrite 'a', we have a no-op, as it's
 650          * already in 'a' */
 651         if (*output == a) {
 652             return;
 653         }
 654
 655         /* Here, '*output' is to be overwritten by 'a' */
 656         u = invlist_clone(a, NULL);
 657         invlist_replace_list_destroys_src(*output, u);
 658         SvREFCNT_dec_NN(u);
 659
 660         return;
 661     }
 662
 663     /* Here 'b' is not empty.  See about 'a' */
 664
 665     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
 666
 667         /* Here, 'a' is empty (and b is not).  That means the union will come
 668          * entirely from 'b'.  If '*output' is NULL, we can directly return a
 669          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
 670          * the clone */
 671
 672         SV ** dest = (*output == NULL) ? output : &u;
 673         *dest = invlist_clone(b, NULL);
 674         if (complement_b) {
 675             _invlist_invert(*dest);
 676         }
 677
 678         if (dest == &u) {
 679             invlist_replace_list_destroys_src(*output, u);
 680             SvREFCNT_dec_NN(u);
 681         }
 682
 683         return;
 684     }
 685
 686     /* Here both lists exist and are non-empty */
 687     array_a = invlist_array(a);
 688     array_b = invlist_array(b);
 689
 690     /* If are to take the union of 'a' with the complement of b, set it
 691      * up so are looking at b's complement. */
 692     if (complement_b) {
 693
 694         /* To complement, we invert: if the first element is 0, remove it.  To
 695          * do this, we just pretend the array starts one later */
 696         if (array_b[0] == 0) {
 697             array_b++;
 698             len_b--;
 699         }
 700         else {
 701
 702             /* But if the first element is not zero, we pretend the list starts
 703              * at the 0 that is always stored immediately before the array. */
 704             array_b--;
 705             len_b++;
 706         }
 707     }
 708
 709     /* Size the union for the worst case: that the sets are completely
 710      * disjoint */
 711     u = _new_invlist(len_a + len_b);
 712
 713     /* Will contain U+0000 if either component does */
 714     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
 715                                       || (len_b > 0 && array_b[0] == 0));
 716
 717     /* Go through each input list item by item, stopping when have exhausted
 718      * one of them */
 719     while (i_a < len_a && i_b < len_b) {
 720         UV cp;      /* The element to potentially add to the union's array */
 721         bool cp_in_set;   /* is it in the input list's set or not */
 722
 723         /* We need to take one or the other of the two inputs for the union.
 724          * Since we are merging two sorted lists, we take the smaller of the
 725          * next items.  In case of a tie, we take first the one that is in its
 726          * set.  If we first took the one not in its set, it would decrement
 727          * the count, possibly to 0 which would cause it to be output as ending
 728          * the range, and the next time through we would take the same number,
 729          * and output it again as beginning the next range.  By doing it the
 730          * opposite way, there is no possibility that the count will be
 731          * momentarily decremented to 0, and thus the two adjoining ranges will
 732          * be seamlessly merged.  (In a tie and both are in the set or both not
 733          * in the set, it doesn't matter which we take first.) */
 734         if (       array_a[i_a] < array_b[i_b]
 735             || (   array_a[i_a] == array_b[i_b]
 736                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
 737         {
 738             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
 739             cp = array_a[i_a++];
 740         }
 741         else {
 742             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
 743             cp = array_b[i_b++];
 744         }
 745
 746         /* Here, have chosen which of the two inputs to look at.  Only output
 747          * if the running count changes to/from 0, which marks the
 748          * beginning/end of a range that's in the set */
 749         if (cp_in_set) {
 750             if (count == 0) {
 751                 array_u[i_u++] = cp;
 752             }
 753             count++;
 754         }
 755         else {
 756             count--;
 757             if (count == 0) {
 758                 array_u[i_u++] = cp;
 759             }
 760         }
 761     }
 762
 763
 764     /* The loop above increments the index into exactly one of the input lists
 765      * each iteration, and ends when either index gets to its list end.  That
 766      * means the other index is lower than its end, and so something is
 767      * remaining in that one.  We decrement 'count', as explained below, if
 768      * that list is in its set.  (i_a and i_b each currently index the element
 769      * beyond the one we care about.) */
 770     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
 771         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
 772     {
 773         count--;
 774     }
 775
 776     /* Above we decremented 'count' if the list that had unexamined elements in
 777      * it was in its set.  This has made it so that 'count' being non-zero
 778      * means there isn't anything left to output; and 'count' equal to 0 means
 779      * that what is left to output is precisely that which is left in the
 780      * non-exhausted input list.
 781      *
 782      * To see why, note first that the exhausted input obviously has nothing
 783      * left to add to the union.  If it was in its set at its end, that means
 784      * the set extends from here to the platform's infinity, and hence so does
 785      * the union and the non-exhausted set is irrelevant.  The exhausted set
 786      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
 787      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
 788      * 'count' remains at 1.  This is consistent with the decremented 'count'
 789      * != 0 meaning there's nothing left to add to the union.
 790      *
 791      * But if the exhausted input wasn't in its set, it contributed 0 to
 792      * 'count', and the rest of the union will be whatever the other input is.
 793      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
 794      * otherwise it gets decremented to 0.  This is consistent with 'count'
 795      * == 0 meaning the remainder of the union is whatever is left in the
 796      * non-exhausted list. */
 797     if (count != 0) {
 798         len_u = i_u;
 799     }
 800     else {
 801         IV copy_count = len_a - i_a;
 802         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
 803             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
 804         }
 805         else { /* The non-exhausted input is b */
 806             copy_count = len_b - i_b;
 807             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
 808         }
 809         len_u = i_u + copy_count;
 810     }
 811
 812     /* Set the result to the final length, which can change the pointer to
 813      * array_u, so re-find it.  (Note that it is unlikely that this will
 814      * change, as we are shrinking the space, not enlarging it) */
 815     if (len_u != _invlist_len(u)) {
 816         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
 817         invlist_trim(u);
 818         array_u = invlist_array(u);
 819     }
 820
 821     if (*output == NULL) {  /* Simply return the new inversion list */
 822         *output = u;
 823     }
 824     else {
 825         /* Otherwise, overwrite the inversion list that was in '*output'.  We
 826          * could instead free '*output', and then set it to 'u', but experience
 827          * has shown [perl #127392] that if the input is a mortal, we can get a
 828          * huge build-up of these during regex compilation before they get
 829          * freed. */
 830         invlist_replace_list_destroys_src(*output, u);
 831         SvREFCNT_dec_NN(u);
 832     }
 833
 834     return;
 835 }
 836
 837 void
 838 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 839                                                const bool complement_b, SV** i)
 840 {
 841     /* Take the intersection of two inversion lists and point '*i' to it.  On
 842      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
 843      * even 'a' or 'b').  If to an inversion list, the contents of the original
 844      * list will be replaced by the intersection.  The first list, 'a', may be
 845      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
 846      * TRUE, the result will be the intersection of 'a' and the complement (or
 847      * inversion) of 'b' instead of 'b' directly.
 848      *
 849      * The basis for this comes from "Unicode Demystified" Chapter 13 by
 850      * Richard Gillam, published by Addison-Wesley, and explained at some
 851      * length there.  The preface says to incorporate its examples into your
 852      * code at your own risk.  In fact, it had bugs
 853      *
 854      * The algorithm is like a merge sort, and is essentially the same as the
 855      * union above
 856      */
 857
 858     const UV* array_a;          /* a's array */
 859     const UV* array_b;
 860     UV len_a;   /* length of a's array */
 861     UV len_b;
 862
 863     SV* r;                   /* the resulting intersection */
 864     UV* array_r;
 865     UV len_r = 0;
 866
 867     UV i_a = 0;             /* current index into a's array */
 868     UV i_b = 0;
 869     UV i_r = 0;
 870
 871     /* running count of how many of the two inputs are postitioned at ranges
 872      * that are in their sets.  As explained in the algorithm source book,
 873      * items are stopped accumulating and are output when the count changes
 874      * to/from 2.  The count is incremented when we start a range that's in an
 875      * input's set, and decremented when we start a range that's not in a set.
 876      * Only when it is 2 are we in the intersection. */
 877     UV count = 0;
 878
 879     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
 880     assert(a != b);
 881     assert(*i == NULL || is_invlist(*i));
 882
 883     /* Special case if either one is empty */
 884     len_a = (a == NULL) ? 0 : _invlist_len(a);
 885     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
 886         if (len_a != 0 && complement_b) {
 887
 888             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
 889              * must be empty.  Here, also we are using 'b's complement, which
 890              * hence must be every possible code point.  Thus the intersection
 891              * is simply 'a'. */
 892
 893             if (*i == a) {  /* No-op */
 894                 return;
 895             }
 896
 897             if (*i == NULL) {
 898                 *i = invlist_clone(a, NULL);
 899                 return;
 900             }
 901
 902             r = invlist_clone(a, NULL);
 903             invlist_replace_list_destroys_src(*i, r);
 904             SvREFCNT_dec_NN(r);
 905             return;
 906         }
 907
 908         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
 909          * intersection must be empty */
 910         if (*i == NULL) {
 911             *i = _new_invlist(0);
 912             return;
 913         }
 914
 915         invlist_clear(*i);
 916         return;
 917     }
 918
 919     /* Here both lists exist and are non-empty */
 920     array_a = invlist_array(a);
 921     array_b = invlist_array(b);
 922
 923     /* If are to take the intersection of 'a' with the complement of b, set it
 924      * up so are looking at b's complement. */
 925     if (complement_b) {
 926
 927         /* To complement, we invert: if the first element is 0, remove it.  To
 928          * do this, we just pretend the array starts one later */
 929         if (array_b[0] == 0) {
 930             array_b++;
 931             len_b--;
 932         }
 933         else {
 934
 935             /* But if the first element is not zero, we pretend the list starts
 936              * at the 0 that is always stored immediately before the array. */
 937             array_b--;
 938             len_b++;
 939         }
 940     }
 941
 942     /* Size the intersection for the worst case: that the intersection ends up
 943      * fragmenting everything to be completely disjoint */
 944     r= _new_invlist(len_a + len_b);
 945
 946     /* Will contain U+0000 iff both components do */
 947     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
 948                                      && len_b > 0 && array_b[0] == 0);
 949
 950     /* Go through each list item by item, stopping when have exhausted one of
 951      * them */
 952     while (i_a < len_a && i_b < len_b) {
 953         UV cp;      /* The element to potentially add to the intersection's
 954                        array */
 955         bool cp_in_set; /* Is it in the input list's set or not */
 956
 957         /* We need to take one or the other of the two inputs for the
 958          * intersection.  Since we are merging two sorted lists, we take the
 959          * smaller of the next items.  In case of a tie, we take first the one
 960          * that is not in its set (a difference from the union algorithm).  If
 961          * we first took the one in its set, it would increment the count,
 962          * possibly to 2 which would cause it to be output as starting a range
 963          * in the intersection, and the next time through we would take that
 964          * same number, and output it again as ending the set.  By doing the
 965          * opposite of this, there is no possibility that the count will be
 966          * momentarily incremented to 2.  (In a tie and both are in the set or
 967          * both not in the set, it doesn't matter which we take first.) */
 968         if (       array_a[i_a] < array_b[i_b]
 969             || (   array_a[i_a] == array_b[i_b]
 970                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
 971         {
 972             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
 973             cp = array_a[i_a++];
 974         }
 975         else {
 976             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
 977             cp= array_b[i_b++];
 978         }
 979
 980         /* Here, have chosen which of the two inputs to look at.  Only output
 981          * if the running count changes to/from 2, which marks the
 982          * beginning/end of a range that's in the intersection */
 983         if (cp_in_set) {
 984             count++;
 985             if (count == 2) {
 986                 array_r[i_r++] = cp;
 987             }
 988         }
 989         else {
 990             if (count == 2) {
 991                 array_r[i_r++] = cp;
 992             }
 993             count--;
 994         }
 995
 996     }
 997
 998     /* The loop above increments the index into exactly one of the input lists
 999      * each iteration, and ends when either index gets to its list end.  That
1000      * means the other index is lower than its end, and so something is
1001      * remaining in that one.  We increment 'count', as explained below, if the
1002      * exhausted list was in its set.  (i_a and i_b each currently index the
1003      * element beyond the one we care about.) */
1004     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
1005         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
1006     {
1007         count++;
1008     }
1009
1010     /* Above we incremented 'count' if the exhausted list was in its set.  This
1011      * has made it so that 'count' being below 2 means there is nothing left to
1012      * output; otheriwse what's left to add to the intersection is precisely
1013      * that which is left in the non-exhausted input list.
1014      *
1015      * To see why, note first that the exhausted input obviously has nothing
1016      * left to affect the intersection.  If it was in its set at its end, that
1017      * means the set extends from here to the platform's infinity, and hence
1018      * anything in the non-exhausted's list will be in the intersection, and
1019      * anything not in it won't be.  Hence, the rest of the intersection is
1020      * precisely what's in the non-exhausted list  The exhausted set also
1021      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
1022      * it means 'count' is now at least 2.  This is consistent with the
1023      * incremented 'count' being >= 2 means to add the non-exhausted list to
1024      * the intersection.
1025      *
1026      * But if the exhausted input wasn't in its set, it contributed 0 to
1027      * 'count', and the intersection can't include anything further; the
1028      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
1029      * incremented.  This is consistent with 'count' being < 2 meaning nothing
1030      * further to add to the intersection. */
1031     if (count < 2) { /* Nothing left to put in the intersection. */
1032         len_r = i_r;
1033     }
1034     else { /* copy the non-exhausted list, unchanged. */
1035         IV copy_count = len_a - i_a;
1036         if (copy_count > 0) {   /* a is the one with stuff left */
1037             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
1038         }
1039         else {  /* b is the one with stuff left */
1040             copy_count = len_b - i_b;
1041             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
1042         }
1043         len_r = i_r + copy_count;
1044     }
1045
1046     /* Set the result to the final length, which can change the pointer to
1047      * array_r, so re-find it.  (Note that it is unlikely that this will
1048      * change, as we are shrinking the space, not enlarging it) */
1049     if (len_r != _invlist_len(r)) {
1050         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
1051         invlist_trim(r);
1052         array_r = invlist_array(r);
1053     }
1054
1055     if (*i == NULL) { /* Simply return the calculated intersection */
1056         *i = r;
1057     }
1058     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
1059               instead free '*i', and then set it to 'r', but experience has
1060               shown [perl #127392] that if the input is a mortal, we can get a
1061               huge build-up of these during regex compilation before they get
1062               freed. */
1063         if (len_r) {
1064             invlist_replace_list_destroys_src(*i, r);
1065         }
1066         else {
1067             invlist_clear(*i);
1068         }
1069         SvREFCNT_dec_NN(r);
1070     }
1071
1072     return;
1073 }
1074
1075 SV*
1076 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
1077 {
1078     /* Add the range from 'start' to 'end' inclusive to the inversion list's
1079      * set.  A pointer to the inversion list is returned.  This may actually be
1080      * a new list, in which case the passed in one has been destroyed.  The
1081      * passed-in inversion list can be NULL, in which case a new one is created
1082      * with just the one range in it.  The new list is not necessarily
1083      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
1084      * result of this function.  The gain would not be large, and in many
1085      * cases, this is called multiple times on a single inversion list, so
1086      * anything freed may almost immediately be needed again.
1087      *
1088      * This used to mostly call the 'union' routine, but that is much more
1089      * heavyweight than really needed for a single range addition */
1090
1091     UV* array;              /* The array implementing the inversion list */
1092     UV len;                 /* How many elements in 'array' */
1093     SSize_t i_s;            /* index into the invlist array where 'start'
1094                                should go */
1095     SSize_t i_e = 0;        /* And the index where 'end' should go */
1096     UV cur_highest;         /* The highest code point in the inversion list
1097                                upon entry to this function */
1098
1099     /* This range becomes the whole inversion list if none already existed */
1100     if (invlist == NULL) {
1101         invlist = _new_invlist(2);
1102         _append_range_to_invlist(invlist, start, end);
1103         return invlist;
1104     }
1105
1106     /* Likewise, if the inversion list is currently empty */
1107     len = _invlist_len(invlist);
1108     if (len == 0) {
1109         _append_range_to_invlist(invlist, start, end);
1110         return invlist;
1111     }
1112
1113     /* Starting here, we have to know the internals of the list */
1114     array = invlist_array(invlist);
1115
1116     /* If the new range ends higher than the current highest ... */
1117     cur_highest = invlist_highest(invlist);
1118     if (end > cur_highest) {
1119
1120         /* If the whole range is higher, we can just append it */
1121         if (start > cur_highest) {
1122             _append_range_to_invlist(invlist, start, end);
1123             return invlist;
1124         }
1125
1126         /* Otherwise, add the portion that is higher ... */
1127         _append_range_to_invlist(invlist, cur_highest + 1, end);
1128
1129         /* ... and continue on below to handle the rest.  As a result of the
1130          * above append, we know that the index of the end of the range is the
1131          * final even numbered one of the array.  Recall that the final element
1132          * always starts a range that extends to infinity.  If that range is in
1133          * the set (meaning the set goes from here to infinity), it will be an
1134          * even index, but if it isn't in the set, it's odd, and the final
1135          * range in the set is one less, which is even. */
1136         if (end == UV_MAX) {
1137             i_e = len;
1138         }
1139         else {
1140             i_e = len - 2;
1141         }
1142     }
1143
1144     /* We have dealt with appending, now see about prepending.  If the new
1145      * range starts lower than the current lowest ... */
1146     if (start < array[0]) {
1147
1148         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
1149          * Let the union code handle it, rather than having to know the
1150          * trickiness in two code places.  */
1151         if (UNLIKELY(start == 0)) {
1152             SV* range_invlist;
1153
1154             range_invlist = _new_invlist(2);
1155             _append_range_to_invlist(range_invlist, start, end);
1156
1157             _invlist_union(invlist, range_invlist, &invlist);
1158
1159             SvREFCNT_dec_NN(range_invlist);
1160
1161             return invlist;
1162         }
1163
1164         /* If the whole new range comes before the first entry, and doesn't
1165          * extend it, we have to insert it as an additional range */
1166         if (end < array[0] - 1) {
1167             i_s = i_e = -1;
1168             goto splice_in_new_range;
1169         }
1170
1171         /* Here the new range adjoins the existing first range, extending it
1172          * downwards. */
1173         array[0] = start;
1174
1175         /* And continue on below to handle the rest.  We know that the index of
1176          * the beginning of the range is the first one of the array */
1177         i_s = 0;
1178     }
1179     else { /* Not prepending any part of the new range to the existing list.
1180             * Find where in the list it should go.  This finds i_s, such that:
1181             *     invlist[i_s] <= start < array[i_s+1]
1182             */
1183         i_s = _invlist_search(invlist, start);
1184     }
1185
1186     /* At this point, any extending before the beginning of the inversion list
1187      * and/or after the end has been done.  This has made it so that, in the
1188      * code below, each endpoint of the new range is either in a range that is
1189      * in the set, or is in a gap between two ranges that are.  This means we
1190      * don't have to worry about exceeding the array bounds.
1191      *
1192      * Find where in the list the new range ends (but we can skip this if we
1193      * have already determined what it is, or if it will be the same as i_s,
1194      * which we already have computed) */
1195     if (i_e == 0) {
1196         i_e = (start == end)
1197               ? i_s
1198               : _invlist_search(invlist, end);
1199     }
1200
1201     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
1202      * is a range that goes to infinity there is no element at invlist[i_e+1],
1203      * so only the first relation holds. */
1204
1205     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1206
1207         /* Here, the ranges on either side of the beginning of the new range
1208          * are in the set, and this range starts in the gap between them.
1209          *
1210          * The new range extends the range above it downwards if the new range
1211          * ends at or above that range's start */
1212         const bool extends_the_range_above = (   end == UV_MAX
1213                                               || end + 1 >= array[i_s+1]);
1214
1215         /* The new range extends the range below it upwards if it begins just
1216          * after where that range ends */
1217         if (start == array[i_s]) {
1218
1219             /* If the new range fills the entire gap between the other ranges,
1220              * they will get merged together.  Other ranges may also get
1221              * merged, depending on how many of them the new range spans.  In
1222              * the general case, we do the merge later, just once, after we
1223              * figure out how many to merge.  But in the case where the new
1224              * range exactly spans just this one gap (possibly extending into
1225              * the one above), we do the merge here, and an early exit.  This
1226              * is done here to avoid having to special case later. */
1227             if (i_e - i_s <= 1) {
1228
1229                 /* If i_e - i_s == 1, it means that the new range terminates
1230                  * within the range above, and hence 'extends_the_range_above'
1231                  * must be true.  (If the range above it extends to infinity,
1232                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
1233                  * will be 0, so no harm done.) */
1234                 if (extends_the_range_above) {
1235                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
1236                     invlist_set_len(invlist,
1237                                     len - 2,
1238                                     *(get_invlist_offset_addr(invlist)));
1239                     return invlist;
1240                 }
1241
1242                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
1243                  * to the same range, and below we are about to decrement i_s
1244                  * */
1245                 i_e--;
1246             }
1247
1248             /* Here, the new range is adjacent to the one below.  (It may also
1249              * span beyond the range above, but that will get resolved later.)
1250              * Extend the range below to include this one. */
1251             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
1252             i_s--;
1253             start = array[i_s];
1254         }
1255         else if (extends_the_range_above) {
1256
1257             /* Here the new range only extends the range above it, but not the
1258              * one below.  It merges with the one above.  Again, we keep i_e
1259              * and i_s in sync if they point to the same range */
1260             if (i_e == i_s) {
1261                 i_e++;
1262             }
1263             i_s++;
1264             array[i_s] = start;
1265         }
1266     }
1267
1268     /* Here, we've dealt with the new range start extending any adjoining
1269      * existing ranges.
1270      *
1271      * If the new range extends to infinity, it is now the final one,
1272      * regardless of what was there before */
1273     if (UNLIKELY(end == UV_MAX)) {
1274         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
1275         return invlist;
1276     }
1277
1278     /* If i_e started as == i_s, it has also been dealt with,
1279      * and been updated to the new i_s, which will fail the following if */
1280     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
1281
1282         /* Here, the ranges on either side of the end of the new range are in
1283          * the set, and this range ends in the gap between them.
1284          *
1285          * If this range is adjacent to (hence extends) the range above it, it
1286          * becomes part of that range; likewise if it extends the range below,
1287          * it becomes part of that range */
1288         if (end + 1 == array[i_e+1]) {
1289             i_e++;
1290             array[i_e] = start;
1291         }
1292         else if (start <= array[i_e]) {
1293             array[i_e] = end + 1;
1294             i_e--;
1295         }
1296     }
1297
1298     if (i_s == i_e) {
1299
1300         /* If the range fits entirely in an existing range (as possibly already
1301          * extended above), it doesn't add anything new */
1302         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1303             return invlist;
1304         }
1305
1306         /* Here, no part of the range is in the list.  Must add it.  It will
1307          * occupy 2 more slots */
1308       splice_in_new_range:
1309
1310         invlist_extend(invlist, len + 2);
1311         array = invlist_array(invlist);
1312         /* Move the rest of the array down two slots. Don't include any
1313          * trailing NUL */
1314         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
1315
1316         /* Do the actual splice */
1317         array[i_e+1] = start;
1318         array[i_e+2] = end + 1;
1319         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
1320         return invlist;
1321     }
1322
1323     /* Here the new range crossed the boundaries of a pre-existing range.  The
1324      * code above has adjusted things so that both ends are in ranges that are
1325      * in the set.  This means everything in between must also be in the set.
1326      * Just squash things together */
1327     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
1328     invlist_set_len(invlist,
1329                     len - i_e + i_s,
1330                     *(get_invlist_offset_addr(invlist)));
1331
1332     return invlist;
1333 }
1334
1335 SV*
1336 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
1337                                  UV** other_elements_ptr)
1338 {
1339     /* Create and return an inversion list whose contents are to be populated
1340      * by the caller.  The caller gives the number of elements (in 'size') and
1341      * the very first element ('element0').  This function will set
1342      * '*other_elements_ptr' to an array of UVs, where the remaining elements
1343      * are to be placed.
1344      *
1345      * Obviously there is some trust involved that the caller will properly
1346      * fill in the other elements of the array.
1347      *
1348      * (The first element needs to be passed in, as the underlying code does
1349      * things differently depending on whether it is zero or non-zero) */
1350
1351     SV* invlist = _new_invlist(size);
1352     bool offset;
1353
1354     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
1355
1356     invlist = add_cp_to_invlist(invlist, element0);
1357     offset = *get_invlist_offset_addr(invlist);
1358
1359     invlist_set_len(invlist, size, offset);
1360     *other_elements_ptr = invlist_array(invlist) + 1;
1361     return invlist;
1362 }
1363
1364 #endif
1365
1366 #ifndef PERL_IN_XSUB_RE
1367 void
1368 Perl__invlist_invert(pTHX_ SV* const invlist)
1369 {
1370     /* Complement the input inversion list.  This adds a 0 if the list didn't
1371      * have a zero; removes it otherwise.  As described above, the data
1372      * structure is set up so that this is very efficient */
1373
1374     PERL_ARGS_ASSERT__INVLIST_INVERT;
1375
1376     assert(! invlist_is_iterating(invlist));
1377
1378     /* The inverse of matching nothing is matching everything */
1379     if (_invlist_len(invlist) == 0) {
1380         _append_range_to_invlist(invlist, 0, UV_MAX);
1381         return;
1382     }
1383
1384     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
1385 }
1386
1387 SV*
1388 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
1389 {
1390     /* Return a new inversion list that is a copy of the input one, which is
1391      * unchanged.  The new list will not be mortal even if the old one was. */
1392
1393     const STRLEN nominal_length = _invlist_len(invlist);
1394     const STRLEN physical_length = SvCUR(invlist);
1395     const bool offset = *(get_invlist_offset_addr(invlist));
1396
1397     PERL_ARGS_ASSERT_INVLIST_CLONE;
1398
1399     if (new_invlist == NULL) {
1400         new_invlist = _new_invlist(nominal_length);
1401     }
1402     else {
1403         sv_upgrade(new_invlist, SVt_INVLIST);
1404         initialize_invlist_guts(new_invlist, nominal_length);
1405     }
1406
1407     *(get_invlist_offset_addr(new_invlist)) = offset;
1408     invlist_set_len(new_invlist, nominal_length, offset);
1409     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
1410
1411     return new_invlist;
1412 }
1413
1414 #endif
1415
1416
1417 #ifndef PERL_IN_XSUB_RE
1418 void
1419 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
1420                          const char * const indent, SV* const invlist)
1421 {
1422     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
1423      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
1424      * the string 'indent'.  The output looks like this:
1425          [0] 0x000A .. 0x000D
1426          [2] 0x0085
1427          [4] 0x2028 .. 0x2029
1428          [6] 0x3104 .. INFTY
1429      * This means that the first range of code points matched by the list are
1430      * 0xA through 0xD; the second range contains only the single code point
1431      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
1432      * are used to define each range (except if the final range extends to
1433      * infinity, only a single element is needed).  The array index of the
1434      * first element for the corresponding range is given in brackets. */
1435
1436     UV start, end;
1437     STRLEN count = 0;
1438
1439     PERL_ARGS_ASSERT__INVLIST_DUMP;
1440
1441     if (invlist_is_iterating(invlist)) {
1442         Perl_dump_indent(aTHX_ level, file,
1443              "%sCan't dump inversion list because is in middle of iterating\n",
1444              indent);
1445         return;
1446     }
1447
1448     invlist_iterinit(invlist);
1449     while (invlist_iternext(invlist, &start, &end)) {
1450         if (end == UV_MAX) {
1451             Perl_dump_indent(aTHX_ level, file,
1452                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
1453                                    indent, (UV)count, start);
1454         }
1455         else if (end != start) {
1456             Perl_dump_indent(aTHX_ level, file,
1457                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
1458                                 indent, (UV)count, start,         end);
1459         }
1460         else {
1461             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
1462                                             indent, (UV)count, start);
1463         }
1464         count += 2;
1465     }
1466 }
1467
1468 #endif
1469
1470 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
1471 bool
1472 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
1473 {
1474     /* Return a boolean as to if the two passed in inversion lists are
1475      * identical.  The final argument, if TRUE, says to take the complement of
1476      * the second inversion list before doing the comparison */
1477
1478     const UV len_a = _invlist_len(a);
1479     UV len_b = _invlist_len(b);
1480
1481     const UV* array_a = NULL;
1482     const UV* array_b = NULL;
1483
1484     PERL_ARGS_ASSERT__INVLISTEQ;
1485
1486     /* This code avoids accessing the arrays unless it knows the length is
1487      * non-zero */
1488
1489     if (len_a == 0) {
1490         if (len_b == 0) {
1491             return ! complement_b;
1492         }
1493     }
1494     else {
1495         array_a = invlist_array(a);
1496     }
1497
1498     if (len_b != 0) {
1499         array_b = invlist_array(b);
1500     }
1501
1502     /* If are to compare 'a' with the complement of b, set it
1503      * up so are looking at b's complement. */
1504     if (complement_b) {
1505
1506         /* The complement of nothing is everything, so <a> would have to have
1507          * just one element, starting at zero (ending at infinity) */
1508         if (len_b == 0) {
1509             return (len_a == 1 && array_a[0] == 0);
1510         }
1511         if (array_b[0] == 0) {
1512
1513             /* Otherwise, to complement, we invert.  Here, the first element is
1514              * 0, just remove it.  To do this, we just pretend the array starts
1515              * one later */
1516
1517             array_b++;
1518             len_b--;
1519         }
1520         else {
1521
1522             /* But if the first element is not zero, we pretend the list starts
1523              * at the 0 that is always stored immediately before the array. */
1524             array_b--;
1525             len_b++;
1526         }
1527     }
1528
1529     return    len_a == len_b
1530            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
1531
1532 }
1533 #endif
1534
1535 #undef HEADER_LENGTH
1536 #undef TO_INTERNAL_SIZE
1537 #undef FROM_INTERNAL_SIZE
1538 #undef INVLIST_VERSION_ID
1539
1540 /* End of inversion list object */