+/* Read a single line of the main body of the swash input text. These are of
+ * the form:
+ * 0053 0056 0073
+ * where each number is hex. The first two numbers form the minimum and
+ * maximum of a range, and the third is the value associated with the range.
+ * Not all swashes should have a third number
+ *
+ * On input: l points to the beginning of the line to be examined; it points
+ * to somewhere in the string of the whole input text, and is
+ * terminated by a \n or the null string terminator.
+ * lend points to the null terminator of that string
+ * wants_value is non-zero if the swash expects a third number
+ * typestr is the name of the swash's mapping, like 'ToLower'
+ * On output: *min, *max, and *val are set to the values read from the line.
+ * returns a pointer just beyond the line examined. If there was no
+ * valid min number on the line, returns lend+1
+ */
+
+STATIC U8*
+S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
+ const bool wants_value, const U8* const typestr)
+{
+ const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
+ STRLEN numlen; /* Length of the number */
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+ /* nl points to the next \n in the scan */
+ U8* const nl = (U8*)memchr(l, '\n', lend - l);
+
+ /* Get the first number on the line: the range minimum */
+ numlen = lend - l;
+ *min = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen) /* If found a hex number, position past it */
+ l += numlen;
+ else if (nl) { /* Else, go handle next line, if any */
+ return nl + 1; /* 1 is length of "\n" */
+ }
+ else { /* Else, no next line */
+ return lend + 1; /* to LIST's end at which \n is not found */
+ }
+
+ /* The max range value follows, separated by a BLANK */
+ if (isBLANK(*l)) {
+ ++l;
+ flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+ numlen = lend - l;
+ *max = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else /* If no value here, it is a single element range */
+ *max = *min;
+
+ /* Non-binary tables have a third entry: what the first element of the
+ * range maps to */
+ if (wants_value) {
+ if (isBLANK(*l)) {
+ ++l;
+ flags = PERL_SCAN_SILENT_ILLDIGIT |
+ PERL_SCAN_DISALLOW_PREFIX;
+ numlen = lend - l;
+ *val = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else
+ *val = 0;
+ }
+ else {
+ *val = 0;
+ if (typeto) {
+ Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+ typestr, l);
+ }
+ }
+ }
+ else
+ *val = 0; /* bits == 1, then any val should be ignored */
+ }
+ else { /* Nothing following range min, should be single element with no
+ mapping expected */
+ *max = *min;
+ if (wants_value) {
+ *val = 0;
+ if (typeto) {
+ Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
+ }
+ }
+ else
+ *val = 0; /* bits == 1, then val should be ignored */
+ }
+
+ /* Position to next line if any, or EOF */
+ if (nl)
+ l = nl + 1;
+ else
+ l = lend;
+
+ return l;
+}
+
+/* Note:
+ * Returns a swatch (a bit vector string) for a code point sequence
+ * that starts from the value C<start> and comprises the number C<span>.
+ * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
+ * Should be used via swash_fetch, which will cache the swatch in C<swash>.
+ */
+STATIC SV*
+S_swash_get(pTHX_ SV* swash, UV start, UV span)
+{
+ SV *swatch;
+ U8 *l, *lend, *x, *xend, *s;
+ STRLEN lcur, xcur, scur;
+ HV *const hv = MUTABLE_HV(SvRV(swash));
+
+ /* The string containing the main body of the table */
+ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+ SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+ SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+ SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
+ const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+ const STRLEN bits = SvUV(*bitssvp);
+ const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+ const UV none = SvUV(*nonesvp);
+ const UV end = start + span;
+
+ PERL_ARGS_ASSERT_SWASH_GET;
+
+ if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
+ Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
+ (UV)bits);
+ }
+
+ /* create and initialize $swatch */
+ scur = octets ? (span * octets) : (span + 7) / 8;
+ swatch = newSV(scur);
+ SvPOK_on(swatch);
+ s = (U8*)SvPVX(swatch);
+ if (octets && none) {
+ const U8* const e = s + scur;
+ while (s < e) {
+ if (bits == 8)
+ *s++ = (U8)(none & 0xff);
+ else if (bits == 16) {
+ *s++ = (U8)((none >> 8) & 0xff);
+ *s++ = (U8)( none & 0xff);
+ }
+ else if (bits == 32) {
+ *s++ = (U8)((none >> 24) & 0xff);
+ *s++ = (U8)((none >> 16) & 0xff);
+ *s++ = (U8)((none >> 8) & 0xff);
+ *s++ = (U8)( none & 0xff);
+ }
+ }
+ *s = '\0';
+ }
+ else {
+ (void)memzero((U8*)s, scur + 1);
+ }
+ SvCUR_set(swatch, scur);
+ s = (U8*)SvPVX(swatch);
+
+ /* read $swash->{LIST} */
+ l = (U8*)SvPV(*listsvp, lcur);
+ lend = l + lcur;
+ while (l < lend) {
+ UV min, max, val;
+ l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+ cBOOL(octets), typestr);
+ if (l > lend) {
+ break;
+ }
+
+ /* If looking for something beyond this range, go try the next one */
+ if (max < start)
+ continue;
+
+ if (octets) {
+ UV key;
+ if (min < start) {
+ if (!none || val < none) {
+ val += start - min;
+ }
+ min = start;
+ }
+ for (key = min; key <= max; key++) {
+ STRLEN offset;
+ if (key >= end)
+ goto go_out_list;
+ /* offset must be non-negative (start <= min <= key < end) */
+ offset = octets * (key - start);
+ if (bits == 8)
+ s[offset] = (U8)(val & 0xff);
+ else if (bits == 16) {
+ s[offset ] = (U8)((val >> 8) & 0xff);
+ s[offset + 1] = (U8)( val & 0xff);
+ }
+ else if (bits == 32) {
+ s[offset ] = (U8)((val >> 24) & 0xff);
+ s[offset + 1] = (U8)((val >> 16) & 0xff);
+ s[offset + 2] = (U8)((val >> 8) & 0xff);
+ s[offset + 3] = (U8)( val & 0xff);
+ }
+
+ if (!none || val < none)
+ ++val;
+ }
+ }
+ else { /* bits == 1, then val should be ignored */
+ UV key;
+ if (min < start)
+ min = start;
+ for (key = min; key <= max; key++) {
+ const STRLEN offset = (STRLEN)(key - start);
+ if (key >= end)
+ goto go_out_list;
+ s[offset >> 3] |= 1 << (offset & 7);
+ }
+ }
+ } /* while */
+ go_out_list:
+
+ /* read $swash->{EXTRAS} */
+ x = (U8*)SvPV(*extssvp, xcur);
+ xend = x + xcur;
+ while (x < xend) {
+ STRLEN namelen;
+ U8 *namestr;
+ SV** othersvp;
+ HV* otherhv;
+ STRLEN otherbits;
+ SV **otherbitssvp, *other;
+ U8 *s, *o, *nl;
+ STRLEN slen, olen;
+
+ const U8 opc = *x++;
+ if (opc == '\n')
+ continue;
+
+ nl = (U8*)memchr(x, '\n', xend - x);
+
+ if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
+ if (nl) {
+ x = nl + 1; /* 1 is length of "\n" */
+ continue;
+ }
+ else {
+ x = xend; /* to EXTRAS' end at which \n is not found */
+ break;
+ }
+ }
+
+ namestr = x;
+ if (nl) {
+ namelen = nl - namestr;
+ x = nl + 1;
+ }
+ else {
+ namelen = xend - namestr;
+ x = xend;
+ }
+
+ othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
+ otherhv = MUTABLE_HV(SvRV(*othersvp));
+ otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
+ otherbits = (STRLEN)SvUV(*otherbitssvp);
+ if (bits < otherbits)
+ Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
+
+ /* The "other" swatch must be destroyed after. */
+ other = swash_get(*othersvp, start, span);
+ o = (U8*)SvPV(other, olen);
+
+ if (!olen)
+ Perl_croak(aTHX_ "panic: swash_get got improper swatch");
+
+ s = (U8*)SvPV(swatch, slen);
+ if (bits == 1 && otherbits == 1) {
+ if (slen != olen)
+ Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
+
+ switch (opc) {
+ case '+':
+ while (slen--)
+ *s++ |= *o++;
+ break;
+ case '!':
+ while (slen--)
+ *s++ |= ~*o++;
+ break;
+ case '-':
+ while (slen--)
+ *s++ &= ~*o++;
+ break;
+ case '&':
+ while (slen--)
+ *s++ &= *o++;
+ break;
+ default:
+ break;
+ }
+ }
+ else {
+ STRLEN otheroctets = otherbits >> 3;
+ STRLEN offset = 0;
+ U8* const send = s + slen;
+
+ while (s < send) {
+ UV otherval = 0;
+
+ if (otherbits == 1) {
+ otherval = (o[offset >> 3] >> (offset & 7)) & 1;
+ ++offset;
+ }
+ else {
+ STRLEN vlen = otheroctets;
+ otherval = *o++;
+ while (--vlen) {
+ otherval <<= 8;
+ otherval |= *o++;
+ }
+ }
+
+ if (opc == '+' && otherval)
+ NOOP; /* replace with otherval */
+ else if (opc == '!' && !otherval)
+ otherval = 1;
+ else if (opc == '-' && otherval)
+ otherval = 0;
+ else if (opc == '&' && !otherval)
+ otherval = 0;
+ else {
+ s += octets; /* no replacement */
+ continue;
+ }
+
+ if (bits == 8)
+ *s++ = (U8)( otherval & 0xff);
+ else if (bits == 16) {
+ *s++ = (U8)((otherval >> 8) & 0xff);
+ *s++ = (U8)( otherval & 0xff);
+ }
+ else if (bits == 32) {
+ *s++ = (U8)((otherval >> 24) & 0xff);
+ *s++ = (U8)((otherval >> 16) & 0xff);
+ *s++ = (U8)((otherval >> 8) & 0xff);
+ *s++ = (U8)( otherval & 0xff);
+ }
+ }
+ }
+ sv_free(other); /* through with it! */
+ } /* while */
+ return swatch;
+}
+
+HV*
+Perl__swash_inversion_hash(pTHX_ SV* swash)
+{
+
+ /* Subject to change or removal. For use only in one place in regexec.c
+ *
+ * Returns a hash which is the inversion and closure of a swash mapping.
+ * For example, consider the input lines:
+ * 004B 006B
+ * 004C 006C
+ * 212A 006B
+ *
+ * The returned hash would have two keys, the utf8 for 006B and the utf8 for
+ * 006C. The value for each key is an array. For 006C, the array would
+ * have a two elements, the utf8 for itself, and for 004C. For 006B, there
+ * would be three elements in its array, the utf8 for 006B, 004B and 212A.
+ *
+ * Essentially, for any code point, it gives all the code points that map to
+ * it, or the list of 'froms' for that point.
+ *
+ * Currently it only looks at the main body of the swash, and ignores any
+ * additions or deletions from other swashes */
+
+ U8 *l, *lend;
+ STRLEN lcur;
+ HV *const hv = MUTABLE_HV(SvRV(swash));
+
+ /* The string containing the main body of the table */
+ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+ SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+ SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+ /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
+ const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+ const STRLEN bits = SvUV(*bitssvp);
+ const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+ const UV none = SvUV(*nonesvp);
+
+ HV* ret = newHV();
+
+ PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
+
+ /* Must have at least 8 bits to get the mappings */
+ if (bits != 8 && bits != 16 && bits != 32) {
+ Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
+ (UV)bits);
+ }
+
+ /* read $swash->{LIST} */
+ l = (U8*)SvPV(*listsvp, lcur);
+ lend = l + lcur;
+
+ /* Go through each input line */
+ while (l < lend) {
+ UV min, max, val;
+ UV inverse;
+ l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+ cBOOL(octets), typestr);
+ if (l > lend) {
+ break;
+ }
+
+ /* Each element in the range is to be inverted */
+ for (inverse = min; inverse <= max; inverse++) {
+ AV* list;
+ SV* element;
+ SV** listp;
+ IV i;
+ bool found_key = FALSE;
+
+ /* The key is the inverse mapping */
+ char key[UTF8_MAXBYTES+1];
+ char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
+ STRLEN key_len = key_end - key;
+
+ /* And the value is what the forward mapping is from. */
+ char utf8_inverse[UTF8_MAXBYTES+1];
+ char *utf8_inverse_end = (char *) uvuni_to_utf8((U8*) utf8_inverse, inverse);
+
+ /* Get the list for the map */
+ if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
+ list = (AV*) *listp;
+ }
+ else { /* No entry yet for it: create one */
+ list = newAV();
+ if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
+ Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+ }
+ }
+
+ for (i = 0; i < av_len(list); i++) {
+ SV** entryp = av_fetch(list, i, FALSE);
+ SV* entry;
+ if (entryp == NULL) {
+ Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+ }
+ entry = *entryp;
+ if (SvCUR(entry) != key_len) {
+ continue;
+ }
+ if (memEQ(key, SvPVX(entry), key_len)) {
+ found_key = TRUE;
+ break;
+ }
+ }
+ if (! found_key) {
+ element = newSVpvn_flags(key, key_len, SVf_UTF8);
+ av_push(list, element);
+ }
+
+
+ /* Simply add the value to the list */
+ element = newSVpvn_flags(utf8_inverse, utf8_inverse_end - utf8_inverse, SVf_UTF8);
+ av_push(list, element);
+
+ /* swash_get() increments the value of val for each element in the
+ * range. That makes more compact tables possible. You can
+ * express the capitalization, for example, of all consecutive
+ * letters with a single line: 0061\t007A\t0041 This maps 0061 to
+ * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
+ * and it's not documented, and perhaps not even currently used,
+ * but I copied the semantics from swash_get(), just in case */
+ if (!none || val < none) {
+ ++val;
+ }
+ }
+ }
+
+ return ret;
+}