- /* Certain characters are problematic because their folded
- * length is so different from their original length that it
- * isn't handleable by the optimizer. They are therefore not
- * placed in an EXACTish node; and are here handled specially.
- * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
- * putting it in a special node keeps regexec from having to
- * deal with a non-utf8 multi-char fold */
- if (FOLD
- && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
- {
- /* We look for either side of the fold. For example \xDF
- * folds to 'ss'. We look for both the single character
- * \xDF and the sequence 'ss'. When we find something that
- * could be one of those, we stop and flush whatever we
- * have output so far into the EXACTish node that was being
- * built. Then restore the input pointer to what it was.
- * regatom will return that EXACT node, and will be called
- * again, positioned so the first character is the one in
- * question, which we return in a different node type.
- * The multi-char folds are a sequence, so the occurrence
- * of the first character in that sequence doesn't
- * necessarily mean that what follows is the rest of the
- * sequence. We keep track of that with a state machine,
- * with the state being set to the latest character
- * processed before the current one. Most characters will
- * set the state to 0, but if one occurs that is part of a
- * potential tricky fold sequence, the state is set to that
- * character, and the next loop iteration sees if the state
- * should progress towards the final folded-from character,
- * or if it was a false alarm. If it turns out to be a
- * false alarm, the character(s) will be output in a new
- * EXACTish node, and join_exact() will later combine them.
- * In the case of the 'ss' sequence, which is more common
- * and more easily checked, some look-ahead is done to
- * save time by ruling-out some false alarms */
- switch (ender) {
- default:
- latest_char_state = generic_char;
- break;
- case 's':
- case 'S':
- case 0x17F: /* LATIN SMALL LETTER LONG S */
- if (AT_LEAST_UNI_SEMANTICS) {
- if (latest_char_state == char_s) { /* 'ss' */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- else if (p < RExC_end) {
-
- /* Look-ahead at the next character. If it
- * is also an s, we handle as a sharp s
- * tricky regnode. */
- if (*p == 's' || *p == 'S') {
-
- /* But first flush anything in the
- * EXACTish buffer */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- p++; /* Account for swallowing this
- 's' up */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- /* Here, the next character is not a
- * literal 's', but still could
- * evaluate to one if part of a \o{},
- * \x or \OCTAL-DIGIT. The minimum
- * length required for that is 4, eg
- * \x53 or \123 */
- else if (*p == '\\'
- && p < RExC_end - 4
- && (isDIGIT(*(p + 1))
- || *(p + 1) == 'x'
- || *(p + 1) == 'o' ))
- {
-
- /* Here, it could be an 's', too much
- * bother to figure it out here. Flush
- * the buffer if any; when come back
- * here, set the state so know that the
- * previous char was an 's' */
- if (len != 0) {
- latest_char_state = generic_char;
- p = oldp;
- goto loopdone;
- }
- latest_char_state = char_s;
- break;
- }
- }
- }
-
- /* Here, can't be an 'ss' sequence, or at least not
- * one that could fold to/from the sharp ss */
- latest_char_state = generic_char;
- break;
- case 0x03C5: /* First char in upsilon series */
- case 0x03A5: /* Also capital UPSILON, which folds to
- 03C5, and hence exhibits the same
- problem */
- if (p < RExC_end - 4) { /* Need >= 4 bytes left */
- latest_char_state = upsilon_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x03B9: /* First char in iota series */
- case 0x0399: /* Also capital IOTA */
- case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */
- case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds
- to 3B9 */
- if (p < RExC_end - 4) {
- latest_char_state = iota_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x0308:
- if (latest_char_state == upsilon_1) {
- latest_char_state = upsilon_2;
- }
- else if (latest_char_state == iota_1) {
- latest_char_state = iota_2;
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x301:
- if (latest_char_state == upsilon_2) {
- ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- else if (latest_char_state == iota_2) {
- ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- latest_char_state = generic_char;
- break;
-
- /* These are the tricky fold characters. Flush any
- * buffer first. (When adding to this list, also should
- * add them to fold_grind.t to make sure get tested) */
- case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
- case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
- case LATIN_SMALL_LETTER_SHARP_S:
- case LATIN_CAPITAL_LETTER_SHARP_S:
- case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
- case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- /* FALL THROUGH */
- do_tricky: {
- char* const oldregxend = RExC_end;
- U8 tmpbuf[UTF8_MAXBYTES+1];
-
- /* Here, we know we need to generate a special
- * regnode, and 'ender' contains the tricky
- * character. What's done is to pretend it's in a
- * [bracketed] class, and let the code that deals
- * with those handle it, as that code has all the
- * intelligence necessary. First save the current
- * parse state, get rid of the already allocated
- * but empty EXACT node that the ANYOFV node will
- * replace, and point the parse to a buffer which
- * we fill with the character we want the regclass
- * code to think is being parsed */
- RExC_emit = orig_emit;
- RExC_parse = (char *) tmpbuf;
- if (UTF) {
- U8 *d = uvchr_to_utf8(tmpbuf, ender);
- *d = '\0';
- RExC_end = (char *) d;
- }
- else { /* ender above 255 already excluded */
- tmpbuf[0] = (U8) ender;
- tmpbuf[1] = '\0';
- RExC_end = RExC_parse + 1;
- }
-
- ret = regclass(pRExC_state,depth+1);
-
- /* Here, have parsed the buffer. Reset the parse to
- * the actual input, and return */
- RExC_end = oldregxend;
- RExC_parse = p - 1;
-
- Set_Node_Offset(ret, RExC_parse);
- Set_Node_Cur_Length(ret);
- nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
- return ret;
- }
- }
- }
-