255, which means that the union with cl should just be
what cl has in it, so can ignore this flag
ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
- is 127-255 to match them, but then invert that, so the
- union with cl should just be what cl has in it, so can
+ is (ASCII) 127-255 to match them, but then invert that, so
+ the union with cl should just be what cl has in it, so can
ignore this flag
*/
} else { /* 'or_with' is not inverted */
8: EXACT <baz>(10)
10: END(0)
- d = uvuni_to_utf8_flags(d, uv, 0);
+ d = uvchr_to_utf8_flags(d, uv, 0);
is the recommended Unicode-aware way of saying
if (UTF) { \
SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
+ unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- /* if it is UTF then it is either already folded, or does not need folding */ \
- uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
- } \
- else if (folder == PL_fold_latin1) { \
- /* if we use this folder we have to obey unicode rules on latin-1 data */ \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = 1; \
- uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
- skiplen = UNISKIP(uvc); \
- foldlen -= skiplen; \
- scan = foldbuf + skiplen; \
- } \
- } else { \
- /* raw data, will be folded later if needed */ \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+/* This gets the next character from the input, folding it if not already
+ * folded. */
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need \
+ * folding */ \
+ uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* This folder implies Unicode rules, which in the range expressible \
+ * by not UTF is the lower case, with the two exceptions, one of \
+ * which should have been taken care of before calling this */ \
+ assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
+ uvc = toLOWER_L1(*uc); \
+ if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
+ len = 1; \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
HV *widecharmap = NULL;
AV *revcharmap = newAV();
regnode *cur;
- const U32 uniflags = UTF8_ALLOW_DEFAULT;
STRLEN len = 0;
UV uvc = 0;
U16 curword = 0;
case EXACT: break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
const U8 *uc = (U8*)STRING( noper );
const U8 *e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
- const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars = 0;
+ STRLEN minbytes = 0;
+ STRLEN maxbytes = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
regardless of encoding */
if (OP( noper ) == EXACTFU_SS) {
/* false positives are ok, so just set this */
- TRIE_BITMAP_SET(trie,0xDF);
+ TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
- chars++;
+
+ /* Acummulate to the current values, the range in the number of
+ * bytes that this character could match. The max is presumed to
+ * be the same as the folded input (which TRIE_READ_CHAR returns),
+ * except that when this is not in UTF-8, it could be matched
+ * against a string which is UTF-8, and the variant characters
+ * could be 2 bytes instead of the 1 here. Likewise, for the
+ * minimum number of bytes when not folded. When folding, the min
+ * is assumed to be 1 byte could fold to match the single character
+ * here, or in the case of a multi-char fold, 1 byte can fold to
+ * the whole sequence. 'foldlen' is used to denote whether we are
+ * in such a sequence, skipping the min setting if so. XXX TODO
+ * Use the exact list of what folds to each character, from
+ * PL_utf8_foldclosures */
+ if (UTF) {
+ maxbytes += UTF8SKIP(uc);
+ if (! folder) {
+ /* A non-UTF-8 string could be 1 byte to match our 2 */
+ minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
+ ? 1
+ : UTF8SKIP(uc);
+ }
+ else {
+ if (foldlen) {
+ foldlen -= UTF8SKIP(uc);
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
+ else {
+ maxbytes += (UNI_IS_INVARIANT(*uc))
+ ? 1
+ : 2;
+ if (! folder) {
+ minbytes++;
+ }
+ else {
+ if (foldlen) {
+ foldlen--;
+ }
+ else {
+ foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
+ minbytes++;
+ }
+ }
+ }
if ( uvc < 256 ) {
if ( folder ) {
U8 folded= folder[ (U8) uvc ];
if ( !UTF ) {
/* store first byte of utf8 representation of
variant codepoints */
- if (! UNI_IS_INVARIANT(uvc)) {
+ if (! NATIVE_IS_INVARIANT(uvc)) {
TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
}
}
}
}
if( cur == first ) {
- trie->minlen = chars;
- trie->maxlen = chars;
- } else if (chars < trie->minlen) {
- trie->minlen = chars;
- } else if (chars > trie->maxlen) {
- trie->maxlen = chars;
- }
- if (OP( noper ) == EXACTFU_SS) {
- /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
- if (trie->minlen > 1)
- trie->minlen= 1;
+ trie->minlen = minbytes;
+ trie->maxlen = maxbytes;
+ } else if (minbytes < trie->minlen) {
+ trie->minlen = minbytes;
+ } else if (maxbytes > trie->maxlen) {
+ trie->maxlen = maxbytes;
}
- if (OP( noper ) == EXACTFU_TRICKYFOLD) {
- /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
- * - We assume that any such sequence might match a 2 byte string */
- if (trie->minlen > 2 )
- trie->minlen= 2;
- }
-
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
U16 charid = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- STRLEN skiplen = 0;
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
U16 charid = 0; /* sanity init */
U32 accept_state = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
- STRLEN skiplen = 0;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
if (OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
* that is "sss".
*
* It turns out that there are problems with all multi-character folds, and not
- * just these three. Now the code is general, for all such cases, but the
- * three still have some special handling. The approach taken is:
+ * just these three. Now the code is general, for all such cases. The
+ * approach taken is:
* 1) This routine examines each EXACTFish node that could contain multi-
* character fold sequences. It returns in *min_subtract how much to
* subtract from the the actual length of the string to get a real minimum
* used by the caller to adjust the min length of the match, and the delta
* between min and max, so that the optimizer doesn't reject these
* possibilities based on size constraints.
- * 2) Certain of these sequences require special handling by the trie code,
- * so, if found, this code changes the joined node type to special ops:
- * EXACTFU_TRICKYFOLD and EXACTFU_SS.
- * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
+ * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
* is used for an EXACTFU node that contains at least one "ss" sequence in
* it. For non-UTF-8 patterns and strings, this is the only case where
* there is a possible fold length change. That means that a regular
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
* 'ss', even if the pattern isn't UTF-8. This avoids the issues
* described in the next item.
- * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
* pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
* UTF-8 pattern.) An assumption that the optimizer part of regexec.c
* (probably unwittingly, in Perl_regexec_flags()) makes is that a
* but in a non-UTF8 pattern, folding it to that above-Latin1 string would
* require the pattern to be forced into UTF-8, the overhead of which we
* want to avoid.)
- */
+ *
+ * Similarly, the code that generates tries doesn't currently handle
+ * not-already-folded multi-char folds, and it looks like a pain to change
+ * that. Therefore, trie generation of EXACTFA nodes with the sharp s
+ * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
+ * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
+ * using /iaa matching will be doing so almost entirely with ASCII
+ * strings, so this should rarely be encountered in practice */
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
}
/* Nodes with 'ss' require special handling, except for EXACTFL
- * and EXACTFA for which there is no multi-char fold to this */
+ * and EXACTFA-ish for which there is no multi-char fold to
+ * this */
if (len == 2 && *s == 's' && *(s+1) == 's'
- && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+ && OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
{
count = 2;
OP(scan) = EXACTFU_SS;
s += 2;
}
- else if (len == 6 /* len is the same in both ASCII and EBCDIC
- for these */
- && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)
- || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
- COMBINING_DIAERESIS_UTF8
- COMBINING_ACUTE_ACCENT_UTF8,
- 6)))
- {
- count = 3;
-
- /* These two folds require special handling by trie's, so
- * change the node type to indicate this. If EXACTFA and
- * EXACTFL were ever to be handled by trie's, this would
- * have to be changed. If this node has already been
- * changed to EXACTFU_SS in this loop, leave it as is. (I
- * (khw) think it doesn't matter in regexec.c for UTF
- * patterns, but no need to change it */
- if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_TRICKYFOLD;
- }
- s += 6;
- }
else { /* Here is a generic multi-char fold. */
const U8* multi_end = s + len;
* test for them. The code that generates the
* is_MULTI_foo() macros croaks should one actually get put
* into Unicode .) */
- if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFL
+ && OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
+ {
count = utf8_length(s, multi_end);
s = multi_end;
}
/* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
* fold to the ASCII range (and there are no existing ones in the
* upper latin1 range). But, as outlined in the comments preceding
- * this function, we need to flag any occurrences of the sharp s */
+ * this function, we need to flag any occurrences of the sharp s.
+ * This character forbids trie formation (because of added
+ * complexity) */
while (s < s_end) {
if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ OP(scan) = EXACTFA_NO_TRIE;
*has_exactf_sharp_s = TRUE;
break;
}
EXACT | EXACT
EXACTFU | EXACTFU
EXACTFU_SS | EXACTFU
- EXACTFU_TRICKYFOLD | EXACTFU
- EXACTFA | 0
+ EXACTFA | EXACTFA
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
( EXACT == (X) ) ? EXACT : \
- ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
+ ( EXACTFA == (X) ) ? EXACTFA : \
0 )
/* dont use tail as the end marker for this traverse */
/* All other (EXACTFL handled above) folds except under
* /iaa that include s, S, and sharp_s also may include
* the others */
- if (OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
+ {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
LATIN_SMALL_LETTER_SHARP_S);
/* All folds except under /iaa that include s, S,
* and sharp_s also may include the others */
- if (OP(scan) != EXACTFA) {
+ if (OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)
+ {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
LATIN_SMALL_LETTER_SHARP_S);
HV * const table = GvHV(PL_hintgv);
SV **ptr;
- if (!table)
+ if (!table || !(PL_hints & HINT_LOCALIZE_HH))
return &PL_core_reg_engine;
ptr = hv_fetchs(table, "regcomp", FALSE);
if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
Newx(dst, *plen_p * 2 + 1, U8);
while (s < *plen_p) {
- const UV uv = NATIVE_TO_ASCII(src[s]);
- if (UNI_IS_INVARIANT(uv))
- dst[d] = (U8)UTF_TO_NATIVE(uv);
+ if (NATIVE_IS_INVARIANT(src[s]))
+ dst[d] = src[s];
else {
- dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
- dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
+ dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
+ dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
}
if (n < num_code_blocks) {
if (!do_end && pRExC_state->code_blocks[n].start == s) {
STRLEN orig_patlen = 0;
bool code = 0;
SV *msv = use_delim ? delim : *svp;
+ if (!msv) msv = &PL_sv_undef;
/* if we've got a delimiter, we go round the loop twice for each
* svp slot (except the last), using the delimiter the second
* The code in this block is based on S_pushav() */
AV *const av = (AV*)msv;
- const I32 maxarg = AvFILL(av) + 1;
+ const SSize_t maxarg = AvFILL(av) + 1;
SV **array;
if (oplist) {
}
if (SvRMAGICAL(av)) {
- U32 i;
+ SSize_t i;
Newx(array, maxarg, SV*);
SAVEFREEPV(array);
- for (i=0; i < (U32)maxarg; i++) {
+ for (i=0; i < maxarg; i++) {
SV ** const svp = av_fetch(av, i, FALSE);
array[i] = svp ? *svp : &PL_sv_undef;
}
if (! len_passed_in) {
if (UTF) {
if (FOLD && (! LOC || code_point > 255)) {
- _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+ _to_uni_fold_flags(code_point,
character,
&len,
FOLD_FLAGS_FULL | ((LOC)
p++;
break;
case 'a':
- ender = ASCII_TO_NATIVE('\007');
+ ender = '\a';
p++;
break;
case 'o':
goto loopdone;
case '1': case '2': case '3':case '4':
case '5': case '6': case '7':
- /* When we parse backslash escapes there is ambiguity between
- * backreferences and octal escapes. Any escape from \1 - \9 is
- * a backreference, any multi-digit escape which does not start with
- * 0 and which when evaluated as decimal could refer to an already
- * parsed capture buffer is a backslash. Anything else is octal.
+ /* When we parse backslash escapes there is ambiguity
+ * between backreferences and octal escapes. Any escape
+ * from \1 - \9 is a backreference, any multi-digit
+ * escape which does not start with 0 and which when
+ * evaluated as decimal could refer to an already
+ * parsed capture buffer is a backslash. Anything else
+ * is octal.
*
- * Note this implies that \118 could be interpreted as 118 OR as
- * "\11" . "8" depending on whether there were 118 capture buffers
- * defined already in the pattern.
- */
+ * Note this implies that \118 could be interpreted as
+ * 118 OR as "\11" . "8" depending on whether there
+ * were 118 capture buffers defined already in the
+ * pattern. */
if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
{ /* Not to be treated as an octal constant, go
find backref */
REGC((char)ender, s++);
}
}
- else /* FOLD */
- if (! ( UTF
+ else /* FOLD */ if (! ( UTF
/* See comments for join_exact() as to why we fold this
* non-UTF at compile time */
|| (node_type == EXACTFU
* utf8. If we start to fold non-UTF patterns, be sure to
* update join_exact() */
if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
+ if (NATIVE_IS_INVARIANT(ender)) {
*s = (U8) ender;
foldlen = 1;
} else {
/* No Latin1 characters participate in multi-char
* folds under /l */
if (LOC
- || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
- *s, *(s+1))))
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+ *s, *(s+1))))
{
break;
}
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
case 'e': value = ASCII_TO_NATIVE('\033');break;
- case 'a': value = ASCII_TO_NATIVE('\007');break;
+ case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
{
* included. literal_endpoint==2 means both ends of the range used
* a literal character, not \x{foo} */
if (literal_endpoint == 2
- && (prevvalue >= 'a' && value <= 'z')
- || (prevvalue >= 'A' && value <= 'Z'))
+ && ((prevvalue >= 'a' && value <= 'z')
+ || (prevvalue >= 'A' && value <= 'Z')))
{
+ _invlist_intersection(this_range, PL_ASCII,
+ &this_range);
_invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
&this_range);
}
/* If the folds haven't been read in, call a fold function
* to force that */
if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
/* This string is just a short named one above \xff */
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFA_NO_TRIE:
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);