From d1c40ef5b615823c7946ca520b1fedd807df469a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 24 Dec 2015 22:42:08 -0700 Subject: [PATCH] regcomp.h: Free up bit in ANYOF FLAGS field I've long been confronted with trying to do things to create a spare bit to use. I thought it easier now, while it's fresh in my mind, to free up one for future use, rather than re-learn things when it next becomes necessary. It would have been a different story if the freed bit had required a performance penalty. This commit also updates the comments about how to create even more spare bits should it become necessary. --- regcomp.c | 18 +++++++-- regcomp.h | 135 +++++++++++++++++++++++++++++--------------------------------- regexec.c | 12 +++--- 3 files changed, 84 insertions(+), 81 deletions(-) diff --git a/regcomp.c b/regcomp.c index cba1e3b..bb51f17 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1366,6 +1366,10 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, &( ANYOF_COMMON_FLAGS |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); + if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) { + anded_flags &= + ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; + } } } @@ -1522,6 +1526,10 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, |= ANYOF_FLAGS(or_with) & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); + if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) { + ored_flags |= + ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; + } } } @@ -14201,7 +14209,8 @@ redo_curchar: assert(OP(node) == ANYOF); OP(node) = ANYOFL; - ANYOF_FLAGS(node) |= ANYOFL_UTF8_LOCALE_REQD; + ANYOF_FLAGS(node) + |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } if (save_fold) { @@ -16098,8 +16107,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * locales, or the class matches at least one 0-255 range code point */ if (LOC && FOLD) { if (only_utf8_locale_list) { - ANYOF_FLAGS(ret) |= ANYOFL_FOLD - |ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE; + ANYOF_FLAGS(ret) + |= ANYOFL_FOLD + |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } else if (cp_list) { /* Look to see if a 0-255 code point is in list */ UV start, end; @@ -17473,7 +17483,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ if (OP(o) == ANYOFL) { - if (flags & ANYOFL_UTF8_LOCALE_REQD) { + if (ANYOFL_UTF8_LOCALE_REQD(flags)) { sv_catpvs(sv, "{utf8-loc}"); } else { diff --git a/regcomp.h b/regcomp.h index d26cf18..f7c8cc4 100644 --- a/regcomp.h +++ b/regcomp.h @@ -403,19 +403,18 @@ struct regnode_ssc { * 2) A subset of item 1) is if all possible code points outside the bitmap * match. This is a common occurrence when the class is complemented, * like /[^ij]/. Therefore a bit is reserved to indicate this, - * ANYOF_MATCHES_ALL_ABOVE_BITMAP. If it became necessary, this flag could - * be replaced by using the normal swash mechanism, but with a performance - * penalty. + * rather than having an expensive swash created, + * ANYOF_MATCHES_ALL_ABOVE_BITMAP. * 3) Under /d rules, it can happen that code points that are in the upper * latin1 range (\x80-\xFF or their equivalents on EBCDIC platforms) match * only if the runtime target string being matched against is UTF-8. For * example /[\w[:punct:]]/d. This happens only for posix classes (with a - * couple of exceptions, like \d), and all such ones also have - * above-bitmap matches. Thus, 3) implies 1) as well. Note that /d rules - * are no longer encouraged; 'use 5.14' or higher deselects them. But a - * flag is required so that they can be properly handled. But it can be a - * shared flag: see 5) below. - * 4) Also under /d rules, something like /[\Wfoo] will match everything in + * couple of exceptions, like \d where it doesn't happen), and all such + * ones also have above-bitmap matches. Thus, 3) implies 1) as well. + * Note that /d rules are no longer encouraged; 'use 5.14' or higher + * deselects them. But a flag is required so that they can be properly + * handled. But it can be a shared flag: see 5) below. + * 4) Also under /d rules, something like /[\Wfoo]/ will match everything in * the \x80-\xFF range, unless the string being matched against is UTF-8. * A swash could be created for this case, but this is relatively common, * and it turns out that it's all or nothing: if any one of these code @@ -440,65 +439,44 @@ struct regnode_ssc { * traditionally not worried too much about its performance. And this * condition requires the ANYOFL_FOLD flag to be set, so testing for * that flag would be sufficient to rule out most cases of this. So it is - * unclear if this should have a flag or not. But, one is currently - * allocated for this purpose, ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE (and - * the text below indicates how to share it, should another bit be - * needed). + * unclear if this should have a flag or not. But, this flag can be + * shared with another, so it doesn't occupy extra space. * - * At the moment, there are no spare bits, but this could be changed by various - * tricks. + * At the moment, there is one spare bit, but this could be increased by + * various tricks. * - * Note that item ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE is not independent of - * the ANYOFL_FOLD flag below. Also, the ANYOFL_UTF8_LOCALE_REQD flag is set - * only if both these aren't. We can therefore share - * ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE with ANYOFL_UTF8_LOCALE_REQD, so what - * the shared flag means depends on the ANYOFL_FOLD flag. + * If just one more bit is needed, at this writing it seems to khw that the + * best choice would be to make ANYOF_MATCHES_ALL_ABOVE_BITMAP not a flag, but + * something like * - * Beyond that, note that the information may be conveyed by creating new - * regnode types. This is not the best solution, as shown later in this - * paragraph, but it is something that is feasible. We could have a regnode - * for ANYOF_INVERT, for example. A complication of this is that the regexec.c - * REGINCLASS macro assumes that it can just use the bitmap if no flags are - * set. This would have to be changed to add extra tests for the node type, or - * a special flag reserved that means unspecified special handling, and then the - * node-type would be used internally to sort that out. So we could gain a bit - * by having an ANYOF_SPECIAL flag, and a node type for INVERT, and another for - * POSIXL, and still another for INVERT_POSIXL. This example illustrates one - * problem with this, a combinatorial explosion of node types. The one node - * type khw can think of that doesn't have this explosion issue is - * ANYOFL_UTF8_LOCALE_REQD. This flag is a natural candidate for being a separate - * node type because it is a specialization of the current ANYOFL, and because - * no other ANYOFL-only flags are set when it is; also most of its uses are - * actually outside the reginclass() function, so this could be done with no - * performance penalty. But since it can be shared, as noted above, it doesn't - * take up space anyway. Another issue when turning a flag into a node type, is - * that a SSC may use that flag -- not just a regular ANYOF[DL]?. In the case - * of ANYOFL_UTF8_LOCALE_REQD, the only likely problem is accurately settting the - * SSC node-type to the new one, which would likely involve S_ssc_or and - * S_ssc_and, and not how the SSC currently gets set to ANYOFL. + * #define ANYOF_MATCHES_ALL_ABOVE_BITMAP ((U32) -2) * - * Another possibility is to instead rename the ANYOF_POSIXL flag to be - * ANYOFL_LARGE, to mean that the ANYOF node has an extra 32 bits beyond what a - * regular one does. That's what it effectively means now, with the extra - * space all for the POSIX class flags. But those classes actually only occupy - * 30 bits, so 2 of the locale flags could be moved to that extra space. The - * downside of this is that ANYOFL nodes with whichever of the flags get moved - * would have to have the extra space always allocated. + * and access it through the ARG like ANYOF_ONLY_HAS_BITMAP is. This flag is + * used by all ANYOF node types, and it could be used to avoid calling the + * handler function, as the macro REGINCLASS in regexec.c does now for other + * cases. + * + * Another possibility is to instead (or additionally) rename the ANYOF_POSIXL + * flag to be ANYOFL_LARGE, to mean that the ANYOF node has an extra 32 bits + * beyond what a regular one does. That's what it effectively means now, with + * the extra space all for the POSIX class flags. But those classes actually + * only occupy 30 bits, so the ANYOFL_FOLD and + * ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD flags could be moved + * to that extra space. The 30 bits in the extra word would indicate if a + * posix class should be looked up or not. The downside of this is that ANYOFL + * nodes with folding would always have to have the extra space allocated, even + * if they didn't use the 30 posix bits. There isn't an SSC problem as all + * SSCs are this large anyway. * * One could completely remove ANYOFL_LARGE and make all ANYOFL nodes large. - * The 30 bits in the extra word would indicate if a posix class should be - * looked up or not. There isn't an SSC problem as all SSCs are this large - * anyway, and the SSC could be set to this node type. REGINCLASS would have - * to be modified so that if the node type were this, it would call - * reginclass(), as the flag bit that indicates to do this now would be gone. - * If 2 locale flags are moved to the larger structure, this would free up a - * total of 4 bits. If this were done, we could create an ANYOF_INVERT - * node-type without a combinatorial explosion, getting us to 5 bits. And, - * keep in mind that ANYOF_MATCHES_ALL_ABOVE_BITMAP is solely for performance, - * so could be removed. The other performance-related flags are shareable with - * flags that are required. + * REGINCLASS would have to be modified so that if the node type were this, it + * would call reginclass(), as the flag bit that indicates to do this now would + * be gone. + * + * All told, 5 bits could be available for other uses if all of the above were + * done. * - * Several flags are not used in synthetic start class (SSC) nodes, so could be + * Some flags are not used in synthetic start class (SSC) nodes, so could be * shared should new flags be needed for SSCs, like SSC_MATCHES_EMPTY_STRING * now. */ @@ -522,15 +500,30 @@ struct regnode_ssc { * then. Only set under /l; never in an SSC */ #define ANYOFL_FOLD 0x04 -/* If set, ANYOFL_FOLD is also set, and there are potential matches that - * will be valid only if the locale is a UTF-8 one. */ -#define ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE 0x08 - -/* If set, means to warn if runtime locale isn't a UTF-8 one. Only under /l. - * If set, none of INVERT, LOC_FOLD, POSIXL, - * ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP can - * be set. Can be in an SSC */ -#define ANYOFL_UTF8_LOCALE_REQD 0x10 +/* Shared bit set only with ANYOFL and SSC nodes: + * If ANYOFL_FOLD is set, this means there are potential matches valid + * only if the locale is a UTF-8 one. + * If ANYOFL_FOLD is NOT set, this means to warn if the runtime locale + * isn't a UTF-8 one (and the generated node assumes a UTF-8 locale). + * None of INVERT, POSIXL, + * ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP + * can be set. */ +#define ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD 0x08 + +/* Convenience macros for teasing apart the meanings when reading the above bit + * */ +#define ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags) \ + ((flags & ( ANYOFL_FOLD /* Both bits are set */ \ + |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)) \ + == ( ANYOFL_FOLD \ + |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)) + +#define ANYOFL_UTF8_LOCALE_REQD(flags) \ + ((flags & ( ANYOFL_FOLD /* Only REQD bit is set */ \ + |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)) \ + == ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD) + +/* Spare: 0x10 */ /* If set, the node matches every code point NUM_ANYOF_CODE_POINTS and above. * Can be in an SSC */ @@ -567,7 +560,7 @@ struct regnode_ssc { /* These are the flags that apply to both regular ANYOF nodes and synthetic * start class nodes during construction of the SSC. During finalization of * the SSC, other of the flags may get added to it */ -#define ANYOF_COMMON_FLAGS ANYOFL_UTF8_LOCALE_REQD +#define ANYOF_COMMON_FLAGS 0 /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ diff --git a/regexec.c b/regexec.c index 85921cf..170e63e 100644 --- a/regexec.c +++ b/regexec.c @@ -1826,7 +1826,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if ((FLAGS(c) & ANYOFL_UTF8_LOCALE_REQD) && ! IN_UTF8_CTYPE_LOCALE) { + if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) { Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); } @@ -5766,7 +5766,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFL: /* /[abc]/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if ((FLAGS(scan) & ANYOFL_UTF8_LOCALE_REQD) && ! IN_UTF8_CTYPE_LOCALE) + if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE) { Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); } @@ -8301,7 +8301,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if ((FLAGS(p) & ANYOFL_UTF8_LOCALE_REQD) && ! IN_UTF8_CTYPE_LOCALE) { + if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) { Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); } /* FALLTHROUGH */ @@ -8648,7 +8648,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_ALLOW_FFFF */ if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); - if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOFL_UTF8_LOCALE_REQD)) { + if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) { _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); } } @@ -8751,8 +8751,8 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const && c < 256 # endif ))) - || (( flags & ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE) - && IN_UTF8_CTYPE_LOCALE))) + || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags) + && IN_UTF8_CTYPE_LOCALE))) { SV* only_utf8_locale = NULL; SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, -- 1.8.3.1