x => 1 << ($PMMOD_SHIFT + 3),
p => 1 << ($PMMOD_SHIFT + 4),
# special cases:
- l => 1 << ($PMMOD_SHIFT + 5),
- u => 1 << ($PMMOD_SHIFT + 6),
d => 0,
+ l => 1,
+ u => 2,
);
sub setcolor {
if (PL_hints & HINT_RE_TAINT)
pmop->op_pmflags |= PMf_RETAINT;
if (PL_hints & HINT_LOCALE) {
- pmop->op_pmflags |= PMf_LOCALE;
+ set_regex_charset(&(pmop->op_pmflags), REGEX_LOCALE_CHARSET);
}
else if ((! (PL_hints & HINT_BYTES)) && (PL_hints & HINT_UNI_8_BIT)) {
- pmop->op_pmflags |= RXf_PMf_UNICODE;
+ set_regex_charset(&(pmop->op_pmflags), REGEX_UNICODE_CHARSET);
}
if (PL_hints & HINT_RE_FLAGS) {
SV *reflags = Perl_refcounted_he_fetch_pvn(aTHX_
PL_compiling.cop_hints_hash, STR_WITH_LEN("reflags_charset"), 0, 0
);
if (reflags && SvOK(reflags)) {
- pmop->op_pmflags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
- pmop->op_pmflags |= SvIV(reflags);
+ set_regex_charset(&(pmop->op_pmflags), SvIV(reflags));
}
}
#define RXf_PMf_FOLD (1 << (RXf_PMf_STD_PMMOD_SHIFT+2)) /* /i */
#define RXf_PMf_EXTENDED (1 << (RXf_PMf_STD_PMMOD_SHIFT+3)) /* /x */
#define RXf_PMf_KEEPCOPY (1 << (RXf_PMf_STD_PMMOD_SHIFT+4)) /* /p */
-#define RXf_PMf_LOCALE (1 << (RXf_PMf_STD_PMMOD_SHIFT+5))
-#define RXf_PMf_UNICODE (1 << (RXf_PMf_STD_PMMOD_SHIFT+6))
+
+/* The character set for the regex is stored in a field of more than one bit
+ * using an enum, for reasons of compactness and to ensure that the options are
+ * mutually exclusive */
+typedef enum {
+ REGEX_DEPENDS_CHARSET = 0,
+ REGEX_LOCALE_CHARSET,
+ REGEX_UNICODE_CHARSET
+} regex_charset;
+
+#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5)
+#define RXf_PMf_CHARSET (3 << (_RXf_PMf_CHARSET_SHIFT)) /* 2 bits */
+
+/* embed.pl doesn't yet know how to handle static inline functions, so
+ manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+ __attribute__nonnull__(1);
+
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+{
+ /* Sets the character set portion of 'flags' to 'cs', which is a member of
+ * the above enum */
+
+ *flags &= ~RXf_PMf_CHARSET;
+ *flags |= (cs << _RXf_PMf_CHARSET_SHIFT);
+}
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+ __attribute__warn_unused_result__;
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+{
+ /* Returns the enum corresponding to the character set in 'flags' */
+
+ return (flags & RXf_PMf_CHARSET) >> _RXf_PMf_CHARSET_SHIFT;
+}
/* Next available bit after the above. Name begins with '_' so won't be
* exported by B */
/* Mask of the above bits. These need to be transferred from op_pmflags to
* re->extflags during compilation */
-#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_UNICODE)
+#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_CHARSET|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY)
/* These copies need to be numerical or defsubs_h.PL won't know about them. */
#define PMf_MULTILINE 1<<0
#define PMf_FOLD 1<<2
#define PMf_EXTENDED 1<<3
#define PMf_KEEPCOPY 1<<4
-#define PMf_LOCALE 1<<5
-#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY || PMf_LOCALE != RXf_PMf_LOCALE
+#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY
# error RXf_PMf defines are wrong
#endif
should be handled, which is now described in
L<perlunicode/Non-character code points>. See also L</Selected Bug Fixes>.
+=item *
+
+Certain shared flags in the C<pmop.op_pmflags> and C<regexp.extflags>
+structures have been removed. These are: C<Rxf_Pmf_LOCALE>,
+C<Rxf_Pmf_UNICODE>, and C<PMf_LOCALE>. Instead there are encodes and
+three static in-line functions for accessing the information:
+C<get_regex_charset()>, C<set_regex_charset()>, and C<get_regex_charset_name()>,
+which are defined in the places where the orginal flags were.
+
=back
=head1 Selected Bug Fixes
DIE(aTHX_ "panic: pp_split");
rx = PM_GETRE(pm);
- TAINT_IF((RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) &&
+ TAINT_IF(get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET &&
(RX_EXTFLAGS(rx) & (RXf_WHITE | RXf_SKIPWHITE)));
RX_MATCH_UTF8_set(rx, do_utf8);
while (*s == ' ' || is_utf8_space((U8*)s))
s += UTF8SKIP(s);
}
- else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+ else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
while (isSPACE_LC(*s))
s++;
}
else
m += t;
}
- } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+ }
+ else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
while (m < strend && !isSPACE_LC(*m))
++m;
} else {
if (do_utf8) {
while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) ))
s += UTF8SKIP(s);
- } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+ }
+ else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
while (s < strend && isSPACE_LC(*s))
++s;
} else {
#define SCF_SEEN_ACCEPT 0x8000
#define UTF cBOOL(RExC_utf8)
-#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
-#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
#define OOB_UNICODE 12345678
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'dual' charset specified, as it means unicode when utf8 */
pm_flags = orig_pm_flags;
- if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
- pm_flags |= RXf_PMf_UNICODE;
+
+ if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
+ set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
}
RExC_precomp = exp;
r->extflags = pm_flags;
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
- bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+ bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
/* The caret is output if there are any defaults: if not all the STD
* flags are set, or if no character set specifier is needed */
* covered by the caret */
const STRLEN wraplen = plen + has_p + has_runon
+ has_default /* If needs a caret */
- + has_charset /* If needs a character set specifier */
+
+ /* If needs a character set specifier */
+ + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
*p++= DEFAULT_PAT_MOD;
}
if (has_charset) {
- if (r->extflags & RXf_PMf_LOCALE) {
- *p++ = LOCALE_PAT_MOD;
- } else {
- *p++ = UNICODE_PAT_MOD;
- }
+ STRLEN len;
+ const char* const name = get_regex_charset_name(r->extflags, &len);
+ Copy(name, p, len, char);
+ p += len;
}
if (has_p)
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
if (RExC_utf8) { /* But the default for a utf8 pattern is
unicode semantics */
- RExC_flags |= RXf_PMf_UNICODE;
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
goto parse_flags;
default:
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
bool has_charset_modifier = 0;
+ regex_charset cs = REGEX_DEPENDS_CHARSET;
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
if (has_charset_modifier || flagsp == &negflags) {
goto fail_modifiers;
}
- posflags |= RXf_PMf_LOCALE;
- negflags |= RXf_PMf_UNICODE;
+ cs = REGEX_LOCALE_CHARSET;
has_charset_modifier = 1;
break;
case UNICODE_PAT_MOD:
if (has_charset_modifier || flagsp == &negflags) {
goto fail_modifiers;
}
- posflags |= RXf_PMf_UNICODE;
- negflags |= RXf_PMf_LOCALE;
+ cs = REGEX_UNICODE_CHARSET;
has_charset_modifier = 1;
break;
case DUAL_PAT_MOD:
/* The dual charset means unicode semantics if the
* pattern (or target, not known until runtime) are
* utf8 */
- if (RExC_utf8) {
- posflags |= RXf_PMf_UNICODE;
- negflags |= RXf_PMf_LOCALE;
- }
- else {
- negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
- }
+ cs = (RExC_utf8)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET;
has_charset_modifier = 1;
break;
case ONCE_PAT_MOD: /* 'o' */
case ')':
RExC_flags |= posflags;
RExC_flags &= ~negflags;
+ set_regex_charset(&RExC_flags, cs);
if (paren != ':') {
oregflags |= posflags;
oregflags &= ~negflags;
+ set_regex_charset(&oregflags, cs);
}
nextchar(pRExC_state);
if (paren != ':') {
{
int bit;
int set=0;
+ regex_charset cs;
for (bit=0; bit<32; bit++) {
if (flags & (1<<bit)) {
+ if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
+ continue;
+ }
if (!set++ && lead)
PerlIO_printf(Perl_debug_log, "%s",lead);
PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
}
}
+ if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+ if (!set++ && lead) {
+ PerlIO_printf(Perl_debug_log, "%s",lead);
+ }
+ switch (cs) {
+ case REGEX_UNICODE_CHARSET:
+ PerlIO_printf(Perl_debug_log, "UNICODE");
+ break;
+ case REGEX_LOCALE_CHARSET:
+ PerlIO_printf(Perl_debug_log, "LOCALE");
+ break;
+ default:
+ PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
+ break;
+ }
+ }
if (lead) {
if (set)
PerlIO_printf(Perl_debug_log, "\n");
/* Note, includes locale, unicode */
#define STD_PMMOD_FLAGS_CLEAR(pmfl) \
- *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE|RXf_PMf_UNICODE)
+ *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET)
/* chars and strings used as regex pattern modifiers
* Singular is a 'c'har, plural is a "string"
* unshared area without affecting binary compatibility */
#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+2)
+/* embed.pl doesn't yet know how to handle static inline functions, so
+ manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+ __attribute__warn_unused_result__;
+
+#define MAX_CHARSET_NAME_LENGTH 1
+
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+{
+ /* Returns a string that corresponds to the name of the regex character set
+ * given by 'flags', and *lenp is set the length of that string, which
+ * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
+
+ *lenp = 1;
+ switch (get_regex_charset(flags)) {
+ case REGEX_DEPENDS_CHARSET: return DUAL_PAT_MODS;
+ case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
+ case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
+ }
+
+ return "?"; /* Unknown */
+}
+
/* Anchor and GPOS related stuff */
#define RXf_ANCH_BOL (1<<(RXf_BASE_SHIFT+0))
#define RXf_ANCH_MBOL (1<<(RXf_BASE_SHIFT+1))
"FOLD", /* 0x00000004 */
"EXTENDED", /* 0x00000008 */
"KEEPCOPY", /* 0x00000010 */
- "LOCALE", /* 0x00000020 */
- "UNICODE", /* 0x00000040 */
+ "CHARSET", /* 0x00000060 */
+ "CHARSET", /* 0x00000060 */
"UNUSED_BIT_7", /* 0x00000080 */
"UNUSED_BIT_8", /* 0x00000100 */
"ANCH_BOL", /* 0x00000200 */
if ( GIMME_V == G_ARRAY ) {
STRLEN left = 0;
- char reflags[sizeof(INT_PAT_MODS) + 1]; /* The +1 is for the charset
- modifier */
+ char reflags[sizeof(INT_PAT_MODS) + MAX_CHARSET_NAME_LENGTH];
const char *fptr;
char ch;
U16 match_flags;
/*
we are in list context so stringify
the modifiers that apply. We ignore "negative
- modifiers" in this scenario.
+ modifiers" in this scenario, and the default character set
*/
- if (RX_EXTFLAGS(re) & RXf_PMf_LOCALE) {
- reflags[left++] = LOCALE_PAT_MOD;
- }
- else if (RX_EXTFLAGS(re) & RXf_PMf_UNICODE) {
- reflags[left++] = UNICODE_PAT_MOD;
+ if (get_regex_charset(RX_EXTFLAGS(re)) != REGEX_DEPENDS_CHARSET) {
+ STRLEN len;
+ const char* const name = get_regex_charset_name(RX_EXTFLAGS(re),
+ &len);
+ Copy(name, reflags + left, len, char);
+ left += len;
}
fptr = INT_PAT_MODS;
match_flags = (U16)((RX_EXTFLAGS(re) & PMf_COMPILETIME)