#if defined(PERL_IN_REGEXEC_C)
ERs |bool |isFOO_utf8_lc |const U8 classnum|NN const U8* character
+ERns |char *|find_next_ascii|NN char* s|NN const char * send|const bool is_utf8
+ERns |char *|find_next_non_ascii|NN char* s|NN const char * send|const bool is_utf8
ERs |SSize_t|regmatch |NN regmatch_info *reginfo|NN char *startpos|NN regnode *prog
WERs |I32 |regrepeat |NN regexp *prog|NN char **startposp \
|NN const regnode *p \
#define backup_one_SB(a,b,c) S_backup_one_SB(aTHX_ a,b,c)
#define backup_one_WB(a,b,c,d) S_backup_one_WB(aTHX_ a,b,c,d)
#define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e)
+#define find_next_ascii S_find_next_ascii
+#define find_next_non_ascii S_find_next_non_ascii
#define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b)
#define isGCB(a,b,c,d,e) S_isGCB(aTHX_ a,b,c,d,e)
#define isLB(a,b,c,d,e,f) S_isLB(aTHX_ a,b,c,d,e,f)
#endif
-#undef PERL_WORDSIZE
-#undef PERL_COUNT_MULTIPLIER
-#undef PERL_WORD_BOUNDARY_MASK
-#undef PERL_VARIANTS_WORD_MASK
+#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
+# undef PERL_WORDSIZE
+# undef PERL_COUNT_MULTIPLIER
+# undef PERL_WORD_BOUNDARY_MASK
+# undef PERL_VARIANTS_WORD_MASK
+#endif
/*
=for apidoc is_utf8_string
#define PERL_ARGS_ASSERT_FIND_BYCLASS \
assert(prog); assert(c); assert(s); assert(strend)
+STATIC char * S_find_next_ascii(char* s, const char * send, const bool is_utf8)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FIND_NEXT_ASCII \
+ assert(s); assert(send)
+
+STATIC char * S_find_next_non_ascii(char* s, const char * send, const bool is_utf8)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII \
+ assert(s); assert(send)
+
STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
}
break;
+ case NASCII:
+ invert = 1;
+ /* FALLTHROUGH */
+ case ASCII:
+ my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
+
+ /* This can be handled as a Posix class */
+ goto join_posix_and_ascii;
+
case NPOSIXA: /* For these, we always know the exact set of
what's matched */
invert = 1;
/* FALLTHROUGH */
case POSIXA:
- if (FLAGS(scan) == _CC_ASCII) {
- my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
- }
- else {
- _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
- PL_XPosix_ptrs[_CC_ASCII],
- &my_invlist);
- }
- goto join_posix;
+ assert(FLAGS(scan) != _CC_ASCII);
+ _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
+ PL_XPosix_ptrs[_CC_ASCII],
+ &my_invlist);
+ goto join_posix_and_ascii;
case NPOSIXD:
case NPOSIXU:
&my_invlist);
}
- join_posix:
+ join_posix_and_ascii:
if (flags & SCF_DO_STCLASS_AND) {
ssc_intersection(data->start_class, my_invlist, invert);
/* The actual POSIXish node for all the rest depends on the
* charset modifier. The ones in the first set depend only on
* ASCII or, if available on this platform, also locale */
+
case ANYOF_ASCII:
case ANYOF_NASCII:
+
#ifdef HAS_ISASCII
- op = (LOC) ? POSIXL : POSIXA;
-#else
- op = POSIXA;
+ if (LOC) {
+ op = POSIXL;
+ goto join_posix;
+ }
#endif
- goto join_posix;
+ /* (named_class - ANY_OF_ASCII) is 0 or 1. xor'ing with
+ * invert converts that to 1 or 0 */
+ op = ASCII + ((namedclass - ANYOF_ASCII) ^ invert);
+ break;
/* The following don't have any matches in the upper Latin1
* range, hence /d is equivalent to /u for them. Making it /u
TRUE /* downgradable to EXACT */
);
}
+ else {
+ *flagp |= HASWIDTH|SIMPLE;
+ }
RExC_parse = (char *) cur_parse;
return FALSE; /* Things like CNTRL are always below 256 */
}
+STATIC char *
+S_find_next_ascii(char * s, const char * send, const bool utf8_target)
+{
+ /* Returns the position of the first ASCII byte in the sequence between 's'
+ * and 'send-1' inclusive; returns 'send' if none found */
+
+ PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
+
+#ifndef EBCDIC
+
+ if ((STRLEN) (send - s) >= PERL_WORDSIZE
+
+ /* This term is wordsize if subword; 0 if not */
+ + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+
+ /* 'offset' */
+ - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
+ {
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s++; /* khw didn't bother creating a separate loop for
+ utf8_target */
+ }
+
+ /* Here, we know we have at least one full word to process. Process
+ * per-word as long as we have at least a full word left */
+ do {
+ if ((* (PERL_UINTMAX_T *) s) & ~ PERL_VARIANTS_WORD_MASK) {
+ break;
+ }
+ s += PERL_WORDSIZE;
+ } while (s + PERL_WORDSIZE <= send);
+ }
+
+#endif
+
+ /* Process per-character */
+ if (utf8_target) {
+ while (s < send) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ while (s < send) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s++;
+ }
+ }
+
+ return s;
+}
+
+STATIC char *
+S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
+{
+ /* Returns the position of the first non-ASCII byte in the sequence between
+ * 's' and 'send-1' inclusive; returns 'send' if none found */
+
+#ifdef EBCDIC
+
+ PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+
+ if (utf8_target) {
+ while (s < send) {
+ if ( ! isASCII(*s)) {
+ return s;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ while (s < send) {
+ if ( ! isASCII(*s)) {
+ return s;
+ }
+ s++;
+ }
+ }
+
+ return s;
+
+#else
+
+ const U8 * next_non_ascii = NULL;
+
+ PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+ PERL_UNUSED_ARG(utf8_target);
+
+ /* On ASCII platforms invariants and ASCII are identical, so if the string
+ * is entirely invariants, there is no non-ASCII character */
+ return (is_utf8_invariant_string_loc((U8 *) s,
+ (STRLEN) (send - s),
+ &next_non_ascii))
+ ? (char *) send
+ : (char *) next_non_ascii;
+
+#endif
+
+}
+
/*
* pregexec and friends
*/
);
break;
+ case ASCII:
+ s = find_next_ascii(s, strend, utf8_target);
+ if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+
+ break;
+
+ case NASCII:
+ s = find_next_non_ascii(s, strend, utf8_target);
+ if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+
+ break;
+
/* The argument to all the POSIX node types is the class number to pass to
* _generic_isCC() to build a mask for searching in PL_charclass[] */
}
break;
+ case ASCII:
+ if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
+ sayNO;
+ }
+
+ locinput++; /* ASCII is always single byte */
+ break;
+
+ case NASCII:
+ if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
+ sayNO;
+ }
+
+ goto increment_locinput;
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number
* */
}
break;
+ case ASCII:
+ if (utf8_target && loceol - scan > max) {
+
+ /* We didn't adjust <loceol> at the beginning of this routine
+ * because is UTF-8, but it is actually ok to do so, since here, to
+ * match, 1 char == 1 byte. */
+ loceol = scan + max;
+ }
+
+ scan = find_next_non_ascii(scan, loceol, utf8_target);
+ break;
+
+ case NASCII:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < loceol
+ && ! isASCII_utf8_safe(scan, loceol))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ scan = find_next_ascii(scan, loceol, utf8_target);
+ }
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number */
case NPOSIXL: