*/
UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
- STRLEN curlen,
- STRLEN *retlen,
- const U32 flags)
+Perl_utf8n_to_uvchr(const U8 *s,
+ STRLEN curlen,
+ STRLEN *retlen,
+ const U32 flags)
{
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
}
-/* The tables below are adapted from
- * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this copyright
- * notice:
-
-Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#if 0 /* This is the original table given in
- http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */
-static U8 utf8d_C9[] = {
- /* The first part of the table maps bytes to character classes that
- * to reduce the size of the transition table and create bitmasks. */
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
- 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
-
- /* The second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a state. */
- 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
- 12,36,12,12,12,12,12,12,12,12,12,12
-};
-
-#endif
-
-#ifndef EBCDIC
-
-
-/* This is a version of the above table customized for Perl that doesn't
- * exclude surrogates and accepts start bytes up through FD (FE on 64-bit
- * machines). The classes have been renumbered so that the patterns are more
- * evident in the table. The class numbers for start bytes are constrained so
- * that they can be used as a shift count for masking off the leading one bits.
- * It would make the code simpler if start byte FF could also be handled, but
- * doing so would mean adding nodes for each of continuation bytes 6-12
- * remaining, and two more nodes for overlong detection (a total of 9), and
- * there is room only for 4 more nodes unless we make the array U16 instead of
- * U8.
- *
- * The classes are
- * 00-7F 0
- * 80-81 7 Not legal immediately after start bytes E0 F0 F8 FC
- * FE
- * 82-83 8 Not legal immediately after start bytes E0 F0 F8 FC
- * 84-87 9 Not legal immediately after start bytes E0 F0 F8
- * 88-8F 10 Not legal immediately after start bytes E0 F0
- * 90-9F 11 Not legal immediately after start byte E0
- * A0-BF 12
- * C0,C1 1
- * C2-DF 2
- * E0 13
- * E1-EF 3
- * F0 14
- * F1-F7 4
- * F8 15
- * F9-FB 5
- * FC 16
- * FD 6
- * FE 17 (or 1 on 32-bit machines, since it overflows)
- * FF 1
- */
-
-static const U8 dfa_tab_for_perl[] = {
- /* The first part of the table maps bytes to character classes to reduce
- * the size of the transition table and create bitmasks. */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*00-0F*/
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*10-1F*/
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*20-2F*/
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*30-3F*/
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*40-4F*/
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*50-5F*/
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*60-6F*/
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*70-7F*/
- 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, /*80-8F*/
- 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, /*90-9F*/
- 12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12, /*A0-AF*/
- 12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12, /*B0-BF*/
- 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /*C0-CF*/
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /*D0-DF*/
- 13, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /*E0-EF*/
- 14, 4, 4, 4, 4, 4, 4, 4,15, 5, 5, 5,16, 6, /*F0-FD*/
-# ifdef UV_IS_QUAD
- 17, /*FE*/
-# else
- 1, /*FE*/
-# endif
- 1, /*FF*/
-
-/* The second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a new state, called a
- * node. The nodes are:
- * N0 The initial state, and final accepting one.
- * N1 Any one continuation byte (80-BF) left. This is transitioned to
- * immediately when the start byte indicates a two-byte sequence
- * N2 Any two continuation bytes left.
- * N3 Any three continuation bytes left.
- * N4 Any four continuation bytes left.
- * N5 Any five continuation bytes left.
- * N6 Start byte is E0. Continuation bytes 80-9F are illegal (overlong);
- * the other continuations transition to N1
- * N7 Start byte is F0. Continuation bytes 80-8F are illegal (overlong);
- * the other continuations transition to N2
- * N8 Start byte is F8. Continuation bytes 80-87 are illegal (overlong);
- * the other continuations transition to N3
- * N9 Start byte is FC. Continuation bytes 80-83 are illegal (overlong);
- * the other continuations transition to N4
- * N10 Start byte is FE. Continuation bytes 80-81 are illegal (overlong);
- * the other continuations transition to N5
- * 1 Reject. All transitions not mentioned above (except the single
- * byte ones (as they are always legal) are to this state.
- */
-
-# define NUM_CLASSES 18
-# define N0 0
-# define N1 ((N0) + NUM_CLASSES)
-# define N2 ((N1) + NUM_CLASSES)
-# define N3 ((N2) + NUM_CLASSES)
-# define N4 ((N3) + NUM_CLASSES)
-# define N5 ((N4) + NUM_CLASSES)
-# define N6 ((N5) + NUM_CLASSES)
-# define N7 ((N6) + NUM_CLASSES)
-# define N8 ((N7) + NUM_CLASSES)
-# define N9 ((N8) + NUM_CLASSES)
-# define N10 ((N9) + NUM_CLASSES)
-
-/*Class: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 */
-/*N0*/ 0, 1,N1,N2,N3,N4,N5, 1, 1, 1, 1, 1, 1,N6,N7,N8,N9,N10,
-/*N1*/ 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
-/*N2*/ 1, 1, 1, 1, 1, 1, 1,N1,N1,N1,N1,N1,N1, 1, 1, 1, 1, 1,
-/*N3*/ 1, 1, 1, 1, 1, 1, 1,N2,N2,N2,N2,N2,N2, 1, 1, 1, 1, 1,
-/*N4*/ 1, 1, 1, 1, 1, 1, 1,N3,N3,N3,N3,N3,N3, 1, 1, 1, 1, 1,
-/*N5*/ 1, 1, 1, 1, 1, 1, 1,N4,N4,N4,N4,N4,N4, 1, 1, 1, 1, 1,
-
-/*N6*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,N1, 1, 1, 1, 1, 1,
-/*N7*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,N2,N2, 1, 1, 1, 1, 1,
-/*N8*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,N3,N3,N3, 1, 1, 1, 1, 1,
-/*N9*/ 1, 1, 1, 1, 1, 1, 1, 1, 1,N4,N4,N4,N4, 1, 1, 1, 1, 1,
-/*N10*/ 1, 1, 1, 1, 1, 1, 1, 1,N5,N5,N5,N5,N5, 1, 1, 1, 1, 1,
-};
-
-#endif
-
/*
=for apidoc utf8n_to_uvchr_error
*/
UV
-Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+Perl_utf8n_to_uvchr_error(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
*/
UV
-Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
+Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
{
const U8 * const s0 = s;
const U8 * send = s0 + curlen;
- U32 possible_problems = 0; /* A bit is set here for each potential problem
- found as we go along */
- UV uv = (UV) -1;
- STRLEN expectlen = 0; /* How long should this sequence be?
- (initialized to silence compilers' wrong
- warning) */
- STRLEN avail_len = 0; /* When input is too short, gives what that is */
- U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
- this gets set and discarded */
+ U32 possible_problems; /* A bit is set here for each potential problem
+ found as we go along */
+ UV uv;
+ STRLEN expectlen; /* How long should this sequence be? */
+ STRLEN avail_len; /* When input is too short, gives what that is */
+ U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
+ gets set and discarded */
/* The below are used only if there is both an overlong malformation and a
* too short one. Otherwise the first two are set to 's0' and 'send', and
* the third not used at all */
- U8 * adjusted_s0 = (U8 *) s0;
+ U8 * adjusted_s0;
U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
routine; see [perl #130921] */
- UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
+ UV uv_so_far;
+ dTHX;
+
+ PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
+
+ /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
+ * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
+ * syllables that the dfa doesn't properly handle. Quickly dispose of the
+ * final case. */
+
+#ifndef EBCDIC
+
+ /* Each of the affected Hanguls starts with \xED */
+
+ if (is_HANGUL_ED_utf8_safe(s0, send)) {
+ if (retlen) {
+ *retlen = 3;
+ }
+ if (errors) {
+ *errors = 0;
+ }
+ if (msgs) {
+ *msgs = NULL;
+ }
- UV state = 0;
+ return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
+ | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
+ | (s0[2] & UTF_CONTINUATION_MASK);
+ }
- PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+#endif
+
+ /* In conjunction with the exhaustive tests that can be enabled in
+ * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
+ * what it is intended to do, and that no flaws in it are masked by
+ * dropping down and executing the code below
+ assert(! isUTF8_CHAR(s0, send)
+ || UTF8_IS_SURROGATE(s0, send)
+ || UTF8_IS_SUPER(s0, send)
+ || UTF8_IS_NONCHAR(s0,send));
+ */
+
+ s = s0;
+ uv = *s0;
+ possible_problems = 0;
+ expectlen = 0;
+ avail_len = 0;
+ discard_errors = 0;
+ adjusted_s0 = (U8 *) s0;
+ uv_so_far = 0;
if (errors) {
*errors = 0;
*retlen = expectlen;
}
- /* An invariant is trivially well-formed */
- if (UTF8_IS_INVARIANT(*s0)) {
- return *s0;
- }
-
-#ifndef EBCDIC
-
- /* Measurements show that this dfa is somewhat faster than the regular code
- * below, so use it first, dropping down for the non-normal cases. */
-
-# define PERL_UTF8_DECODE_REJECT 1
-
- while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
- UV type = dfa_tab_for_perl[*s];
-
- uv = (state == 0)
- ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
- : UTF8_ACCUMULATE(uv, *s);
- state = dfa_tab_for_perl[256 + state + type];
-
- if (state == 0) {
-
- /* If this could be a code point that the flags don't allow (the first
- * surrogate is the first such possible one), delve further, but we already
- * have calculated 'uv' */
- if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
- |UTF8_DISALLOW_PERL_EXTENDED
- |UTF8_WARN_ILLEGAL_INTERCHANGE
- |UTF8_WARN_PERL_EXTENDED))
- && uv >= UNICODE_SURROGATE_FIRST)
- {
- curlen = s + 1 - s0;
- goto got_uv;
- }
-
- return UNI_TO_NATIVE(uv);
- }
-
- s++;
- }
-
- /* Here, is some sort of failure. Use the full mechanism */
-
- uv = *s0;
-
-#endif
-
/* A continuation character can't start a valid sequence */
if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
possible_problems |= UTF8_GOT_CONTINUATION;
}
}
- got_uv:
-
/* Here, we have found all the possible problems, except for when the input
* is for a problematic code point not allowed by the input parameters. */
return _to_upper_title_latin1((U8) c, p, lenp, 'S');
}
- uvchr_to_utf8(p, c);
- return CALL_UPPER_CASE(c, p, p, lenp);
+ return CALL_UPPER_CASE(c, NULL, p, lenp);
}
UV
return _to_upper_title_latin1((U8) c, p, lenp, 's');
}
- uvchr_to_utf8(p, c);
- return CALL_TITLE_CASE(c, p, p, lenp);
+ return CALL_TITLE_CASE(c, NULL, p, lenp);
}
STATIC U8
return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
}
- uvchr_to_utf8(p, c);
- return CALL_LOWER_CASE(c, p, p, lenp);
+ return CALL_LOWER_CASE(c, NULL, p, lenp);
}
UV
/* Here, above 255. If no special needs, just use the macro */
if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
- uvchr_to_utf8(p, c);
- return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
+ return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
}
else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
the special flags. */
{
PERL_ARGS_ASSERT__IS_UTF8_MARK;
- return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
+ return is_utf8_common(p, NULL, "IsM", PL_utf8_mark);
}
STATIC UV
/* Here, there was no mapping defined, which means that the code point maps
* to itself. Return the inputs */
cases_to_self:
- len = UTF8SKIP(p);
- if (p != ustrp) { /* Don't copy onto itself */
- Copy(p, ustrp, len, U8);
+ if (p) {
+ len = UTF8SKIP(p);
+ if (p != ustrp) { /* Don't copy onto itself */
+ Copy(p, ustrp, len, U8);
+ }
+ *lenp = len;
+ }
+ else {
+ *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
}
-
- if (lenp)
- *lenp = len;
return uv1;
PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
+ PL_utf8_mark = _new_invlist_C_array(PL_uni_prop_ptrs[PL_M]);
}
SV *