This function is for code that wants any warning and/or error messages to be
returned to the caller rather than be displayed. All messages that would have
-been displayed if all lexcial warnings are enabled will be returned.
+been displayed if all lexical warnings are enabled will be returned.
It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
placed after all the others, C<msgs>. If this parameter is 0, this function
STRLEN len = OFFUNISKIP(uv);
U8 *p = d+len-1;
while (p > d) {
- *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
- uv >>= UTF_ACCUMULATION_SHIFT;
+ *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
+ uv >>= SHIFT;
}
*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
return d+len;
*/
UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
- STRLEN curlen,
- STRLEN *retlen,
- const U32 flags)
+Perl_utf8n_to_uvchr(const U8 *s,
+ STRLEN curlen,
+ STRLEN *retlen,
+ const U32 flags)
{
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
}
-/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/,
- * which requires this copyright notice */
-
-/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#if 0
-static U8 utf8d_C9[] = {
- /* The first part of the table maps bytes to character classes that
- * to reduce the size of the transition table and create bitmasks. */
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
- 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
-
- /* The second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a state. */
- 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
- 12,36,12,12,12,12,12,12,12,12,12,12
-};
-
-#endif
-
-#ifndef EBCDIC
-
-/* This is a version of the above table customized for Perl that doesn't
- * exclude surrogates and accepts start bytes up through F7 (representing
- * 2**21 - 1). */
-static U8 dfa_tab_for_perl[] = {
- /* The first part of the table maps bytes to character classes to reduce
- * the size of the transition table and create bitmasks. */
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
- 10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/
-
- /* The second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a state. */
- 0,12,24,36,96,12,12,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,/*23*/
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,/*47*/
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,/*71*/
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,/*95*/
- 12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */
-
- /* The customization was to repurpose the surrogates type '4' to instead be
- * for start bytes F1-F7. Types 5 and 6 are now unused, and their entries in
- * the transition part of the table are set to 12, so are illegal.
- *
- * To do higher code points would require expansion and some rearrangement of
- * the table. The type '1' entries for continuation bytes 80-8f would have to
- * be split into several types, because they aren't treated uniformly for
- * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE:
- * 80-81. We start needing to worry about overflow if FE is included.
- * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember
- * from the web site that these are used to right shift). FE would
- * necessarily be type 7; and FF, type 8. And new states would have to be
- * created for F8 and FC (and FE and FF if used), so quite a bit of work would
- * be involved.
- *
- * XXX Better would be to customize the table so that the noncharacters are
- * excluded. This again is non trivial, but doing so would simplify the code
- * that uses this, and might make it small enough to make it inlinable */
-};
-
-#endif
-
/*
=for apidoc utf8n_to_uvchr_error
*/
UV
-Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+Perl_utf8n_to_uvchr_error(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
*/
UV
-Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
+Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
{
const U8 * const s0 = s;
const U8 * send = s0 + curlen;
- U32 possible_problems = 0; /* A bit is set here for each potential problem
- found as we go along */
- UV uv = (UV) -1;
- STRLEN expectlen = 0; /* How long should this sequence be?
- (initialized to silence compilers' wrong
- warning) */
- STRLEN avail_len = 0; /* When input is too short, gives what that is */
- U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
- this gets set and discarded */
+ U32 possible_problems; /* A bit is set here for each potential problem
+ found as we go along */
+ UV uv;
+ STRLEN expectlen; /* How long should this sequence be? */
+ STRLEN avail_len; /* When input is too short, gives what that is */
+ U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
+ gets set and discarded */
/* The below are used only if there is both an overlong malformation and a
* too short one. Otherwise the first two are set to 's0' and 'send', and
* the third not used at all */
- U8 * adjusted_s0 = (U8 *) s0;
+ U8 * adjusted_s0;
U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
routine; see [perl #130921] */
- UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
+ UV uv_so_far;
+ dTHX;
+
+ PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
+
+ /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
+ * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
+ * syllables that the dfa doesn't properly handle. Quickly dispose of the
+ * final case. */
+
+#ifndef EBCDIC
+
+ /* Each of the affected Hanguls starts with \xED */
+
+ if (is_HANGUL_ED_utf8_safe(s0, send)) {
+ if (retlen) {
+ *retlen = 3;
+ }
+ if (errors) {
+ *errors = 0;
+ }
+ if (msgs) {
+ *msgs = NULL;
+ }
+
+ return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
+ | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
+ | (s0[2] & UTF_CONTINUATION_MASK);
+ }
- UV state = 0;
+#endif
- PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+ /* In conjunction with the exhaustive tests that can be enabled in
+ * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
+ * what it is intended to do, and that no flaws in it are masked by
+ * dropping down and executing the code below
+ assert(! isUTF8_CHAR(s0, send)
+ || UTF8_IS_SURROGATE(s0, send)
+ || UTF8_IS_SUPER(s0, send)
+ || UTF8_IS_NONCHAR(s0,send));
+ */
+
+ s = s0;
+ uv = *s0;
+ possible_problems = 0;
+ expectlen = 0;
+ avail_len = 0;
+ discard_errors = 0;
+ adjusted_s0 = (U8 *) s0;
+ uv_so_far = 0;
if (errors) {
*errors = 0;
*retlen = expectlen;
}
- /* An invariant is trivially well-formed */
- if (UTF8_IS_INVARIANT(*s0)) {
- return *s0;
- }
-
-#ifndef EBCDIC
-
- /* Measurements show that this dfa is somewhat faster than the regular code
- * below, so use it first, dropping down for the non-normal cases. */
-
-# define PERL_UTF8_DECODE_REJECT 12
-
- while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
- UV type = dfa_tab_for_perl[*s];
-
- if (state != 0) {
- uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT);
- state = dfa_tab_for_perl[256 + state + type];
- }
- else {
- uv = (0xff >> type) & (*s);
- state = dfa_tab_for_perl[256 + type];
- }
-
- if (state == 0) {
-
- /* If this could be a code point that the flags don't allow (the first
- * surrogate is the first such possible one), delve further, but we already
- * have calculated 'uv' */
- if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
- |UTF8_WARN_ILLEGAL_INTERCHANGE))
- && uv >= UNICODE_SURROGATE_FIRST)
- {
- curlen = s + 1 - s0;
- goto got_uv;
- }
-
- return uv;
- }
-
- s++;
- }
-
- /* Here, is some sort of failure. Use the full mechanism */
-
- uv = *s0;
-
-#endif
-
/* A continuation character can't start a valid sequence */
if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
possible_problems |= UTF8_GOT_CONTINUATION;
}
}
- got_uv:
-
/* Here, we have found all the possible problems, except for when the input
* is for a problematic code point not allowed by the input parameters. */
return _to_upper_title_latin1((U8) c, p, lenp, 'S');
}
- uvchr_to_utf8(p, c);
- return CALL_UPPER_CASE(c, p, p, lenp);
+ return CALL_UPPER_CASE(c, NULL, p, lenp);
}
UV
return _to_upper_title_latin1((U8) c, p, lenp, 's');
}
- uvchr_to_utf8(p, c);
- return CALL_TITLE_CASE(c, p, p, lenp);
+ return CALL_TITLE_CASE(c, NULL, p, lenp);
}
STATIC U8
return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
}
- uvchr_to_utf8(p, c);
- return CALL_LOWER_CASE(c, p, p, lenp);
+ return CALL_LOWER_CASE(c, NULL, p, lenp);
}
UV
/* Here, above 255. If no special needs, just use the macro */
if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
- uvchr_to_utf8(p, c);
- return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
+ return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
}
else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
the special flags. */
{
PERL_ARGS_ASSERT__IS_UTF8_MARK;
- return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
+ return is_utf8_common(p, NULL, "IsM", PL_utf8_mark);
}
STATIC UV
/* Here, there was no mapping defined, which means that the code point maps
* to itself. Return the inputs */
cases_to_self:
- len = UTF8SKIP(p);
- if (p != ustrp) { /* Don't copy onto itself */
- Copy(p, ustrp, len, U8);
+ if (p) {
+ len = UTF8SKIP(p);
+ if (p != ustrp) { /* Don't copy onto itself */
+ Copy(p, ustrp, len, U8);
+ }
+ *lenp = len;
+ }
+ else {
+ *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
}
-
- if (lenp)
- *lenp = len;
return uv1;
/* Get the 0th element, which is needed to setup the inversion list
* */
while (isSPACE(*l)) l++;
+ after_atou = (char *) lend;
if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
" inversion list");
" elements than available", elements);
}
while (isSPACE(*l)) l++;
+ after_atou = (char *) lend;
if (!grok_atoUV((const char *)l, other_elements_ptr++,
&after_atou))
{
{
/* Set up the inversion list global variables */
- PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_ASCII_invlist);
- PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_XPOSIXALNUM_invlist);
- PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_XPOSIXALPHA_invlist);
- PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_XPOSIXBLANK_invlist);
- PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(PL_CASED_invlist);
- PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_XPOSIXCNTRL_invlist);
- PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_XPOSIXDIGIT_invlist);
- PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_XPOSIXGRAPH_invlist);
- PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_XPOSIXLOWER_invlist);
- PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_XPOSIXPRINT_invlist);
- PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_XPOSIXPUNCT_invlist);
- PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_XPOSIXSPACE_invlist);
- PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_XPOSIXUPPER_invlist);
- PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_VERTSPACE_invlist);
- PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_XPOSIXWORD_invlist);
- PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_XPOSIXXDIGIT_invlist);
+ PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
+ PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALNUM]);
+ PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALPHA]);
+ PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXBLANK]);
+ PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
+ PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXCNTRL]);
+ PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXDIGIT]);
+ PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXGRAPH]);
+ PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXLOWER]);
+ PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPRINT]);
+ PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPUNCT]);
+ PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXSPACE]);
+ PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXUPPER]);
+ PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
+ PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXWORD]);
+ PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXXDIGIT]);
+
+ PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
+ PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALNUM]);
+ PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALPHA]);
+ PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXBLANK]);
+ PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
+ PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXCNTRL]);
+ PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXDIGIT]);
+ PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXGRAPH]);
+ PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXLOWER]);
+ PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPRINT]);
+ PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPUNCT]);
+ PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXSPACE]);
+ PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXUPPER]);
+ PL_Posix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
+ PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXWORD]);
+ PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXXDIGIT]);
+
PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
- PL_Assigned_invlist = _new_invlist_C_array(PL_ASSIGNED_invlist);
PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
- PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
- PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
- PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
- PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
- PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
- PL_utf8_perl_idstart = _new_invlist_C_array(PL__PERL_IDSTART_invlist);
- PL_utf8_perl_idcont = _new_invlist_C_array(PL__PERL_IDCONT_invlist);
+
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
- PL_utf8_foldable = _new_invlist_C_array(PL__PERL_ANY_FOLDS_invlist);
- PL_HasMultiCharFold = _new_invlist_C_array(
- PL__PERL_FOLDS_TO_MULTI_CHAR_invlist);
+
+ PL_Assigned_invlist = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASSIGNED]);
+
+ PL_utf8_perl_idstart = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDSTART]);
+ PL_utf8_perl_idcont = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDCONT]);
+
+ PL_utf8_charname_begin = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_BEGIN]);
+ PL_utf8_charname_continue = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_CONTINUE]);
+
+ PL_utf8_foldable = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_ANY_FOLDS]);
+ PL_HasMultiCharFold = _new_invlist_C_array(PL_uni_prop_ptrs[
+ PL__PERL_FOLDS_TO_MULTI_CHAR]);
PL_NonL1NonFinalFold = _new_invlist_C_array(
NonL1_Perl_Non_Final_Folds_invlist);
- PL_utf8_charname_begin = _new_invlist_C_array(PL__PERL_CHARNAME_BEGIN_invlist);
- PL_utf8_charname_continue = _new_invlist_C_array(PL__PERL_CHARNAME_CONTINUE_invlist);
+
+ PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
+ PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
+ PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
+ PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
+ PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
+ PL_utf8_mark = _new_invlist_C_array(PL_uni_prop_ptrs[PL_M]);
}
SV *
char* lookup_name;
bool stricter = FALSE;
+ bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one
+ of the cjk numeric properties (though
+ it requires extra effort to compile
+ them) */
unsigned int i;
- unsigned int j = 0;
+ unsigned int j = 0, lookup_len;
int equals_pos = -1; /* Where the '=' is found, or negative if none */
+ int slash_pos = -1; /* Where the '/' is found, or negative if none */
int table_index = 0;
bool starts_with_In_or_Is = FALSE;
Size_t lookup_offset = 0;
}
/* Then check if it is one of these properties. This is hard-coded
- * because easier this way, and the list is unlikely to change */
- if ( memEQs(lookup_name + lookup_offset,
+ * because easier this way, and the list is unlikely to change. There
+ * are several properties like this in the Unihan DB, which is unlikely
+ * to be compiled, and they all end with 'numeric'. The interiors
+ * aren't checked for the precise property. This would stop working if
+ * a cjk property were to be created that ended with 'numeric' and
+ * wasn't a numeric type */
+ is_nv_type = memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "numericvalue")
+ || memEQs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "nv")
+ || ( memENDPs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "numeric")
+ && ( memBEGINPs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "cjk")
+ || memBEGINPs(lookup_name + lookup_offset,
+ j - 1 - lookup_offset, "k")));
+ if ( is_nv_type
+ || memEQs(lookup_name + lookup_offset,
j - 1 - lookup_offset, "canonicalcombiningclass")
|| memEQs(lookup_name + lookup_offset,
j - 1 - lookup_offset, "ccc")
|| memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "numericvalue")
- || memEQs(lookup_name + lookup_offset,
- j - 1 - lookup_offset, "nv")
- || memEQs(lookup_name + lookup_offset,
j - 1 - lookup_offset, "age")
|| memEQs(lookup_name + lookup_offset,
j - 1 - lookup_offset, "in")
* willy-nilly, as those could be a minus sign. Other stricter
* rules also apply. However, these properties all can have the
* rhs not be a number, in which case they contain at least one
- * alphabetic. In those cases, the stricter rules don't apply. We
- * first parse to look for alphas */
+ * alphabetic. In those cases, the stricter rules don't apply.
+ * But the numeric type properties can have the alphas [Ee] to
+ * signify an exponent, and it is still a number with stricter
+ * rules. So look for an alpha that signifys not-strict */
stricter = TRUE;
for (k = i; k < len; k++) {
- if (isALPHA(name[k])) {
+ if ( isALPHA(name[k])
+ && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
+ {
stricter = FALSE;
break;
}
/* We are now in a position to determine if this property should have
* been parsed using stricter rules. Only a few are like that, and
* unlikely to change. */
- if ( ( memBEGINPs(lookup_name, j, "perl")
- && memNEs(lookup_name + 4, j - 4, "space")
- && memNEs(lookup_name + 4, j - 4, "word"))
- || memEQs(lookup_name, j, "canondcij")
- || memEQs(lookup_name, j, "combabove"))
+ if ( memBEGINPs(lookup_name, j, "perl")
+ && memNEs(lookup_name + 4, j - 4, "space")
+ && memNEs(lookup_name + 4, j - 4, "word"))
{
stricter = TRUE;
lookup_name[j++] = cur;
- /* Unless this is a slash, we are done with it */
- if (cur != '/') {
+ /* Unless this is a non-trailing slash, we are done with it */
+ if (i >= len - 1 || cur != '/') {
continue;
}
+ slash_pos = j;
+
/* A slash in the 'numeric value' property indicates that what follows
* is a denominator. It can have a leading '+' and '0's that should be
* skipped. But we have never allowed a negative denominator, so treat
* a minus like every other character. (No need to rule out a second
* '/', as that won't match anything anyway */
- if ( memEQs(lookup_name + lookup_offset, equals_pos - lookup_offset,
- "nv=")
- || memEQs(lookup_name + lookup_offset, equals_pos - lookup_offset,
- "numericvalue="))
- {
+ if (is_nv_type) {
i++;
if (i < len && name[i] == '+') {
i++;
return NULL;
}
- starts_with_In_or_Is = true;
+ starts_with_In_or_Is = TRUE;
}
+ lookup_len = j; /* Use a more mnemonic name starting here */
+
/* Get the index into our pointer table of the inversion list corresponding
* to the property */
- table_index = match_uniprop((U8 *) lookup_name, j);
+ table_index = match_uniprop((U8 *) lookup_name, lookup_len);
/* If it didn't find the property */
if (table_index == 0) {
/* If didn't find the property, we try again stripping off any initial
* 'In' or 'Is' */
- if (! starts_with_In_or_Is) {
- return NULL;
- }
+ if (starts_with_In_or_Is) {
+ lookup_name += 2;
+ lookup_len -= 2;
+ equals_pos -= 2;
+ slash_pos -= 2;
- lookup_name += 2;
- j -= 2;
+ table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+ }
- /* If still didn't find it, give up */
- table_index = match_uniprop((U8 *) lookup_name, j);
if (table_index == 0) {
- return NULL;
+ char * canonical;
+
+ /* If not found, and not a numeric type property, isn't a legal
+ * property */
+ if (! is_nv_type) {
+ return NULL;
+ }
+
+ /* But the numeric type properties need more work to decide. What
+ * we do is make sure we have the number in canonical form and look
+ * that up. */
+
+ if (slash_pos < 0) { /* No slash */
+
+ /* When it isn't a rational, take the input, convert it to a
+ * NV, then create a canonical string representation of that
+ * NV. */
+
+ NV value;
+
+ /* Get the value */
+ if (my_atof3(lookup_name + equals_pos, &value,
+ lookup_len - equals_pos)
+ != lookup_name + lookup_len)
+ {
+ return NULL;
+ }
+
+ /* If the value is an integer, the canonical value is integral */
+ if (Perl_ceil(value) == value) {
+ canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
+ equals_pos, lookup_name, value);
+ }
+ else { /* Otherwise, it is %e with a known precision */
+ canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
+ equals_pos, lookup_name,
+ PL_E_FORMAT_PRECISION, value);
+ }
+ }
+ else { /* Has a slash. Create a rational in canonical form */
+ UV numerator, denominator, gcd, trial;
+ const char * end_ptr;
+ const char * sign = "";
+
+ /* We can't just find the numerator, denominator, and do the
+ * division, then use the method above, because that is
+ * inexact. And the input could be a rational that is within
+ * epsilon (given our precision) of a valid rational, and would
+ * then incorrectly compare valid.
+ *
+ * We're only interested in the part after the '=' */
+ const char * this_lookup_name = lookup_name + equals_pos;
+ lookup_len -= equals_pos;
+ slash_pos -= equals_pos;
+
+ /* Handle any leading minus */
+ if (this_lookup_name[0] == '-') {
+ sign = "-";
+ this_lookup_name++;
+ lookup_len--;
+ slash_pos--;
+ }
+
+ /* Convert the numerator to numeric */
+ end_ptr = this_lookup_name + slash_pos;
+ if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
+ return NULL;
+ }
+
+ /* It better have included all characters before the slash */
+ if (*end_ptr != '/') {
+ return NULL;
+ }
+
+ /* Set to look at just the denominator */
+ this_lookup_name += slash_pos;
+ lookup_len -= slash_pos;
+ end_ptr = this_lookup_name + lookup_len;
+
+ /* Convert the denominator to numeric */
+ if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
+ return NULL;
+ }
+
+ /* It better be the rest of the characters, and don't divide by
+ * 0 */
+ if ( end_ptr != this_lookup_name + lookup_len
+ || denominator == 0)
+ {
+ return NULL;
+ }
+
+ /* Get the greatest common denominator using
+ http://en.wikipedia.org/wiki/Euclidean_algorithm */
+ gcd = numerator;
+ trial = denominator;
+ while (trial != 0) {
+ UV temp = trial;
+ trial = gcd % trial;
+ gcd = temp;
+ }
+
+ /* If already in lowest possible terms, we have already tried
+ * looking this up */
+ if (gcd == 1) {
+ return NULL;
+ }
+
+ /* Reduce the rational, which should put it in canonical form.
+ * Then look it up */
+ numerator /= gcd;
+ denominator /= gcd;
+
+ canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
+ equals_pos, lookup_name, sign, numerator, denominator);
+ }
+
+ /* Here, we have the number in canonical form. Try that */
+ table_index = match_uniprop((U8 *) canonical, strlen(canonical));
+ if (table_index == 0) {
+ return NULL;
+ }
}
}
if (to_fold) {
if ( table_index == PL_XPOSIXUPPER
|| table_index == PL_XPOSIXLOWER
- || table_index == PL_LT)
+ || table_index == PL_TITLE)
{
table_index = PL_CASED;
}
- else if ( table_index == PL_LU
- || table_index == PL_LL
- || table_index == PL_LT)
- {
- table_index = PL_L_AMP_;
+ else if ( table_index == PL_UPPERCASELETTER
+ || table_index == PL_LOWERCASELETTER
+#ifdef PL_TITLECASELETTER /* Missing from early Unicodes */
+ || table_index == PL_TITLECASELETTER
+#endif
+ ) {
+ table_index = PL_CASEDLETTER;
}
- else if ( table_index == PL_POSIXUPPER
- || table_index == PL_POSIXLOWER)
+ else if ( table_index == PL_POSIXUPPER
+ || table_index == PL_POSIXLOWER)
{
table_index = PL_POSIXALPHA;
}