- itemsize = len;
- if (DO_UTF8(sv)) {
- itemsize = sv_len_utf8(sv);
- if (itemsize != (I32)len) {
- I32 itembytes;
- if (itemsize <= fieldsize) {
- const char *send = chophere = s + itemsize;
- while (s < send) {
- if (*s == '\r') {
- itemsize = s - item;
- chophere = s;
- break;
- }
- if (! isCNTRL(*s))
- gotsome = TRUE;
- s++;
- }
- }
- else {
- const char *send;
- itemsize = fieldsize;
- itembytes = itemsize;
- sv_pos_u2b(sv, &itembytes, 0);
- send = chophere = s + itembytes;
- while (s < send || (s == send && isSPACE(*s))) {
- if (isSPACE(*s)) {
- if (chopspace)
- chophere = s;
- if (*s == '\r')
- break;
- }
- else {
- if (! isCNTRL(*s))
- gotsome = TRUE;
- if (strchr(PL_chopset, *s))
- chophere = s + 1;
- }
- s++;
- }
- itemsize = chophere - item;
- sv_pos_b2u(sv, &itemsize);
- }
- item_is_utf8 = TRUE;
- break;
- }
- }
- item_is_utf8 = FALSE;
- if (itemsize <= fieldsize) {
- const char *const send = chophere = s + itemsize;
- while (s < send) {
- if (*s == '\r') {
- itemsize = s - item;
- chophere = s;
- break;
- }
- if (! isCNTRL(*s))
- gotsome = TRUE;
+ const char *send = s + len;
+ I32 size = 0;
+
+ chophere = NULL;
+ item_is_utf8 = DO_UTF8(sv);
+ while (s < send) {
+ /* look for a legal split position */
+ if (isSPACE(*s)) {
+ if (*s == '\r') {
+ chophere = s;
+ itemsize = size;
+ break;
+ }
+ if (chopspace) {
+ /* provisional split point */
+ chophere = s;
+ itemsize = size;
+ }
+ /* we delay testing fieldsize until after we've
+ * processed the possible split char directly
+ * following the last field char; so if fieldsize=3
+ * and item="a b cdef", we consume "a b", not "a".
+ * Ditto further down.
+ */
+ if (size == fieldsize)
+ break;
+ }
+ else {
+ if (strchr(PL_chopset, *s)) {
+ /* provisional split point */
+ /* for a non-space split char, we include
+ * the split char; hence the '+1' */
+ chophere = s + 1;
+ itemsize = size;
+ }
+ if (size == fieldsize)
+ break;
+ if (!isCNTRL(*s))
+ gotsome = TRUE;
+ }
+
+ if (item_is_utf8)
+ s += UTF8SKIP(s);
+ else