To: vim-dev@vim.org Subject: Patch 7.2.312 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 7.2.312 Problem: iconv() returns an invalid character sequence when conversion fails. It should return an empty string. (Yongwei Wu) Solution: Be more strict about invalid characters in the input. Files: src/mbyte.c *** ../vim-7.2.311/src/mbyte.c 2009-06-16 15:23:07.000000000 +0200 --- src/mbyte.c 2009-11-25 16:10:44.000000000 +0100 *************** *** 133,154 **** static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); static int dbcs_ptr2char __ARGS((char_u *p)); ! /* Lookup table to quickly get the length in bytes of a UTF-8 character from ! * the first byte of a UTF-8 string. Bytes which are illegal when used as the ! * first byte have a one, because these will be used separately. */ static char utf8len_tab[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/ ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, }; /* * XIM often causes trouble. Define XIM_DEBUG to get a log of XIM callbacks * in the "xim.log" file. */ --- 133,172 ---- static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); static int dbcs_ptr2char __ARGS((char_u *p)); ! /* ! * Lookup table to quickly get the length in bytes of a UTF-8 character from ! * the first byte of a UTF-8 string. ! * Bytes which are illegal when used as the first byte have a 1. ! * The NUL byte has length 1. ! */ static char utf8len_tab[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, }; /* + * Like utf8len_tab above, but using a zero for illegal lead bytes. + */ + static char utf8len_tab_zero[256] = + { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0, + }; + + /* * XIM often causes trouble. Define XIM_DEBUG to get a log of XIM callbacks * in the "xim.log" file. */ *************** *** 1352,1358 **** if (size > 0 && *p >= 0x80) { if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) ! return 1; c = utf_ptr2char(p); /* An illegal byte is displayed as . */ if (utf_ptr2len(p) == 1 || c == NUL) --- 1370,1376 ---- if (size > 0 && *p >= 0x80) { if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) ! return 1; /* truncated */ c = utf_ptr2char(p); /* An illegal byte is displayed as . */ if (utf_ptr2len(p) == 1 || c == NUL) *************** *** 1473,1479 **** if (p[0] < 0x80) /* be quick for ASCII */ return p[0]; ! len = utf8len_tab[p[0]]; if (len > 1 && (p[1] & 0xc0) == 0x80) { if (len == 2) --- 1491,1497 ---- if (p[0] < 0x80) /* be quick for ASCII */ return p[0]; ! len = utf8len_tab_zero[p[0]]; if (len > 1 && (p[1] & 0xc0) == 0x80) { if (len == 2) *************** *** 1723,1728 **** --- 1741,1747 ---- /* * Return length of UTF-8 character, obtained from the first byte. * "b" must be between 0 and 255! + * Returns 1 for an invalid first byte value. */ int utf_byte2len(b) *************** *** 1737,1742 **** --- 1756,1762 ---- * Returns 1 for "". * Returns 1 for an illegal byte sequence (also in incomplete byte seq.). * Returns number > "size" for an incomplete byte sequence. + * Never returns zero. */ int utf_ptr2len_len(p, size) *************** *** 1747,1757 **** int i; int m; ! if (*p == NUL) ! return 1; ! m = len = utf8len_tab[*p]; if (len > size) m = size; /* incomplete byte sequence. */ for (i = 1; i < m; ++i) if ((p[i] & 0xc0) != 0x80) return 1; --- 1767,1779 ---- int i; int m; ! len = utf8len_tab[*p]; ! if (len == 1) ! return 1; /* NUL, ascii or illegal lead byte */ if (len > size) m = size; /* incomplete byte sequence. */ + else + m = len; for (i = 1; i < m; ++i) if ((p[i] & 0xc0) != 0x80) return 1; *************** *** 2505,2510 **** --- 2527,2533 ---- /* * mb_head_off() function pointer. * Return offset from "p" to the first byte of the character it points into. + * If "p" points to the NUL at the end of the string return 0. * Returns 0 when already at the first byte of a character. */ int *************** *** 2524,2530 **** /* It can't be a trailing byte when not using DBCS, at the start of the * string or the previous byte can't start a double-byte. */ ! if (p <= base || MB_BYTE2LEN(p[-1]) == 1) return 0; /* This is slow: need to start at the base and go forward until the --- 2547,2553 ---- /* It can't be a trailing byte when not using DBCS, at the start of the * string or the previous byte can't start a double-byte. */ ! if (p <= base || MB_BYTE2LEN(p[-1]) == 1 || *p == NUL) return 0; /* This is slow: need to start at the base and go forward until the *************** *** 2552,2558 **** * lead byte in the current cell. */ if (p <= base || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e) ! || MB_BYTE2LEN(p[-1]) == 1) return 0; /* This is slow: need to start at the base and go forward until the --- 2575,2582 ---- * lead byte in the current cell. */ if (p <= base || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e) ! || MB_BYTE2LEN(p[-1]) == 1 ! || *p == NUL) return 0; /* This is slow: need to start at the base and go forward until the *************** *** 2578,2583 **** --- 2602,2608 ---- char_u *q; char_u *s; int c; + int len; #ifdef FEAT_ARABIC char_u *j; #endif *************** *** 2597,2604 **** --q; /* Check for illegal sequence. Do allow an illegal byte after where we * started. */ ! if (utf8len_tab[*q] != (int)(s - q + 1) ! && utf8len_tab[*q] != (int)(p - q + 1)) return 0; if (q <= base) --- 2622,2629 ---- --q; /* Check for illegal sequence. Do allow an illegal byte after where we * started. */ ! len = utf8len_tab[*q]; ! if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) return 0; if (q <= base) *************** *** 2810,2818 **** while (end == NULL ? *p != NUL : p < end) { ! if ((*p & 0xc0) == 0x80) return FALSE; /* invalid lead byte */ - l = utf8len_tab[*p]; if (end != NULL && p + l > end) return FALSE; /* incomplete byte sequence */ ++p; --- 2835,2843 ---- while (end == NULL ? *p != NUL : p < end) { ! l = utf8len_tab_zero[*p]; ! if (l == 0) return FALSE; /* invalid lead byte */ if (end != NULL && p + l > end) return FALSE; /* incomplete byte sequence */ ++p; *************** *** 6117,6128 **** d = retval; for (i = 0; i < len; ++i) { ! l = utf_ptr2len(ptr + i); if (l == 0) *d++ = NUL; else if (l == 1) { ! if (unconvlenp != NULL && utf8len_tab[ptr[i]] > len - i) { /* Incomplete sequence at the end. */ *unconvlenp = len - i; --- 6142,6161 ---- d = retval; for (i = 0; i < len; ++i) { ! l = utf_ptr2len_len(ptr + i, len - i); if (l == 0) *d++ = NUL; else if (l == 1) { ! int l_w = utf8len_tab_zero[ptr[i]]; ! ! if (l_w == 0) ! { ! /* Illegal utf-8 byte cannot be converted */ ! vim_free(retval); ! return NULL; ! } ! if (unconvlenp != NULL && l_w > len - i) { /* Incomplete sequence at the end. */ *unconvlenp = len - i; *** ../vim-7.2.311/src/version.c 2009-12-02 13:32:10.000000000 +0100 --- src/version.c 2009-12-02 15:00:23.000000000 +0100 *************** *** 683,684 **** --- 683,686 ---- { /* Add new patch number below this line */ + /**/ + 312, /**/ -- hundred-and-one symptoms of being an internet addict: 6. You refuse to go to a vacation spot with no electricity and no phone lines. /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ download, build and distribute -- http://www.A-A-P.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///