*** a/src/backend/utils/mb/mbutils.c --- b/src/backend/utils/mb/mbutils.c *************** *** 710,715 **** pg_encoding_mb2wchar_with_len(int encoding, --- 710,737 ---- return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len); } + /* convert a wchar string to a multibyte */ + int + pg_wchar2mb(const pg_wchar *from, char *to) + { + return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, pg_wchar_strlen(from)); + } + + /* convert a wchar string to a multibyte with a limited length */ + int + pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len) + { + return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, len); + } + + /* same, with any encoding */ + int + pg_encoding_wchar2mb_with_len(int encoding, + const pg_wchar *from, char *to, int len) + { + return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *)to, len); + } + /* returns the byte length of a multibyte character */ int pg_mblen(const char *mbstr) *** a/src/backend/utils/mb/wchar.c --- b/src/backend/utils/mb/wchar.c *************** *** 340,345 **** pg_euctw_dsplen(const unsigned char *s) --- 340,393 ---- } /* + * Convert pg_wchar to EUC_* encoding. + * caller must allocate enough space for "to", including a trailing zero! + * len: length of from. + * "from" not necessarily null terminated. + */ + static int + pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len) + { + int cnt = 0; + + while (len > 0 && *from) + { + unsigned char c; + + if ((c = *from >> 24)) + { + *to++ = c; + *to++ = (*from >> 16) & 0xff; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 4; + } + else if ((c = *from >> 16)) + { + *to++ = c; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 3; + } + else if ((c = *from >> 8)) + { + *to++ = c; + *to++ = *from & 0xff; + cnt += 2; + } + else + { + *to++ = *from; + cnt++; + } + len--; + } + *to = 0; + return cnt; + } + + + /* * JOHAB */ static int *************** *** 453,458 **** unicode_to_utf8(pg_wchar c, unsigned char *utf8string) --- 501,530 ---- return utf8string; } + /* + * Trivial conversion from pg_wchar to UTF-8. + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ + static int + pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len) + { + int cnt = 0; + + while (len > 0 && *from) + { + int char_len; + + unicode_to_utf8(*from, to); + char_len = pg_utf_mblen(to); + len--; + cnt += char_len; + to += char_len; + } + *to = 0; + return cnt; + } /* * Return the byte length of a UTF8 character pointed to by s *************** *** 719,724 **** pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) --- 791,848 ---- return cnt; } + /* + * convert pg_wchar to mule internal code + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ + static int + pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len) + { + int cnt = 0; + unsigned char lb; + + while (len > 0 && *from) + { + /* + * LBPRV1 is not used in any PostgreSQL supported encodings, it can be + * safely ignored. LCPRV2 is only used for Chinese encodings (EUC_TW + * and BIG5), so prefix byte is 0x9d. + */ + lb = (*from >> 16) & 0xff; + if (IS_LC1(lb)) + { + *to++ = lb; + *to++ = *from & 0xff; + cnt += 2; + } + else if (IS_LC2(lb)) + { + *to++ = lb; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 3; + } + else if (IS_LBPRV2(lb)) + { + *to++ = 0x9d; + *to++ = lb; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 4; + } + else + { + *to++ = lb; + cnt += 1; + } + len--; + } + *to = 0; + return cnt; + } + int pg_mule_mblen(const unsigned char *s) { *************** *** 774,779 **** pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len) --- 898,925 ---- return cnt; } + /* + * Trivial conversion from pg_wchar to single byte encoding. Just ignores + * high bits. + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ + static int + pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len) + { + int cnt = 0; + + while (len > 0 && *from) + { + *to++ = *from++; + len--; + cnt++; + } + *to = 0; + return cnt; + } + static int pg_latin1_mblen(const unsigned char *s) { *************** *** 1550,1597 **** pg_eucjp_increment(unsigned char *charptr, int length) *------------------------------------------------------------------- */ pg_wchar_tbl pg_wchar_table[] = { ! {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ ! {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ ! {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ ! {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ ! {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ ! {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ ! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */ ! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */ ! {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ ! {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ ! {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ ! {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ ! {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ ! {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ ! {0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ ! {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ }; /* returns the byte length of a word for mule internal code */ --- 1696,1743 ---- *------------------------------------------------------------------- */ pg_wchar_tbl pg_wchar_table[] = { ! {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ ! {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ ! {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ ! {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ ! {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ ! {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ ! {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */ ! {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */ ! {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ ! {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ ! {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ ! {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ ! {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ ! {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ ! {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ ! {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ }; /* returns the byte length of a word for mule internal code */ *** a/src/include/mb/pg_wchar.h --- b/src/include/mb/pg_wchar.h *************** *** 58,63 **** typedef unsigned int pg_wchar; --- 58,67 ---- * Is a prefix byte for "private" multibyte encodings? */ #define IS_LCPRV2(c) ((unsigned char)(c) == 0x9c || (unsigned char)(c) == 0x9d) + /* + * Is a leading byte for "private" multibyte encodings? + */ + #define IS_LBPRV2(c) ((unsigned char)(c) >= 0xf0) /*---------------------------------------------------- * leading characters *************** *** 277,283 **** extern pg_enc2gettext pg_enc2gettext_tbl[]; * pg_wchar stuff */ typedef int (*mb2wchar_with_len_converter) (const unsigned char *from, ! pg_wchar *to, int len); typedef int (*mblen_converter) (const unsigned char *mbstr); --- 281,291 ---- * pg_wchar stuff */ typedef int (*mb2wchar_with_len_converter) (const unsigned char *from, ! pg_wchar *to, ! int len); ! ! typedef int (*wchar2mb_with_len_converter) (const pg_wchar *from, ! unsigned char *to, int len); typedef int (*mblen_converter) (const unsigned char *mbstr); *************** *** 290,297 **** typedef int (*mbverifier) (const unsigned char *mbstr, int len); typedef struct { ! mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte ! * string to a wchar */ mblen_converter mblen; /* get byte length of a char */ mbdisplaylen_converter dsplen; /* get display width of a char */ mbverifier mbverify; /* verify multibyte sequence */ --- 298,307 ---- typedef struct { ! mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte ! * string to a wchar */ ! wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar ! * string to a multibyte */ mblen_converter mblen; /* get byte length of a char */ mbdisplaylen_converter dsplen; /* get display width of a char */ mbverifier mbverify; /* verify multibyte sequence */ *************** *** 372,377 **** extern int pg_mb2wchar(const char *from, pg_wchar *to); --- 382,391 ---- extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len); extern int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len); + extern int pg_wchar2mb(const pg_wchar *from, char *to); + extern int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len); + extern int pg_encoding_wchar2mb_with_len(int encoding, + const pg_wchar *from, char *to, int len); extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);