From 6c6b7c6128e84f4053badbf777f4f57039f0e92a Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Wed, 6 Mar 2024 17:41:51 -0800 Subject: [PATCH v24 1/5] Support C.utf-8 locale in the new builtin collation provider. MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit ----- CATVERSION ----- The builtin C.utf-8 locale has similar semantics to the libc locale of the same name. That is, code point sort order (fast, memcmp-based) combined with Unicode semantics for character operations such as pattern matching, regular expressions, and LOWER()/INITCAP()/UPPER(). The character semantics are based on Unicode simple case mappings. The builtin provider's C.utf-8 offers several important advantages over libc: * faster sorting -- benefits from additional optimizations such as abbreviated keys and varstrfastcmp_c * faster case conversion, e.g. LOWER(), at least compared with some libc implementations * available on all platforms with identical semantics, and the semantics are stable, testable, and documentable within a given Postgres major version Being based on memcmp, the builtin C.utf-8 locale does not offer natural language sort order. But it is an improvement for most use cases that might otherwise use libc's "C.utf-8" locale, as well as many use cases that use libc's "C" locale. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider --- doc/src/sgml/charset.sgml | 27 +++- doc/src/sgml/ref/create_collation.sgml | 2 +- doc/src/sgml/ref/create_database.sgml | 3 +- doc/src/sgml/ref/initdb.sgml | 2 +- src/backend/regex/regc_pg_locale.c | 36 ++++- src/backend/utils/adt/formatting.c | 123 +++++++++++++++++ src/backend/utils/adt/pg_locale.c | 13 +- src/bin/initdb/initdb.c | 16 ++- src/bin/initdb/t/001_initdb.pl | 17 +++ src/bin/pg_upgrade/t/002_pg_upgrade.pl | 2 +- src/bin/scripts/t/020_createdb.pl | 18 +++ src/include/catalog/pg_collation.dat | 3 + src/test/regress/expected/collate.utf8.out | 136 +++++++++++++++++++ src/test/regress/expected/collate.utf8_1.out | 8 ++ src/test/regress/parallel_schedule | 4 +- src/test/regress/sql/collate.utf8.sql | 67 +++++++++ 16 files changed, 467 insertions(+), 10 deletions(-) create mode 100644 src/test/regress/expected/collate.utf8.out create mode 100644 src/test/regress/expected/collate.utf8_1.out create mode 100644 src/test/regress/sql/collate.utf8.sql diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 7114eb7b52..55bbb20dac 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -377,13 +377,21 @@ initdb --locale-provider=icu --icu-locale=en The builtin provider uses built-in operations. Only - the C locale is supported for this provider. + the C and C.utf-8 locales are + supported for this provider. The C locale behavior is identical to the C locale in the libc provider. When using this locale, the behavior may depend on the database encoding. + + The C.utf-8 locale is available only for when the + database encoding is utf-8, and the behavior is + based on Unicode. The collation uses the code point values only. The + regular expression character classes are based on the "POSIX + Compatible" semantics, and the case mapping is the "simple" variant. + @@ -878,6 +886,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR"; + + pg_c_utf8 + + + This collation sorts by Unicode code point values rather than natural + language order. For the functions lower, + initcap, and upper, it uses + Unicode simple case mapping. For pattern matching (including regular + expressions), it uses the POSIX Compatible variant of Unicode Compatibility + Properties. Behavior is efficient and stable within a + Postgres major version. This collation is + only available for encoding UTF8. + + + + C (equivalent to POSIX) diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 98cd7d56be..85f18cbbe5 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -99,7 +99,7 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM If provider is builtin, then locale must be specified and set to - C. + either C or C.utf-8. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 6c1fd95602..1f5cdf1271 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -165,7 +165,8 @@ CREATE DATABASE name If is builtin, then locale - must be specified and set to C. + must be specified and set to either C or + C.utf-8. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 4760570f6a..08a1c2538f 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -289,7 +289,7 @@ PostgreSQL documentation If is builtin, must be specified and set to - C. + C or C.utf-8. diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 6a26388bfa..85f3238eb0 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use functions */ PG_REGEX_LOCALE_1BYTE, /* Use functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t functions */ @@ -266,7 +269,12 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + pg_regex_strategy = PG_REGEX_BUILTIN; + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +298,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +332,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +366,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +409,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +443,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +477,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +511,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +545,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +579,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +614,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_uppercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +656,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_lowercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +822,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +842,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 5f483b8dbc..1a578b8a2b 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -77,6 +77,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "nodes/miscnodes.h" #include "parser/scansup.h" @@ -1679,6 +1681,33 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize = srclen + 1; + char *dst = palloc(dstsize); + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size */ + dstsize = srclen + 1; + result = palloc(dstsize); + + needed = unicode_strlower(dst, dstsize, src, srclen); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_strlower(dst, dstsize, src, srclen); + Assert(needed + 1 == dstsize); + } + + result = dst; + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); @@ -1799,6 +1828,33 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize = srclen + 1; + char *dst = palloc(dstsize); + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size */ + dstsize = srclen + 1; + result = palloc(dstsize); + + needed = unicode_strupper(dst, dstsize, src, srclen); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_strupper(dst, dstsize, src, srclen); + Assert(needed + 1 == dstsize); + } + + result = dst; + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); @@ -1920,6 +1976,73 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *src = (unsigned char *) buff; + unsigned char *dst; + size_t dstsize = nbytes + 1; + int srcoff = 0; + int dstoff = 0; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + dst = (unsigned char *) palloc(dstsize); + + while (srcoff < nbytes) + { + pg_wchar u1 = utf8_to_unicode(src + srcoff); + pg_wchar u2; + int u1len = unicode_utf8len(u1); + int u2len; + + if (wasalnum) + u2 = unicode_lowercase_simple(u1); + else + u2 = unicode_uppercase_simple(u1); + + u2len = unicode_utf8len(u2); + + wasalnum = pg_u_isalnum(u2, true); + + /* + * If we can't fit the necessary bytes and a terminating NUL, + * reallocate buffer to the maximum size we might need, and + * shrink it later. + */ + if (dstoff + u2len + 1 > dstsize) + { + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + dstsize = (nbytes + 1) * sizeof(pg_wchar); + dst = repalloc(dst, dstsize); + } + + unicode_to_utf8(u2, dst + dstoff); + srcoff += u1len; + dstoff += u2len; + } + + *(dst + dstoff) = '\0'; + dstoff++; + + if (dstsize == dstoff) + { + result = (char *) dst; + } + else + { + /* shrink buffer and store result */ + result = palloc(dstoff); + memcpy(result, dst, dstoff); + pfree(dst); + } + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 39390fbe4e..a5aeabce94 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1731,7 +1731,11 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; - /* the builtin collation provider is not versioned */ + /* + * The only two supported locales (C and C.utf-8) are both based on memcmp + * and do not change. (The ctype behavior can change, but the versioning + * does not track that.) + */ if (collprovider == COLLPROVIDER_BUILTIN) return NULL; @@ -2508,7 +2512,14 @@ builtin_validate_locale(int encoding, const char *locale) int required_encoding = -1; if (strcmp(locale, "C") == 0) + { canonical_name = "C"; + } + else if (strcmp(locale, "C.utf-8") == 0 || strcmp(locale, "C.UTF8") == 0) + { + required_encoding = PG_UTF8; + canonical_name = "C.utf-8"; + } if (!canonical_name) ereport(ERROR, diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 8d53ef4a1f..ac33508d32 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2403,9 +2403,16 @@ setlocales(void) if (locale_provider == COLLPROVIDER_BUILTIN) { - if (strcmp(datlocale, "C") != 0) + if (strcmp(datlocale, "C") == 0) + canonname = "C"; + else if (strcmp(datlocale, "C.utf-8") == 0 || + strcmp(datlocale, "C.UTF8") == 0) + canonname = "C.utf-8"; + else pg_fatal("invalid locale name \"%s\" for builtin provider", datlocale); + + datlocale = canonname; } else if (locale_provider == COLLPROVIDER_ICU) { @@ -2694,6 +2701,13 @@ setup_locale_encoding(void) !check_locale_encoding(lc_collate, encodingid)) exit(1); /* check_locale_encoding printed the error */ + if (locale_provider == COLLPROVIDER_BUILTIN) + { + if (strcmp(datlocale, "C.utf-8") == 0 && encodingid != PG_UTF8) + pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"", + datlocale, "utf-8"); + } + if (locale_provider == COLLPROVIDER_ICU && !check_icu_locale_encoding(encodingid)) exit(1); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index e719f70dae..c5408b6f2d 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -196,6 +196,23 @@ command_ok( ], 'locale provider builtin with --locale'); +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E utf-8', + '--builtin-locale=C.utf-8', "$tempdir/data8" + ], + 'locale provider builtin with -E utf-8 --builtin-locale=C.utf-8'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E SQL_ASCII', + '--builtin-locale=C.utf-8', "$tempdir/data9" + ], + 'locale provider builtin with --builtin-locale=C.utf-8 fails for SQL_ASCII' +); + command_ok( [ 'initdb', '--no-sync', diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index ed79c0930b..3e67121a8d 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -140,7 +140,7 @@ if ($oldnode->pg_version >= '17devel') { $original_enc_name = "utf-8"; $original_provider = "b"; - $original_datlocale = "C"; + $original_datlocale = "C.utf-8"; } elsif ($oldnode->pg_version >= 15 && $ENV{with_icu} eq 'yes') { diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index dfd635bfab..3ba623f9d1 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -139,6 +139,24 @@ $node->command_ok( ], 'create database with provider "builtin" and LC_CTYPE=C'); +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E utf-8', '--builtin-locale=C.UTF8', + 'tbuiltin5' + ], + 'create database with provider "builtin" and --builtin-locale C.utf-8'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E LATIN1', '--builtin-locale=C.utf-8', + 'tbuiltin6' + ], + 'create database with provider "builtin" and --builtin-locale C.utf-8'); + $node->command_fails( [ 'createdb', '-T', diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 938432e8a4..b95a89491d 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -30,5 +30,8 @@ descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', colllocale => 'und' }, +{ oid => '811', descr => 'sorts by Unicode code point; Unicode & POSIX character semantics', + collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', + colllocale => 'C.utf-8' }, ] diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out new file mode 100644 index 0000000000..eff0ef21ac --- /dev/null +++ b/src/test/regress/expected/collate.utf8.out @@ -0,0 +1,136 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.utf-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SET client_encoding TO UTF8; +-- +-- Test PG_C_UTF8 +-- +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C_UTF8'); -- fails +ERROR: invalid locale name "C_UTF8" for builtin provider +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF8'); +DROP COLLATION regress_pg_c_utf8; +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.utf-8'); +CREATE TABLE test_pg_c_utf8 ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO test_pg_c_utf8 VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_c_utf8; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +-----------------+-----------------+-----------------+-----------------+---------+---------------+-----------------+--------------- + abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 + ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE test_pg_c_utf8; +-- negative test: Final_Sigma not used for builtin locale C.utf-8 +SELECT lower('ΑΣ' COLLATE PG_C_UTF8); + lower +------- + ασ +(1 row) + +SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); + lower +------- + αͺσͺ +(1 row) + +SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); + lower +------- + α΄σ΄ +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/expected/collate.utf8_1.out b/src/test/regress/expected/collate.utf8_1.out new file mode 100644 index 0000000000..e73fdf50c3 --- /dev/null +++ b/src/test/regress/expected/collate.utf8_1.out @@ -0,0 +1,8 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.utf-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 1d8a414eea..e48cb4b7a3 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,9 +78,9 @@ test: brin_bloom brin_multi # psql depends on create_am # amutils depends on geometry, create_index_spgist, hash_index, brin # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role without_overlaps +test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role without_overlaps -# collate.*.utf8 tests cannot be run in parallel with each other +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252 # ---------- diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 0000000000..1f5f9ef491 --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,67 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.utf-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test PG_C_UTF8 +-- + +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C_UTF8'); -- fails +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF8'); +DROP COLLATION regress_pg_c_utf8; +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.utf-8'); + +CREATE TABLE test_pg_c_utf8 ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO test_pg_c_utf8 VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_c_utf8; + +DROP TABLE test_pg_c_utf8; + +-- negative test: Final_Sigma not used for builtin locale C.utf-8 +SELECT lower('ΑΣ' COLLATE PG_C_UTF8); +SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); +SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed -- 2.34.1