From aaa4171a031c7481cb4c1a57e24d33c73432b390 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 26 Feb 2024 11:58:29 +1300 Subject: [PATCH v2] Add simple codepoint redirections to unaccent.rules. Previously we searched for code points where the Unicode data file listed an equivalent combining character sequence that added accents. Some code points redirect to a single other code point, instead of doing any actual combining. We can follow those references recursively to get the answer. Per bug report #18362, which pointed out some ancient Greek character variants that we were missing. Reported-by: Cees van Zeeland Reviewed-by: Robert Haas Reviewed-by: Peter Eisentraut Discussion: https://postgr.es/m/18362-be6d0cfe122b6354%40postgresql.org --- contrib/unaccent/expected/unaccent.out | 2 +- contrib/unaccent/generate_unaccent_rules.py | 17 +- contrib/unaccent/unaccent.rules | 1013 ++++++++++++++++++- 3 files changed, 1023 insertions(+), 9 deletions(-) diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index d03374c799a..763f4ed9ffd 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -176,6 +176,6 @@ SELECT ts_lexize('unaccent', '〝'); SELECT unaccent('ℌ'); unaccent ---------- - x + H (1 row) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index cffb7db7cee..a2c1ebf2d87 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -104,10 +104,11 @@ def is_letter_with_marks(codepoint, table): """Returns true for letters combined with one or more marks.""" # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values - # Letter may have no combining characters, in which case it has - # no marks. - if len(codepoint.combining_ids) == 1: - return False + # Some codepoints redirect directly to another, instead of doing any + # "combining"... but sometimes they redirect to a codepoint that doesn't + # exist, so ignore those. + if len(codepoint.combining_ids) == 1 and codepoint.combining_ids[0] in table: + return is_letter_with_marks(table[codepoint.combining_ids[0]], table) # A letter without diacritical marks has none of them. if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: @@ -148,8 +149,7 @@ def get_plain_letter(codepoint, table): def is_ligature(codepoint, table): """Return true for letters combined with letters.""" - return all(is_letter(table[i], table) for i in codepoint.combining_ids) - + return all(i in table and is_letter(table[i], table) for i in codepoint.combining_ids) def get_plain_letters(codepoint, table): """Return a list of plain letters from a ligature.""" @@ -200,6 +200,9 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): # the parser of unaccent only accepts non-whitespace characters # for "src" and "trg" (see unaccent.c) if not src.isspace() and not trg.isspace(): + if src == "ℌ": + # a mistake? + continue charactersSet.add((ord(src), trg)) return charactersSet @@ -251,7 +254,7 @@ def main(args): # walk through all the codepoints looking for interesting mappings for codepoint in all: if codepoint.general_category.startswith('L') and \ - len(codepoint.combining_ids) > 1: + len(codepoint.combining_ids) > 0: if is_letter_with_marks(codepoint, table): charactersSet.add((codepoint.id, chr(get_plain_letter(codepoint, table).id))) diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index ca6caa51f52..35fd246b71f 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -1,9 +1,12 @@ ¡ ! © (C) +ª a « << ­ - ® (R) ± +/- +µ μ +º o » >> ¼ " 1/4" ½ " 1/2" @@ -402,6 +405,11 @@ ʦ ts ʪ ls ʫ lz +ʰ h +ʲ j +ʳ r +ʷ w +ʸ y ʹ ' ʺ """" ʻ ' @@ -417,6 +425,9 @@ ˖ + ˗ - ˜ ~ +ˡ l +ˢ s +ˣ x ̀ ́ ̂ @@ -536,6 +547,17 @@ ό ο ύ υ ώ ω +ϐ β +ϑ θ +ϒ Υ +ϕ φ +ϖ π +ϰ κ +ϱ ρ +ϲ ς +ϴ Θ +ϵ ε +Ϲ Σ Ё Е ё е ᴀ A @@ -556,6 +578,50 @@ ᴠ V ᴡ W ᴢ Z +ᴬ A +ᴮ B +ᴰ D +ᴱ E +ᴳ G +ᴴ H +ᴵ I +ᴶ J +ᴷ K +ᴸ L +ᴹ M +ᴺ N +ᴼ O +ᴾ P +ᴿ R +ᵀ T +ᵁ U +ᵂ W +ᵃ a +ᵇ b +ᵈ d +ᵉ e +ᵍ g +ᵏ k +ᵐ m +ᵒ o +ᵖ p +ᵗ t +ᵘ u +ᵛ v +ᵝ β +ᵞ γ +ᵟ δ +ᵠ φ +ᵡ χ +ᵢ i +ᵣ r +ᵤ u +ᵥ v +ᵦ β +ᵧ γ +ᵨ ρ +ᵩ φ +ᵪ χ ᵫ ue ᵬ b ᵭ d @@ -592,6 +658,10 @@ ᶓ e ᶖ i ᶙ u +ᶜ c +ᶠ f +ᶻ z +ᶿ θ Ḁ A ḁ a Ḃ B @@ -947,12 +1017,19 @@ Ὦ Ω Ὧ Ω ὰ α +ά α ὲ ε +έ ε ὴ η +ή η ὶ ι +ί ι ὸ ο +ό ο ὺ υ +ύ υ ὼ ω +ώ ω ᾀ α ᾁ α ᾂ α @@ -1011,26 +1088,33 @@ Ᾰ Α Ᾱ Α Ὰ Α +Ά Α ᾼ Α +ι ι ῂ η ῃ η ῄ η ῆ η ῇ η Ὲ Ε +Έ Ε Ὴ Η +Ή Η ῌ Η ῐ ι ῑ ι ῒ ι +ΐ ι ῖ ι ῗ ι Ῐ Ι Ῑ Ι Ὶ Ι +Ί Ι ῠ υ ῡ υ ῢ υ +ΰ υ ῤ ρ ῥ ρ ῦ υ @@ -1038,6 +1122,7 @@ Ῠ Υ Ῡ Υ Ὺ Υ +Ύ Υ Ῥ Ρ ῲ ω ῳ ω @@ -1045,7 +1130,9 @@ ῶ ω ῷ ω Ὸ Ο +Ό Ο Ὼ Ω +Ώ Ω ῼ Ω ‐ - ‑ - @@ -1077,6 +1164,20 @@ ⁈ ?! ⁉ !? ⁎ * +ⁱ i +ⁿ n +ₐ a +ₑ e +ₒ o +ₓ x +ₕ h +ₖ k +ₗ l +ₘ m +ₙ n +ₚ p +ₛ s +ₜ t ₠ CE ₢ Cr ₣ Fr. @@ -1100,7 +1201,7 @@ ℉ °F ℊ g ℋ H -ℌ x +ℌ H ℍ H ℎ h ℐ I @@ -1119,7 +1220,10 @@ ℞ Rx ℡ TEL ℤ Z +Ω Ω ℨ Z +K K +Å A ℬ B ℭ C ℯ e @@ -1129,6 +1233,10 @@ ℴ o ℹ i ℻ FAX +ℼ π +ℽ γ +ℾ Γ +ℿ Π ⅅ D ⅆ d ⅇ e @@ -1281,6 +1389,8 @@ ⱴ v ⱸ e ⱺ o +ⱼ j +ⱽ V Ȿ S Ɀ Z 、 , @@ -1455,6 +1565,9 @@ Ꞩ S ꞩ s Ɦ H +ꟲ C +ꟳ F +ꟴ Q ff ff fi fi fl fl @@ -1611,6 +1724,904 @@ 、 , ← <- → -> +𐞥 q +𝐀 A +𝐁 B +𝐂 C +𝐃 D +𝐄 E +𝐅 F +𝐆 G +𝐇 H +𝐈 I +𝐉 J +𝐊 K +𝐋 L +𝐌 M +𝐍 N +𝐎 O +𝐏 P +𝐐 Q +𝐑 R +𝐒 S +𝐓 T +𝐔 U +𝐕 V +𝐖 W +𝐗 X +𝐘 Y +𝐙 Z +𝐚 a +𝐛 b +𝐜 c +𝐝 d +𝐞 e +𝐟 f +𝐠 g +𝐡 h +𝐢 i +𝐣 j +𝐤 k +𝐥 l +𝐦 m +𝐧 n +𝐨 o +𝐩 p +𝐪 q +𝐫 r +𝐬 s +𝐭 t +𝐮 u +𝐯 v +𝐰 w +𝐱 x +𝐲 y +𝐳 z +𝐴 A +𝐵 B +𝐶 C +𝐷 D +𝐸 E +𝐹 F +𝐺 G +𝐻 H +𝐼 I +𝐽 J +𝐾 K +𝐿 L +𝑀 M +𝑁 N +𝑂 O +𝑃 P +𝑄 Q +𝑅 R +𝑆 S +𝑇 T +𝑈 U +𝑉 V +𝑊 W +𝑋 X +𝑌 Y +𝑍 Z +𝑎 a +𝑏 b +𝑐 c +𝑑 d +𝑒 e +𝑓 f +𝑔 g +𝑖 i +𝑗 j +𝑘 k +𝑙 l +𝑚 m +𝑛 n +𝑜 o +𝑝 p +𝑞 q +𝑟 r +𝑠 s +𝑡 t +𝑢 u +𝑣 v +𝑤 w +𝑥 x +𝑦 y +𝑧 z +𝑨 A +𝑩 B +𝑪 C +𝑫 D +𝑬 E +𝑭 F +𝑮 G +𝑯 H +𝑰 I +𝑱 J +𝑲 K +𝑳 L +𝑴 M +𝑵 N +𝑶 O +𝑷 P +𝑸 Q +𝑹 R +𝑺 S +𝑻 T +𝑼 U +𝑽 V +𝑾 W +𝑿 X +𝒀 Y +𝒁 Z +𝒂 a +𝒃 b +𝒄 c +𝒅 d +𝒆 e +𝒇 f +𝒈 g +𝒉 h +𝒊 i +𝒋 j +𝒌 k +𝒍 l +𝒎 m +𝒏 n +𝒐 o +𝒑 p +𝒒 q +𝒓 r +𝒔 s +𝒕 t +𝒖 u +𝒗 v +𝒘 w +𝒙 x +𝒚 y +𝒛 z +𝒜 A +𝒞 C +𝒟 D +𝒢 G +𝒥 J +𝒦 K +𝒩 N +𝒪 O +𝒫 P +𝒬 Q +𝒮 S +𝒯 T +𝒰 U +𝒱 V +𝒲 W +𝒳 X +𝒴 Y +𝒵 Z +𝒶 a +𝒷 b +𝒸 c +𝒹 d +𝒻 f +𝒽 h +𝒾 i +𝒿 j +𝓀 k +𝓁 l +𝓂 m +𝓃 n +𝓅 p +𝓆 q +𝓇 r +𝓈 s +𝓉 t +𝓊 u +𝓋 v +𝓌 w +𝓍 x +𝓎 y +𝓏 z +𝓐 A +𝓑 B +𝓒 C +𝓓 D +𝓔 E +𝓕 F +𝓖 G +𝓗 H +𝓘 I +𝓙 J +𝓚 K +𝓛 L +𝓜 M +𝓝 N +𝓞 O +𝓟 P +𝓠 Q +𝓡 R +𝓢 S +𝓣 T +𝓤 U +𝓥 V +𝓦 W +𝓧 X +𝓨 Y +𝓩 Z +𝓪 a +𝓫 b +𝓬 c +𝓭 d +𝓮 e +𝓯 f +𝓰 g +𝓱 h +𝓲 i +𝓳 j +𝓴 k +𝓵 l +𝓶 m +𝓷 n +𝓸 o +𝓹 p +𝓺 q +𝓻 r +𝓼 s +𝓽 t +𝓾 u +𝓿 v +𝔀 w +𝔁 x +𝔂 y +𝔃 z +𝔄 A +𝔅 B +𝔇 D +𝔈 E +𝔉 F +𝔊 G +𝔍 J +𝔎 K +𝔏 L +𝔐 M +𝔑 N +𝔒 O +𝔓 P +𝔔 Q +𝔖 S +𝔗 T +𝔘 U +𝔙 V +𝔚 W +𝔛 X +𝔜 Y +𝔞 a +𝔟 b +𝔠 c +𝔡 d +𝔢 e +𝔣 f +𝔤 g +𝔥 h +𝔦 i +𝔧 j +𝔨 k +𝔩 l +𝔪 m +𝔫 n +𝔬 o +𝔭 p +𝔮 q +𝔯 r +𝔰 s +𝔱 t +𝔲 u +𝔳 v +𝔴 w +𝔵 x +𝔶 y +𝔷 z +𝔸 A +𝔹 B +𝔻 D +𝔼 E +𝔽 F +𝔾 G +𝕀 I +𝕁 J +𝕂 K +𝕃 L +𝕄 M +𝕆 O +𝕊 S +𝕋 T +𝕌 U +𝕍 V +𝕎 W +𝕏 X +𝕐 Y +𝕒 a +𝕓 b +𝕔 c +𝕕 d +𝕖 e +𝕗 f +𝕘 g +𝕙 h +𝕚 i +𝕛 j +𝕜 k +𝕝 l +𝕞 m +𝕟 n +𝕠 o +𝕡 p +𝕢 q +𝕣 r +𝕤 s +𝕥 t +𝕦 u +𝕧 v +𝕨 w +𝕩 x +𝕪 y +𝕫 z +𝕬 A +𝕭 B +𝕮 C +𝕯 D +𝕰 E +𝕱 F +𝕲 G +𝕳 H +𝕴 I +𝕵 J +𝕶 K +𝕷 L +𝕸 M +𝕹 N +𝕺 O +𝕻 P +𝕼 Q +𝕽 R +𝕾 S +𝕿 T +𝖀 U +𝖁 V +𝖂 W +𝖃 X +𝖄 Y +𝖅 Z +𝖆 a +𝖇 b +𝖈 c +𝖉 d +𝖊 e +𝖋 f +𝖌 g +𝖍 h +𝖎 i +𝖏 j +𝖐 k +𝖑 l +𝖒 m +𝖓 n +𝖔 o +𝖕 p +𝖖 q +𝖗 r +𝖘 s +𝖙 t +𝖚 u +𝖛 v +𝖜 w +𝖝 x +𝖞 y +𝖟 z +𝖠 A +𝖡 B +𝖢 C +𝖣 D +𝖤 E +𝖥 F +𝖦 G +𝖧 H +𝖨 I +𝖩 J +𝖪 K +𝖫 L +𝖬 M +𝖭 N +𝖮 O +𝖯 P +𝖰 Q +𝖱 R +𝖲 S +𝖳 T +𝖴 U +𝖵 V +𝖶 W +𝖷 X +𝖸 Y +𝖹 Z +𝖺 a +𝖻 b +𝖼 c +𝖽 d +𝖾 e +𝖿 f +𝗀 g +𝗁 h +𝗂 i +𝗃 j +𝗄 k +𝗅 l +𝗆 m +𝗇 n +𝗈 o +𝗉 p +𝗊 q +𝗋 r +𝗌 s +𝗍 t +𝗎 u +𝗏 v +𝗐 w +𝗑 x +𝗒 y +𝗓 z +𝗔 A +𝗕 B +𝗖 C +𝗗 D +𝗘 E +𝗙 F +𝗚 G +𝗛 H +𝗜 I +𝗝 J +𝗞 K +𝗟 L +𝗠 M +𝗡 N +𝗢 O +𝗣 P +𝗤 Q +𝗥 R +𝗦 S +𝗧 T +𝗨 U +𝗩 V +𝗪 W +𝗫 X +𝗬 Y +𝗭 Z +𝗮 a +𝗯 b +𝗰 c +𝗱 d +𝗲 e +𝗳 f +𝗴 g +𝗵 h +𝗶 i +𝗷 j +𝗸 k +𝗹 l +𝗺 m +𝗻 n +𝗼 o +𝗽 p +𝗾 q +𝗿 r +𝘀 s +𝘁 t +𝘂 u +𝘃 v +𝘄 w +𝘅 x +𝘆 y +𝘇 z +𝘈 A +𝘉 B +𝘊 C +𝘋 D +𝘌 E +𝘍 F +𝘎 G +𝘏 H +𝘐 I +𝘑 J +𝘒 K +𝘓 L +𝘔 M +𝘕 N +𝘖 O +𝘗 P +𝘘 Q +𝘙 R +𝘚 S +𝘛 T +𝘜 U +𝘝 V +𝘞 W +𝘟 X +𝘠 Y +𝘡 Z +𝘢 a +𝘣 b +𝘤 c +𝘥 d +𝘦 e +𝘧 f +𝘨 g +𝘩 h +𝘪 i +𝘫 j +𝘬 k +𝘭 l +𝘮 m +𝘯 n +𝘰 o +𝘱 p +𝘲 q +𝘳 r +𝘴 s +𝘵 t +𝘶 u +𝘷 v +𝘸 w +𝘹 x +𝘺 y +𝘻 z +𝘼 A +𝘽 B +𝘾 C +𝘿 D +𝙀 E +𝙁 F +𝙂 G +𝙃 H +𝙄 I +𝙅 J +𝙆 K +𝙇 L +𝙈 M +𝙉 N +𝙊 O +𝙋 P +𝙌 Q +𝙍 R +𝙎 S +𝙏 T +𝙐 U +𝙑 V +𝙒 W +𝙓 X +𝙔 Y +𝙕 Z +𝙖 a +𝙗 b +𝙘 c +𝙙 d +𝙚 e +𝙛 f +𝙜 g +𝙝 h +𝙞 i +𝙟 j +𝙠 k +𝙡 l +𝙢 m +𝙣 n +𝙤 o +𝙥 p +𝙦 q +𝙧 r +𝙨 s +𝙩 t +𝙪 u +𝙫 v +𝙬 w +𝙭 x +𝙮 y +𝙯 z +𝙰 A +𝙱 B +𝙲 C +𝙳 D +𝙴 E +𝙵 F +𝙶 G +𝙷 H +𝙸 I +𝙹 J +𝙺 K +𝙻 L +𝙼 M +𝙽 N +𝙾 O +𝙿 P +𝚀 Q +𝚁 R +𝚂 S +𝚃 T +𝚄 U +𝚅 V +𝚆 W +𝚇 X +𝚈 Y +𝚉 Z +𝚊 a +𝚋 b +𝚌 c +𝚍 d +𝚎 e +𝚏 f +𝚐 g +𝚑 h +𝚒 i +𝚓 j +𝚔 k +𝚕 l +𝚖 m +𝚗 n +𝚘 o +𝚙 p +𝚚 q +𝚛 r +𝚜 s +𝚝 t +𝚞 u +𝚟 v +𝚠 w +𝚡 x +𝚢 y +𝚣 z +𝚨 Α +𝚩 Β +𝚪 Γ +𝚫 Δ +𝚬 Ε +𝚭 Ζ +𝚮 Η +𝚯 Θ +𝚰 Ι +𝚱 Κ +𝚲 Λ +𝚳 Μ +𝚴 Ν +𝚵 Ξ +𝚶 Ο +𝚷 Π +𝚸 Ρ +𝚺 Σ +𝚻 Τ +𝚼 Υ +𝚽 Φ +𝚾 Χ +𝚿 Ψ +𝛀 Ω +𝛂 α +𝛃 β +𝛄 γ +𝛅 δ +𝛆 ε +𝛇 ζ +𝛈 η +𝛉 θ +𝛊 ι +𝛋 κ +𝛌 λ +𝛍 μ +𝛎 ν +𝛏 ξ +𝛐 ο +𝛑 π +𝛒 ρ +𝛓 ς +𝛔 σ +𝛕 τ +𝛖 υ +𝛗 φ +𝛘 χ +𝛙 ψ +𝛚 ω +𝛢 Α +𝛣 Β +𝛤 Γ +𝛥 Δ +𝛦 Ε +𝛧 Ζ +𝛨 Η +𝛩 Θ +𝛪 Ι +𝛫 Κ +𝛬 Λ +𝛭 Μ +𝛮 Ν +𝛯 Ξ +𝛰 Ο +𝛱 Π +𝛲 Ρ +𝛴 Σ +𝛵 Τ +𝛶 Υ +𝛷 Φ +𝛸 Χ +𝛹 Ψ +𝛺 Ω +𝛼 α +𝛽 β +𝛾 γ +𝛿 δ +𝜀 ε +𝜁 ζ +𝜂 η +𝜃 θ +𝜄 ι +𝜅 κ +𝜆 λ +𝜇 μ +𝜈 ν +𝜉 ξ +𝜊 ο +𝜋 π +𝜌 ρ +𝜍 ς +𝜎 σ +𝜏 τ +𝜐 υ +𝜑 φ +𝜒 χ +𝜓 ψ +𝜔 ω +𝜜 Α +𝜝 Β +𝜞 Γ +𝜟 Δ +𝜠 Ε +𝜡 Ζ +𝜢 Η +𝜣 Θ +𝜤 Ι +𝜥 Κ +𝜦 Λ +𝜧 Μ +𝜨 Ν +𝜩 Ξ +𝜪 Ο +𝜫 Π +𝜬 Ρ +𝜮 Σ +𝜯 Τ +𝜰 Υ +𝜱 Φ +𝜲 Χ +𝜳 Ψ +𝜴 Ω +𝜶 α +𝜷 β +𝜸 γ +𝜹 δ +𝜺 ε +𝜻 ζ +𝜼 η +𝜽 θ +𝜾 ι +𝜿 κ +𝝀 λ +𝝁 μ +𝝂 ν +𝝃 ξ +𝝄 ο +𝝅 π +𝝆 ρ +𝝇 ς +𝝈 σ +𝝉 τ +𝝊 υ +𝝋 φ +𝝌 χ +𝝍 ψ +𝝎 ω +𝝖 Α +𝝗 Β +𝝘 Γ +𝝙 Δ +𝝚 Ε +𝝛 Ζ +𝝜 Η +𝝝 Θ +𝝞 Ι +𝝟 Κ +𝝠 Λ +𝝡 Μ +𝝢 Ν +𝝣 Ξ +𝝤 Ο +𝝥 Π +𝝦 Ρ +𝝨 Σ +𝝩 Τ +𝝪 Υ +𝝫 Φ +𝝬 Χ +𝝭 Ψ +𝝮 Ω +𝝰 α +𝝱 β +𝝲 γ +𝝳 δ +𝝴 ε +𝝵 ζ +𝝶 η +𝝷 θ +𝝸 ι +𝝹 κ +𝝺 λ +𝝻 μ +𝝼 ν +𝝽 ξ +𝝾 ο +𝝿 π +𝞀 ρ +𝞁 ς +𝞂 σ +𝞃 τ +𝞄 υ +𝞅 φ +𝞆 χ +𝞇 ψ +𝞈 ω +𝞐 Α +𝞑 Β +𝞒 Γ +𝞓 Δ +𝞔 Ε +𝞕 Ζ +𝞖 Η +𝞗 Θ +𝞘 Ι +𝞙 Κ +𝞚 Λ +𝞛 Μ +𝞜 Ν +𝞝 Ξ +𝞞 Ο +𝞟 Π +𝞠 Ρ +𝞢 Σ +𝞣 Τ +𝞤 Υ +𝞥 Φ +𝞦 Χ +𝞧 Ψ +𝞨 Ω +𝞪 α +𝞫 β +𝞬 γ +𝞭 δ +𝞮 ε +𝞯 ζ +𝞰 η +𝞱 θ +𝞲 ι +𝞳 κ +𝞴 λ +𝞵 μ +𝞶 ν +𝞷 ξ +𝞸 ο +𝞹 π +𝞺 ρ +𝞻 ς +𝞼 σ +𝞽 τ +𝞾 υ +𝞿 φ +𝟀 χ +𝟁 ψ +𝟂 ω 🄀 0. 🄁 0, 🄂 1, -- 2.39.3 (Apple Git-146)