From a26e284cb712515232b2a0bb080878eab4afae65 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 26 Feb 2024 11:58:29 +1300 Subject: [PATCH] Add simple codepoint redirections to unaccent.rules. Previously we searched for code points where the Unicode data file listed an equivalent combining character sequence that added accents. Some code points redirect to a single other code point, instead of doing any actual combining. We can follow those references recursively to get the answer. Per bug report #18362, which pointed out some ancient Greek character variants that we were missing. Reported-by: Cees van Zeeland Discussion: https://postgr.es/m/18362-be6d0cfe122b6354%40postgresql.org --- contrib/unaccent/generate_unaccent_rules.py | 14 +- contrib/unaccent/unaccent.rules | 1012 +++++++++++++++++++ 2 files changed, 1019 insertions(+), 7 deletions(-) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index cffb7db7cee..c9e003aaca6 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -104,10 +104,11 @@ def is_letter_with_marks(codepoint, table): """Returns true for letters combined with one or more marks.""" # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values - # Letter may have no combining characters, in which case it has - # no marks. - if len(codepoint.combining_ids) == 1: - return False + # Some codepoints redirect directly to another, instead of doing any + # "combining"... but sometimes they redirect to a codepoint that doesn't + # exist, so ignore those. + if len(codepoint.combining_ids) == 1 and codepoint.combining_ids[0] in table: + return is_letter_with_marks(table[codepoint.combining_ids[0]], table) # A letter without diacritical marks has none of them. if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: @@ -148,8 +149,7 @@ def get_plain_letter(codepoint, table): def is_ligature(codepoint, table): """Return true for letters combined with letters.""" - return all(is_letter(table[i], table) for i in codepoint.combining_ids) - + return all(i in table and is_letter(table[i], table) for i in codepoint.combining_ids) def get_plain_letters(codepoint, table): """Return a list of plain letters from a ligature.""" @@ -251,7 +251,7 @@ def main(args): # walk through all the codepoints looking for interesting mappings for codepoint in all: if codepoint.general_category.startswith('L') and \ - len(codepoint.combining_ids) > 1: + len(codepoint.combining_ids) > 0: if is_letter_with_marks(codepoint, table): charactersSet.add((codepoint.id, chr(get_plain_letter(codepoint, table).id))) diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index ca6caa51f52..cc21e11858e 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -1,9 +1,12 @@ ¡ ! © (C) +ª a « << ­ - ® (R) ± +/- +µ μ +º o » >> ¼ " 1/4" ½ " 1/2" @@ -402,6 +405,11 @@ ʦ ts ʪ ls ʫ lz +ʰ h +ʲ j +ʳ r +ʷ w +ʸ y ʹ ' ʺ """" ʻ ' @@ -417,6 +425,9 @@ ˖ + ˗ - ˜ ~ +ˡ l +ˢ s +ˣ x ̀ ́ ̂ @@ -536,6 +547,17 @@ ό ο ύ υ ώ ω +ϐ β +ϑ θ +ϒ Υ +ϕ φ +ϖ π +ϰ κ +ϱ ρ +ϲ ς +ϴ Θ +ϵ ε +Ϲ Σ Ё Е ё е ᴀ A @@ -556,6 +578,50 @@ ᴠ V ᴡ W ᴢ Z +ᴬ A +ᴮ B +ᴰ D +ᴱ E +ᴳ G +ᴴ H +ᴵ I +ᴶ J +ᴷ K +ᴸ L +ᴹ M +ᴺ N +ᴼ O +ᴾ P +ᴿ R +ᵀ T +ᵁ U +ᵂ W +ᵃ a +ᵇ b +ᵈ d +ᵉ e +ᵍ g +ᵏ k +ᵐ m +ᵒ o +ᵖ p +ᵗ t +ᵘ u +ᵛ v +ᵝ β +ᵞ γ +ᵟ δ +ᵠ φ +ᵡ χ +ᵢ i +ᵣ r +ᵤ u +ᵥ v +ᵦ β +ᵧ γ +ᵨ ρ +ᵩ φ +ᵪ χ ᵫ ue ᵬ b ᵭ d @@ -592,6 +658,10 @@ ᶓ e ᶖ i ᶙ u +ᶜ c +ᶠ f +ᶻ z +ᶿ θ Ḁ A ḁ a Ḃ B @@ -947,12 +1017,19 @@ Ὦ Ω Ὧ Ω ὰ α +ά α ὲ ε +έ ε ὴ η +ή η ὶ ι +ί ι ὸ ο +ό ο ὺ υ +ύ υ ὼ ω +ώ ω ᾀ α ᾁ α ᾂ α @@ -1011,26 +1088,33 @@ Ᾰ Α Ᾱ Α Ὰ Α +Ά Α ᾼ Α +ι ι ῂ η ῃ η ῄ η ῆ η ῇ η Ὲ Ε +Έ Ε Ὴ Η +Ή Η ῌ Η ῐ ι ῑ ι ῒ ι +ΐ ι ῖ ι ῗ ι Ῐ Ι Ῑ Ι Ὶ Ι +Ί Ι ῠ υ ῡ υ ῢ υ +ΰ υ ῤ ρ ῥ ρ ῦ υ @@ -1038,6 +1122,7 @@ Ῠ Υ Ῡ Υ Ὺ Υ +Ύ Υ Ῥ Ρ ῲ ω ῳ ω @@ -1045,7 +1130,9 @@ ῶ ω ῷ ω Ὸ Ο +Ό Ο Ὼ Ω +Ώ Ω ῼ Ω ‐ - ‑ - @@ -1077,6 +1164,20 @@ ⁈ ?! ⁉ !? ⁎ * +ⁱ i +ⁿ n +ₐ a +ₑ e +ₒ o +ₓ x +ₕ h +ₖ k +ₗ l +ₘ m +ₙ n +ₚ p +ₛ s +ₜ t ₠ CE ₢ Cr ₣ Fr. @@ -1101,6 +1202,7 @@ ℊ g ℋ H ℌ x +ℌ H ℍ H ℎ h ℐ I @@ -1119,7 +1221,10 @@ ℞ Rx ℡ TEL ℤ Z +Ω Ω ℨ Z +K K +Å A ℬ B ℭ C ℯ e @@ -1129,6 +1234,10 @@ ℴ o ℹ i ℻ FAX +ℼ π +ℽ γ +ℾ Γ +ℿ Π ⅅ D ⅆ d ⅇ e @@ -1281,6 +1390,8 @@ ⱴ v ⱸ e ⱺ o +ⱼ j +ⱽ V Ȿ S Ɀ Z 、 , @@ -1455,6 +1566,9 @@ Ꞩ S ꞩ s Ɦ H +ꟲ C +ꟳ F +ꟴ Q ff ff fi fi fl fl @@ -1611,6 +1725,904 @@ 、 , ← <- → -> +𐞥 q +𝐀 A +𝐁 B +𝐂 C +𝐃 D +𝐄 E +𝐅 F +𝐆 G +𝐇 H +𝐈 I +𝐉 J +𝐊 K +𝐋 L +𝐌 M +𝐍 N +𝐎 O +𝐏 P +𝐐 Q +𝐑 R +𝐒 S +𝐓 T +𝐔 U +𝐕 V +𝐖 W +𝐗 X +𝐘 Y +𝐙 Z +𝐚 a +𝐛 b +𝐜 c +𝐝 d +𝐞 e +𝐟 f +𝐠 g +𝐡 h +𝐢 i +𝐣 j +𝐤 k +𝐥 l +𝐦 m +𝐧 n +𝐨 o +𝐩 p +𝐪 q +𝐫 r +𝐬 s +𝐭 t +𝐮 u +𝐯 v +𝐰 w +𝐱 x +𝐲 y +𝐳 z +𝐴 A +𝐵 B +𝐶 C +𝐷 D +𝐸 E +𝐹 F +𝐺 G +𝐻 H +𝐼 I +𝐽 J +𝐾 K +𝐿 L +𝑀 M +𝑁 N +𝑂 O +𝑃 P +𝑄 Q +𝑅 R +𝑆 S +𝑇 T +𝑈 U +𝑉 V +𝑊 W +𝑋 X +𝑌 Y +𝑍 Z +𝑎 a +𝑏 b +𝑐 c +𝑑 d +𝑒 e +𝑓 f +𝑔 g +𝑖 i +𝑗 j +𝑘 k +𝑙 l +𝑚 m +𝑛 n +𝑜 o +𝑝 p +𝑞 q +𝑟 r +𝑠 s +𝑡 t +𝑢 u +𝑣 v +𝑤 w +𝑥 x +𝑦 y +𝑧 z +𝑨 A +𝑩 B +𝑪 C +𝑫 D +𝑬 E +𝑭 F +𝑮 G +𝑯 H +𝑰 I +𝑱 J +𝑲 K +𝑳 L +𝑴 M +𝑵 N +𝑶 O +𝑷 P +𝑸 Q +𝑹 R +𝑺 S +𝑻 T +𝑼 U +𝑽 V +𝑾 W +𝑿 X +𝒀 Y +𝒁 Z +𝒂 a +𝒃 b +𝒄 c +𝒅 d +𝒆 e +𝒇 f +𝒈 g +𝒉 h +𝒊 i +𝒋 j +𝒌 k +𝒍 l +𝒎 m +𝒏 n +𝒐 o +𝒑 p +𝒒 q +𝒓 r +𝒔 s +𝒕 t +𝒖 u +𝒗 v +𝒘 w +𝒙 x +𝒚 y +𝒛 z +𝒜 A +𝒞 C +𝒟 D +𝒢 G +𝒥 J +𝒦 K +𝒩 N +𝒪 O +𝒫 P +𝒬 Q +𝒮 S +𝒯 T +𝒰 U +𝒱 V +𝒲 W +𝒳 X +𝒴 Y +𝒵 Z +𝒶 a +𝒷 b +𝒸 c +𝒹 d +𝒻 f +𝒽 h +𝒾 i +𝒿 j +𝓀 k +𝓁 l +𝓂 m +𝓃 n +𝓅 p +𝓆 q +𝓇 r +𝓈 s +𝓉 t +𝓊 u +𝓋 v +𝓌 w +𝓍 x +𝓎 y +𝓏 z +𝓐 A +𝓑 B +𝓒 C +𝓓 D +𝓔 E +𝓕 F +𝓖 G +𝓗 H +𝓘 I +𝓙 J +𝓚 K +𝓛 L +𝓜 M +𝓝 N +𝓞 O +𝓟 P +𝓠 Q +𝓡 R +𝓢 S +𝓣 T +𝓤 U +𝓥 V +𝓦 W +𝓧 X +𝓨 Y +𝓩 Z +𝓪 a +𝓫 b +𝓬 c +𝓭 d +𝓮 e +𝓯 f +𝓰 g +𝓱 h +𝓲 i +𝓳 j +𝓴 k +𝓵 l +𝓶 m +𝓷 n +𝓸 o +𝓹 p +𝓺 q +𝓻 r +𝓼 s +𝓽 t +𝓾 u +𝓿 v +𝔀 w +𝔁 x +𝔂 y +𝔃 z +𝔄 A +𝔅 B +𝔇 D +𝔈 E +𝔉 F +𝔊 G +𝔍 J +𝔎 K +𝔏 L +𝔐 M +𝔑 N +𝔒 O +𝔓 P +𝔔 Q +𝔖 S +𝔗 T +𝔘 U +𝔙 V +𝔚 W +𝔛 X +𝔜 Y +𝔞 a +𝔟 b +𝔠 c +𝔡 d +𝔢 e +𝔣 f +𝔤 g +𝔥 h +𝔦 i +𝔧 j +𝔨 k +𝔩 l +𝔪 m +𝔫 n +𝔬 o +𝔭 p +𝔮 q +𝔯 r +𝔰 s +𝔱 t +𝔲 u +𝔳 v +𝔴 w +𝔵 x +𝔶 y +𝔷 z +𝔸 A +𝔹 B +𝔻 D +𝔼 E +𝔽 F +𝔾 G +𝕀 I +𝕁 J +𝕂 K +𝕃 L +𝕄 M +𝕆 O +𝕊 S +𝕋 T +𝕌 U +𝕍 V +𝕎 W +𝕏 X +𝕐 Y +𝕒 a +𝕓 b +𝕔 c +𝕕 d +𝕖 e +𝕗 f +𝕘 g +𝕙 h +𝕚 i +𝕛 j +𝕜 k +𝕝 l +𝕞 m +𝕟 n +𝕠 o +𝕡 p +𝕢 q +𝕣 r +𝕤 s +𝕥 t +𝕦 u +𝕧 v +𝕨 w +𝕩 x +𝕪 y +𝕫 z +𝕬 A +𝕭 B +𝕮 C +𝕯 D +𝕰 E +𝕱 F +𝕲 G +𝕳 H +𝕴 I +𝕵 J +𝕶 K +𝕷 L +𝕸 M +𝕹 N +𝕺 O +𝕻 P +𝕼 Q +𝕽 R +𝕾 S +𝕿 T +𝖀 U +𝖁 V +𝖂 W +𝖃 X +𝖄 Y +𝖅 Z +𝖆 a +𝖇 b +𝖈 c +𝖉 d +𝖊 e +𝖋 f +𝖌 g +𝖍 h +𝖎 i +𝖏 j +𝖐 k +𝖑 l +𝖒 m +𝖓 n +𝖔 o +𝖕 p +𝖖 q +𝖗 r +𝖘 s +𝖙 t +𝖚 u +𝖛 v +𝖜 w +𝖝 x +𝖞 y +𝖟 z +𝖠 A +𝖡 B +𝖢 C +𝖣 D +𝖤 E +𝖥 F +𝖦 G +𝖧 H +𝖨 I +𝖩 J +𝖪 K +𝖫 L +𝖬 M +𝖭 N +𝖮 O +𝖯 P +𝖰 Q +𝖱 R +𝖲 S +𝖳 T +𝖴 U +𝖵 V +𝖶 W +𝖷 X +𝖸 Y +𝖹 Z +𝖺 a +𝖻 b +𝖼 c +𝖽 d +𝖾 e +𝖿 f +𝗀 g +𝗁 h +𝗂 i +𝗃 j +𝗄 k +𝗅 l +𝗆 m +𝗇 n +𝗈 o +𝗉 p +𝗊 q +𝗋 r +𝗌 s +𝗍 t +𝗎 u +𝗏 v +𝗐 w +𝗑 x +𝗒 y +𝗓 z +𝗔 A +𝗕 B +𝗖 C +𝗗 D +𝗘 E +𝗙 F +𝗚 G +𝗛 H +𝗜 I +𝗝 J +𝗞 K +𝗟 L +𝗠 M +𝗡 N +𝗢 O +𝗣 P +𝗤 Q +𝗥 R +𝗦 S +𝗧 T +𝗨 U +𝗩 V +𝗪 W +𝗫 X +𝗬 Y +𝗭 Z +𝗮 a +𝗯 b +𝗰 c +𝗱 d +𝗲 e +𝗳 f +𝗴 g +𝗵 h +𝗶 i +𝗷 j +𝗸 k +𝗹 l +𝗺 m +𝗻 n +𝗼 o +𝗽 p +𝗾 q +𝗿 r +𝘀 s +𝘁 t +𝘂 u +𝘃 v +𝘄 w +𝘅 x +𝘆 y +𝘇 z +𝘈 A +𝘉 B +𝘊 C +𝘋 D +𝘌 E +𝘍 F +𝘎 G +𝘏 H +𝘐 I +𝘑 J +𝘒 K +𝘓 L +𝘔 M +𝘕 N +𝘖 O +𝘗 P +𝘘 Q +𝘙 R +𝘚 S +𝘛 T +𝘜 U +𝘝 V +𝘞 W +𝘟 X +𝘠 Y +𝘡 Z +𝘢 a +𝘣 b +𝘤 c +𝘥 d +𝘦 e +𝘧 f +𝘨 g +𝘩 h +𝘪 i +𝘫 j +𝘬 k +𝘭 l +𝘮 m +𝘯 n +𝘰 o +𝘱 p +𝘲 q +𝘳 r +𝘴 s +𝘵 t +𝘶 u +𝘷 v +𝘸 w +𝘹 x +𝘺 y +𝘻 z +𝘼 A +𝘽 B +𝘾 C +𝘿 D +𝙀 E +𝙁 F +𝙂 G +𝙃 H +𝙄 I +𝙅 J +𝙆 K +𝙇 L +𝙈 M +𝙉 N +𝙊 O +𝙋 P +𝙌 Q +𝙍 R +𝙎 S +𝙏 T +𝙐 U +𝙑 V +𝙒 W +𝙓 X +𝙔 Y +𝙕 Z +𝙖 a +𝙗 b +𝙘 c +𝙙 d +𝙚 e +𝙛 f +𝙜 g +𝙝 h +𝙞 i +𝙟 j +𝙠 k +𝙡 l +𝙢 m +𝙣 n +𝙤 o +𝙥 p +𝙦 q +𝙧 r +𝙨 s +𝙩 t +𝙪 u +𝙫 v +𝙬 w +𝙭 x +𝙮 y +𝙯 z +𝙰 A +𝙱 B +𝙲 C +𝙳 D +𝙴 E +𝙵 F +𝙶 G +𝙷 H +𝙸 I +𝙹 J +𝙺 K +𝙻 L +𝙼 M +𝙽 N +𝙾 O +𝙿 P +𝚀 Q +𝚁 R +𝚂 S +𝚃 T +𝚄 U +𝚅 V +𝚆 W +𝚇 X +𝚈 Y +𝚉 Z +𝚊 a +𝚋 b +𝚌 c +𝚍 d +𝚎 e +𝚏 f +𝚐 g +𝚑 h +𝚒 i +𝚓 j +𝚔 k +𝚕 l +𝚖 m +𝚗 n +𝚘 o +𝚙 p +𝚚 q +𝚛 r +𝚜 s +𝚝 t +𝚞 u +𝚟 v +𝚠 w +𝚡 x +𝚢 y +𝚣 z +𝚨 Α +𝚩 Β +𝚪 Γ +𝚫 Δ +𝚬 Ε +𝚭 Ζ +𝚮 Η +𝚯 Θ +𝚰 Ι +𝚱 Κ +𝚲 Λ +𝚳 Μ +𝚴 Ν +𝚵 Ξ +𝚶 Ο +𝚷 Π +𝚸 Ρ +𝚺 Σ +𝚻 Τ +𝚼 Υ +𝚽 Φ +𝚾 Χ +𝚿 Ψ +𝛀 Ω +𝛂 α +𝛃 β +𝛄 γ +𝛅 δ +𝛆 ε +𝛇 ζ +𝛈 η +𝛉 θ +𝛊 ι +𝛋 κ +𝛌 λ +𝛍 μ +𝛎 ν +𝛏 ξ +𝛐 ο +𝛑 π +𝛒 ρ +𝛓 ς +𝛔 σ +𝛕 τ +𝛖 υ +𝛗 φ +𝛘 χ +𝛙 ψ +𝛚 ω +𝛢 Α +𝛣 Β +𝛤 Γ +𝛥 Δ +𝛦 Ε +𝛧 Ζ +𝛨 Η +𝛩 Θ +𝛪 Ι +𝛫 Κ +𝛬 Λ +𝛭 Μ +𝛮 Ν +𝛯 Ξ +𝛰 Ο +𝛱 Π +𝛲 Ρ +𝛴 Σ +𝛵 Τ +𝛶 Υ +𝛷 Φ +𝛸 Χ +𝛹 Ψ +𝛺 Ω +𝛼 α +𝛽 β +𝛾 γ +𝛿 δ +𝜀 ε +𝜁 ζ +𝜂 η +𝜃 θ +𝜄 ι +𝜅 κ +𝜆 λ +𝜇 μ +𝜈 ν +𝜉 ξ +𝜊 ο +𝜋 π +𝜌 ρ +𝜍 ς +𝜎 σ +𝜏 τ +𝜐 υ +𝜑 φ +𝜒 χ +𝜓 ψ +𝜔 ω +𝜜 Α +𝜝 Β +𝜞 Γ +𝜟 Δ +𝜠 Ε +𝜡 Ζ +𝜢 Η +𝜣 Θ +𝜤 Ι +𝜥 Κ +𝜦 Λ +𝜧 Μ +𝜨 Ν +𝜩 Ξ +𝜪 Ο +𝜫 Π +𝜬 Ρ +𝜮 Σ +𝜯 Τ +𝜰 Υ +𝜱 Φ +𝜲 Χ +𝜳 Ψ +𝜴 Ω +𝜶 α +𝜷 β +𝜸 γ +𝜹 δ +𝜺 ε +𝜻 ζ +𝜼 η +𝜽 θ +𝜾 ι +𝜿 κ +𝝀 λ +𝝁 μ +𝝂 ν +𝝃 ξ +𝝄 ο +𝝅 π +𝝆 ρ +𝝇 ς +𝝈 σ +𝝉 τ +𝝊 υ +𝝋 φ +𝝌 χ +𝝍 ψ +𝝎 ω +𝝖 Α +𝝗 Β +𝝘 Γ +𝝙 Δ +𝝚 Ε +𝝛 Ζ +𝝜 Η +𝝝 Θ +𝝞 Ι +𝝟 Κ +𝝠 Λ +𝝡 Μ +𝝢 Ν +𝝣 Ξ +𝝤 Ο +𝝥 Π +𝝦 Ρ +𝝨 Σ +𝝩 Τ +𝝪 Υ +𝝫 Φ +𝝬 Χ +𝝭 Ψ +𝝮 Ω +𝝰 α +𝝱 β +𝝲 γ +𝝳 δ +𝝴 ε +𝝵 ζ +𝝶 η +𝝷 θ +𝝸 ι +𝝹 κ +𝝺 λ +𝝻 μ +𝝼 ν +𝝽 ξ +𝝾 ο +𝝿 π +𝞀 ρ +𝞁 ς +𝞂 σ +𝞃 τ +𝞄 υ +𝞅 φ +𝞆 χ +𝞇 ψ +𝞈 ω +𝞐 Α +𝞑 Β +𝞒 Γ +𝞓 Δ +𝞔 Ε +𝞕 Ζ +𝞖 Η +𝞗 Θ +𝞘 Ι +𝞙 Κ +𝞚 Λ +𝞛 Μ +𝞜 Ν +𝞝 Ξ +𝞞 Ο +𝞟 Π +𝞠 Ρ +𝞢 Σ +𝞣 Τ +𝞤 Υ +𝞥 Φ +𝞦 Χ +𝞧 Ψ +𝞨 Ω +𝞪 α +𝞫 β +𝞬 γ +𝞭 δ +𝞮 ε +𝞯 ζ +𝞰 η +𝞱 θ +𝞲 ι +𝞳 κ +𝞴 λ +𝞵 μ +𝞶 ν +𝞷 ξ +𝞸 ο +𝞹 π +𝞺 ρ +𝞻 ς +𝞼 σ +𝞽 τ +𝞾 υ +𝞿 φ +𝟀 χ +𝟁 ψ +𝟂 ω 🄀 0. 🄁 0, 🄂 1, -- 2.39.2