From 8d37b2d70b95c57cc4f02727bc7f064b7fffecec Mon Sep 17 00:00:00 2001 From: Przemyslaw Sztoch Date: Fri, 1 Jul 2022 13:43:30 +0200 Subject: [PATCH 2/2] Unaccent More characters category --- contrib/unaccent/expected/unaccent.out | 6 + contrib/unaccent/generate_unaccent_rules.py | 16 +- contrib/unaccent/sql/unaccent.sql | 2 + contrib/unaccent/unaccent.rules | 292 ++++++++++++++++++++ 4 files changed, 311 insertions(+), 5 deletions(-) diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index 090a0b3889..b3e9310842 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -121,3 +121,9 @@ SELECT unaccent('unaccent', '⅓ ⅜ ℃ ℉ ℗'); 1/3 3/8 °C °F (P) (1 row) +SELECT unaccent('unaccent', '㎡ ㎥ ㎢ ㎣ ㎧'); + unaccent +------------------- + m2 m3 km2 mm3 m/s +(1 row) + diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 71932c8224..e9076a8ffe 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -106,7 +106,7 @@ def is_letter_with_marks(codepoint, table): # Letter may have no combining characters, in which case it has # no marks. if len(codepoint.combining_ids) == 1: - return False + return codepoint.combining_ids[0] in table and is_letter(table[codepoint.combining_ids[0]], table) # A letter without diacritical marks has none of them. if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: @@ -131,13 +131,13 @@ def get_plain_letter(codepoint, table): than one combining character, do a recursive lookup on the table to find out its plain base letter.""" if is_letter_with_marks(codepoint, table): - if len(table[codepoint.combining_ids[0]].combining_ids) > 1: + if len(table[codepoint.combining_ids[0]].combining_ids) >= 1: return get_plain_letter(table[codepoint.combining_ids[0]], table) elif is_plain_letter(table[codepoint.combining_ids[0]]): return table[codepoint.combining_ids[0]] # Should not come here - assert(False) + assert False, 'Codepoint U+%0.2X' % codepoint.id elif is_plain_letter(codepoint): return codepoint @@ -208,8 +208,8 @@ def special_cases(): """Returns the special cases which are not handled by other methods""" charactersDict = {} - charactersDict[0x2103] = "\xb0C" # DEGREE CELSIUS - charactersDict[0x2109] = "\xb0F" # DEGREE FAHRENHEIT + # Template example (already unnecessary): + #charactersDict[0x2103] = "\xb0C" # DEGREE CELSIUS return charactersDict @@ -252,6 +252,12 @@ def main(args): charactersDict[codepoint.id] = "".join(chr(combining_codepoint.id) for combining_codepoint in get_plain_letters(codepoint, table)) + elif (codepoint.general_category.startswith('N') or codepoint.general_category.startswith('So')) and \ + len(codepoint.combining_ids) > 0 and \ + args.noLigaturesExpansion is False and is_ligature(codepoint, table): + charactersDict[codepoint.id] = "".join(chr(combining_codepoint.id) + for combining_codepoint + in get_plain_letters(codepoint, table)) elif is_mark_to_remove(codepoint): charactersDict[codepoint.id] = None diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index 4c6b8c0cd8..eac5d89dae 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -27,3 +27,5 @@ SELECT ts_lexize('unaccent', '˃˖˗˜'); SELECT ts_lexize('unaccent', 'À'); SELECT unaccent('unaccent', '⅓ ⅜ ℃ ℉ ℗'); + +SELECT unaccent('unaccent', '㎡ ㎥ ㎢ ㎣ ㎧'); diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 3a5d35627a..0e709af1da 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -5,7 +5,10 @@ ­ - ® (R) ± +/- +² 2 +³ 3 µ μ +¹ 1 º o » >> ¼ 1/4 @@ -550,6 +553,8 @@ ϐ β ϑ θ ϒ Υ +ϓ Υ +ϔ Υ ϕ φ ϖ π ϰ κ @@ -860,6 +865,7 @@ ẘ w ẙ y ẚ a +ẛ s ẜ s ẝ s ẞ SS @@ -1207,8 +1213,25 @@ ⁈ ?! ⁉ !? ⁎ * +⁰ 0 ⁱ i +⁴ 4 +⁵ 5 +⁶ 6 +⁷ 7 +⁸ 8 +⁹ 9 ⁿ n +₀ 0 +₁ 1 +₂ 2 +₃ 3 +₄ 4 +₅ 5 +₆ 6 +₇ 7 +₈ 8 +₉ 9 ₐ a ₑ e ₒ o @@ -1261,7 +1284,9 @@ ℜ R ℝ R ℞ Rx +℠ SM ℡ TEL +™ TM ℤ Z Ω Ω ℨ Z @@ -1341,6 +1366,26 @@ ∥ || ≪ << ≫ >> +① 1 +② 2 +③ 3 +④ 4 +⑤ 5 +⑥ 6 +⑦ 7 +⑧ 8 +⑨ 9 +⑩ 10 +⑪ 11 +⑫ 12 +⑬ 13 +⑭ 14 +⑮ 15 +⑯ 16 +⑰ 17 +⑱ 18 +⑲ 19 +⑳ 20 ⑴ (1) ⑵ (2) ⑶ (3) @@ -1407,6 +1452,59 @@ ⒳ (x) ⒴ (y) ⒵ (z) +Ⓐ A +Ⓑ B +Ⓒ C +Ⓓ D +Ⓔ E +Ⓕ F +Ⓖ G +Ⓗ H +Ⓘ I +Ⓙ J +Ⓚ K +Ⓛ L +Ⓜ M +Ⓝ N +Ⓞ O +Ⓟ P +Ⓠ Q +Ⓡ R +Ⓢ S +Ⓣ T +Ⓤ U +Ⓥ V +Ⓦ W +Ⓧ X +Ⓨ Y +Ⓩ Z +ⓐ a +ⓑ b +ⓒ c +ⓓ d +ⓔ e +ⓕ f +ⓖ g +ⓗ h +ⓘ i +ⓙ j +ⓚ k +ⓛ l +ⓜ m +ⓝ n +ⓞ o +ⓟ p +ⓠ q +ⓡ r +ⓢ s +ⓣ t +ⓤ u +ⓥ v +ⓦ w +ⓧ x +ⓨ y +ⓩ z +⓪ 0 ⦅ (( ⦆ )) ⩴ ::= @@ -1451,6 +1549,41 @@ 〛 ] 〝 " 〞 " +㉐ PTE +㉑ 21 +㉒ 22 +㉓ 23 +㉔ 24 +㉕ 25 +㉖ 26 +㉗ 27 +㉘ 28 +㉙ 29 +㉚ 30 +㉛ 31 +㉜ 32 +㉝ 33 +㉞ 34 +㉟ 35 +㊱ 36 +㊲ 37 +㊳ 38 +㊴ 39 +㊵ 40 +㊶ 41 +㊷ 42 +㊸ 43 +㊹ 44 +㊺ 45 +㊻ 46 +㊼ 47 +㊽ 48 +㊾ 49 +㊿ 50 +㋌ Hg +㋍ erg +㋎ eV +㋏ LTD ㍱ hPa ㍲ da ㍳ AU @@ -1458,9 +1591,12 @@ ㍵ oV ㍶ pc ㍷ dm +㍸ dm2 +㍹ dm3 ㍺ IU ㎀ pA ㎁ nA +㎂ μA ㎃ mA ㎄ kA ㎅ KB @@ -1470,6 +1606,8 @@ ㎉ kcal ㎊ pF ㎋ nF +㎌ μF +㎍ μg ㎎ mg ㎏ kg ㎐ Hz @@ -1477,11 +1615,24 @@ ㎒ MHz ㎓ GHz ㎔ THz +㎕ μl +㎖ ml +㎗ dl +㎘ kl ㎙ fm ㎚ nm +㎛ μm ㎜ mm ㎝ cm ㎞ km +㎟ mm2 +㎠ cm2 +㎡ m2 +㎢ km2 +㎣ mm3 +㎤ cm3 +㎥ m3 +㎦ km3 ㎧ m/s ㎩ Pa ㎪ kPa @@ -1491,17 +1642,22 @@ ㎮ rad/s ㎰ ps ㎱ ns +㎲ μs ㎳ ms ㎴ pV ㎵ nV +㎶ μV ㎷ mV ㎸ kV ㎹ MV ㎺ pW ㎻ nW +㎼ μW ㎽ mW ㎾ kW ㎿ MW +㏀ kΩ +㏁ MΩ ㏂ a.m. ㏃ Bq ㏄ cc @@ -1532,6 +1688,7 @@ ㏝ Wb ㏞ V/m ㏟ A/m +㏿ gal ꚜ ъ ꚝ ь ꜰ F @@ -2439,6 +2596,7 @@ 𝚶 Ο 𝚷 Π 𝚸 Ρ +𝚹 Θ 𝚺 Σ 𝚻 Τ 𝚼 Υ @@ -2471,6 +2629,12 @@ 𝛘 χ 𝛙 ψ 𝛚 ω +𝛜 ε +𝛝 θ +𝛞 κ +𝛟 φ +𝛠 ρ +𝛡 π 𝛢 Α 𝛣 Β 𝛤 Γ @@ -2488,6 +2652,7 @@ 𝛰 Ο 𝛱 Π 𝛲 Ρ +𝛳 Θ 𝛴 Σ 𝛵 Τ 𝛶 Υ @@ -2520,6 +2685,12 @@ 𝜒 χ 𝜓 ψ 𝜔 ω +𝜖 ε +𝜗 θ +𝜘 κ +𝜙 φ +𝜚 ρ +𝜛 π 𝜜 Α 𝜝 Β 𝜞 Γ @@ -2537,6 +2708,7 @@ 𝜪 Ο 𝜫 Π 𝜬 Ρ +𝜭 Θ 𝜮 Σ 𝜯 Τ 𝜰 Υ @@ -2569,6 +2741,12 @@ 𝝌 χ 𝝍 ψ 𝝎 ω +𝝐 ε +𝝑 θ +𝝒 κ +𝝓 φ +𝝔 ρ +𝝕 π 𝝖 Α 𝝗 Β 𝝘 Γ @@ -2586,6 +2764,7 @@ 𝝤 Ο 𝝥 Π 𝝦 Ρ +𝝧 Θ 𝝨 Σ 𝝩 Τ 𝝪 Υ @@ -2618,6 +2797,12 @@ 𝞆 χ 𝞇 ψ 𝞈 ω +𝞊 ε +𝞋 θ +𝞌 κ +𝞍 φ +𝞎 ρ +𝞏 π 𝞐 Α 𝞑 Β 𝞒 Γ @@ -2635,6 +2820,7 @@ 𝞞 Ο 𝞟 Π 𝞠 Ρ +𝞡 Θ 𝞢 Σ 𝞣 Τ 𝞤 Υ @@ -2667,6 +2853,62 @@ 𝟀 χ 𝟁 ψ 𝟂 ω +𝟄 ε +𝟅 θ +𝟆 κ +𝟇 φ +𝟈 ρ +𝟉 π +𝟎 0 +𝟏 1 +𝟐 2 +𝟑 3 +𝟒 4 +𝟓 5 +𝟔 6 +𝟕 7 +𝟖 8 +𝟗 9 +𝟘 0 +𝟙 1 +𝟚 2 +𝟛 3 +𝟜 4 +𝟝 5 +𝟞 6 +𝟟 7 +𝟠 8 +𝟡 9 +𝟢 0 +𝟣 1 +𝟤 2 +𝟥 3 +𝟦 4 +𝟧 5 +𝟨 6 +𝟩 7 +𝟪 8 +𝟫 9 +𝟬 0 +𝟭 1 +𝟮 2 +𝟯 3 +𝟰 4 +𝟱 5 +𝟲 6 +𝟳 7 +𝟴 8 +𝟵 9 +𝟶 0 +𝟷 1 +𝟸 2 +𝟹 3 +𝟺 4 +𝟻 5 +𝟼 6 +𝟽 7 +𝟾 8 +𝟿 9 🄀 0. 🄁 0, 🄂 1, @@ -2704,3 +2946,53 @@ 🄧 (X) 🄨 (Y) 🄩 (Z) +🄫 C +🄬 R +🄭 CD +🄮 WZ +🄰 A +🄱 B +🄲 C +🄳 D +🄴 E +🄵 F +🄶 G +🄷 H +🄸 I +🄹 J +🄺 K +🄻 L +🄼 M +🄽 N +🄾 O +🄿 P +🅀 Q +🅁 R +🅂 S +🅃 T +🅄 U +🅅 V +🅆 W +🅇 X +🅈 Y +🅉 Z +🅊 HV +🅋 MV +🅌 SD +🅍 SS +🅎 PPV +🅏 WC +🅪 MC +🅫 MD +🅬 MR +🆐 DJ +🯰 0 +🯱 1 +🯲 2 +🯳 3 +🯴 4 +🯵 5 +🯶 6 +🯷 7 +🯸 8 +🯹 9 -- 2.34.1