Skip to content

Commit 3ca02f1

Browse files
committed
Fix rules that have a single space on the RHS. Closes #2
1 parent c95e844 commit 3ca02f1

77 files changed

Lines changed: 21856 additions & 785 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
# Changelog
22

3+
## Unicode Transform 0.4.0
4+
5+
This is the changelog for Unicode Transform 0.4.0 released on 8th December, 2025. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_transform/tags)
6+
7+
### Bug Fixes
8+
9+
* Fix rules generator to ensure that a transform to an ASCII space is not trimmed (thereby causing an empty string). Thanks to @arcanemachine for the report. Closes #2.
10+
11+
### Enhancements
12+
13+
* Updates to CLDR 47 Transform rules (most of which remain unimplemented in this release)
14+
315
## Unicode Transform 0.3.0
416

517
This is the changelog for Unicode Transform 0.3.0 released on April 18th, 2022. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_transform/tags)

lib/transforms/latin_ascii.ex

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ defmodule Unicode.Transform.LatinAscii do
55
# Source: Latin
66
# Target: ASCII
77
# Transform direction: both
8-
# Transform alias: und-t-d0-ascii und-Latn-t-s0-ascii
8+
# Transform alias: und-t-und-latn-d0-ascii
99

1010
#
1111
# This handles only Latin, Common, and IDEOGRAPHIC NUMBER ZERO (Han).
@@ -1391,29 +1391,29 @@ defmodule Unicode.Transform.LatinAscii do
13911391
# Spaces
13921392
#
13931393
# 00A0;NO-BREAK SPACE
1394-
replace("\u00A0", "")
1394+
replace("\u00A0", " ")
13951395
# 2002;EN SPACE (compat)
1396-
replace("\u2002", "")
1396+
replace("\u2002", " ")
13971397
# 2003;EM SPACE (compat)
1398-
replace("\u2003", "")
1398+
replace("\u2003", " ")
13991399
# 2004;THREE-PER-EM SPACE (compat)
1400-
replace("\u2004", "")
1400+
replace("\u2004", " ")
14011401
# 2005;FOUR-PER-EM SPACE (compat)
1402-
replace("\u2005", "")
1402+
replace("\u2005", " ")
14031403
# 2006;SIX-PER-EM SPACE (compat)
1404-
replace("\u2006", "")
1404+
replace("\u2006", " ")
14051405
# 2007;FIGURE SPACE (compat)
1406-
replace("\u2007", "")
1406+
replace("\u2007", " ")
14071407
# 2008;PUNCTUATION SPACE (compat)
1408-
replace("\u2008", "")
1408+
replace("\u2008", " ")
14091409
# 2009;THIN SPACE (compat)
1410-
replace("\u2009", "")
1410+
replace("\u2009", " ")
14111411
# 200A;HAIR SPACE (compat)
1412-
replace("\u200A", "")
1412+
replace("\u200A", " ")
14131413
# 205F;MEDIUM MATHEMATICAL SPACE (compat)
1414-
replace("\u205F", "")
1414+
replace("\u205F", " ")
14151415
# 3000;IDEOGRAPHIC SPACE (from ‹character-fallback›)
1416-
replace("\u3000", "")
1416+
replace("\u3000", " ")
14171417
#
14181418
# Quotes, apostrophes
14191419
#

lib/unicode/transform/rule/conversion.ex

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ defmodule Unicode.Transform.Rule.Conversion do
287287
Enum.map(list, fn
288288
nil -> nil
289289
"" -> nil
290+
" " -> " "
290291
other -> String.trim(other)
291292
end)
292293
end

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
defmodule UnicodeTransform.MixProject do
22
use Mix.Project
33

4-
@version "0.3.0"
4+
@version "0.4.0"
55

66
def project do
77
[

mix.lock

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
%{
2-
"dialyxir": {:hex, :dialyxir, "1.1.0", "c5aab0d6e71e5522e77beff7ba9e08f8e02bad90dfbeffae60eaf0cb47e29488", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "07ea8e49c45f15264ebe6d5b93799d4dd56a44036cf42d0ad9c960bc266c0b9a"},
3-
"earmark_parser": {:hex, :earmark_parser, "1.4.25", "2024618731c55ebfcc5439d756852ec4e85978a39d0d58593763924d9a15916f", [:mix], [], "hexpm", "56749c5e1c59447f7b7a23ddb235e4b3defe276afc220a6227237f3efe83f51e"},
4-
"erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"},
5-
"ex_doc": {:hex, :ex_doc, "0.28.3", "6eea2f69995f5fba94cd6dd398df369fe4e777a47cd887714a0976930615c9e6", [:mix], [{:earmark_parser, "~> 1.4.19", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "05387a6a2655b5f9820f3f627450ed20b4325c25977b2ee69bed90af6688e718"},
2+
"dialyxir": {:hex, :dialyxir, "1.4.7", "dda948fcee52962e4b6c5b4b16b2d8fa7d50d8645bbae8b8685c3f9ecb7f5f4d", [:mix], [{:erlex, ">= 0.2.8", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "b34527202e6eb8cee198efec110996c25c5898f43a4094df157f8d28f27d9efe"},
3+
"earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"},
4+
"erlex": {:hex, :erlex, "0.2.8", "cd8116f20f3c0afe376d1e8d1f0ae2452337729f68be016ea544a72f767d9c12", [:mix], [], "hexpm", "9d66ff9fedf69e49dc3fd12831e12a8a37b76f8651dd21cd45fcf5561a8a7590"},
5+
"ex_doc": {:hex, :ex_doc, "0.39.2", "da5549bbce34c5fb0811f829f9f6b7a13d5607b222631d9e989447096f295c57", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "62665526a88c207653dbcee2aac66c2c229d7c18a70ca4ffc7f74f9e01324daa"},
66
"ex_unicode": {:hex, :ex_unicode, "1.11.1", "483d9ad4da4b6a83e88a9063d62de6ea431735857c47360b6a73b5b231b13133", [:mix], [], "hexpm", "5946bc5b4f8edfc5124dcf82f42aebd3832eb0fef4bfa03a7d9b8c2d6c402bc7"},
7-
"makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
8-
"makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"},
9-
"makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},
10-
"nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"},
11-
"sweet_xml": {:hex, :sweet_xml, "0.7.3", "debb256781c75ff6a8c5cbf7981146312b66f044a2898f453709a53e5031b45b", [:mix], [], "hexpm", "e110c867a1b3fe74bfc7dd9893aa851f0eed5518d0d7cad76d7baafd30e4f5ba"},
7+
"makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"},
8+
"makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"},
9+
"makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"},
10+
"nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"},
11+
"sweet_xml": {:hex, :sweet_xml, "0.7.5", "803a563113981aaac202a1dbd39771562d0ad31004ddbfc9b5090bdcd5605277", [:mix], [], "hexpm", "193b28a9b12891cae351d81a0cead165ffe67df1b73fe5866d10629f4faefb12"},
1212
"uncode_set": {:git, "https://github.com/elixir-unicode/unicode_set.git", "67920e8d743ff6576a53104a3fe084b69bfb9ec5", []},
13-
"unicode": {:hex, :unicode, "1.13.1", "4ebb6dc60e91f04e0dd46dca93576416473bc22bae69484122c11682af48e8df", [:mix], [], "hexpm", "fe4fcc8e15f444cf07bfbca30ad726be99c12d45da06d70b0e4f0c49b0d7d5ce"},
14-
"unicode_set": {:hex, :unicode_set, "1.1.0", "32971b9b8061f2b2f0c607ba588ad9a6202e78a0c577555df4da899fd4434f23", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}, {:unicode, "~> 1.13", [hex: :unicode, repo: "hexpm", optional: false]}], "hexpm", "b416e7b18d1297bbdc25c05e56506291e8fd3f339b8386d4e1f6ddb57c047918"},
13+
"unicode": {:hex, :unicode, "1.20.0", "10189cfe98b03ebb8be6efd00df0936c1c94d75bfbd62cba2bdf958fef3ee4a7", [:mix], [], "hexpm", "fa581cf80b3b1b7f42e4d24a69109dfac465cec27a62c661306c81f4ab35894c"},
14+
"unicode_set": {:hex, :unicode_set, "1.5.0", "f2dcc40b1e8daf1a04433c705d9a8fb8ccdfc8fd5763a92d414a3e0775414cfb", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}, {:unicode, "~> 1.13", [hex: :unicode, repo: "hexpm", optional: false]}], "hexpm", "6c7f200e52fb90434d6b783eaa4e0ea303cfc4844ea25b2fc1ba3eb8a6901b11"},
1515
}

test/transform_test.exs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,9 @@ defmodule TransformTest do
2626

2727
assert Unicode.Transform.LatinAscii.transform(text) == after_transform
2828
end
29+
30+
test "that non-breaking space becomes ASCII space in Latin-ASCII" do
31+
assert "There and Back Again" = Unicode.Transform.LatinAscii.transform("There\u00a0and Back\u00a0Again")
32+
end
2933
end
3034
end

transforms/Any-Accents.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ For terms of use, see http://www.unicode.org/copyright.html
99
<version number="$Revision$"/>
1010
<transforms>
1111
<transform source="Any" target="Accents" direction="both" alias="und-t-d0-accents" backwardAlias="und-t-s0-accents">
12-
<tRule>
12+
<tRule><![CDATA[
1313
:: NFD (NFC) ;
1414
# to do: make reversible
1515
# define special conversion characters.
@@ -22,7 +22,7 @@ $pre \' $post ↔ ́ ; # COMBINING ACUTE ACCENT
2222
$pre \^ $post ↔ ̂ ; # COMBINING CIRCUMFLEX ACCENT
2323
$pre \~ $post ↔ ̃ ; # COMBINING TILDE
2424
$pre \- $post ↔ ̄ ; # COMBINING MACRON
25-
$pre \&quot; $post ↔ ̈ ; # COMBINING DIAERESIS
25+
$pre \" $post ↔ ̈ ; # COMBINING DIAERESIS
2626
$pre \* $post ↔ ̊ ; # COMBINING RING ABOVE
2727
$pre \, $post ↔ ̧ ; # COMBINING CEDILLA
2828
$pre '/' $post ↔ ̸ ; # COMBINING LONG SOLIDUS OVERLAY
@@ -278,7 +278,7 @@ $pre v $post ↔ ʌ ; # LATIN SMALL LETTER TURNED V
278278
# $pre YYY $post ↔ ẚ ; # LATIN SMALL LETTER A WITH RIGHT HALF RING
279279
# $pre YYY $post ↔ ⁿ ; # SUPERSCRIPT LATIN SMALL LETTER N
280280
:: NFC (NFD) ;
281-
</tRule>
281+
]]></tRule>
282282
</transform>
283283
</transforms>
284284
</supplementalData>

transforms/Arabic-Latin-BGN.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ For terms of use, see http://www.unicode.org/copyright.html
2727
# MINIMAL FILTER: Arabic-Latin
2828
#
2929

30-
:: [[:arabic:][:block=ARABIC:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهوىيًٌٍَُِّْ٠١٢٣٤٥٦٧٨٩ٱ]] ;
30+
:: [[:Arabic:][:Block=Arabic:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهوىيًٌٍَُِّْ٠١٢٣٤٥٦٧٨٩ٱ]] ;
3131
:: NFKD (NFC) ;
3232
#
3333
#

transforms/Arabic-Latin.xml

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ For terms of use, see http://www.unicode.org/copyright.html
2222
# Does *not* do assimilation of "al", nor hyphenation.
2323
# While it could be done, we need to determine whether a prefix "al" could
2424
# occur other than as the definite article (since no space is used).
25-
:: [[:Arabic:][:block=ARABIC:][‎ⁿ،؛؟ـً-ٕ٠-٬۰-۹﷼ښ]] ;
25+
:: [[:Arabic:][:Block=Arabic:][‎ⁿ،؛؟ـً-ٕ٠-٬۰-۹﷼ښ][\u0611\u0670]] ;
2626
:: NFKD (NFC);
2727
$disambig = ̱ ;
2828
$disambig2 = ̰ ;
@@ -61,6 +61,10 @@ $notAbove = [[:^ccc=0:] & [:^ccc=230:]];
6161
٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT
6262
٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE
6363
64+
؉ ↔ ‰ ; # U+0609 ARABIC-INDIC PER MILLE SIGN
65+
؊ ↔ ‱ ; # U+060A ARABIC-INDIC PER TEN THOUSAND SIGN
66+
‎۔‎ ↔ '.' ; # U+06D4 ARABIC FULL STOP
67+
6468
# letters
6569
# long vowels
6670
َا↔ ā ; # ARABIC FATHA, ARABIC LETTER ALEF
@@ -137,6 +141,58 @@ $notAbove = [[:^ccc=0:] & [:^ccc=230:]];
137141
# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
138142
گ ↔ g ; # ARABIC LETTER GAF
139143
144+
# fallbacks TODO roundtrip where possible, using diacritics to distinguish
145+
#https://en.wikipedia.org/wiki/Sindhi_transliteration
146+
‎ٺ‎→ṭh;
147+
‎ٿ‎→th;
148+
‎ٽ‎→ṭ;
149+
‎ڙ‎→ṛ;
150+
‎ڦ‎→ph;
151+
‎ڻ‎→ṇ;
152+
‎ڱ‎→ṅ;
153+
‎ڃ‎→ñ;
154+
‎ڪ‎→k;
155+
‎ڄ‎→j̈;
156+
‎ۃ‎→ẖ;
157+
‎ڳ‎→g̤;
158+
‎ڍ‎→ḍh;
159+
‎ڌ‎→dh;
160+
‎ڏ‎→d̤;
161+
‎ڊ‎→ḍ;
162+
‎ڇ‎→ch;
163+
‎ڀ‎→bh;
164+
‎ٻ‎→ḇ;
165+
‎۽‎→'&';
166+
‎۾‎→'mn';
167+
168+
#https://en.wiktionary.org/wiki/Wiktionary:Urdu_transliteration
169+
‎ھ‎ → ʱ ;
170+
‎ں‎ → ◌̃ ;
171+
‎ے‎ → ai ;
172+
‎ڈ‎ → ḍ ;
173+
‎ڑ‎ → ṛ ;
174+
‎ٹ‎ → ṭ ;
175+
176+
#https://www.eki.ee/wgrs/rom2_ps.htm
177+
#https://en.wikipedia.org/wiki/Pashto_alphabet
178+
‎ټ‎ → ṯ ;
179+
‎ځ‎ → dz ;
180+
‎څ‎ → ts ;
181+
‎ډ‎ → ḏ ;
182+
‎ړ‎ → ṟ ;
183+
‎ږ‎ → z͟h ;
184+
‎ګ‎ → g ;
185+
‎ڼ‎ → ṉ ;
186+
‎ۍ‎ → ạy ;
187+
‎ې‎ → e ;
188+
189+
#https://www.eki.ee/wgrs/rom1_ug.pdf
190+
‎ہ‎ → ḥ ;
191+
‎ە‎ → ĥ ;
192+
193+
# Delete marks without correspondants
194+
[\u0611\u0670] → ;
195+
140196
# fallbacks
141197
| s ← c } [eiy];
142198
| k ← c ;

transforms/Bengali-Latin.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ For terms of use, see http://www.unicode.org/copyright.html
1010
<transforms>
1111
<transform source="Beng" target="Latn" direction="forward" alias="Bengali-Latin und-Latn-t-und-beng">
1212
<tRule>
13-
::[[:script=bengali:][।-॥ঁ-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ়-ৄে-ৈো-্ৗড়-ঢ়য়-ৣ০-৺ৎ]];
13+
::[[:Script=Bengali:][।-॥ঁ-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ়-ৄে-ৈো-্ৗড়-ঢ়য়-ৣ০-৺ৎ]];
1414
::NFD;
1515
::Bengali-InterIndic;
1616
::InterIndic-Latin;

0 commit comments

Comments
 (0)