KasunUoM commited on
Commit
b6e493a
·
verified ·
1 Parent(s): 793cc00

Script for Romanize Sinhala Text

Browse files
Files changed (1) hide show
  1. romanizer.py +131 -0
romanizer.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This helps converting Sinhala text to standard Romanized text
2
+
3
+ import re
4
+
5
+ # -- Specials (vowels, diacritics, standalone signs) --
6
+
7
+ ro_specials = [
8
+ ['ඓ', 'ai'],
9
+ ['ඖ', 'au'],
10
+ ['ඍ', 'ṛ'],
11
+ ['ඎ', 'ṝ'],
12
+ ['ඐ', 'ḹ'],
13
+ ['අ', 'a'],
14
+ ['ආ', 'ā'],
15
+ ['ඇ', 'æ'], ['ඇ', 'Æ'],
16
+ ['ඈ', 'ǣ'],
17
+ ['ඉ', 'i'],
18
+ ['ඊ', 'ī'],
19
+ ['උ', 'u'],
20
+ ['ඌ', 'ū'],
21
+ ['එ', 'e'],
22
+ ['ඒ', 'ē'],
23
+ ['ඔ', 'o'],
24
+ ['ඕ', 'ō'],
25
+ ['ඞ්', 'ṅ'],
26
+ ['ං', 'ṁ'], ['ං', 'ṃ'],
27
+ ['ඃ', 'ḥ'], ['ඃ', 'Ḥ'],
28
+ ]
29
+
30
+ # -- Consonants --
31
+
32
+ ro_consonants = [
33
+ ['ඛ', 'kh'],
34
+ ['ඨ', 'ṭh'],
35
+ ['ඝ', 'gh'],
36
+ ['ඡ', 'ch'],
37
+ ['ඣ', 'jh'],
38
+ ['ඦ', 'ñj'],
39
+ ['ඪ', 'ḍh'],
40
+ ['ඬ', 'ṇḍ'],
41
+ ['ථ', 'th'],
42
+ ['ධ', 'dh'],
43
+ ['ඵ', 'ph'],
44
+ ['භ', 'bh'],
45
+ ['ඹ', 'mb'],
46
+ ['ඳ', 'ṉd'],
47
+ ['ඟ', 'ṉg'],
48
+ ['ඥ', 'gn'],
49
+ ['ක', 'k'],
50
+ ['ග', 'g'],
51
+ ['ච', 'c'],
52
+ ['ජ', 'j'],
53
+ ['ඤ', 'ñ'],
54
+ ['ට', 'ṭ'],
55
+ ['ඩ', 'ḍ'],
56
+ ['ණ', 'ṇ'],
57
+ ['ත', 't'],
58
+ ['ද', 'd'],
59
+ ['න', 'n'],
60
+ ['ප', 'p'],
61
+ ['බ', 'b'],
62
+ ['ම', 'm'],
63
+ ['ය', 'y'],
64
+ ['ර', 'r'],
65
+ ['ල', 'l'],
66
+ ['ව', 'v'],
67
+ ['ශ', 'ś'],
68
+ ['ෂ', 'ş'], ['ෂ', 'ṣ'],
69
+ ['ස', 's'],
70
+ ['හ', 'h'],
71
+ ['ළ', 'ḷ'],
72
+ ['ෆ', 'f']
73
+ ]
74
+
75
+ # -- Combinations (consonant + vowel signs) --
76
+
77
+ ro_combinations = [
78
+ ['', '', '්'],
79
+ ['', 'a', ''],
80
+ ['', 'ā', 'ා'],
81
+ ['', 'æ', 'ැ'],
82
+ ['', 'ǣ', 'ෑ'],
83
+ ['', 'i', 'ි'],
84
+ ['', 'ī', 'ී'],
85
+ ['', 'u', 'ු'],
86
+ ['', 'ū', 'ූ'],
87
+ ['', 'e', 'ෙ'],
88
+ ['', 'ē', 'ේ'],
89
+ ['', 'ai', 'ෛ'],
90
+ ['', 'o', 'ො'],
91
+ ['', 'ō', 'ෝ'],
92
+ ['', 'ṛ', 'ෘ'],
93
+ ['', 'ṝ', 'ෲ'],
94
+ ['', 'au', 'ෞ'],
95
+ ['', 'ḹ', 'ෳ']
96
+ ]
97
+
98
+ # -- Generate consonant+vowel combos --
99
+
100
+ def create_conso_combi(combinations, consonants):
101
+ conso_combi = []
102
+ for combi in combinations:
103
+ for conso in consonants:
104
+ base_sinh = conso[0] + combi[2]
105
+ base_rom = combi[0] + conso[1] + combi[1]
106
+ conso_combi.append((base_sinh, base_rom))
107
+ return conso_combi
108
+
109
+ ro_conso_combi = create_conso_combi(ro_combinations, ro_consonants)
110
+
111
+ # -- Core replace function --
112
+ def replace_all(text, mapping):
113
+
114
+ # sort by length (to handle longest matches first)
115
+ mapping = sorted(mapping, key=lambda x: len(x[0]), reverse=True)
116
+ for sinh, rom in mapping:
117
+ text = re.sub(sinh, rom, text)
118
+ return text
119
+
120
+ # -- Main Sinhala → Roman Function --
121
+ def sinhala_to_roman(text):
122
+
123
+ # remove ZWJ (zero-width joiner)
124
+ text = text.replace("\u200D", "")
125
+
126
+ # do consonant+vowel combos first
127
+ text = replace_all(text, ro_conso_combi)
128
+
129
+ # then specials
130
+ text = replace_all(text, ro_specials)
131
+ return text