File size: 2,877 Bytes
b6e493a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# This helps converting Sinhala text to standard Romanized text
import re
# -- Specials (vowels, diacritics, standalone signs) --
ro_specials = [
['ඓ', 'ai'],
['ඖ', 'au'],
['ඍ', 'ṛ'],
['ඎ', 'ṝ'],
['ඐ', 'ḹ'],
['අ', 'a'],
['ආ', 'ā'],
['ඇ', 'æ'], ['ඇ', 'Æ'],
['ඈ', 'ǣ'],
['ඉ', 'i'],
['ඊ', 'ī'],
['උ', 'u'],
['ඌ', 'ū'],
['එ', 'e'],
['ඒ', 'ē'],
['ඔ', 'o'],
['ඕ', 'ō'],
['ඞ්', 'ṅ'],
['ං', 'ṁ'], ['ං', 'ṃ'],
['ඃ', 'ḥ'], ['ඃ', 'Ḥ'],
]
# -- Consonants --
ro_consonants = [
['ඛ', 'kh'],
['ඨ', 'ṭh'],
['ඝ', 'gh'],
['ඡ', 'ch'],
['ඣ', 'jh'],
['ඦ', 'ñj'],
['ඪ', 'ḍh'],
['ඬ', 'ṇḍ'],
['ථ', 'th'],
['ධ', 'dh'],
['ඵ', 'ph'],
['භ', 'bh'],
['ඹ', 'mb'],
['ඳ', 'ṉd'],
['ඟ', 'ṉg'],
['ඥ', 'gn'],
['ක', 'k'],
['ග', 'g'],
['ච', 'c'],
['ජ', 'j'],
['ඤ', 'ñ'],
['ට', 'ṭ'],
['ඩ', 'ḍ'],
['ණ', 'ṇ'],
['ත', 't'],
['ද', 'd'],
['න', 'n'],
['ප', 'p'],
['බ', 'b'],
['ම', 'm'],
['ය', 'y'],
['ර', 'r'],
['ල', 'l'],
['ව', 'v'],
['ශ', 'ś'],
['ෂ', 'ş'], ['ෂ', 'ṣ'],
['ස', 's'],
['හ', 'h'],
['ළ', 'ḷ'],
['ෆ', 'f']
]
# -- Combinations (consonant + vowel signs) --
ro_combinations = [
['', '', '්'],
['', 'a', ''],
['', 'ā', 'ා'],
['', 'æ', 'ැ'],
['', 'ǣ', 'ෑ'],
['', 'i', 'ි'],
['', 'ī', 'ී'],
['', 'u', 'ු'],
['', 'ū', 'ූ'],
['', 'e', 'ෙ'],
['', 'ē', 'ේ'],
['', 'ai', 'ෛ'],
['', 'o', 'ො'],
['', 'ō', 'ෝ'],
['', 'ṛ', 'ෘ'],
['', 'ṝ', 'ෲ'],
['', 'au', 'ෞ'],
['', 'ḹ', 'ෳ']
]
# -- Generate consonant+vowel combos --
def create_conso_combi(combinations, consonants):
conso_combi = []
for combi in combinations:
for conso in consonants:
base_sinh = conso[0] + combi[2]
base_rom = combi[0] + conso[1] + combi[1]
conso_combi.append((base_sinh, base_rom))
return conso_combi
ro_conso_combi = create_conso_combi(ro_combinations, ro_consonants)
# -- Core replace function --
def replace_all(text, mapping):
# sort by length (to handle longest matches first)
mapping = sorted(mapping, key=lambda x: len(x[0]), reverse=True)
for sinh, rom in mapping:
text = re.sub(sinh, rom, text)
return text
# -- Main Sinhala → Roman Function --
def sinhala_to_roman(text):
# remove ZWJ (zero-width joiner)
text = text.replace("\u200D", "")
# do consonant+vowel combos first
text = replace_all(text, ro_conso_combi)
# then specials
text = replace_all(text, ro_specials)
return text
|