forked from tibetan-nlp/tibetan-spellchecker
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind-mingzhi.py
executable file
·133 lines (121 loc) · 4.79 KB
/
find-mingzhi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
# This is a proof of concept code computing the mingzhi or the main stack for
# any valid Tibetan syllable. It uses the algorithm described in @Hildt2018.
#
# The main limitation is the completely unknown behavior (including crashes) fo
# a non-valid Tibetan syllable.
#
# Copyright (C) 2017 Elie Roux <[email protected]>
# Provided under the Creative Commons CC0 license
# http://creativecommons.org/publicdomain/zero/1.0/legalcode
import re
subjoined_to_letter = {
'\u0F90': 'ཀ',
'\u0F92': 'ག',
'\u0F94': 'ང',
'\u0F97': 'ཇ',
'\u0F99': 'ཉ',
'\u0F9F': 'ཏ',
'\u0FA1': 'ད',
'\u0FA3': 'ན',
'\u0FA6': 'བ',
'\u0FA8': 'མ',
'\u0FA9': 'ཙ',
'\u0FAB': 'ཛ',
'\u0F95': 'ཅ',
'\u0FA4': 'པ',
'\u0FB7': 'ཧ'
}
def get_mingzhi(syl):
mainstack = get_main_stack(syl)
if len(mainstack) < 2:
return mainstack
if mainstack[1] == '\u0FAD' or mainstack[1] == '\u0FB1' or mainstack[1] == '\u0FB2' or mainstack[1] == '\u0FB3':
return mainstack[0]
if mainstack[0] == 'ར' or mainstack[0] == 'ས' or mainstack[0] == 'ལ':
return subjoined_to_letter[mainstack[1]]
return mainstack[0]
def can_be_prefix_of_naked(prefix, main):
if prefix == 'ད':
return main in ['ཀ', 'ག', 'ང', 'པ', 'བ', 'མ']
if prefix == 'བ':
return main in ['ཀ', 'ག', 'ཅ', 'ཏ', 'ད', 'ཙ', 'ཞ', 'ཟ', 'ཤ', 'ས']
if prefix == 'མ':
return main in ['ཁ', 'ག', 'ང', 'ཆ', 'ཇ', 'ཉ', 'ཐ', 'ད', 'ན', 'ཚ', 'ཛ']
if prefix == 'འ':
return main in ['ཁ', 'ག', 'ཆ', 'ཇ', 'ཐ', 'ད', 'ཕ', 'བ', 'ཚ', 'ཛ']
if prefix == 'ག':
return main in ['ཅ', 'ཉ', 'ཏ', 'ད', 'ན', 'ཙ', 'ཞ', 'ཟ', 'ཡ', 'ཤ', 'ས']
return False
ambiguous_to_main = {
'མངས': 'མ',
'འབས': 'བ',
'མགས': 'མ',
'བགས': 'བ',
'འགས': 'ག',
'དབས': 'བ',
'དགས': 'ག',
'དངས': 'ད',
'དམས': 'མ'
}
def get_main_stack(syl):
# rule 1: if the syllable contains a subscript, superscript or wasur then the main stack is what contains it
m = re.search("[\u0F40-\u0F6C][\u0F90-\u0FBC]+", syl)
if (m is not None):
return m.group(0)
# rule 2: if a letter other than འ carries a vowel then it is the main stack
m = re.search("([\u0F40-\u0F5F\u0F61-\u0F6C])[\u0F71-\u0F81]", syl)
if (m is not None):
return m.group(1)
# rule 3: if a vowel is carried by འ and འ is the first letter then འ is the main stack
m = re.search("\u0F60[\u0F71-\u0F81]", syl)
if (m is not None and m.span()[0] == 0):
return "\u0F60"
# rule 4: if a vowel accent is carried by an འ which is not the first letter, the main stack is before the first འ with a vowel
elif (m is not None):
return syl[m.span()[0]-1]
# rule 5: if the syllable has three or four letters and ends with འང or འམ, then the main stack is right before འང or འམ
syllen = len(syl)
if syllen > 2 and (syl.endswith('འང') or syl.endswith('འམ')):
return syl[-3]
# rule 6: if the syllable is composed of only one letter this letter is the main stack
if syllen == 1:
return syl
# rule 7: if the syllable contains two letters, then the first is the main stack
if syllen == 2:
return syl[0]
# rule 8: if the syllable contains four letters, the main stack is the second letter
if syllen == 4:
return syl[1]
# rule 9: if the final letter is not ས, then the main stack is the second letter
if not syl[2] == 'ས':
return syl[1]
# rule 10: if the first letter cannot be a prefix to the second letter when it has no superscribed nor subscribedy, then the main stack is the first letter
if not can_be_prefix_of_naked(syl[0], syl[1]):
return syl[0]
# rule 11: if ས cannot be second suffix after the second letter if it was a first suffix (meaning the second letter is not ག, ང, བ nor མ), then the main stack is the second letter
if not syl[1] in ['ག', 'ང', 'བ', 'མ']:
return syl[1]
else:
return ambiguous_to_main[syl]
print(get_main_stack('དབྱངས') == 'བྱ')
print(get_main_stack('གཞི') == 'ཞ')
print(get_main_stack('འོད') == 'འ')
print(get_main_stack('མའིའོ') == 'མ')
print(get_main_stack('མའིའམ') == 'མ')
print(get_main_stack('བའམ') == 'བ')
print(get_main_stack('བ') == 'བ')
print(get_main_stack('བར') == 'བ')
print(get_main_stack('བཟབས') == 'ཟ')
print(get_main_stack('བཟའ') == 'ཟ')
print(get_main_stack('ཐགས') == 'ཐ')
print(get_main_stack('གནས') == 'ན')
print(get_main_stack('མགས') == 'མ')
print(get_mingzhi('རླ') == 'ར')
print(get_mingzhi('སྦྱ') == 'བ')
print(get_mingzhi('རྙ') == 'ཉ')
print(get_mingzhi('ལྔ') == 'ང')
print(get_mingzhi('སྲ') == 'ས')
print(get_mingzhi('གླ') == 'ག')
print(get_mingzhi('རྒྱ') == 'ག')
print(get_mingzhi('ར\u0FAD') == 'ར')