-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathservice_to_uzbek_text.py
46 lines (40 loc) · 1.26 KB
/
service_to_uzbek_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import nltk.tokenize
from nltk.tokenize import RegexpTokenizer
def text_normalizer(text):
#text=text.lower()
text = text.replace("'", "‘")
text = text.replace("`", "‘")
text = text.replace("‘", "‘")
text = text.replace("‘", "‘")
text = text.replace("‘", "‘")
text = text.replace("’", "‘")
solid_sign=["sun‘iy","sur‘at","jur‘at","sa‘y"]
for x in solid_sign:
if(x in text):
new_x=x.replace("‘","ʼ")
text=text.replace(x,new_x)
return text
def word_normalizer(word):
word=word.lower()
word=word.strip()
word = word.replace("'", "‘")
word = word.replace("`", "‘")
word = word.replace("‘", "‘")
word = word.replace("‘", "‘")
word = word.replace("‘", "‘")
word = word.replace("’", "‘")
solid_sign=["sun‘iy","sur‘at","jur‘at","sa‘y"]
for x in solid_sign:
if(x in word):
new_x=x.replace("‘","ʼ")
return word
def word_tokenizer(text):
text=text_normalizer(text)
tokenize = RegexpTokenizer("[\w`'‘‘‘’‘-]+")
tokens = tokenize.tokenize(text)
return tokens
#import nltk
#nltk.download('punkt')
def sent_tokenizer(text):
sent=nltk.tokenize.sent_tokenize(text)
return sent