-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_segmentation.py
105 lines (90 loc) · 3.36 KB
/
word_segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
'''
TODO:
* More testing!
* Change 'similarities' from N*N to linear, so that speed is sub-second.
* Write a tutorial, in Jupyter notebook, to show what I did, and how it is SO FRIGGIN COOL!!!
* Fix other errors
'''
testWord = 'बन्दीप्रतिकोव्यवहारसम्बन्धीमापदण्डअनुकूलको'
def split_word(word):
'''Takes a word, returns all valid splits of it'''
splits = list(get_all_substrings(word))
return startWord(word, splits)
def get_all_substrings(string):
'''takes in an unsegmented string, and depends on the global var 'vocab'. returns: all substrings of the given string, every string a valid vocab token.
tokens beginning in markers are invalid tokens
'''
length = len(string)
for i in range(length):
for j in range(i + 1, length + 1):
if string[i:j] in vocab and string[i:j][0].isalpha():
yield(string[i:j])
def genChild(word, token, startPosInWord, endPosInList, tokenList):
'''Used by startWord. Takes the full word, current token being processed, etc'''
global parts
startPos = startPosInWord
nextWordPos = startPos + len(token)
nextPos = endPosInList
if nextWordPos >= len(word):
#This is the final token
return [token]
#There are other tokens to be made
#if token == 'सम्बन्धी' or 'सम्बन्धी' in token:
nextChar = word[nextWordPos]
#occurrences of next in me
repCount = token.count(nextChar)
genTokens = wordsStartingIn(nextChar, tokenList[nextPos:], repCount = repCount)
nextTokens = genTokens[0]
nextPos = genTokens[1] + nextPos
#Run this function recursively on each
toReturn = []
if len(nextTokens) > 0:
for tok in nextTokens:
children = genChild(word, tok, nextWordPos, nextPos, tokenList)
for child in children:
if token+child.replace('_','') in word:
res = token+"_"+child
toReturn.append(res)
#parts.add(res)
return toReturn
def startWord (word, tokenList):
'''Semi-helper for genChild. processes the token-creation by taking a word, and all valid splits'''
seedWords = wordsStartingIn(word[0], tokenList)
words = seedWords[0]
lastPos = seedWords[1]
tot = []
for each in words:
res = genChild(word, each, 0, lastPos, tokenList)
tot+=res
return tot
def sortMe(words):
'''Sorts the different possible splits by their similarities to each other from Word2Vec'''
return sorted(words, key=lambda x: similarities(x.split('_'), model))
def wordsStartingIn(startingChar, curTokenList, repCount = 0):
'''Returns the words in curToken list that start with startingChar.
repCount is the number of occurrences of the char inside the target word of interest.
'''
global counter
counter += 1
i = 0
tokenList = curTokenList
if len(tokenList)>0:
while repCount >= 0:
while i <len(tokenList) and startingChar != tokenList[i][0]:
i += 1
#Now that we have the position:
match_toks = []
while i< len(tokenList) and startingChar == tokenList[i][0]:
if repCount == 0:
match_toks.append(tokenList[i])
i += 1
repCount -=1
return match_toks, i
return [], len(curTokenList)
def similarities(wordList, model):
'''Sort the words in wordlist according to similarities as given by model'''
totalScore = 0
for i in range(len(wordList)):
for j in range(i+1, len(wordList)):
totalScore += model.n_similarity(wordList[i], wordList[j])
return totalScore/len(wordList)