-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_org_funcs.py
116 lines (89 loc) · 4.16 KB
/
find_org_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
from collections import Counter
def extract_temperature(temperature_str):
# Split the temperature string on the "-" character
temperature_range = temperature_str.split("-")
# Convert the temperature values to floats
temperature_values = [float(temp) for temp in temperature_range]
# Take the average of the temperature values
temperature = sum(temperature_values) / len(temperature_values)
return temperature
def find_most_probable_organisms(attributes, bacteria,inverted_counts, vaccination=None, gram = None, travelling=None, n=1):
# Convert the organism data to a dictionary
organism_dict = dict(zip(bacteria['organism'], bacteria.iloc[:, 1:].values.tolist()))
# Define a function to calculate the similarity between a set of attributes and an organism's characteristics
def similarity(attributes, organism):
score = 0
for i in range(1, len(attributes)):
for j in range(len(organism)):
try:
if attributes[i] in organism[j]:
try:
factor = inverted_counts[attributes[i]]
score += factor * 1
except:
score += 1
except:
pass
if "Temperature:" in attributes[i]:
split_string = attributes[i].split()
temperature = float(int(split_string[1]))
try:
organism_temp_range = organism[11].split("-")
except:
organism_temp_range = organism[11]
try:
organism_min_temp = float(int(organism_temp_range[0]))
organism_max_temp = float(int(organism_temp_range[1]))
if organism_min_temp <= temperature <= organism_max_temp:
score += 1
except:
if temperature ==organism_temp_range:
score += 1
if travelling is not None and organism[20] == travelling:
score += 1
if vaccination is not None and organism[21] == vaccination:
score += 1
if gram not in attributes and gram != None:
score = 0
return score
# Calculate the similarity score for each organism
scores = {organism: similarity(attributes, organism_dict[organism]) for organism in organism_dict}
# Sort the organisms by their similarity score, in descending order
sorted_organisms = sorted(scores.items(), key=lambda x: x[1], reverse=True)
tuples = sorted_organisms[:n]
found_scores = []
found_organisms = []
for i in tuples:
found_scores.append(i[1])
found_organisms.append(i[0])
# Return the top N organisms with the highest similarity score
return found_scores, found_organisms
# create a sample dataframe
def count_frequency(df):
word_counts = {}
# iterate over each column in the dataframe
for col in df.columns:
# convert the column to a string
text = ' '.join(df[col].astype(str).tolist())
# split the string into words
words = text.split()
# count the frequency of each word in the column
word_counts[col] = Counter(words)
# create a new dictionary to store the total word counts across all columns
total_counts = {}
# iterate over each column in the word_counts dictionary
for col, count_dict in word_counts.items():
# iterate over each word in the count_dict
for word, count in count_dict.items():
# add the count to the total_counts dictionary
if word in total_counts:
total_counts[word] += count
else:
total_counts[word] = count
# sort the words by their count
sorted_words = sorted(total_counts.items(), key=lambda x: x[1], reverse=True)
max_count = 15
inverted_counts = {word: max_count / count for word, count in total_counts.items()}
# print the total count for e
return inverted_counts