-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
94 lines (80 loc) · 3.43 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File: utils.py
import uuid
import re
from typing import Dict, Any, Optional
from thefuzz import fuzz
def generate_uuid(entity_type: str, identifier: str) -> str:
"""
Generates a UUID for a given entity type and identifier.
Args:
entity_type: The type of the entity (e.g., "agent", "object", "location").
identifier: A string identifier unique to the entity within its type.
Returns:
A UUID string in the format "entity_type_normalized_identifier".
"""
normalized_identifier = normalize_identifier(identifier)
return f"{entity_type}_{normalized_identifier}"
def normalize_identifier(identifier: str) -> str:
"""
Normalize an identifier for use in UUIDs (Strip, lowercase, and replace spaces with underscores).
This function is used for creating consistent, unique keys.
"""
return identifier.strip().lower().replace(" ", "_") if identifier else ""
def normalize_name(name: str) -> str:
"""
Normalize an entity name for display (strip whitespace, lowercase).
This function is used for presenting names in a consistent format,
but NOT for matching or UUID generation.
"""
return name.strip().lower() if name else ""
def normalize_for_matching(text: str) -> str:
"""
Normalize text for fuzzy matching (remove punctuation, lowercase, etc.).
This function is used for comparing strings to find potential duplicates,
and is more aggressive than normalize_identifier and normalize_name.
"""
text = text.strip().lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
# Consider adding stemming/lemmatization here if needed, e.g., using NLTK:
# from nltk.stem import PorterStemmer
# stemmer = PorterStemmer()
# text = ' '.join([stemmer.stem(word) for word in text.split()])
return text
def normalize_reference(text: str) -> str:
"""
Normalizes a reference string by removing extra whitespace and converting to lowercase.
Kept for backwards compatibility; consider consolidating with normalize_identifier.
"""
return re.sub(r"\s+", " ", text.strip().lower())
def validate_reference(reference: str, valid_prefixes: list) -> bool:
"""
Validates a reference string against a list of valid prefixes.
Kept for backwards compatibility.
"""
return any(reference.startswith(prefix) for prefix in valid_prefixes)
def is_close_match(str1: str, str2: str, threshold: int = 90) -> bool:
"""
Checks if two strings are a close match using fuzzy matching.
Now uses normalize_for_matching before comparison.
Args:
str1: The first string.
str2: The second string.
threshold: The minimum similarity score to be considered a match (0-100).
Returns:
True if the strings are a close match, False otherwise.
"""
return fuzz.ratio(normalize_for_matching(str1), normalize_for_matching(str2)) >= threshold
def format_scene_text(scene: Dict[str, Any]) -> str:
"""
Combines dialogue and stage directions from a scene dictionary into a single text string.
"""
parts = []
if "Scene" in scene:
parts.append(f"Scene: {scene['Scene']}")
dialogue = scene.get("Dialogue", [])
for item in dialogue:
if "Stage Direction" in item:
parts.append(f"Stage Direction: {item['Stage Direction']}")
if "Character" in item and "Line" in item:
parts.append(f"{item['Character']}: {item['Line']}")
return "\n".join(parts)