Skip to content

Commit

Permalink
Use Lazy Loaders (#1536)
Browse files Browse the repository at this point in the history
* try lazy loadHF first

Signed-off-by: dafnapension <[email protected]>

* reduce benchmark profiling to generating the dataset only. Not inferring (that is dome mocking anyhow) and not evaluating (of the mocked results). add trust_remote also to load_dataset_builder

Signed-off-by: dafnapension <[email protected]>

* try procrastination for load csv too

Signed-off-by: dafnapension <[email protected]>

* added split cache for the generators, and log limit once per data and increase loader cache

Signed-off-by: dafnapension <[email protected]>

* make sklearn loader too - a lazy loader

Signed-off-by: dafnapension <[email protected]>

* adjust to new readers for csv

Signed-off-by: dafnapension <[email protected]>

* Enhance LoadHF class to support optional splits and improve dataset loading logic

Signed-off-by: elronbandel <[email protected]>

* Refactor LoadHF class to improve dataset loading and implement limit on yielded instances

Signed-off-by: elronbandel <[email protected]>

* Refactor LoadHF class to streamline dataset loading and enhance split handling

Signed-off-by: elronbandel <[email protected]>

* Remove unused import and update line number in secrets baseline

Signed-off-by: elronbandel <[email protected]>

* Refactor load_data method to simplify error handling and remove unnecessary cache checks

Signed-off-by: elronbandel <[email protected]>

* Merge origin/main

Signed-off-by: elronbandel <[email protected]>

* Refactor loaders to implement LazyLoader class and update load_iterables method for improved streaming support

Signed-off-by: elronbandel <[email protected]>

* Update exception handling in test_failed_load_csv to catch general exceptions

Signed-off-by: elronbandel <[email protected]>

* Refactor LoadHF class to streamline data loading and enhance error handling

Signed-off-by: elronbandel <[email protected]>

---------

Signed-off-by: dafnapension <[email protected]>
Signed-off-by: elronbandel <[email protected]>
Co-authored-by: Elron Bandel <[email protected]>
  • Loading branch information
dafnapension and elronbandel authored Feb 10, 2025
1 parent 8157230 commit 82a440f
Show file tree
Hide file tree
Showing 29 changed files with 272 additions and 283 deletions.
28 changes: 3 additions & 25 deletions performance/bluebench_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,7 @@ def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
benchmark_recipe=benchmark_recipe, split=split, **kwargs
)

model = self.profiler_instantiate_model()

predictions = self.profiler_infer_predictions(model=model, dataset=dataset)

evaluation_result = self.profiler_evaluate_predictions(
predictions=predictions, dataset=dataset
)
logger.critical(f"length of evaluation_result: {len(evaluation_result)}")
logger.critical(f"length of bluebench generated dataset: {len(dataset)}")


dataset_query = "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]"
Expand Down Expand Up @@ -154,44 +147,29 @@ def main():
pst.strip_dirs()
pst.sort_stats("name") # sort by function name
pst.print_stats(
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|profiler_instantiate_model|profiler_infer_predictions|profiler_evaluate_predictions|load_data|load_iterables"
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|load_data|load_iterables"
)
s = f.getvalue()
assert s.split("\n")[7].split()[3] == "cumtime"
overall_tot_time = find_cummtime_of(
"profile_benchmark_blue_bench", "bluebench_profiler.py", s
)
load_time = find_cummtime_of("load_data", "loaders.py", s)
just_load_no_initial_ms_time = find_cummtime_of(
"load_iterables", "loaders.py", s
)

instantiate_benchmark_time = find_cummtime_of(
"profiler_instantiate_benchmark_recipe", "bluebench_profiler.py", s
)
generate_benchmark_dataset_time = find_cummtime_of(
"profiler_generate_benchmark_dataset", "bluebench_profiler.py", s
)
instantiate_model_time = find_cummtime_of(
"profiler_instantiate_model", "bluebench_profiler.py", s
)
inference_time = find_cummtime_of(
"profiler_infer_predictions", "bluebench_profiler.py", s
)
evaluation_time = find_cummtime_of(
"profiler_evaluate_predictions", "bluebench_profiler.py", s
)

# Data to be written
dictionary = {
"dataset_query": dataset_query,
"total_time": overall_tot_time,
"load_time": load_time,
"load_time_no_initial_ms": just_load_no_initial_ms_time,
"instantiate_benchmark_time": instantiate_benchmark_time,
"generate_benchmark_dataset_time": generate_benchmark_dataset_time,
"instantiate_model_time": instantiate_model_time,
"inference_time": inference_time,
"evaluation_time": evaluation_time,
"used_eager_mode": settings.use_eager_execution,
"performance.prof file": temp_prof_file_path,
}
Expand Down
41 changes: 10 additions & 31 deletions performance/compare_benchmark_performance_results.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import json
import os
import sys

# Argument parser to get file paths from the command line
Expand All @@ -23,24 +22,11 @@
print(f'dataset_query = "{main_perf["dataset_query"]}"')
print(f"used_eager_mode in main = {main_perf['used_eager_mode']}")
print(f"used_eager_mode in PR = {pr_perf['used_eager_mode']}")
print(f"use Mocked inference = {os.environ['UNITXT_MOCK_INFERENCE_MODE']}")

ratio1 = (
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time_no_initial_ms"])
/ (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
if (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
> 0
else 1
)
ratio2 = (
pr_perf["evaluation_time"] / main_perf["evaluation_time"]
if main_perf["evaluation_time"] > 0
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time"])
/ (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"])
if (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"]) > 0
else 1
)
# Markdown table formatting
Expand All @@ -49,26 +35,19 @@
line2 = "--------------------|-------------|-------------|---------------\n"
line3 = f" Total time | {main_perf['total_time']:>11} | {pr_perf['total_time']:>11} | {pr_perf['total_time'] / main_perf['total_time']:.2f}\n"
ratio_line4 = (
pr_perf["load_time_no_initial_ms"] / main_perf["load_time_no_initial_ms"]
if main_perf["load_time_no_initial_ms"] > 0
else 1
pr_perf["load_time"] / main_perf["load_time"] if main_perf["load_time"] > 0 else 1
)
line4 = f" Load time | {main_perf['load_time_no_initial_ms']:>11} | {pr_perf['load_time_no_initial_ms']:>11} | {ratio_line4:.2f}\n"
line4 = f" Load time | {main_perf['load_time']:>11} | {pr_perf['load_time']:>11} | {ratio_line4:.2f}\n"
line5 = f" DS Gen. inc. Load | {main_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time'] / main_perf['generate_benchmark_dataset_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time_no_initial_ms'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time_no_initial_ms'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Inference time | {main_perf['inference_time']:>11} | {pr_perf['inference_time']:>11} | {pr_perf['inference_time'] / main_perf['inference_time']:.2f}\n"
line8 = f" Evaluate time | {main_perf['evaluation_time']:>11} | {pr_perf['evaluation_time']:>11} | {ratio2:.2f}\n"
line9 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
line10 = f" Model Instantiation| {main_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time'] / main_perf['instantiate_model_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"

print("### Performance Comparison Results, time expressed in seconds:\n")
print(line1 + line2 + line3 + line4 + line5 + line6 + line7 + line8 + line9 + line10)
print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
print("\n\n")
# Performance degradation check (5% threshold)
if ratio1 > 1.05 or ratio2 > 1.05:
print(
"\n**Warning**: Performance degradation in Dataset Generation and/or Evaluation exceeds 5%!"
)
if ratio1 > 1.05:
print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
print(
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
"followed by 'snakeviz <the performance.prof file specified in the output json file>'."
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/universal_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
loader=LoadHF(
path="universalner/universal_ner",
name=sub_task,
requirements_list=["conllu"],
requirements=["conllu"],
),
preprocess_steps=[
# The dataset is sorted by classes
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ keep-runtime-typing = true
"src/unitxt/metric.py" = ["F811", "F401"]
"src/unitxt/dataset.py" = ["F811", "F401"]
"src/unitxt/blocks.py" = ["F811", "F401"]
"tests/library/test_loaders.py" = ["N802", "N803"]
"tests/library/test_loaders.py" = ["N802", "N803", "RUF015"]
"tests/library/test_dataclass.py" = ["F811", "E731"]
"src/unitxt/validate.py" = ["B024"]
"src/unitxt/standard.py" = ["C901"]
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/ceb/gja.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "ceb_gja",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/da/ddt.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "da_ddt",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/de/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "de_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/en/ewt.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "en_ewt",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/en/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "en_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/hr/set.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "hr_set",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/pt/bosque.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "pt_bosque",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/pt/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "pt_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/ru/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "ru_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sk/snk.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sk_snk",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sr/set.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sr_set",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sv/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sv_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sv/talbanken.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sv_talbanken",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/tl/trg.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "tl_trg",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/tl/ugnayan.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "tl_ugnayan",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/zh/gsd.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "zh_gsd",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/zh/gsdsimp.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "zh_gsdsimp",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/zh/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "zh_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
6 changes: 1 addition & 5 deletions src/unitxt/fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,7 @@ def prepare_subsets(self):
for i in range(len(self.subsets)):
self.named_subsets[i] = self.subsets[i]
else:
for name, origin in self.subsets.items():
try:
self.named_subsets[name] = origin
except Exception as e:
raise RuntimeError(f"Exception in subset: {name}") from e
self.named_subsets = self.subsets

def splits(self) -> List[str]:
self.prepare_subsets()
Expand Down
Loading

0 comments on commit 82a440f

Please sign in to comment.