From 07abc762164f0543f04c625d58b3bb989449578b Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 17 Jul 2024 21:00:31 +0800 Subject: [PATCH 01/11] fix max mem used --- benchmarks/text_to_video_latte.py | 13 +++++----- .../examples/latte/README.md | 26 +++++++++---------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index 629cdccc0..463d452d1 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -38,8 +38,7 @@ from PIL import Image, ImageDraw import torch -import oneflow as flow -from onediffx import compile_pipe, OneflowCompileOptions +from onediffx import compile_pipe from diffusers.utils import load_image, export_to_video from diffusers.schedulers import DDIMScheduler from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder @@ -250,14 +249,16 @@ def get_kwarg_inputs(): videos = pipe(**kwarg_inputs).video end = time.time() + print("=======================================") print(f"Inference time: {end - begin:.3f}s") iter_per_sec = iter_profiler.get_iter_per_sec() if iter_per_sec is not None: print(f"Iterations per second: {iter_per_sec:.3f}") - cuda_mem_after_used = flow._oneflow_internal.GetCUDAMemoryUsed() - host_mem_after_used = flow._oneflow_internal.GetCPUMemoryUsed() - print(f"CUDA Mem after: {cuda_mem_after_used / 1024:.3f}GiB") - print(f"Host Mem after: {host_mem_after_used / 1024:.3f}GiB") + cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) + cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) + print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") + print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") + print("=======================================") if args.output_video is not None: # export_to_video(output_frames[0], args.output_video, fps=args.fps) diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md index bbce3929c..6e203b2db 100644 --- a/onediff_diffusers_extensions/examples/latte/README.md +++ b/onediff_diffusers_extensions/examples/latte/README.md @@ -40,7 +40,7 @@ python3 ./benchmarks/text_to_video_latte.py \ --model maxin-cn/Latte-1 \ --steps 50 \ --compiler none \ -----output-video ./latte.mp4 \ +--output-video ./latte.mp4 \ --prompt "An epic tornado attacking above aglowing city at night." ``` @@ -50,7 +50,7 @@ python3 ./benchmarks/text_to_video_latte.py \ --model maxin-cn/Latte-1 \ --steps 50 \ --compiler nexfort \ -----output-video ./latte_compile.mp4 \ +--output-video ./latte_compile.mp4 \ --prompt "An epic tornado attacking above aglowing city at night." ``` @@ -60,17 +60,17 @@ python3 ./benchmarks/text_to_video_latte.py \ #### On A100 | Metric | NVIDIA A100-PCIE-40GB (512 * 512) | -| ------------------------------------------------ | ----------------------------------- | -| Data update date(yyyy-mm-dd) | 2024-06-19 | -| PyTorch iteration speed | 1.60it/s | -| OneDiff iteration speed | 2.27it/s(+41.9%) | -| PyTorch E2E time | 32.618s | -| OneDiff E2E time | 22.601s(-30.7%) | -| PyTorch Max Mem Used | 28.208GiB | -| OneDiff Max Mem Used | 24.753GiB | -| PyTorch Warmup with Run time | 33.291s | -| OneDiff Warmup with Compilation time1 | 572.877s | -| OneDiff Warmup with Cache time | 148.068s | +| ------------------------------------------------ | --------------------------------- | +| Data update date(yyyy-mm-dd) | 2024-06-19 | +| PyTorch iteration speed | 1.60 it/s | +| OneDiff iteration speed | 2.27 it/s(+41.9%) | +| PyTorch E2E time | 32.618 s | +| OneDiff E2E time | 22.601 s(-30.7%) | +| PyTorch Max Mem Used | 19.9 GiB | +| OneDiff Max Mem Used | 19.9 GiB | +| PyTorch Warmup with Run time | 33.291 s | +| OneDiff Warmup with Compilation time1 | 572.877 s | +| OneDiff Warmup with Cache time | 148.068 s | 1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU. From 8f658519b23df605d0fc66432b461a4db4bdc1da Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 17 Jul 2024 21:24:47 +0800 Subject: [PATCH 02/11] fix --- benchmarks/text_to_video_latte.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index 463d452d1..6f7ea6328 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -39,7 +39,6 @@ import torch from onediffx import compile_pipe -from diffusers.utils import load_image, export_to_video from diffusers.schedulers import DDIMScheduler from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder from transformers import T5EncoderModel, T5Tokenizer @@ -267,7 +266,7 @@ def get_kwarg_inputs(): args.output_video, videos[0], fps=8, quality=9 ) # highest quality is 10, lowest is 0 except: - print("Error when saving {}".format(prompt)) + print("Error when saving {}".format(args.prompt)) else: print("Please set `--output-video` to save the output video") From 6245a6d13d0b322ad9f912bf99b6724d91cd48f8 Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 17 Jul 2024 21:25:17 +0800 Subject: [PATCH 03/11] fix format --- onediff_diffusers_extensions/examples/latte/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md index 6e203b2db..f1c46d5ef 100644 --- a/onediff_diffusers_extensions/examples/latte/README.md +++ b/onediff_diffusers_extensions/examples/latte/README.md @@ -67,7 +67,7 @@ python3 ./benchmarks/text_to_video_latte.py \ | PyTorch E2E time | 32.618 s | | OneDiff E2E time | 22.601 s(-30.7%) | | PyTorch Max Mem Used | 19.9 GiB | -| OneDiff Max Mem Used | 19.9 GiB | +| OneDiff Max Mem Used | 19.9 GiB | | PyTorch Warmup with Run time | 33.291 s | | OneDiff Warmup with Compilation time1 | 572.877 s | | OneDiff Warmup with Cache time | 148.068 s | From a3210c6bf67a09c39173714130d9d40f9921aada Mon Sep 17 00:00:00 2001 From: strint Date: Thu, 18 Jul 2024 12:18:16 +0800 Subject: [PATCH 04/11] prof latte --- benchmarks/text_to_video_latte.py | 87 ++++++++++++++++--------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index 6f7ea6328..7e179d510 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -225,50 +225,55 @@ def get_kwarg_inputs(): ) return kwarg_inputs - if args.warmups > 0: - print("=======================================") - print("Begin warmup") - begin = time.time() - for _ in range(args.warmups): - pipe(**get_kwarg_inputs()).video - end = time.time() - print("End warmup") - print(f"Warmup time: {end - begin:.3f}s") + with torch.profiler.profile() as prof: + with torch.profiler.record_function("latte warmup"): + if args.warmups > 0: + print("=======================================") + print("Begin warmup") + begin = time.time() + for _ in range(args.warmups): + pipe(**get_kwarg_inputs()).video + end = time.time() + print("End warmup") + print(f"Warmup time: {end - begin:.3f}s") + + print("=======================================") + + kwarg_inputs = get_kwarg_inputs() + iter_profiler = IterationProfiler() + if "callback_on_step_end" in inspect.signature(pipe).parameters: + kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end + elif "callback" in inspect.signature(pipe).parameters: + kwarg_inputs["callback"] = iter_profiler.callback_on_step_end + with torch.profiler.record_function("latte run"): + torch.manual_seed(args.seed) + begin = time.time() + videos = pipe(**kwarg_inputs).video + end = time.time() print("=======================================") + print(f"Inference time: {end - begin:.3f}s") + iter_per_sec = iter_profiler.get_iter_per_sec() + if iter_per_sec is not None: + print(f"Iterations per second: {iter_per_sec:.3f}") + cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) + cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) + print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") + print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") + print("=======================================") - kwarg_inputs = get_kwarg_inputs() - iter_profiler = IterationProfiler() - if "callback_on_step_end" in inspect.signature(pipe).parameters: - kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end - elif "callback" in inspect.signature(pipe).parameters: - kwarg_inputs["callback"] = iter_profiler.callback_on_step_end - torch.manual_seed(args.seed) - begin = time.time() - videos = pipe(**kwarg_inputs).video - end = time.time() - - print("=======================================") - print(f"Inference time: {end - begin:.3f}s") - iter_per_sec = iter_profiler.get_iter_per_sec() - if iter_per_sec is not None: - print(f"Iterations per second: {iter_per_sec:.3f}") - cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) - cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) - print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") - print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") - print("=======================================") - - if args.output_video is not None: - # export_to_video(output_frames[0], args.output_video, fps=args.fps) - try: - imageio.mimwrite( - args.output_video, videos[0], fps=8, quality=9 - ) # highest quality is 10, lowest is 0 - except: - print("Error when saving {}".format(args.prompt)) - else: - print("Please set `--output-video` to save the output video") + with torch.profiler.record_function("latte export"): + if args.output_video is not None: + # export_to_video(output_frames[0], args.output_video, fps=args.fps) + try: + imageio.mimwrite( + args.output_video, videos[0], fps=8, quality=9 + ) # highest quality is 10, lowest is 0 + except: + print("Error when saving {}".format(args.prompt)) + else: + print("Please set `--output-video` to save the output video") + prof.export_chrome_trace("latte_with_cache_prof.json") if __name__ == "__main__": From 5872b83eb2cec45a6fd61d4bd0588b08333e0f6f Mon Sep 17 00:00:00 2001 From: strint Date: Thu, 18 Jul 2024 19:19:14 +0800 Subject: [PATCH 05/11] add profile for latte --- benchmarks/text_to_video_latte.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index 7e179d510..e927e486d 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -50,8 +50,6 @@ def parse_args(): parser.add_argument("--model", type=str, default=MODEL) parser.add_argument("--ckpt", type=str, default=CKPT) parser.add_argument("--prompt", type=str, default=PROMPT) - parser.add_argument("--save_graph", action="store_true") - parser.add_argument("--load_graph", action="store_true") parser.add_argument("--variant", type=str, default=VARIANT) parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE) # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD) @@ -94,6 +92,7 @@ def parse_args(): type=int, default=ATTENTION_FP16_SCORE_ACCUM_MAX_M, ) + parser.add_argument("--profile", action="store_true") return parser.parse_args() @@ -122,6 +121,15 @@ def callback_on_step_end(self, pipe, i, t, callback_kwargs={}): self.num_iterations += 1 return callback_kwargs +from contextlib import contextmanager + +@contextmanager +def conditional_context(enabled, context_manager): + if enabled: + with context_manager as cm: + yield cm + else: + return None def main(): args = parse_args() @@ -225,8 +233,8 @@ def get_kwarg_inputs(): ) return kwarg_inputs - with torch.profiler.profile() as prof: - with torch.profiler.record_function("latte warmup"): + with conditional_context(args.profile, torch.profiler.profile()) as prof: + with conditional_context(args.profile, torch.profiler.record_function("latte warmup")): if args.warmups > 0: print("=======================================") print("Begin warmup") @@ -262,7 +270,7 @@ def get_kwarg_inputs(): print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") print("=======================================") - with torch.profiler.record_function("latte export"): + with conditional_context(args.profile, torch.profiler.record_function("latte export")): if args.output_video is not None: # export_to_video(output_frames[0], args.output_video, fps=args.fps) try: @@ -273,7 +281,8 @@ def get_kwarg_inputs(): print("Error when saving {}".format(args.prompt)) else: print("Please set `--output-video` to save the output video") - prof.export_chrome_trace("latte_with_cache_prof.json") + if prof: + prof.export_chrome_trace("latte_with_cache_prof.json") if __name__ == "__main__": From 9f1c44c55e553590a48941852f409e718ff77c1a Mon Sep 17 00:00:00 2001 From: strint Date: Tue, 23 Jul 2024 16:56:53 +0800 Subject: [PATCH 06/11] add diffusers latte pipeline --- benchmarks/text_to_video_latte.py | 77 ++++++++++++++++--- .../examples/latte/README.md | 10 +++ 2 files changed, 75 insertions(+), 12 deletions(-) diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index 5211edaa3..a7a3786e5 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -96,6 +96,7 @@ def parse_args(): default=ATTENTION_FP16_SCORE_ACCUM_MAX_M, ) parser.add_argument("--profile", action="store_true") + parser.add_argument("--from_hf", action="store_true") return parser.parse_args() @@ -137,19 +138,42 @@ def conditional_context(enabled, context_manager): yield None -def main(): - args = parse_args() +_is_form_hf = False - if os.path.exists(args.model): - model_path = args.model + +def get_pipeline(args, model_path, device): + global _is_form_hf + if args.from_hf: + # Has error for now + # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step + # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1 + print("get pipeline from diffusers") + _is_form_hf = True + return get_pipeline_from_hf(args, model_path, device) else: - from huggingface_hub import snapshot_download + print("get pipeline from source") + _is_form_hf = False + return get_pipeline_from_source(args, model_path, device) - model_path = snapshot_download(repo_id=args.model) - torch.set_grad_enabled(False) - device = "cuda" if torch.cuda.is_available() else "cpu" +def get_pipeline_from_hf(args, model_path, device): + # Get pipeline from diffusers + # diffusers version >= 0.30 + from diffusers import LattePipeline + + pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to( + device + ) + + # Convert to channels_last memory format + pipe.transformer.to(memory_format=torch.channels_last) + pipe.vae.to(memory_format=torch.channels_last) + return pipe + +def get_pipeline_from_source(args, model_path, device): + # Get pipeline from https://github.com/siliconflow/dit_latte/ from models.latte_t2v import LatteT2V from sample.pipeline_latte import LattePipeline @@ -193,6 +217,24 @@ def main(): transformer=transformer_model, ).to(device) + return pipe + + +def main(): + args = parse_args() + + if os.path.exists(args.model): + model_path = args.model + else: + from huggingface_hub import snapshot_download + + model_path = snapshot_download(repo_id=args.model) + + torch.set_grad_enabled(False) + device = "cuda" if torch.cuda.is_available() else "cpu" + + pipe = get_pipeline(args, model_path, device) + if args.compiler == "none": pass elif args.compiler == "nexfort": @@ -230,15 +272,19 @@ def get_kwarg_inputs(): enable_temporal_attentions=args.enable_temporal_attentions, num_images_per_prompt=1, mask_feature=True, - enable_vae_temporal_decoder=args.enable_vae_temporal_decoder, **( dict() if args.extra_call_kwargs is None else json.loads(args.extra_call_kwargs) ), ) + if not _is_form_hf: + kwarg_inputs[ + "enable_vae_temporal_decoder" + ] = args.enable_vae_temporal_decoder return kwarg_inputs + kwarg_inputs = get_kwarg_inputs() with conditional_context(args.profile, torch.profiler.profile()) as prof: with conditional_context( args.profile, torch.profiler.record_function("latte warmup") @@ -248,14 +294,17 @@ def get_kwarg_inputs(): print("Begin warmup") begin = time.time() for _ in range(args.warmups): - pipe(**get_kwarg_inputs()).video + out = pipe(**kwarg_inputs) + if _is_form_hf: + videos = out.frames[0] + else: + videos = out.video end = time.time() print("End warmup") print(f"Warmup time: {end - begin:.3f}s") print("=======================================") - kwarg_inputs = get_kwarg_inputs() iter_profiler = IterationProfiler() if "callback_on_step_end" in inspect.signature(pipe).parameters: kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end @@ -264,7 +313,11 @@ def get_kwarg_inputs(): with torch.profiler.record_function("latte run"): torch.manual_seed(args.seed) begin = time.time() - videos = pipe(**kwarg_inputs).video + out = pipe(**kwarg_inputs) + if _is_form_hf: + videos = out.frames[0] + else: + videos = out.video end = time.time() print("=======================================") diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md index 1353cd844..49eff3e66 100644 --- a/onediff_diffusers_extensions/examples/latte/README.md +++ b/onediff_diffusers_extensions/examples/latte/README.md @@ -13,6 +13,16 @@ ## Environment setup ### Set up Latte + +#### From HF diffusers +Note: HF diffusers has bug on LattePipeline on 20240723 +Reference: https://huggingface.co/docs/diffusers/main/en/api/pipelines/latte +```bash +# make sure LattePipeline avaliable in HF diffusers(diffusers version >= 0.30) +pip install git+https://github.com/huggingface/diffusers.git@main +``` + +#### (Optional)From latte project HF model: https://huggingface.co/maxin-cn/Latte-1 ```bash git clone -b run https://github.com/siliconflow/dit_latte/ From 2102e128eeafd527d510b5e4696c278eb09d387f Mon Sep 17 00:00:00 2001 From: strint Date: Tue, 23 Jul 2024 22:02:42 +0800 Subject: [PATCH 07/11] add new test data ad base --- .../examples/latte/README.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md index 49eff3e66..02887a830 100644 --- a/onediff_diffusers_extensions/examples/latte/README.md +++ b/onediff_diffusers_extensions/examples/latte/README.md @@ -69,18 +69,18 @@ python3 ./benchmarks/text_to_video_latte.py \ ### Metric #### On A100 -| Metric | NVIDIA A100-PCIE-40GB (512 * 512) | -| ------------------------------------------------ | --------------------------------- | -| Data update date(yyyy-mm-dd) | 2024-06-19 | -| PyTorch iteration speed | 1.60 it/s | -| OneDiff iteration speed | 2.27 it/s(+41.9%) | -| PyTorch E2E time | 32.618 s | -| OneDiff E2E time | 22.601 s(-30.7%) | -| PyTorch Max Mem Used | 19.9 GiB | -| OneDiff Max Mem Used | 19.9 GiB | -| PyTorch Warmup with Run time | 33.291 s | -| OneDiff Warmup with Compilation time1 | 572.877 s | -| OneDiff Warmup with Cache time | 148.068 s | +| Metric | NVIDIA A100-PCIE-40GB (512 * 512) | NVIDIA A100-PCIE-40GB(512 * 512) by strint on ubuntu22 | +| ------------------------------------------------ | --------------------------------- | ------------------------------------------------------- | +| Data update date(yyyy-mm-dd) | 2024-06-19 | 2024-07-23 | +| PyTorch iteration speed | 1.60 it/s | 1.6 it/s | +| OneDiff iteration speed | 2.27 it/s(+41.9%) | 1.723 it/s | +| PyTorch E2E time | 32.618 s | 32.497 s | +| OneDiff E2E time | 22.601 s(-30.7%) | 29.64 s | +| PyTorch Max Mem Used | 19.9 GiB | 19.92 GiB | +| OneDiff Max Mem Used | 19.9 GiB | 19.9 GiB | +| PyTorch Warmup with Run time | 33.291 s | 33.129 s | +| OneDiff Warmup with Compilation time1 | 572.877 s | 737.6 s | +| OneDiff Warmup with Cache time | 148.068 s | 159 s | 1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU. From f83f3873e42c86ca93eab29a6cb2e951112a3399 Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 24 Jul 2024 11:03:55 +0800 Subject: [PATCH 08/11] rename --- benchmarks/text_to_video_latte.py | 179 ++++------- benchmarks/text_to_video_latte_profile.py | 352 ++++++++++++++++++++++ 2 files changed, 405 insertions(+), 126 deletions(-) create mode 100644 benchmarks/text_to_video_latte_profile.py diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index a7a3786e5..fdf538d58 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -51,6 +51,8 @@ def parse_args(): parser.add_argument("--model", type=str, default=MODEL) parser.add_argument("--ckpt", type=str, default=CKPT) parser.add_argument("--prompt", type=str, default=PROMPT) + parser.add_argument("--save_graph", action="store_true") + parser.add_argument("--load_graph", action="store_true") parser.add_argument("--variant", type=str, default=VARIANT) parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE) # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD) @@ -95,8 +97,6 @@ def parse_args(): type=int, default=ATTENTION_FP16_SCORE_ACCUM_MAX_M, ) - parser.add_argument("--profile", action="store_true") - parser.add_argument("--from_hf", action="store_true") return parser.parse_args() @@ -126,54 +126,19 @@ def callback_on_step_end(self, pipe, i, t, callback_kwargs={}): return callback_kwargs -from contextlib import contextmanager - - -@contextmanager -def conditional_context(enabled, context_manager): - if enabled: - with context_manager as cm: - yield cm - else: - yield None - - -_is_form_hf = False - +def main(): + args = parse_args() -def get_pipeline(args, model_path, device): - global _is_form_hf - if args.from_hf: - # Has error for now - # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step - # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1 - print("get pipeline from diffusers") - _is_form_hf = True - return get_pipeline_from_hf(args, model_path, device) + if os.path.exists(args.model): + model_path = args.model else: - print("get pipeline from source") - _is_form_hf = False - return get_pipeline_from_source(args, model_path, device) - - -def get_pipeline_from_hf(args, model_path, device): - # Get pipeline from diffusers - # diffusers version >= 0.30 - from diffusers import LattePipeline - - pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to( - device - ) + from huggingface_hub import snapshot_download - # Convert to channels_last memory format - pipe.transformer.to(memory_format=torch.channels_last) - pipe.vae.to(memory_format=torch.channels_last) - return pipe + model_path = snapshot_download(repo_id=args.model) + torch.set_grad_enabled(False) + device = "cuda" if torch.cuda.is_available() else "cpu" -def get_pipeline_from_source(args, model_path, device): - # Get pipeline from https://github.com/siliconflow/dit_latte/ from models.latte_t2v import LatteT2V from sample.pipeline_latte import LattePipeline @@ -217,24 +182,6 @@ def get_pipeline_from_source(args, model_path, device): transformer=transformer_model, ).to(device) - return pipe - - -def main(): - args = parse_args() - - if os.path.exists(args.model): - model_path = args.model - else: - from huggingface_hub import snapshot_download - - model_path = snapshot_download(repo_id=args.model) - - torch.set_grad_enabled(False) - device = "cuda" if torch.cuda.is_available() else "cpu" - - pipe = get_pipeline(args, model_path, device) - if args.compiler == "none": pass elif args.compiler == "nexfort": @@ -272,80 +219,60 @@ def get_kwarg_inputs(): enable_temporal_attentions=args.enable_temporal_attentions, num_images_per_prompt=1, mask_feature=True, + enable_vae_temporal_decoder=args.enable_vae_temporal_decoder, **( dict() if args.extra_call_kwargs is None else json.loads(args.extra_call_kwargs) ), ) - if not _is_form_hf: - kwarg_inputs[ - "enable_vae_temporal_decoder" - ] = args.enable_vae_temporal_decoder return kwarg_inputs - kwarg_inputs = get_kwarg_inputs() - with conditional_context(args.profile, torch.profiler.profile()) as prof: - with conditional_context( - args.profile, torch.profiler.record_function("latte warmup") - ): - if args.warmups > 0: - print("=======================================") - print("Begin warmup") - begin = time.time() - for _ in range(args.warmups): - out = pipe(**kwarg_inputs) - if _is_form_hf: - videos = out.frames[0] - else: - videos = out.video - end = time.time() - print("End warmup") - print(f"Warmup time: {end - begin:.3f}s") - - print("=======================================") - - iter_profiler = IterationProfiler() - if "callback_on_step_end" in inspect.signature(pipe).parameters: - kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end - elif "callback" in inspect.signature(pipe).parameters: - kwarg_inputs["callback"] = iter_profiler.callback_on_step_end - with torch.profiler.record_function("latte run"): - torch.manual_seed(args.seed) - begin = time.time() - out = pipe(**kwarg_inputs) - if _is_form_hf: - videos = out.frames[0] - else: - videos = out.video - end = time.time() - + if args.warmups > 0: print("=======================================") - print(f"Inference time: {end - begin:.3f}s") - iter_per_sec = iter_profiler.get_iter_per_sec() - if iter_per_sec is not None: - print(f"Iterations per second: {iter_per_sec:.3f}") - cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) - cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) - print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") - print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") + print("Begin warmup") + begin = time.time() + for _ in range(args.warmups): + pipe(**get_kwarg_inputs()).video + end = time.time() + print("End warmup") + print(f"Warmup time: {end - begin:.3f}s") + print("=======================================") - with conditional_context( - args.profile, torch.profiler.record_function("latte export") - ): - if args.output_video is not None: - # export_to_video(output_frames[0], args.output_video, fps=args.fps) - try: - imageio.mimwrite( - args.output_video, videos[0], fps=8, quality=9 - ) # highest quality is 10, lowest is 0 - except: - print("Error when saving {}".format(args.prompt)) - else: - print("Please set `--output-video` to save the output video") - if prof: - prof.export_chrome_trace("latte_with_cache_prof.json") + kwarg_inputs = get_kwarg_inputs() + iter_profiler = IterationProfiler() + if "callback_on_step_end" in inspect.signature(pipe).parameters: + kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end + elif "callback" in inspect.signature(pipe).parameters: + kwarg_inputs["callback"] = iter_profiler.callback_on_step_end + torch.manual_seed(args.seed) + begin = time.time() + videos = pipe(**kwarg_inputs).video + end = time.time() + + print("=======================================") + print(f"Inference time: {end - begin:.3f}s") + iter_per_sec = iter_profiler.get_iter_per_sec() + if iter_per_sec is not None: + print(f"Iterations per second: {iter_per_sec:.3f}") + cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) + cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) + print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") + print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") + print("=======================================") + + if args.output_video is not None: + # export_to_video(output_frames[0], args.output_video, fps=args.fps) + try: + imageio.mimwrite( + args.output_video, videos[0], fps=8, quality=9 + ) # highest quality is 10, lowest is 0 + except: + print("Error when saving {}".format(args.prompt)) + + else: + print("Please set `--output-video` to save the output video") if __name__ == "__main__": diff --git a/benchmarks/text_to_video_latte_profile.py b/benchmarks/text_to_video_latte_profile.py new file mode 100644 index 000000000..bdd4f5f11 --- /dev/null +++ b/benchmarks/text_to_video_latte_profile.py @@ -0,0 +1,352 @@ +MODEL = "maxin-cn/Latte-1" +CKPT = "t2v_v20240523.pt" +VARIANT = None +CUSTOM_PIPELINE = None +# SAMPLE_METHOD = "DDIM" +BETA_START = 0.0001 +BETA_END = 0.02 +BREA_SCHEDULE = "linear" +VARIANCE_TYPE = "learned_range" +STEPS = 50 +SEED = 25 +WARMUPS = 1 +BATCH = 1 +HEIGHT = 512 +WIDTH = 512 +VIDEO_LENGTH = 16 +FPS = 8 +GUIDANCE_SCALE = 7.5 +ENABLE_TEMPORAL_ATTENTIONS = "true" +ENABLE_VAE_TEMPORAL_DECODER = "true" +OUTPUT_VIDEO = "output.mp4" + +PROMPT = "An epic tornado attacking above aglowing city at night." + +EXTRA_CALL_KWARGS = None +ATTENTION_FP16_SCORE_ACCUM_MAX_M = 0 + +COMPILER_CONFIG = None + + +import argparse +import importlib +import inspect +import json +import os +import random +import time + +import imageio + +import torch +from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder +from diffusers.schedulers import DDIMScheduler +from onediffx import compile_pipe +from PIL import Image, ImageDraw +from transformers import T5EncoderModel, T5Tokenizer + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default=MODEL) + parser.add_argument("--ckpt", type=str, default=CKPT) + parser.add_argument("--prompt", type=str, default=PROMPT) + parser.add_argument("--variant", type=str, default=VARIANT) + parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE) + # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD) + parser.add_argument("--beta-start", type=float, default=BETA_START) + parser.add_argument("--beta-end", type=float, default=BETA_END) + parser.add_argument("--beta-schedule", type=str, default=BREA_SCHEDULE) + parser.add_argument( + "--enable_temporal_attentions", + type=(lambda x: str(x).lower() in ["true", "1", "yes"]), + default=ENABLE_TEMPORAL_ATTENTIONS, + ) + parser.add_argument( + "--enable_vae_temporal_decoder", + type=(lambda x: str(x).lower() in ["true", "1", "yes"]), + default=ENABLE_VAE_TEMPORAL_DECODER, + ) + parser.add_argument("--guidance-scale", type=float, default=GUIDANCE_SCALE) + parser.add_argument("--variance-type", type=str, default=VARIANCE_TYPE) + parser.add_argument("--steps", type=int, default=STEPS) + parser.add_argument("--seed", type=int, default=SEED) + parser.add_argument("--warmups", type=int, default=WARMUPS) + parser.add_argument("--batch", type=int, default=BATCH) + parser.add_argument("--height", type=int, default=HEIGHT) + parser.add_argument("--width", type=int, default=WIDTH) + parser.add_argument("--video-length", type=int, default=VIDEO_LENGTH) + parser.add_argument("--fps", type=int, default=FPS) + parser.add_argument("--extra-call-kwargs", type=str, default=EXTRA_CALL_KWARGS) + parser.add_argument("--output-video", type=str, default=OUTPUT_VIDEO) + parser.add_argument( + "--compiler", + type=str, + default="nexfort", + choices=["none", "nexfort", "compile"], + ) + parser.add_argument( + "--compiler-config", + type=str, + default=COMPILER_CONFIG, + ) + parser.add_argument( + "--attention-fp16-score-accum-max-m", + type=int, + default=ATTENTION_FP16_SCORE_ACCUM_MAX_M, + ) + parser.add_argument("--profile", action="store_true") + parser.add_argument("--from_hf", action="store_true") + return parser.parse_args() + + +class IterationProfiler: + def __init__(self): + self.begin = None + self.end = None + self.num_iterations = 0 + + def get_iter_per_sec(self): + if self.begin is None or self.end is None: + return None + self.end.synchronize() + dur = self.begin.elapsed_time(self.end) + return self.num_iterations / dur * 1000.0 + + def callback_on_step_end(self, pipe, i, t, callback_kwargs={}): + if self.begin is None: + event = torch.cuda.Event(enable_timing=True) + event.record() + self.begin = event + else: + event = torch.cuda.Event(enable_timing=True) + event.record() + self.end = event + self.num_iterations += 1 + return callback_kwargs + + +from contextlib import contextmanager + + +@contextmanager +def conditional_context(enabled, context_manager): + if enabled: + with context_manager as cm: + yield cm + else: + yield None + + +_is_form_hf = False + + +def get_pipeline(args, model_path, device): + global _is_form_hf + if args.from_hf: + # Has error for now + # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step + # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1 + print("get pipeline from diffusers") + _is_form_hf = True + return get_pipeline_from_hf(args, model_path, device) + else: + print("get pipeline from source") + _is_form_hf = False + return get_pipeline_from_source(args, model_path, device) + + +def get_pipeline_from_hf(args, model_path, device): + # Get pipeline from diffusers + # diffusers version >= 0.30 + from diffusers import LattePipeline + + pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to( + device + ) + + # Convert to channels_last memory format + pipe.transformer.to(memory_format=torch.channels_last) + pipe.vae.to(memory_format=torch.channels_last) + return pipe + + +def get_pipeline_from_source(args, model_path, device): + # Get pipeline from https://github.com/siliconflow/dit_latte/ + from models.latte_t2v import LatteT2V + from sample.pipeline_latte import LattePipeline + + transformer_model = LatteT2V.from_pretrained( + model_path, subfolder="transformer", video_length=args.video_length + ).to(device, dtype=torch.float16) + + if args.enable_vae_temporal_decoder: + vae = AutoencoderKLTemporalDecoder.from_pretrained( + args.model, subfolder="vae_temporal_decoder", torch_dtype=torch.float16 + ).to(device) + else: + vae = AutoencoderKL.from_pretrained( + args.model, subfolder="vae", torch_dtype=torch.float16 + ).to(device) + tokenizer = T5Tokenizer.from_pretrained(args.model, subfolder="tokenizer") + text_encoder = T5EncoderModel.from_pretrained( + args.model, subfolder="text_encoder", torch_dtype=torch.float16 + ).to(device) + + # set eval mode + transformer_model.eval() + vae.eval() + text_encoder.eval() + + scheduler = DDIMScheduler.from_pretrained( + model_path, + subfolder="scheduler", + beta_start=args.beta_start, + beta_end=args.beta_end, + beta_schedule=args.beta_schedule, + variance_type=args.variance_type, + clip_sample=False, + ) + + pipe = LattePipeline( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + scheduler=scheduler, + transformer=transformer_model, + ).to(device) + + return pipe + + +def main(): + args = parse_args() + + if os.path.exists(args.model): + model_path = args.model + else: + from huggingface_hub import snapshot_download + + model_path = snapshot_download(repo_id=args.model) + + torch.set_grad_enabled(False) + device = "cuda" if torch.cuda.is_available() else "cpu" + + pipe = get_pipeline(args, model_path, device) + + if args.compiler == "none": + pass + elif args.compiler == "nexfort": + print("Nexfort backend is now active...") + if args.compiler_config is not None: + # config with dict + options = json.loads(args.compiler_config) + else: + # config with string + options = '{"mode": "O2", \ + "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \ + "triton.fuse_attention_allow_fp16_reduction": false}}' + pipe = compile_pipe( + pipe, backend="nexfort", options=options, fuse_qkv_projections=True + ) + elif args.compiler == "compile": + if hasattr(pipe, "unet"): + pipe.unet = torch.compile(pipe.unet) + if hasattr(pipe, "transformer"): + pipe.transformer = torch.compile(pipe.transformer) + if hasattr(pipe, "controlnet"): + pipe.controlnet = torch.compile(pipe.controlnet) + pipe.vae = torch.compile(pipe.vae) + else: + raise ValueError(f"Unknown compiler: {args.compiler}") + + def get_kwarg_inputs(): + kwarg_inputs = dict( + prompt=args.prompt, + video_length=args.video_length, + height=args.height, + width=args.width, + num_inference_steps=args.steps, + guidance_scale=args.guidance_scale, + enable_temporal_attentions=args.enable_temporal_attentions, + num_images_per_prompt=1, + mask_feature=True, + **( + dict() + if args.extra_call_kwargs is None + else json.loads(args.extra_call_kwargs) + ), + ) + if not _is_form_hf: + kwarg_inputs[ + "enable_vae_temporal_decoder" + ] = args.enable_vae_temporal_decoder + return kwarg_inputs + + kwarg_inputs = get_kwarg_inputs() + with conditional_context(args.profile, torch.profiler.profile()) as prof: + with conditional_context( + args.profile, torch.profiler.record_function("latte warmup") + ): + if args.warmups > 0: + print("=======================================") + print("Begin warmup") + begin = time.time() + for _ in range(args.warmups): + out = pipe(**kwarg_inputs) + if _is_form_hf: + videos = out.frames[0] + else: + videos = out.video + end = time.time() + print("End warmup") + print(f"Warmup time: {end - begin:.3f}s") + + print("=======================================") + + iter_profiler = IterationProfiler() + if "callback_on_step_end" in inspect.signature(pipe).parameters: + kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end + elif "callback" in inspect.signature(pipe).parameters: + kwarg_inputs["callback"] = iter_profiler.callback_on_step_end + with torch.profiler.record_function("latte run"): + torch.manual_seed(args.seed) + begin = time.time() + out = pipe(**kwarg_inputs) + if _is_form_hf: + videos = out.frames[0] + else: + videos = out.video + end = time.time() + + print("=======================================") + print(f"Inference time: {end - begin:.3f}s") + iter_per_sec = iter_profiler.get_iter_per_sec() + if iter_per_sec is not None: + print(f"Iterations per second: {iter_per_sec:.3f}") + cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) + cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) + print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") + print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") + print("=======================================") + + with conditional_context( + args.profile, torch.profiler.record_function("latte export") + ): + if args.output_video is not None: + # export_to_video(output_frames[0], args.output_video, fps=args.fps) + try: + imageio.mimwrite( + args.output_video, videos[0], fps=8, quality=9 + ) # highest quality is 10, lowest is 0 + except: + print("Error when saving {}".format(args.prompt)) + else: + print("Please set `--output-video` to save the output video") + if prof: + prof.export_chrome_trace("latte_with_cache_prof.json") + + +if __name__ == "__main__": + main() From 78df7d14d3ea76705cf8d9cc66af3e1d6bcb2edc Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 24 Jul 2024 11:32:22 +0800 Subject: [PATCH 09/11] restore profile --- benchmarks/text_to_video_latte.py | 181 +++++++---- benchmarks/text_to_video_latte_profile.py | 352 ---------------------- 2 files changed, 127 insertions(+), 406 deletions(-) delete mode 100644 benchmarks/text_to_video_latte_profile.py diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index fdf538d58..bdd4f5f11 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -51,8 +51,6 @@ def parse_args(): parser.add_argument("--model", type=str, default=MODEL) parser.add_argument("--ckpt", type=str, default=CKPT) parser.add_argument("--prompt", type=str, default=PROMPT) - parser.add_argument("--save_graph", action="store_true") - parser.add_argument("--load_graph", action="store_true") parser.add_argument("--variant", type=str, default=VARIANT) parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE) # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD) @@ -97,6 +95,8 @@ def parse_args(): type=int, default=ATTENTION_FP16_SCORE_ACCUM_MAX_M, ) + parser.add_argument("--profile", action="store_true") + parser.add_argument("--from_hf", action="store_true") return parser.parse_args() @@ -126,19 +126,54 @@ def callback_on_step_end(self, pipe, i, t, callback_kwargs={}): return callback_kwargs -def main(): - args = parse_args() +from contextlib import contextmanager - if os.path.exists(args.model): - model_path = args.model + +@contextmanager +def conditional_context(enabled, context_manager): + if enabled: + with context_manager as cm: + yield cm else: - from huggingface_hub import snapshot_download + yield None - model_path = snapshot_download(repo_id=args.model) - torch.set_grad_enabled(False) - device = "cuda" if torch.cuda.is_available() else "cpu" +_is_form_hf = False + +def get_pipeline(args, model_path, device): + global _is_form_hf + if args.from_hf: + # Has error for now + # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step + # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1 + print("get pipeline from diffusers") + _is_form_hf = True + return get_pipeline_from_hf(args, model_path, device) + else: + print("get pipeline from source") + _is_form_hf = False + return get_pipeline_from_source(args, model_path, device) + + +def get_pipeline_from_hf(args, model_path, device): + # Get pipeline from diffusers + # diffusers version >= 0.30 + from diffusers import LattePipeline + + pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to( + device + ) + + # Convert to channels_last memory format + pipe.transformer.to(memory_format=torch.channels_last) + pipe.vae.to(memory_format=torch.channels_last) + return pipe + + +def get_pipeline_from_source(args, model_path, device): + # Get pipeline from https://github.com/siliconflow/dit_latte/ from models.latte_t2v import LatteT2V from sample.pipeline_latte import LattePipeline @@ -182,6 +217,24 @@ def main(): transformer=transformer_model, ).to(device) + return pipe + + +def main(): + args = parse_args() + + if os.path.exists(args.model): + model_path = args.model + else: + from huggingface_hub import snapshot_download + + model_path = snapshot_download(repo_id=args.model) + + torch.set_grad_enabled(False) + device = "cuda" if torch.cuda.is_available() else "cpu" + + pipe = get_pipeline(args, model_path, device) + if args.compiler == "none": pass elif args.compiler == "nexfort": @@ -191,7 +244,7 @@ def main(): options = json.loads(args.compiler_config) else: # config with string - options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision", \ + options = '{"mode": "O2", \ "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \ "triton.fuse_attention_allow_fp16_reduction": false}}' pipe = compile_pipe( @@ -219,60 +272,80 @@ def get_kwarg_inputs(): enable_temporal_attentions=args.enable_temporal_attentions, num_images_per_prompt=1, mask_feature=True, - enable_vae_temporal_decoder=args.enable_vae_temporal_decoder, **( dict() if args.extra_call_kwargs is None else json.loads(args.extra_call_kwargs) ), ) + if not _is_form_hf: + kwarg_inputs[ + "enable_vae_temporal_decoder" + ] = args.enable_vae_temporal_decoder return kwarg_inputs - if args.warmups > 0: - print("=======================================") - print("Begin warmup") - begin = time.time() - for _ in range(args.warmups): - pipe(**get_kwarg_inputs()).video - end = time.time() - print("End warmup") - print(f"Warmup time: {end - begin:.3f}s") + kwarg_inputs = get_kwarg_inputs() + with conditional_context(args.profile, torch.profiler.profile()) as prof: + with conditional_context( + args.profile, torch.profiler.record_function("latte warmup") + ): + if args.warmups > 0: + print("=======================================") + print("Begin warmup") + begin = time.time() + for _ in range(args.warmups): + out = pipe(**kwarg_inputs) + if _is_form_hf: + videos = out.frames[0] + else: + videos = out.video + end = time.time() + print("End warmup") + print(f"Warmup time: {end - begin:.3f}s") + + print("=======================================") + + iter_profiler = IterationProfiler() + if "callback_on_step_end" in inspect.signature(pipe).parameters: + kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end + elif "callback" in inspect.signature(pipe).parameters: + kwarg_inputs["callback"] = iter_profiler.callback_on_step_end + with torch.profiler.record_function("latte run"): + torch.manual_seed(args.seed) + begin = time.time() + out = pipe(**kwarg_inputs) + if _is_form_hf: + videos = out.frames[0] + else: + videos = out.video + end = time.time() print("=======================================") + print(f"Inference time: {end - begin:.3f}s") + iter_per_sec = iter_profiler.get_iter_per_sec() + if iter_per_sec is not None: + print(f"Iterations per second: {iter_per_sec:.3f}") + cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) + cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) + print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") + print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") + print("=======================================") - kwarg_inputs = get_kwarg_inputs() - iter_profiler = IterationProfiler() - if "callback_on_step_end" in inspect.signature(pipe).parameters: - kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end - elif "callback" in inspect.signature(pipe).parameters: - kwarg_inputs["callback"] = iter_profiler.callback_on_step_end - torch.manual_seed(args.seed) - begin = time.time() - videos = pipe(**kwarg_inputs).video - end = time.time() - - print("=======================================") - print(f"Inference time: {end - begin:.3f}s") - iter_per_sec = iter_profiler.get_iter_per_sec() - if iter_per_sec is not None: - print(f"Iterations per second: {iter_per_sec:.3f}") - cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) - cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) - print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") - print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") - print("=======================================") - - if args.output_video is not None: - # export_to_video(output_frames[0], args.output_video, fps=args.fps) - try: - imageio.mimwrite( - args.output_video, videos[0], fps=8, quality=9 - ) # highest quality is 10, lowest is 0 - except: - print("Error when saving {}".format(args.prompt)) - - else: - print("Please set `--output-video` to save the output video") + with conditional_context( + args.profile, torch.profiler.record_function("latte export") + ): + if args.output_video is not None: + # export_to_video(output_frames[0], args.output_video, fps=args.fps) + try: + imageio.mimwrite( + args.output_video, videos[0], fps=8, quality=9 + ) # highest quality is 10, lowest is 0 + except: + print("Error when saving {}".format(args.prompt)) + else: + print("Please set `--output-video` to save the output video") + if prof: + prof.export_chrome_trace("latte_with_cache_prof.json") if __name__ == "__main__": diff --git a/benchmarks/text_to_video_latte_profile.py b/benchmarks/text_to_video_latte_profile.py deleted file mode 100644 index bdd4f5f11..000000000 --- a/benchmarks/text_to_video_latte_profile.py +++ /dev/null @@ -1,352 +0,0 @@ -MODEL = "maxin-cn/Latte-1" -CKPT = "t2v_v20240523.pt" -VARIANT = None -CUSTOM_PIPELINE = None -# SAMPLE_METHOD = "DDIM" -BETA_START = 0.0001 -BETA_END = 0.02 -BREA_SCHEDULE = "linear" -VARIANCE_TYPE = "learned_range" -STEPS = 50 -SEED = 25 -WARMUPS = 1 -BATCH = 1 -HEIGHT = 512 -WIDTH = 512 -VIDEO_LENGTH = 16 -FPS = 8 -GUIDANCE_SCALE = 7.5 -ENABLE_TEMPORAL_ATTENTIONS = "true" -ENABLE_VAE_TEMPORAL_DECODER = "true" -OUTPUT_VIDEO = "output.mp4" - -PROMPT = "An epic tornado attacking above aglowing city at night." - -EXTRA_CALL_KWARGS = None -ATTENTION_FP16_SCORE_ACCUM_MAX_M = 0 - -COMPILER_CONFIG = None - - -import argparse -import importlib -import inspect -import json -import os -import random -import time - -import imageio - -import torch -from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder -from diffusers.schedulers import DDIMScheduler -from onediffx import compile_pipe -from PIL import Image, ImageDraw -from transformers import T5EncoderModel, T5Tokenizer - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, default=MODEL) - parser.add_argument("--ckpt", type=str, default=CKPT) - parser.add_argument("--prompt", type=str, default=PROMPT) - parser.add_argument("--variant", type=str, default=VARIANT) - parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE) - # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD) - parser.add_argument("--beta-start", type=float, default=BETA_START) - parser.add_argument("--beta-end", type=float, default=BETA_END) - parser.add_argument("--beta-schedule", type=str, default=BREA_SCHEDULE) - parser.add_argument( - "--enable_temporal_attentions", - type=(lambda x: str(x).lower() in ["true", "1", "yes"]), - default=ENABLE_TEMPORAL_ATTENTIONS, - ) - parser.add_argument( - "--enable_vae_temporal_decoder", - type=(lambda x: str(x).lower() in ["true", "1", "yes"]), - default=ENABLE_VAE_TEMPORAL_DECODER, - ) - parser.add_argument("--guidance-scale", type=float, default=GUIDANCE_SCALE) - parser.add_argument("--variance-type", type=str, default=VARIANCE_TYPE) - parser.add_argument("--steps", type=int, default=STEPS) - parser.add_argument("--seed", type=int, default=SEED) - parser.add_argument("--warmups", type=int, default=WARMUPS) - parser.add_argument("--batch", type=int, default=BATCH) - parser.add_argument("--height", type=int, default=HEIGHT) - parser.add_argument("--width", type=int, default=WIDTH) - parser.add_argument("--video-length", type=int, default=VIDEO_LENGTH) - parser.add_argument("--fps", type=int, default=FPS) - parser.add_argument("--extra-call-kwargs", type=str, default=EXTRA_CALL_KWARGS) - parser.add_argument("--output-video", type=str, default=OUTPUT_VIDEO) - parser.add_argument( - "--compiler", - type=str, - default="nexfort", - choices=["none", "nexfort", "compile"], - ) - parser.add_argument( - "--compiler-config", - type=str, - default=COMPILER_CONFIG, - ) - parser.add_argument( - "--attention-fp16-score-accum-max-m", - type=int, - default=ATTENTION_FP16_SCORE_ACCUM_MAX_M, - ) - parser.add_argument("--profile", action="store_true") - parser.add_argument("--from_hf", action="store_true") - return parser.parse_args() - - -class IterationProfiler: - def __init__(self): - self.begin = None - self.end = None - self.num_iterations = 0 - - def get_iter_per_sec(self): - if self.begin is None or self.end is None: - return None - self.end.synchronize() - dur = self.begin.elapsed_time(self.end) - return self.num_iterations / dur * 1000.0 - - def callback_on_step_end(self, pipe, i, t, callback_kwargs={}): - if self.begin is None: - event = torch.cuda.Event(enable_timing=True) - event.record() - self.begin = event - else: - event = torch.cuda.Event(enable_timing=True) - event.record() - self.end = event - self.num_iterations += 1 - return callback_kwargs - - -from contextlib import contextmanager - - -@contextmanager -def conditional_context(enabled, context_manager): - if enabled: - with context_manager as cm: - yield cm - else: - yield None - - -_is_form_hf = False - - -def get_pipeline(args, model_path, device): - global _is_form_hf - if args.from_hf: - # Has error for now - # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step - # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1 - print("get pipeline from diffusers") - _is_form_hf = True - return get_pipeline_from_hf(args, model_path, device) - else: - print("get pipeline from source") - _is_form_hf = False - return get_pipeline_from_source(args, model_path, device) - - -def get_pipeline_from_hf(args, model_path, device): - # Get pipeline from diffusers - # diffusers version >= 0.30 - from diffusers import LattePipeline - - pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to( - device - ) - - # Convert to channels_last memory format - pipe.transformer.to(memory_format=torch.channels_last) - pipe.vae.to(memory_format=torch.channels_last) - return pipe - - -def get_pipeline_from_source(args, model_path, device): - # Get pipeline from https://github.com/siliconflow/dit_latte/ - from models.latte_t2v import LatteT2V - from sample.pipeline_latte import LattePipeline - - transformer_model = LatteT2V.from_pretrained( - model_path, subfolder="transformer", video_length=args.video_length - ).to(device, dtype=torch.float16) - - if args.enable_vae_temporal_decoder: - vae = AutoencoderKLTemporalDecoder.from_pretrained( - args.model, subfolder="vae_temporal_decoder", torch_dtype=torch.float16 - ).to(device) - else: - vae = AutoencoderKL.from_pretrained( - args.model, subfolder="vae", torch_dtype=torch.float16 - ).to(device) - tokenizer = T5Tokenizer.from_pretrained(args.model, subfolder="tokenizer") - text_encoder = T5EncoderModel.from_pretrained( - args.model, subfolder="text_encoder", torch_dtype=torch.float16 - ).to(device) - - # set eval mode - transformer_model.eval() - vae.eval() - text_encoder.eval() - - scheduler = DDIMScheduler.from_pretrained( - model_path, - subfolder="scheduler", - beta_start=args.beta_start, - beta_end=args.beta_end, - beta_schedule=args.beta_schedule, - variance_type=args.variance_type, - clip_sample=False, - ) - - pipe = LattePipeline( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - scheduler=scheduler, - transformer=transformer_model, - ).to(device) - - return pipe - - -def main(): - args = parse_args() - - if os.path.exists(args.model): - model_path = args.model - else: - from huggingface_hub import snapshot_download - - model_path = snapshot_download(repo_id=args.model) - - torch.set_grad_enabled(False) - device = "cuda" if torch.cuda.is_available() else "cpu" - - pipe = get_pipeline(args, model_path, device) - - if args.compiler == "none": - pass - elif args.compiler == "nexfort": - print("Nexfort backend is now active...") - if args.compiler_config is not None: - # config with dict - options = json.loads(args.compiler_config) - else: - # config with string - options = '{"mode": "O2", \ - "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \ - "triton.fuse_attention_allow_fp16_reduction": false}}' - pipe = compile_pipe( - pipe, backend="nexfort", options=options, fuse_qkv_projections=True - ) - elif args.compiler == "compile": - if hasattr(pipe, "unet"): - pipe.unet = torch.compile(pipe.unet) - if hasattr(pipe, "transformer"): - pipe.transformer = torch.compile(pipe.transformer) - if hasattr(pipe, "controlnet"): - pipe.controlnet = torch.compile(pipe.controlnet) - pipe.vae = torch.compile(pipe.vae) - else: - raise ValueError(f"Unknown compiler: {args.compiler}") - - def get_kwarg_inputs(): - kwarg_inputs = dict( - prompt=args.prompt, - video_length=args.video_length, - height=args.height, - width=args.width, - num_inference_steps=args.steps, - guidance_scale=args.guidance_scale, - enable_temporal_attentions=args.enable_temporal_attentions, - num_images_per_prompt=1, - mask_feature=True, - **( - dict() - if args.extra_call_kwargs is None - else json.loads(args.extra_call_kwargs) - ), - ) - if not _is_form_hf: - kwarg_inputs[ - "enable_vae_temporal_decoder" - ] = args.enable_vae_temporal_decoder - return kwarg_inputs - - kwarg_inputs = get_kwarg_inputs() - with conditional_context(args.profile, torch.profiler.profile()) as prof: - with conditional_context( - args.profile, torch.profiler.record_function("latte warmup") - ): - if args.warmups > 0: - print("=======================================") - print("Begin warmup") - begin = time.time() - for _ in range(args.warmups): - out = pipe(**kwarg_inputs) - if _is_form_hf: - videos = out.frames[0] - else: - videos = out.video - end = time.time() - print("End warmup") - print(f"Warmup time: {end - begin:.3f}s") - - print("=======================================") - - iter_profiler = IterationProfiler() - if "callback_on_step_end" in inspect.signature(pipe).parameters: - kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end - elif "callback" in inspect.signature(pipe).parameters: - kwarg_inputs["callback"] = iter_profiler.callback_on_step_end - with torch.profiler.record_function("latte run"): - torch.manual_seed(args.seed) - begin = time.time() - out = pipe(**kwarg_inputs) - if _is_form_hf: - videos = out.frames[0] - else: - videos = out.video - end = time.time() - - print("=======================================") - print(f"Inference time: {end - begin:.3f}s") - iter_per_sec = iter_profiler.get_iter_per_sec() - if iter_per_sec is not None: - print(f"Iterations per second: {iter_per_sec:.3f}") - cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3) - cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3) - print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB") - print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB") - print("=======================================") - - with conditional_context( - args.profile, torch.profiler.record_function("latte export") - ): - if args.output_video is not None: - # export_to_video(output_frames[0], args.output_video, fps=args.fps) - try: - imageio.mimwrite( - args.output_video, videos[0], fps=8, quality=9 - ) # highest quality is 10, lowest is 0 - except: - print("Error when saving {}".format(args.prompt)) - else: - print("Please set `--output-video` to save the output video") - if prof: - prof.export_chrome_trace("latte_with_cache_prof.json") - - -if __name__ == "__main__": - main() From 7e3680889ba7a9fee11433816ecdcb792219fd1c Mon Sep 17 00:00:00 2001 From: strint Date: Thu, 25 Jul 2024 15:49:00 +0800 Subject: [PATCH 10/11] format --- benchmarks/text_to_video_latte.py | 25 ++++++++++++------- .../register_comfy/CrossAttentionPatch.py | 8 +++--- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index bdd4f5f11..90fef4e08 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -95,8 +95,9 @@ def parse_args(): type=int, default=ATTENTION_FP16_SCORE_ACCUM_MAX_M, ) - parser.add_argument("--profile", action="store_true") - parser.add_argument("--from_hf", action="store_true") + parser.add_argument("--profile_warmup", action="store_true") + parser.add_argument("--profile_run", action="store_true") + parser.add_argument("--from-hf", action="store_true") return parser.parse_args() @@ -285,9 +286,11 @@ def get_kwarg_inputs(): return kwarg_inputs kwarg_inputs = get_kwarg_inputs() - with conditional_context(args.profile, torch.profiler.profile()) as prof: + with conditional_context( + args.profile_warmup, torch.profiler.profile() + ) as prof_warmup: with conditional_context( - args.profile, torch.profiler.record_function("latte warmup") + args.profile_warmup, torch.profiler.record_function("latte warmup") ): if args.warmups > 0: print("=======================================") @@ -302,15 +305,19 @@ def get_kwarg_inputs(): end = time.time() print("End warmup") print(f"Warmup time: {end - begin:.3f}s") - print("=======================================") + if prof_warmup: + prof_warmup.export_chrome_trace("latte_prof_warmup.json") + with conditional_context(args.profile_run, torch.profiler.profile()) as prof_run: iter_profiler = IterationProfiler() if "callback_on_step_end" in inspect.signature(pipe).parameters: kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end elif "callback" in inspect.signature(pipe).parameters: kwarg_inputs["callback"] = iter_profiler.callback_on_step_end - with torch.profiler.record_function("latte run"): + with conditional_context( + args.profile_run, torch.profiler.record_function("latte run") + ): torch.manual_seed(args.seed) begin = time.time() out = pipe(**kwarg_inputs) @@ -332,7 +339,7 @@ def get_kwarg_inputs(): print("=======================================") with conditional_context( - args.profile, torch.profiler.record_function("latte export") + args.profile_run, torch.profiler.record_function("latte export") ): if args.output_video is not None: # export_to_video(output_frames[0], args.output_video, fps=args.fps) @@ -344,8 +351,8 @@ def get_kwarg_inputs(): print("Error when saving {}".format(args.prompt)) else: print("Please set `--output-video` to save the output video") - if prof: - prof.export_chrome_trace("latte_with_cache_prof.json") + if prof_run: + prof_run.export_chrome_trace("latte_prof_run.json") if __name__ == "__main__": diff --git a/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py b/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py index 749029178..cf9cf73e7 100644 --- a/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py +++ b/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py @@ -26,7 +26,7 @@ def tensor_to_size(source, dest_size): return source -def get_weight_subidxs(weight,ad_params,sub_idxs): +def get_weight_subidxs(weight, ad_params, sub_idxs): return weight[ad_params[sub_idxs]] @@ -167,7 +167,7 @@ def ipadapter_attention( if ad_params is not None and ad_params["sub_idxs"] is not None: if isinstance(weight, torch.Tensor) and weight.dim() != 0: weight = tensor_to_size(weight, ad_params["full_length"]) - weight = get_weight_subidxs(weight,ad_params,"sub_idxs") + weight = get_weight_subidxs(weight, ad_params, "sub_idxs") # if torch.all(weight == 0): # return 0 weight = weight.repeat( @@ -178,8 +178,8 @@ def ipadapter_attention( # if image length matches or exceeds full_length get sub_idx images if cond.shape[0] >= ad_params["full_length"]: - cond = get_weight_subidxs(cond,ad_params,"sub_idxs") - uncond = get_weight_subidxs(uncond,ad_params,"sub_idxs") + cond = get_weight_subidxs(cond, ad_params, "sub_idxs") + uncond = get_weight_subidxs(uncond, ad_params, "sub_idxs") # otherwise get sub_idxs images else: cond = tensor_to_size(cond, ad_params["full_length"]) From 5fb30e409b06806c01ceee2d9a0d552e73fc6bdf Mon Sep 17 00:00:00 2001 From: strint Date: Sun, 4 Aug 2024 23:41:51 +0800 Subject: [PATCH 11/11] prit perf --- benchmarks/text_to_video_latte.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py index 90fef4e08..3ea16d04a 100644 --- a/benchmarks/text_to_video_latte.py +++ b/benchmarks/text_to_video_latte.py @@ -352,6 +352,7 @@ def get_kwarg_inputs(): else: print("Please set `--output-video` to save the output video") if prof_run: + print(prof_run.key_averages().table(sort_by="cuda_time_total", row_limit=100)) prof_run.export_chrome_trace("latte_prof_run.json")