From 07abc762164f0543f04c625d58b3bb989449578b Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Wed, 17 Jul 2024 21:00:31 +0800
Subject: [PATCH 01/11] fix max mem used

---
 benchmarks/text_to_video_latte.py             | 13 +++++-----
 .../examples/latte/README.md                  | 26 +++++++++----------
 2 files changed, 20 insertions(+), 19 deletions(-)
diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index 629cdccc0..463d452d1 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -38,8 +38,7 @@
 from PIL import Image, ImageDraw
 
 import torch
-import oneflow as flow
-from onediffx import compile_pipe, OneflowCompileOptions
+from onediffx import compile_pipe
 from diffusers.utils import load_image, export_to_video
 from diffusers.schedulers import DDIMScheduler
 from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
@@ -250,14 +249,16 @@ def get_kwarg_inputs():
     videos = pipe(**kwarg_inputs).video
     end = time.time()
 
+    print("=======================================")
     print(f"Inference time: {end - begin:.3f}s")
     iter_per_sec = iter_profiler.get_iter_per_sec()
     if iter_per_sec is not None:
         print(f"Iterations per second: {iter_per_sec:.3f}")
-    cuda_mem_after_used = flow._oneflow_internal.GetCUDAMemoryUsed()
-    host_mem_after_used = flow._oneflow_internal.GetCPUMemoryUsed()
-    print(f"CUDA Mem after: {cuda_mem_after_used / 1024:.3f}GiB")
-    print(f"Host Mem after: {host_mem_after_used / 1024:.3f}GiB")
+    cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
+    cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+    print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
+    print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
+    print("=======================================")
 
     if args.output_video is not None:
         # export_to_video(output_frames[0], args.output_video, fps=args.fps)
diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md
index bbce3929c..6e203b2db 100644
--- a/onediff_diffusers_extensions/examples/latte/README.md
+++ b/onediff_diffusers_extensions/examples/latte/README.md
@@ -40,7 +40,7 @@ python3 ./benchmarks/text_to_video_latte.py \
 --model maxin-cn/Latte-1 \
 --steps 50 \
 --compiler none \
-----output-video ./latte.mp4 \
+--output-video ./latte.mp4 \
 --prompt "An epic tornado attacking above aglowing city at night."
 ```
 
@@ -50,7 +50,7 @@ python3 ./benchmarks/text_to_video_latte.py \
 --model maxin-cn/Latte-1 \
 --steps 50 \
 --compiler nexfort \
-----output-video ./latte_compile.mp4 \
+--output-video ./latte_compile.mp4 \
 --prompt "An epic tornado attacking above aglowing city at night."
 ```
 
@@ -60,17 +60,17 @@ python3 ./benchmarks/text_to_video_latte.py \
 
 #### On A100
 | Metric                                           | NVIDIA A100-PCIE-40GB (512 * 512) |
-| ------------------------------------------------ | ----------------------------------- |
-| Data update date(yyyy-mm-dd)                     | 2024-06-19                          |
-| PyTorch iteration speed                          | 1.60it/s                            |
-| OneDiff iteration speed                          | 2.27it/s(+41.9%)                    |
-| PyTorch E2E time                                 | 32.618s                             |
-| OneDiff E2E time                                 | 22.601s(-30.7%)                     |
-| PyTorch Max Mem Used                             | 28.208GiB                           |
-| OneDiff Max Mem Used                             | 24.753GiB                           |
-| PyTorch Warmup with Run time                     | 33.291s                             |
-| OneDiff Warmup with Compilation time<sup>1</sup> | 572.877s                            |
-| OneDiff Warmup with Cache time                   | 148.068s                            |
+| ------------------------------------------------ | --------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-06-19                        |
+| PyTorch iteration speed                          | 1.60 it/s                         |
+| OneDiff iteration speed                          | 2.27 it/s(+41.9%)                 |
+| PyTorch E2E time                                 | 32.618 s                          |
+| OneDiff E2E time                                 | 22.601 s(-30.7%)                  |
+| PyTorch Max Mem Used                             | 19.9 GiB                          |
+| OneDiff Max Mem Used                             | 19.9 GiB                        |
+| PyTorch Warmup with Run time                     | 33.291 s                          |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 572.877 s                         |
+| OneDiff Warmup with Cache time                   | 148.068 s                         |
 
  <sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU.
 

From 8f658519b23df605d0fc66432b461a4db4bdc1da Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Wed, 17 Jul 2024 21:24:47 +0800
Subject: [PATCH 02/11] fix

---
 benchmarks/text_to_video_latte.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index 463d452d1..6f7ea6328 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -39,7 +39,6 @@
 
 import torch
 from onediffx import compile_pipe
-from diffusers.utils import load_image, export_to_video
 from diffusers.schedulers import DDIMScheduler
 from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
 from transformers import T5EncoderModel, T5Tokenizer
@@ -267,7 +266,7 @@ def get_kwarg_inputs():
                 args.output_video, videos[0], fps=8, quality=9
             )  # highest quality is 10, lowest is 0
         except:
-            print("Error when saving {}".format(prompt))
+            print("Error when saving {}".format(args.prompt))
     else:
         print("Please set `--output-video` to save the output video")
 

From 6245a6d13d0b322ad9f912bf99b6724d91cd48f8 Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Wed, 17 Jul 2024 21:25:17 +0800
Subject: [PATCH 03/11] fix format

---
 onediff_diffusers_extensions/examples/latte/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md
index 6e203b2db..f1c46d5ef 100644
--- a/onediff_diffusers_extensions/examples/latte/README.md
+++ b/onediff_diffusers_extensions/examples/latte/README.md
@@ -67,7 +67,7 @@ python3 ./benchmarks/text_to_video_latte.py \
 | PyTorch E2E time                                 | 32.618 s                          |
 | OneDiff E2E time                                 | 22.601 s(-30.7%)                  |
 | PyTorch Max Mem Used                             | 19.9 GiB                          |
-| OneDiff Max Mem Used                             | 19.9 GiB                        |
+| OneDiff Max Mem Used                             | 19.9 GiB                          |
 | PyTorch Warmup with Run time                     | 33.291 s                          |
 | OneDiff Warmup with Compilation time<sup>1</sup> | 572.877 s                         |
 | OneDiff Warmup with Cache time                   | 148.068 s                         |

From a3210c6bf67a09c39173714130d9d40f9921aada Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Thu, 18 Jul 2024 12:18:16 +0800
Subject: [PATCH 04/11] prof latte

---
 benchmarks/text_to_video_latte.py | 87 ++++++++++++++++---------------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index 6f7ea6328..7e179d510 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -225,50 +225,55 @@ def get_kwarg_inputs():
         )
         return kwarg_inputs
 
-    if args.warmups > 0:
-        print("=======================================")
-        print("Begin warmup")
-        begin = time.time()
-        for _ in range(args.warmups):
-            pipe(**get_kwarg_inputs()).video
-        end = time.time()
-        print("End warmup")
-        print(f"Warmup time: {end - begin:.3f}s")
+    with torch.profiler.profile() as prof:
+        with torch.profiler.record_function("latte warmup"):
+            if args.warmups > 0:
+                print("=======================================")
+                print("Begin warmup")
+                begin = time.time()
+                for _ in range(args.warmups):
+                    pipe(**get_kwarg_inputs()).video
+                end = time.time()
+                print("End warmup")
+                print(f"Warmup time: {end - begin:.3f}s")
+
+                print("=======================================")
+
+        kwarg_inputs = get_kwarg_inputs()
+        iter_profiler = IterationProfiler()
+        if "callback_on_step_end" in inspect.signature(pipe).parameters:
+            kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
+        elif "callback" in inspect.signature(pipe).parameters:
+            kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
+        with torch.profiler.record_function("latte run"):
+            torch.manual_seed(args.seed)
+            begin = time.time()
+            videos = pipe(**kwarg_inputs).video
+            end = time.time()
 
         print("=======================================")
+        print(f"Inference time: {end - begin:.3f}s")
+        iter_per_sec = iter_profiler.get_iter_per_sec()
+        if iter_per_sec is not None:
+            print(f"Iterations per second: {iter_per_sec:.3f}")
+        cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
+        cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+        print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
+        print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
+        print("=======================================")
 
-    kwarg_inputs = get_kwarg_inputs()
-    iter_profiler = IterationProfiler()
-    if "callback_on_step_end" in inspect.signature(pipe).parameters:
-        kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
-    elif "callback" in inspect.signature(pipe).parameters:
-        kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
-    torch.manual_seed(args.seed)
-    begin = time.time()
-    videos = pipe(**kwarg_inputs).video
-    end = time.time()
-
-    print("=======================================")
-    print(f"Inference time: {end - begin:.3f}s")
-    iter_per_sec = iter_profiler.get_iter_per_sec()
-    if iter_per_sec is not None:
-        print(f"Iterations per second: {iter_per_sec:.3f}")
-    cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
-    cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
-    print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
-    print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
-    print("=======================================")
-
-    if args.output_video is not None:
-        # export_to_video(output_frames[0], args.output_video, fps=args.fps)
-        try:
-            imageio.mimwrite(
-                args.output_video, videos[0], fps=8, quality=9
-            )  # highest quality is 10, lowest is 0
-        except:
-            print("Error when saving {}".format(args.prompt))
-    else:
-        print("Please set `--output-video` to save the output video")
+        with torch.profiler.record_function("latte export"):
+            if args.output_video is not None:
+                # export_to_video(output_frames[0], args.output_video, fps=args.fps)
+                try:
+                    imageio.mimwrite(
+                        args.output_video, videos[0], fps=8, quality=9
+                    )  # highest quality is 10, lowest is 0
+                except:
+                    print("Error when saving {}".format(args.prompt))
+            else:
+                print("Please set `--output-video` to save the output video")
+    prof.export_chrome_trace("latte_with_cache_prof.json")
 
 
 if __name__ == "__main__":

From 5872b83eb2cec45a6fd61d4bd0588b08333e0f6f Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Thu, 18 Jul 2024 19:19:14 +0800
Subject: [PATCH 05/11] add profile for latte

---
 benchmarks/text_to_video_latte.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index 7e179d510..e927e486d 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -50,8 +50,6 @@ def parse_args():
     parser.add_argument("--model", type=str, default=MODEL)
     parser.add_argument("--ckpt", type=str, default=CKPT)
     parser.add_argument("--prompt", type=str, default=PROMPT)
-    parser.add_argument("--save_graph", action="store_true")
-    parser.add_argument("--load_graph", action="store_true")
     parser.add_argument("--variant", type=str, default=VARIANT)
     parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE)
     # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD)
@@ -94,6 +92,7 @@ def parse_args():
         type=int,
         default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
     )
+    parser.add_argument("--profile", action="store_true")
     return parser.parse_args()
 
 
@@ -122,6 +121,15 @@ def callback_on_step_end(self, pipe, i, t, callback_kwargs={}):
             self.num_iterations += 1
         return callback_kwargs
 
+from contextlib import contextmanager
+
+@contextmanager
+def conditional_context(enabled, context_manager):
+    if enabled:
+        with context_manager as cm:
+            yield cm
+    else:
+        return None
 
 def main():
     args = parse_args()
@@ -225,8 +233,8 @@ def get_kwarg_inputs():
         )
         return kwarg_inputs
 
-    with torch.profiler.profile() as prof:
-        with torch.profiler.record_function("latte warmup"):
+    with conditional_context(args.profile, torch.profiler.profile()) as prof:
+        with conditional_context(args.profile, torch.profiler.record_function("latte warmup")):
             if args.warmups > 0:
                 print("=======================================")
                 print("Begin warmup")
@@ -262,7 +270,7 @@ def get_kwarg_inputs():
         print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
         print("=======================================")
 
-        with torch.profiler.record_function("latte export"):
+        with conditional_context(args.profile, torch.profiler.record_function("latte export")):
             if args.output_video is not None:
                 # export_to_video(output_frames[0], args.output_video, fps=args.fps)
                 try:
@@ -273,7 +281,8 @@ def get_kwarg_inputs():
                     print("Error when saving {}".format(args.prompt))
             else:
                 print("Please set `--output-video` to save the output video")
-    prof.export_chrome_trace("latte_with_cache_prof.json")
+    if prof:
+        prof.export_chrome_trace("latte_with_cache_prof.json")
 
 
 if __name__ == "__main__":

From 9f1c44c55e553590a48941852f409e718ff77c1a Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Tue, 23 Jul 2024 16:56:53 +0800
Subject: [PATCH 06/11] add diffusers latte pipeline

---
 benchmarks/text_to_video_latte.py             | 77 ++++++++++++++++---
 .../examples/latte/README.md                  | 10 +++
 2 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index 5211edaa3..a7a3786e5 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -96,6 +96,7 @@ def parse_args():
         default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
     )
     parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--from_hf", action="store_true")
     return parser.parse_args()
 
 
@@ -137,19 +138,42 @@ def conditional_context(enabled, context_manager):
         yield None
 
 
-def main():
-    args = parse_args()
+_is_form_hf = False
 
-    if os.path.exists(args.model):
-        model_path = args.model
+
+def get_pipeline(args, model_path, device):
+    global _is_form_hf
+    if args.from_hf:
+        # Has error for now
+        # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step
+        # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1
+        print("get pipeline from diffusers")
+        _is_form_hf = True
+        return get_pipeline_from_hf(args, model_path, device)
     else:
-        from huggingface_hub import snapshot_download
+        print("get pipeline from source")
+        _is_form_hf = False
+        return get_pipeline_from_source(args, model_path, device)
 
-        model_path = snapshot_download(repo_id=args.model)
 
-    torch.set_grad_enabled(False)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+def get_pipeline_from_hf(args, model_path, device):
+    # Get pipeline from diffusers
+    # diffusers version >= 0.30
+    from diffusers import LattePipeline
+
+    pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to(
+        device
+    )
+
+    # Convert to channels_last memory format
+    pipe.transformer.to(memory_format=torch.channels_last)
+    pipe.vae.to(memory_format=torch.channels_last)
+    return pipe
 
+
+def get_pipeline_from_source(args, model_path, device):
+    # Get pipeline from https://github.com/siliconflow/dit_latte/
     from models.latte_t2v import LatteT2V
     from sample.pipeline_latte import LattePipeline
 
@@ -193,6 +217,24 @@ def main():
         transformer=transformer_model,
     ).to(device)
 
+    return pipe
+
+
+def main():
+    args = parse_args()
+
+    if os.path.exists(args.model):
+        model_path = args.model
+    else:
+        from huggingface_hub import snapshot_download
+
+        model_path = snapshot_download(repo_id=args.model)
+
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    pipe = get_pipeline(args, model_path, device)
+
     if args.compiler == "none":
         pass
     elif args.compiler == "nexfort":
@@ -230,15 +272,19 @@ def get_kwarg_inputs():
             enable_temporal_attentions=args.enable_temporal_attentions,
             num_images_per_prompt=1,
             mask_feature=True,
-            enable_vae_temporal_decoder=args.enable_vae_temporal_decoder,
             **(
                 dict()
                 if args.extra_call_kwargs is None
                 else json.loads(args.extra_call_kwargs)
             ),
         )
+        if not _is_form_hf:
+            kwarg_inputs[
+                "enable_vae_temporal_decoder"
+            ] = args.enable_vae_temporal_decoder
         return kwarg_inputs
 
+    kwarg_inputs = get_kwarg_inputs()
     with conditional_context(args.profile, torch.profiler.profile()) as prof:
         with conditional_context(
             args.profile, torch.profiler.record_function("latte warmup")
@@ -248,14 +294,17 @@ def get_kwarg_inputs():
                 print("Begin warmup")
                 begin = time.time()
                 for _ in range(args.warmups):
-                    pipe(**get_kwarg_inputs()).video
+                    out = pipe(**kwarg_inputs)
+                    if _is_form_hf:
+                        videos = out.frames[0]
+                    else:
+                        videos = out.video
                 end = time.time()
                 print("End warmup")
                 print(f"Warmup time: {end - begin:.3f}s")
 
                 print("=======================================")
 
-        kwarg_inputs = get_kwarg_inputs()
         iter_profiler = IterationProfiler()
         if "callback_on_step_end" in inspect.signature(pipe).parameters:
             kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
@@ -264,7 +313,11 @@ def get_kwarg_inputs():
         with torch.profiler.record_function("latte run"):
             torch.manual_seed(args.seed)
             begin = time.time()
-            videos = pipe(**kwarg_inputs).video
+            out = pipe(**kwarg_inputs)
+            if _is_form_hf:
+                videos = out.frames[0]
+            else:
+                videos = out.video
             end = time.time()
 
         print("=======================================")
diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md
index 1353cd844..49eff3e66 100644
--- a/onediff_diffusers_extensions/examples/latte/README.md
+++ b/onediff_diffusers_extensions/examples/latte/README.md
@@ -13,6 +13,16 @@
 
 ## Environment setup
 ### Set up Latte
+
+#### From HF diffusers
+Note: HF diffusers has bug on LattePipeline on 20240723
+Reference: https://huggingface.co/docs/diffusers/main/en/api/pipelines/latte
+```bash
+# make sure LattePipeline avaliable in HF diffusers(diffusers version >= 0.30)
+pip install git+https://github.com/huggingface/diffusers.git@main
+```
+
+#### (Optional)From latte project
 HF model: https://huggingface.co/maxin-cn/Latte-1
 ```bash
 git clone -b run https://github.com/siliconflow/dit_latte/

From 2102e128eeafd527d510b5e4696c278eb09d387f Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Tue, 23 Jul 2024 22:02:42 +0800
Subject: [PATCH 07/11] add new test data ad base

---
 .../examples/latte/README.md                  | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md
index 49eff3e66..02887a830 100644
--- a/onediff_diffusers_extensions/examples/latte/README.md
+++ b/onediff_diffusers_extensions/examples/latte/README.md
@@ -69,18 +69,18 @@ python3 ./benchmarks/text_to_video_latte.py \
 ### Metric
 
 #### On A100
-| Metric                                           | NVIDIA A100-PCIE-40GB (512 * 512) |
-| ------------------------------------------------ | --------------------------------- |
-| Data update date(yyyy-mm-dd)                     | 2024-06-19                        |
-| PyTorch iteration speed                          | 1.60 it/s                         |
-| OneDiff iteration speed                          | 2.27 it/s(+41.9%)                 |
-| PyTorch E2E time                                 | 32.618 s                          |
-| OneDiff E2E time                                 | 22.601 s(-30.7%)                  |
-| PyTorch Max Mem Used                             | 19.9 GiB                          |
-| OneDiff Max Mem Used                             | 19.9 GiB                          |
-| PyTorch Warmup with Run time                     | 33.291 s                          |
-| OneDiff Warmup with Compilation time<sup>1</sup> | 572.877 s                         |
-| OneDiff Warmup with Cache time                   | 148.068 s                         |
+| Metric                                           | NVIDIA A100-PCIE-40GB (512 * 512) | NVIDIA A100-PCIE-40GB(512 * 512)  by strint on ubuntu22 |
+| ------------------------------------------------ | --------------------------------- | ------------------------------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-06-19                        | 2024-07-23                                              |
+| PyTorch iteration speed                          | 1.60 it/s                         | 1.6 it/s                                                |
+| OneDiff iteration speed                          | 2.27 it/s(+41.9%)                 | 1.723 it/s                                              |
+| PyTorch E2E time                                 | 32.618 s                          | 32.497 s                                                |
+| OneDiff E2E time                                 | 22.601 s(-30.7%)                  | 29.64 s                                                 |
+| PyTorch Max Mem Used                             | 19.9 GiB                          | 19.92 GiB                                               |
+| OneDiff Max Mem Used                             | 19.9 GiB                          | 19.9 GiB                                                |
+| PyTorch Warmup with Run time                     | 33.291 s                          | 33.129 s                                                |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 572.877 s                         | 737.6 s                                                 |
+| OneDiff Warmup with Cache time                   | 148.068 s                         | 159 s                                                   |
 
  <sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU.
 

From f83f3873e42c86ca93eab29a6cb2e951112a3399 Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Wed, 24 Jul 2024 11:03:55 +0800
Subject: [PATCH 08/11] rename

---
 benchmarks/text_to_video_latte.py         | 179 ++++-------
 benchmarks/text_to_video_latte_profile.py | 352 ++++++++++++++++++++++
 2 files changed, 405 insertions(+), 126 deletions(-)
 create mode 100644 benchmarks/text_to_video_latte_profile.py

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index a7a3786e5..fdf538d58 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -51,6 +51,8 @@ def parse_args():
     parser.add_argument("--model", type=str, default=MODEL)
     parser.add_argument("--ckpt", type=str, default=CKPT)
     parser.add_argument("--prompt", type=str, default=PROMPT)
+    parser.add_argument("--save_graph", action="store_true")
+    parser.add_argument("--load_graph", action="store_true")
     parser.add_argument("--variant", type=str, default=VARIANT)
     parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE)
     # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD)
@@ -95,8 +97,6 @@ def parse_args():
         type=int,
         default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
     )
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument("--from_hf", action="store_true")
     return parser.parse_args()
 
 
@@ -126,54 +126,19 @@ def callback_on_step_end(self, pipe, i, t, callback_kwargs={}):
         return callback_kwargs
 
 
-from contextlib import contextmanager
-
-
-@contextmanager
-def conditional_context(enabled, context_manager):
-    if enabled:
-        with context_manager as cm:
-            yield cm
-    else:
-        yield None
-
-
-_is_form_hf = False
-
+def main():
+    args = parse_args()
 
-def get_pipeline(args, model_path, device):
-    global _is_form_hf
-    if args.from_hf:
-        # Has error for now
-        # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step
-        # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1
-        print("get pipeline from diffusers")
-        _is_form_hf = True
-        return get_pipeline_from_hf(args, model_path, device)
+    if os.path.exists(args.model):
+        model_path = args.model
     else:
-        print("get pipeline from source")
-        _is_form_hf = False
-        return get_pipeline_from_source(args, model_path, device)
-
-
-def get_pipeline_from_hf(args, model_path, device):
-    # Get pipeline from diffusers
-    # diffusers version >= 0.30
-    from diffusers import LattePipeline
-
-    pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to(
-        device
-    )
+        from huggingface_hub import snapshot_download
 
-    # Convert to channels_last memory format
-    pipe.transformer.to(memory_format=torch.channels_last)
-    pipe.vae.to(memory_format=torch.channels_last)
-    return pipe
+        model_path = snapshot_download(repo_id=args.model)
 
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
 
-def get_pipeline_from_source(args, model_path, device):
-    # Get pipeline from https://github.com/siliconflow/dit_latte/
     from models.latte_t2v import LatteT2V
     from sample.pipeline_latte import LattePipeline
 
@@ -217,24 +182,6 @@ def get_pipeline_from_source(args, model_path, device):
         transformer=transformer_model,
     ).to(device)
 
-    return pipe
-
-
-def main():
-    args = parse_args()
-
-    if os.path.exists(args.model):
-        model_path = args.model
-    else:
-        from huggingface_hub import snapshot_download
-
-        model_path = snapshot_download(repo_id=args.model)
-
-    torch.set_grad_enabled(False)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    pipe = get_pipeline(args, model_path, device)
-
     if args.compiler == "none":
         pass
     elif args.compiler == "nexfort":
@@ -272,80 +219,60 @@ def get_kwarg_inputs():
             enable_temporal_attentions=args.enable_temporal_attentions,
             num_images_per_prompt=1,
             mask_feature=True,
+            enable_vae_temporal_decoder=args.enable_vae_temporal_decoder,
             **(
                 dict()
                 if args.extra_call_kwargs is None
                 else json.loads(args.extra_call_kwargs)
             ),
         )
-        if not _is_form_hf:
-            kwarg_inputs[
-                "enable_vae_temporal_decoder"
-            ] = args.enable_vae_temporal_decoder
         return kwarg_inputs
 
-    kwarg_inputs = get_kwarg_inputs()
-    with conditional_context(args.profile, torch.profiler.profile()) as prof:
-        with conditional_context(
-            args.profile, torch.profiler.record_function("latte warmup")
-        ):
-            if args.warmups > 0:
-                print("=======================================")
-                print("Begin warmup")
-                begin = time.time()
-                for _ in range(args.warmups):
-                    out = pipe(**kwarg_inputs)
-                    if _is_form_hf:
-                        videos = out.frames[0]
-                    else:
-                        videos = out.video
-                end = time.time()
-                print("End warmup")
-                print(f"Warmup time: {end - begin:.3f}s")
-
-                print("=======================================")
-
-        iter_profiler = IterationProfiler()
-        if "callback_on_step_end" in inspect.signature(pipe).parameters:
-            kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
-        elif "callback" in inspect.signature(pipe).parameters:
-            kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
-        with torch.profiler.record_function("latte run"):
-            torch.manual_seed(args.seed)
-            begin = time.time()
-            out = pipe(**kwarg_inputs)
-            if _is_form_hf:
-                videos = out.frames[0]
-            else:
-                videos = out.video
-            end = time.time()
-
+    if args.warmups > 0:
         print("=======================================")
-        print(f"Inference time: {end - begin:.3f}s")
-        iter_per_sec = iter_profiler.get_iter_per_sec()
-        if iter_per_sec is not None:
-            print(f"Iterations per second: {iter_per_sec:.3f}")
-        cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
-        cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
-        print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
-        print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
+        print("Begin warmup")
+        begin = time.time()
+        for _ in range(args.warmups):
+            pipe(**get_kwarg_inputs()).video
+        end = time.time()
+        print("End warmup")
+        print(f"Warmup time: {end - begin:.3f}s")
+
         print("=======================================")
 
-        with conditional_context(
-            args.profile, torch.profiler.record_function("latte export")
-        ):
-            if args.output_video is not None:
-                # export_to_video(output_frames[0], args.output_video, fps=args.fps)
-                try:
-                    imageio.mimwrite(
-                        args.output_video, videos[0], fps=8, quality=9
-                    )  # highest quality is 10, lowest is 0
-                except:
-                    print("Error when saving {}".format(args.prompt))
-            else:
-                print("Please set `--output-video` to save the output video")
-    if prof:
-        prof.export_chrome_trace("latte_with_cache_prof.json")
+    kwarg_inputs = get_kwarg_inputs()
+    iter_profiler = IterationProfiler()
+    if "callback_on_step_end" in inspect.signature(pipe).parameters:
+        kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
+    elif "callback" in inspect.signature(pipe).parameters:
+        kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
+    torch.manual_seed(args.seed)
+    begin = time.time()
+    videos = pipe(**kwarg_inputs).video
+    end = time.time()
+
+    print("=======================================")
+    print(f"Inference time: {end - begin:.3f}s")
+    iter_per_sec = iter_profiler.get_iter_per_sec()
+    if iter_per_sec is not None:
+        print(f"Iterations per second: {iter_per_sec:.3f}")
+    cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
+    cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+    print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
+    print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
+    print("=======================================")
+
+    if args.output_video is not None:
+        # export_to_video(output_frames[0], args.output_video, fps=args.fps)
+        try:
+            imageio.mimwrite(
+                args.output_video, videos[0], fps=8, quality=9
+            )  # highest quality is 10, lowest is 0
+        except:
+            print("Error when saving {}".format(args.prompt))
+
+    else:
+        print("Please set `--output-video` to save the output video")
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/text_to_video_latte_profile.py b/benchmarks/text_to_video_latte_profile.py
new file mode 100644
index 000000000..bdd4f5f11
--- /dev/null
+++ b/benchmarks/text_to_video_latte_profile.py
@@ -0,0 +1,352 @@
+MODEL = "maxin-cn/Latte-1"
+CKPT = "t2v_v20240523.pt"
+VARIANT = None
+CUSTOM_PIPELINE = None
+# SAMPLE_METHOD = "DDIM"
+BETA_START = 0.0001
+BETA_END = 0.02
+BREA_SCHEDULE = "linear"
+VARIANCE_TYPE = "learned_range"
+STEPS = 50
+SEED = 25
+WARMUPS = 1
+BATCH = 1
+HEIGHT = 512
+WIDTH = 512
+VIDEO_LENGTH = 16
+FPS = 8
+GUIDANCE_SCALE = 7.5
+ENABLE_TEMPORAL_ATTENTIONS = "true"
+ENABLE_VAE_TEMPORAL_DECODER = "true"
+OUTPUT_VIDEO = "output.mp4"
+
+PROMPT = "An epic tornado attacking above aglowing city at night."
+
+EXTRA_CALL_KWARGS = None
+ATTENTION_FP16_SCORE_ACCUM_MAX_M = 0
+
+COMPILER_CONFIG = None
+
+
+import argparse
+import importlib
+import inspect
+import json
+import os
+import random
+import time
+
+import imageio
+
+import torch
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
+from diffusers.schedulers import DDIMScheduler
+from onediffx import compile_pipe
+from PIL import Image, ImageDraw
+from transformers import T5EncoderModel, T5Tokenizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default=MODEL)
+    parser.add_argument("--ckpt", type=str, default=CKPT)
+    parser.add_argument("--prompt", type=str, default=PROMPT)
+    parser.add_argument("--variant", type=str, default=VARIANT)
+    parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE)
+    # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD)
+    parser.add_argument("--beta-start", type=float, default=BETA_START)
+    parser.add_argument("--beta-end", type=float, default=BETA_END)
+    parser.add_argument("--beta-schedule", type=str, default=BREA_SCHEDULE)
+    parser.add_argument(
+        "--enable_temporal_attentions",
+        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+        default=ENABLE_TEMPORAL_ATTENTIONS,
+    )
+    parser.add_argument(
+        "--enable_vae_temporal_decoder",
+        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+        default=ENABLE_VAE_TEMPORAL_DECODER,
+    )
+    parser.add_argument("--guidance-scale", type=float, default=GUIDANCE_SCALE)
+    parser.add_argument("--variance-type", type=str, default=VARIANCE_TYPE)
+    parser.add_argument("--steps", type=int, default=STEPS)
+    parser.add_argument("--seed", type=int, default=SEED)
+    parser.add_argument("--warmups", type=int, default=WARMUPS)
+    parser.add_argument("--batch", type=int, default=BATCH)
+    parser.add_argument("--height", type=int, default=HEIGHT)
+    parser.add_argument("--width", type=int, default=WIDTH)
+    parser.add_argument("--video-length", type=int, default=VIDEO_LENGTH)
+    parser.add_argument("--fps", type=int, default=FPS)
+    parser.add_argument("--extra-call-kwargs", type=str, default=EXTRA_CALL_KWARGS)
+    parser.add_argument("--output-video", type=str, default=OUTPUT_VIDEO)
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        default="nexfort",
+        choices=["none", "nexfort", "compile"],
+    )
+    parser.add_argument(
+        "--compiler-config",
+        type=str,
+        default=COMPILER_CONFIG,
+    )
+    parser.add_argument(
+        "--attention-fp16-score-accum-max-m",
+        type=int,
+        default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
+    )
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--from_hf", action="store_true")
+    return parser.parse_args()
+
+
+class IterationProfiler:
+    def __init__(self):
+        self.begin = None
+        self.end = None
+        self.num_iterations = 0
+
+    def get_iter_per_sec(self):
+        if self.begin is None or self.end is None:
+            return None
+        self.end.synchronize()
+        dur = self.begin.elapsed_time(self.end)
+        return self.num_iterations / dur * 1000.0
+
+    def callback_on_step_end(self, pipe, i, t, callback_kwargs={}):
+        if self.begin is None:
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.begin = event
+        else:
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.end = event
+            self.num_iterations += 1
+        return callback_kwargs
+
+
+from contextlib import contextmanager
+
+
+@contextmanager
+def conditional_context(enabled, context_manager):
+    if enabled:
+        with context_manager as cm:
+            yield cm
+    else:
+        yield None
+
+
+_is_form_hf = False
+
+
+def get_pipeline(args, model_path, device):
+    global _is_form_hf
+    if args.from_hf:
+        # Has error for now
+        # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step
+        # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1
+        print("get pipeline from diffusers")
+        _is_form_hf = True
+        return get_pipeline_from_hf(args, model_path, device)
+    else:
+        print("get pipeline from source")
+        _is_form_hf = False
+        return get_pipeline_from_source(args, model_path, device)
+
+
+def get_pipeline_from_hf(args, model_path, device):
+    # Get pipeline from diffusers
+    # diffusers version >= 0.30
+    from diffusers import LattePipeline
+
+    pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to(
+        device
+    )
+
+    # Convert to channels_last memory format
+    pipe.transformer.to(memory_format=torch.channels_last)
+    pipe.vae.to(memory_format=torch.channels_last)
+    return pipe
+
+
+def get_pipeline_from_source(args, model_path, device):
+    # Get pipeline from https://github.com/siliconflow/dit_latte/
+    from models.latte_t2v import LatteT2V
+    from sample.pipeline_latte import LattePipeline
+
+    transformer_model = LatteT2V.from_pretrained(
+        model_path, subfolder="transformer", video_length=args.video_length
+    ).to(device, dtype=torch.float16)
+
+    if args.enable_vae_temporal_decoder:
+        vae = AutoencoderKLTemporalDecoder.from_pretrained(
+            args.model, subfolder="vae_temporal_decoder", torch_dtype=torch.float16
+        ).to(device)
+    else:
+        vae = AutoencoderKL.from_pretrained(
+            args.model, subfolder="vae", torch_dtype=torch.float16
+        ).to(device)
+    tokenizer = T5Tokenizer.from_pretrained(args.model, subfolder="tokenizer")
+    text_encoder = T5EncoderModel.from_pretrained(
+        args.model, subfolder="text_encoder", torch_dtype=torch.float16
+    ).to(device)
+
+    # set eval mode
+    transformer_model.eval()
+    vae.eval()
+    text_encoder.eval()
+
+    scheduler = DDIMScheduler.from_pretrained(
+        model_path,
+        subfolder="scheduler",
+        beta_start=args.beta_start,
+        beta_end=args.beta_end,
+        beta_schedule=args.beta_schedule,
+        variance_type=args.variance_type,
+        clip_sample=False,
+    )
+
+    pipe = LattePipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+        transformer=transformer_model,
+    ).to(device)
+
+    return pipe
+
+
+def main():
+    args = parse_args()
+
+    if os.path.exists(args.model):
+        model_path = args.model
+    else:
+        from huggingface_hub import snapshot_download
+
+        model_path = snapshot_download(repo_id=args.model)
+
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    pipe = get_pipeline(args, model_path, device)
+
+    if args.compiler == "none":
+        pass
+    elif args.compiler == "nexfort":
+        print("Nexfort backend is now active...")
+        if args.compiler_config is not None:
+            # config with dict
+            options = json.loads(args.compiler_config)
+        else:
+            # config with string
+            options = '{"mode": "O2",             \
+                "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
+                "triton.fuse_attention_allow_fp16_reduction": false}}'
+        pipe = compile_pipe(
+            pipe, backend="nexfort", options=options, fuse_qkv_projections=True
+        )
+    elif args.compiler == "compile":
+        if hasattr(pipe, "unet"):
+            pipe.unet = torch.compile(pipe.unet)
+        if hasattr(pipe, "transformer"):
+            pipe.transformer = torch.compile(pipe.transformer)
+        if hasattr(pipe, "controlnet"):
+            pipe.controlnet = torch.compile(pipe.controlnet)
+        pipe.vae = torch.compile(pipe.vae)
+    else:
+        raise ValueError(f"Unknown compiler: {args.compiler}")
+
+    def get_kwarg_inputs():
+        kwarg_inputs = dict(
+            prompt=args.prompt,
+            video_length=args.video_length,
+            height=args.height,
+            width=args.width,
+            num_inference_steps=args.steps,
+            guidance_scale=args.guidance_scale,
+            enable_temporal_attentions=args.enable_temporal_attentions,
+            num_images_per_prompt=1,
+            mask_feature=True,
+            **(
+                dict()
+                if args.extra_call_kwargs is None
+                else json.loads(args.extra_call_kwargs)
+            ),
+        )
+        if not _is_form_hf:
+            kwarg_inputs[
+                "enable_vae_temporal_decoder"
+            ] = args.enable_vae_temporal_decoder
+        return kwarg_inputs
+
+    kwarg_inputs = get_kwarg_inputs()
+    with conditional_context(args.profile, torch.profiler.profile()) as prof:
+        with conditional_context(
+            args.profile, torch.profiler.record_function("latte warmup")
+        ):
+            if args.warmups > 0:
+                print("=======================================")
+                print("Begin warmup")
+                begin = time.time()
+                for _ in range(args.warmups):
+                    out = pipe(**kwarg_inputs)
+                    if _is_form_hf:
+                        videos = out.frames[0]
+                    else:
+                        videos = out.video
+                end = time.time()
+                print("End warmup")
+                print(f"Warmup time: {end - begin:.3f}s")
+
+                print("=======================================")
+
+        iter_profiler = IterationProfiler()
+        if "callback_on_step_end" in inspect.signature(pipe).parameters:
+            kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
+        elif "callback" in inspect.signature(pipe).parameters:
+            kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
+        with torch.profiler.record_function("latte run"):
+            torch.manual_seed(args.seed)
+            begin = time.time()
+            out = pipe(**kwarg_inputs)
+            if _is_form_hf:
+                videos = out.frames[0]
+            else:
+                videos = out.video
+            end = time.time()
+
+        print("=======================================")
+        print(f"Inference time: {end - begin:.3f}s")
+        iter_per_sec = iter_profiler.get_iter_per_sec()
+        if iter_per_sec is not None:
+            print(f"Iterations per second: {iter_per_sec:.3f}")
+        cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
+        cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+        print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
+        print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
+        print("=======================================")
+
+        with conditional_context(
+            args.profile, torch.profiler.record_function("latte export")
+        ):
+            if args.output_video is not None:
+                # export_to_video(output_frames[0], args.output_video, fps=args.fps)
+                try:
+                    imageio.mimwrite(
+                        args.output_video, videos[0], fps=8, quality=9
+                    )  # highest quality is 10, lowest is 0
+                except:
+                    print("Error when saving {}".format(args.prompt))
+            else:
+                print("Please set `--output-video` to save the output video")
+    if prof:
+        prof.export_chrome_trace("latte_with_cache_prof.json")
+
+
+if __name__ == "__main__":
+    main()

From 78df7d14d3ea76705cf8d9cc66af3e1d6bcb2edc Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Wed, 24 Jul 2024 11:32:22 +0800
Subject: [PATCH 09/11] restore profile

---
 benchmarks/text_to_video_latte.py         | 181 +++++++----
 benchmarks/text_to_video_latte_profile.py | 352 ----------------------
 2 files changed, 127 insertions(+), 406 deletions(-)
 delete mode 100644 benchmarks/text_to_video_latte_profile.py

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index fdf538d58..bdd4f5f11 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -51,8 +51,6 @@ def parse_args():
     parser.add_argument("--model", type=str, default=MODEL)
     parser.add_argument("--ckpt", type=str, default=CKPT)
     parser.add_argument("--prompt", type=str, default=PROMPT)
-    parser.add_argument("--save_graph", action="store_true")
-    parser.add_argument("--load_graph", action="store_true")
     parser.add_argument("--variant", type=str, default=VARIANT)
     parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE)
     # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD)
@@ -97,6 +95,8 @@ def parse_args():
         type=int,
         default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
     )
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--from_hf", action="store_true")
     return parser.parse_args()
 
 
@@ -126,19 +126,54 @@ def callback_on_step_end(self, pipe, i, t, callback_kwargs={}):
         return callback_kwargs
 
 
-def main():
-    args = parse_args()
+from contextlib import contextmanager
 
-    if os.path.exists(args.model):
-        model_path = args.model
+
+@contextmanager
+def conditional_context(enabled, context_manager):
+    if enabled:
+        with context_manager as cm:
+            yield cm
     else:
-        from huggingface_hub import snapshot_download
+        yield None
 
-        model_path = snapshot_download(repo_id=args.model)
 
-    torch.set_grad_enabled(False)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+_is_form_hf = False
+
 
+def get_pipeline(args, model_path, device):
+    global _is_form_hf
+    if args.from_hf:
+        # Has error for now
+        # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step
+        # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1
+        print("get pipeline from diffusers")
+        _is_form_hf = True
+        return get_pipeline_from_hf(args, model_path, device)
+    else:
+        print("get pipeline from source")
+        _is_form_hf = False
+        return get_pipeline_from_source(args, model_path, device)
+
+
+def get_pipeline_from_hf(args, model_path, device):
+    # Get pipeline from diffusers
+    # diffusers version >= 0.30
+    from diffusers import LattePipeline
+
+    pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to(
+        device
+    )
+
+    # Convert to channels_last memory format
+    pipe.transformer.to(memory_format=torch.channels_last)
+    pipe.vae.to(memory_format=torch.channels_last)
+    return pipe
+
+
+def get_pipeline_from_source(args, model_path, device):
+    # Get pipeline from https://github.com/siliconflow/dit_latte/
     from models.latte_t2v import LatteT2V
     from sample.pipeline_latte import LattePipeline
 
@@ -182,6 +217,24 @@ def main():
         transformer=transformer_model,
     ).to(device)
 
+    return pipe
+
+
+def main():
+    args = parse_args()
+
+    if os.path.exists(args.model):
+        model_path = args.model
+    else:
+        from huggingface_hub import snapshot_download
+
+        model_path = snapshot_download(repo_id=args.model)
+
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    pipe = get_pipeline(args, model_path, device)
+
     if args.compiler == "none":
         pass
     elif args.compiler == "nexfort":
@@ -191,7 +244,7 @@ def main():
             options = json.loads(args.compiler_config)
         else:
             # config with string
-            options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision",             \
+            options = '{"mode": "O2",             \
                 "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
                 "triton.fuse_attention_allow_fp16_reduction": false}}'
         pipe = compile_pipe(
@@ -219,60 +272,80 @@ def get_kwarg_inputs():
             enable_temporal_attentions=args.enable_temporal_attentions,
             num_images_per_prompt=1,
             mask_feature=True,
-            enable_vae_temporal_decoder=args.enable_vae_temporal_decoder,
             **(
                 dict()
                 if args.extra_call_kwargs is None
                 else json.loads(args.extra_call_kwargs)
             ),
         )
+        if not _is_form_hf:
+            kwarg_inputs[
+                "enable_vae_temporal_decoder"
+            ] = args.enable_vae_temporal_decoder
         return kwarg_inputs
 
-    if args.warmups > 0:
-        print("=======================================")
-        print("Begin warmup")
-        begin = time.time()
-        for _ in range(args.warmups):
-            pipe(**get_kwarg_inputs()).video
-        end = time.time()
-        print("End warmup")
-        print(f"Warmup time: {end - begin:.3f}s")
+    kwarg_inputs = get_kwarg_inputs()
+    with conditional_context(args.profile, torch.profiler.profile()) as prof:
+        with conditional_context(
+            args.profile, torch.profiler.record_function("latte warmup")
+        ):
+            if args.warmups > 0:
+                print("=======================================")
+                print("Begin warmup")
+                begin = time.time()
+                for _ in range(args.warmups):
+                    out = pipe(**kwarg_inputs)
+                    if _is_form_hf:
+                        videos = out.frames[0]
+                    else:
+                        videos = out.video
+                end = time.time()
+                print("End warmup")
+                print(f"Warmup time: {end - begin:.3f}s")
+
+                print("=======================================")
+
+        iter_profiler = IterationProfiler()
+        if "callback_on_step_end" in inspect.signature(pipe).parameters:
+            kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
+        elif "callback" in inspect.signature(pipe).parameters:
+            kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
+        with torch.profiler.record_function("latte run"):
+            torch.manual_seed(args.seed)
+            begin = time.time()
+            out = pipe(**kwarg_inputs)
+            if _is_form_hf:
+                videos = out.frames[0]
+            else:
+                videos = out.video
+            end = time.time()
 
         print("=======================================")
+        print(f"Inference time: {end - begin:.3f}s")
+        iter_per_sec = iter_profiler.get_iter_per_sec()
+        if iter_per_sec is not None:
+            print(f"Iterations per second: {iter_per_sec:.3f}")
+        cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
+        cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+        print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
+        print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
+        print("=======================================")
 
-    kwarg_inputs = get_kwarg_inputs()
-    iter_profiler = IterationProfiler()
-    if "callback_on_step_end" in inspect.signature(pipe).parameters:
-        kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
-    elif "callback" in inspect.signature(pipe).parameters:
-        kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
-    torch.manual_seed(args.seed)
-    begin = time.time()
-    videos = pipe(**kwarg_inputs).video
-    end = time.time()
-
-    print("=======================================")
-    print(f"Inference time: {end - begin:.3f}s")
-    iter_per_sec = iter_profiler.get_iter_per_sec()
-    if iter_per_sec is not None:
-        print(f"Iterations per second: {iter_per_sec:.3f}")
-    cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
-    cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
-    print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
-    print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
-    print("=======================================")
-
-    if args.output_video is not None:
-        # export_to_video(output_frames[0], args.output_video, fps=args.fps)
-        try:
-            imageio.mimwrite(
-                args.output_video, videos[0], fps=8, quality=9
-            )  # highest quality is 10, lowest is 0
-        except:
-            print("Error when saving {}".format(args.prompt))
-
-    else:
-        print("Please set `--output-video` to save the output video")
+        with conditional_context(
+            args.profile, torch.profiler.record_function("latte export")
+        ):
+            if args.output_video is not None:
+                # export_to_video(output_frames[0], args.output_video, fps=args.fps)
+                try:
+                    imageio.mimwrite(
+                        args.output_video, videos[0], fps=8, quality=9
+                    )  # highest quality is 10, lowest is 0
+                except:
+                    print("Error when saving {}".format(args.prompt))
+            else:
+                print("Please set `--output-video` to save the output video")
+    if prof:
+        prof.export_chrome_trace("latte_with_cache_prof.json")
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/text_to_video_latte_profile.py b/benchmarks/text_to_video_latte_profile.py
deleted file mode 100644
index bdd4f5f11..000000000
--- a/benchmarks/text_to_video_latte_profile.py
+++ /dev/null
@@ -1,352 +0,0 @@
-MODEL = "maxin-cn/Latte-1"
-CKPT = "t2v_v20240523.pt"
-VARIANT = None
-CUSTOM_PIPELINE = None
-# SAMPLE_METHOD = "DDIM"
-BETA_START = 0.0001
-BETA_END = 0.02
-BREA_SCHEDULE = "linear"
-VARIANCE_TYPE = "learned_range"
-STEPS = 50
-SEED = 25
-WARMUPS = 1
-BATCH = 1
-HEIGHT = 512
-WIDTH = 512
-VIDEO_LENGTH = 16
-FPS = 8
-GUIDANCE_SCALE = 7.5
-ENABLE_TEMPORAL_ATTENTIONS = "true"
-ENABLE_VAE_TEMPORAL_DECODER = "true"
-OUTPUT_VIDEO = "output.mp4"
-
-PROMPT = "An epic tornado attacking above aglowing city at night."
-
-EXTRA_CALL_KWARGS = None
-ATTENTION_FP16_SCORE_ACCUM_MAX_M = 0
-
-COMPILER_CONFIG = None
-
-
-import argparse
-import importlib
-import inspect
-import json
-import os
-import random
-import time
-
-import imageio
-
-import torch
-from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
-from diffusers.schedulers import DDIMScheduler
-from onediffx import compile_pipe
-from PIL import Image, ImageDraw
-from transformers import T5EncoderModel, T5Tokenizer
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, default=MODEL)
-    parser.add_argument("--ckpt", type=str, default=CKPT)
-    parser.add_argument("--prompt", type=str, default=PROMPT)
-    parser.add_argument("--variant", type=str, default=VARIANT)
-    parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE)
-    # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD)
-    parser.add_argument("--beta-start", type=float, default=BETA_START)
-    parser.add_argument("--beta-end", type=float, default=BETA_END)
-    parser.add_argument("--beta-schedule", type=str, default=BREA_SCHEDULE)
-    parser.add_argument(
-        "--enable_temporal_attentions",
-        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
-        default=ENABLE_TEMPORAL_ATTENTIONS,
-    )
-    parser.add_argument(
-        "--enable_vae_temporal_decoder",
-        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
-        default=ENABLE_VAE_TEMPORAL_DECODER,
-    )
-    parser.add_argument("--guidance-scale", type=float, default=GUIDANCE_SCALE)
-    parser.add_argument("--variance-type", type=str, default=VARIANCE_TYPE)
-    parser.add_argument("--steps", type=int, default=STEPS)
-    parser.add_argument("--seed", type=int, default=SEED)
-    parser.add_argument("--warmups", type=int, default=WARMUPS)
-    parser.add_argument("--batch", type=int, default=BATCH)
-    parser.add_argument("--height", type=int, default=HEIGHT)
-    parser.add_argument("--width", type=int, default=WIDTH)
-    parser.add_argument("--video-length", type=int, default=VIDEO_LENGTH)
-    parser.add_argument("--fps", type=int, default=FPS)
-    parser.add_argument("--extra-call-kwargs", type=str, default=EXTRA_CALL_KWARGS)
-    parser.add_argument("--output-video", type=str, default=OUTPUT_VIDEO)
-    parser.add_argument(
-        "--compiler",
-        type=str,
-        default="nexfort",
-        choices=["none", "nexfort", "compile"],
-    )
-    parser.add_argument(
-        "--compiler-config",
-        type=str,
-        default=COMPILER_CONFIG,
-    )
-    parser.add_argument(
-        "--attention-fp16-score-accum-max-m",
-        type=int,
-        default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
-    )
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument("--from_hf", action="store_true")
-    return parser.parse_args()
-
-
-class IterationProfiler:
-    def __init__(self):
-        self.begin = None
-        self.end = None
-        self.num_iterations = 0
-
-    def get_iter_per_sec(self):
-        if self.begin is None or self.end is None:
-            return None
-        self.end.synchronize()
-        dur = self.begin.elapsed_time(self.end)
-        return self.num_iterations / dur * 1000.0
-
-    def callback_on_step_end(self, pipe, i, t, callback_kwargs={}):
-        if self.begin is None:
-            event = torch.cuda.Event(enable_timing=True)
-            event.record()
-            self.begin = event
-        else:
-            event = torch.cuda.Event(enable_timing=True)
-            event.record()
-            self.end = event
-            self.num_iterations += 1
-        return callback_kwargs
-
-
-from contextlib import contextmanager
-
-
-@contextmanager
-def conditional_context(enabled, context_manager):
-    if enabled:
-        with context_manager as cm:
-            yield cm
-    else:
-        yield None
-
-
-_is_form_hf = False
-
-
-def get_pipeline(args, model_path, device):
-    global _is_form_hf
-    if args.from_hf:
-        # Has error for now
-        # File "python3.10/site-packages/diffusers/schedulers/scheduling_ddim.py", line 413, in step
-        # pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        # RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 1
-        print("get pipeline from diffusers")
-        _is_form_hf = True
-        return get_pipeline_from_hf(args, model_path, device)
-    else:
-        print("get pipeline from source")
-        _is_form_hf = False
-        return get_pipeline_from_source(args, model_path, device)
-
-
-def get_pipeline_from_hf(args, model_path, device):
-    # Get pipeline from diffusers
-    # diffusers version >= 0.30
-    from diffusers import LattePipeline
-
-    pipe = LattePipeline.from_pretrained(args.model, torch_dtype=torch.float16).to(
-        device
-    )
-
-    # Convert to channels_last memory format
-    pipe.transformer.to(memory_format=torch.channels_last)
-    pipe.vae.to(memory_format=torch.channels_last)
-    return pipe
-
-
-def get_pipeline_from_source(args, model_path, device):
-    # Get pipeline from https://github.com/siliconflow/dit_latte/
-    from models.latte_t2v import LatteT2V
-    from sample.pipeline_latte import LattePipeline
-
-    transformer_model = LatteT2V.from_pretrained(
-        model_path, subfolder="transformer", video_length=args.video_length
-    ).to(device, dtype=torch.float16)
-
-    if args.enable_vae_temporal_decoder:
-        vae = AutoencoderKLTemporalDecoder.from_pretrained(
-            args.model, subfolder="vae_temporal_decoder", torch_dtype=torch.float16
-        ).to(device)
-    else:
-        vae = AutoencoderKL.from_pretrained(
-            args.model, subfolder="vae", torch_dtype=torch.float16
-        ).to(device)
-    tokenizer = T5Tokenizer.from_pretrained(args.model, subfolder="tokenizer")
-    text_encoder = T5EncoderModel.from_pretrained(
-        args.model, subfolder="text_encoder", torch_dtype=torch.float16
-    ).to(device)
-
-    # set eval mode
-    transformer_model.eval()
-    vae.eval()
-    text_encoder.eval()
-
-    scheduler = DDIMScheduler.from_pretrained(
-        model_path,
-        subfolder="scheduler",
-        beta_start=args.beta_start,
-        beta_end=args.beta_end,
-        beta_schedule=args.beta_schedule,
-        variance_type=args.variance_type,
-        clip_sample=False,
-    )
-
-    pipe = LattePipeline(
-        vae=vae,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        scheduler=scheduler,
-        transformer=transformer_model,
-    ).to(device)
-
-    return pipe
-
-
-def main():
-    args = parse_args()
-
-    if os.path.exists(args.model):
-        model_path = args.model
-    else:
-        from huggingface_hub import snapshot_download
-
-        model_path = snapshot_download(repo_id=args.model)
-
-    torch.set_grad_enabled(False)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    pipe = get_pipeline(args, model_path, device)
-
-    if args.compiler == "none":
-        pass
-    elif args.compiler == "nexfort":
-        print("Nexfort backend is now active...")
-        if args.compiler_config is not None:
-            # config with dict
-            options = json.loads(args.compiler_config)
-        else:
-            # config with string
-            options = '{"mode": "O2",             \
-                "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
-                "triton.fuse_attention_allow_fp16_reduction": false}}'
-        pipe = compile_pipe(
-            pipe, backend="nexfort", options=options, fuse_qkv_projections=True
-        )
-    elif args.compiler == "compile":
-        if hasattr(pipe, "unet"):
-            pipe.unet = torch.compile(pipe.unet)
-        if hasattr(pipe, "transformer"):
-            pipe.transformer = torch.compile(pipe.transformer)
-        if hasattr(pipe, "controlnet"):
-            pipe.controlnet = torch.compile(pipe.controlnet)
-        pipe.vae = torch.compile(pipe.vae)
-    else:
-        raise ValueError(f"Unknown compiler: {args.compiler}")
-
-    def get_kwarg_inputs():
-        kwarg_inputs = dict(
-            prompt=args.prompt,
-            video_length=args.video_length,
-            height=args.height,
-            width=args.width,
-            num_inference_steps=args.steps,
-            guidance_scale=args.guidance_scale,
-            enable_temporal_attentions=args.enable_temporal_attentions,
-            num_images_per_prompt=1,
-            mask_feature=True,
-            **(
-                dict()
-                if args.extra_call_kwargs is None
-                else json.loads(args.extra_call_kwargs)
-            ),
-        )
-        if not _is_form_hf:
-            kwarg_inputs[
-                "enable_vae_temporal_decoder"
-            ] = args.enable_vae_temporal_decoder
-        return kwarg_inputs
-
-    kwarg_inputs = get_kwarg_inputs()
-    with conditional_context(args.profile, torch.profiler.profile()) as prof:
-        with conditional_context(
-            args.profile, torch.profiler.record_function("latte warmup")
-        ):
-            if args.warmups > 0:
-                print("=======================================")
-                print("Begin warmup")
-                begin = time.time()
-                for _ in range(args.warmups):
-                    out = pipe(**kwarg_inputs)
-                    if _is_form_hf:
-                        videos = out.frames[0]
-                    else:
-                        videos = out.video
-                end = time.time()
-                print("End warmup")
-                print(f"Warmup time: {end - begin:.3f}s")
-
-                print("=======================================")
-
-        iter_profiler = IterationProfiler()
-        if "callback_on_step_end" in inspect.signature(pipe).parameters:
-            kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
-        elif "callback" in inspect.signature(pipe).parameters:
-            kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
-        with torch.profiler.record_function("latte run"):
-            torch.manual_seed(args.seed)
-            begin = time.time()
-            out = pipe(**kwarg_inputs)
-            if _is_form_hf:
-                videos = out.frames[0]
-            else:
-                videos = out.video
-            end = time.time()
-
-        print("=======================================")
-        print(f"Inference time: {end - begin:.3f}s")
-        iter_per_sec = iter_profiler.get_iter_per_sec()
-        if iter_per_sec is not None:
-            print(f"Iterations per second: {iter_per_sec:.3f}")
-        cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
-        cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
-        print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
-        print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
-        print("=======================================")
-
-        with conditional_context(
-            args.profile, torch.profiler.record_function("latte export")
-        ):
-            if args.output_video is not None:
-                # export_to_video(output_frames[0], args.output_video, fps=args.fps)
-                try:
-                    imageio.mimwrite(
-                        args.output_video, videos[0], fps=8, quality=9
-                    )  # highest quality is 10, lowest is 0
-                except:
-                    print("Error when saving {}".format(args.prompt))
-            else:
-                print("Please set `--output-video` to save the output video")
-    if prof:
-        prof.export_chrome_trace("latte_with_cache_prof.json")
-
-
-if __name__ == "__main__":
-    main()

From 7e3680889ba7a9fee11433816ecdcb792219fd1c Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Thu, 25 Jul 2024 15:49:00 +0800
Subject: [PATCH 10/11] format

---
 benchmarks/text_to_video_latte.py             | 25 ++++++++++++-------
 .../register_comfy/CrossAttentionPatch.py     |  8 +++---
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index bdd4f5f11..90fef4e08 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -95,8 +95,9 @@ def parse_args():
         type=int,
         default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
     )
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument("--from_hf", action="store_true")
+    parser.add_argument("--profile_warmup", action="store_true")
+    parser.add_argument("--profile_run", action="store_true")
+    parser.add_argument("--from-hf", action="store_true")
     return parser.parse_args()
 
 
@@ -285,9 +286,11 @@ def get_kwarg_inputs():
         return kwarg_inputs
 
     kwarg_inputs = get_kwarg_inputs()
-    with conditional_context(args.profile, torch.profiler.profile()) as prof:
+    with conditional_context(
+        args.profile_warmup, torch.profiler.profile()
+    ) as prof_warmup:
         with conditional_context(
-            args.profile, torch.profiler.record_function("latte warmup")
+            args.profile_warmup, torch.profiler.record_function("latte warmup")
         ):
             if args.warmups > 0:
                 print("=======================================")
@@ -302,15 +305,19 @@ def get_kwarg_inputs():
                 end = time.time()
                 print("End warmup")
                 print(f"Warmup time: {end - begin:.3f}s")
-
                 print("=======================================")
+    if prof_warmup:
+        prof_warmup.export_chrome_trace("latte_prof_warmup.json")
 
+    with conditional_context(args.profile_run, torch.profiler.profile()) as prof_run:
         iter_profiler = IterationProfiler()
         if "callback_on_step_end" in inspect.signature(pipe).parameters:
             kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
         elif "callback" in inspect.signature(pipe).parameters:
             kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
-        with torch.profiler.record_function("latte run"):
+        with conditional_context(
+            args.profile_run, torch.profiler.record_function("latte run")
+        ):
             torch.manual_seed(args.seed)
             begin = time.time()
             out = pipe(**kwarg_inputs)
@@ -332,7 +339,7 @@ def get_kwarg_inputs():
         print("=======================================")
 
         with conditional_context(
-            args.profile, torch.profiler.record_function("latte export")
+            args.profile_run, torch.profiler.record_function("latte export")
         ):
             if args.output_video is not None:
                 # export_to_video(output_frames[0], args.output_video, fps=args.fps)
@@ -344,8 +351,8 @@ def get_kwarg_inputs():
                     print("Error when saving {}".format(args.prompt))
             else:
                 print("Please set `--output-video` to save the output video")
-    if prof:
-        prof.export_chrome_trace("latte_with_cache_prof.json")
+    if prof_run:
+        prof_run.export_chrome_trace("latte_prof_run.json")
 
 
 if __name__ == "__main__":
diff --git a/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py b/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py
index 749029178..cf9cf73e7 100644
--- a/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py
+++ b/onediff_comfy_nodes/modules/oneflow/infer_compiler_registry/register_comfy/CrossAttentionPatch.py
@@ -26,7 +26,7 @@ def tensor_to_size(source, dest_size):
     return source
 
 
-def get_weight_subidxs(weight,ad_params,sub_idxs):
+def get_weight_subidxs(weight, ad_params, sub_idxs):
     return weight[ad_params[sub_idxs]]
 
 
@@ -167,7 +167,7 @@ def ipadapter_attention(
         if ad_params is not None and ad_params["sub_idxs"] is not None:
             if isinstance(weight, torch.Tensor) and weight.dim() != 0:
                 weight = tensor_to_size(weight, ad_params["full_length"])
-                weight = get_weight_subidxs(weight,ad_params,"sub_idxs")
+                weight = get_weight_subidxs(weight, ad_params, "sub_idxs")
                 # if torch.all(weight == 0):
                 #     return 0
                 weight = weight.repeat(
@@ -178,8 +178,8 @@ def ipadapter_attention(
 
             # if image length matches or exceeds full_length get sub_idx images
             if cond.shape[0] >= ad_params["full_length"]:
-                cond = get_weight_subidxs(cond,ad_params,"sub_idxs")
-                uncond = get_weight_subidxs(uncond,ad_params,"sub_idxs")
+                cond = get_weight_subidxs(cond, ad_params, "sub_idxs")
+                uncond = get_weight_subidxs(uncond, ad_params, "sub_idxs")
             # otherwise get sub_idxs images
             else:
                 cond = tensor_to_size(cond, ad_params["full_length"])

From 5fb30e409b06806c01ceee2d9a0d552e73fc6bdf Mon Sep 17 00:00:00 2001
From: strint <xiaoyulink@gmail.com>
Date: Sun, 4 Aug 2024 23:41:51 +0800
Subject: [PATCH 11/11] prit perf

---
 benchmarks/text_to_video_latte.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
index 90fef4e08..3ea16d04a 100644
--- a/benchmarks/text_to_video_latte.py
+++ b/benchmarks/text_to_video_latte.py
@@ -352,6 +352,7 @@ def get_kwarg_inputs():
             else:
                 print("Please set `--output-video` to save the output video")
     if prof_run:
+        print(prof_run.key_averages().table(sort_by="cuda_time_total", row_limit=100))
         prof_run.export_chrome_trace("latte_prof_run.json")