Add option to run External through a CUDA graph (#222)

* Put model forward into a CUDA graph * Add option * Add CUDA Graph test for External Fix pos grad * Update docs * Add warmup steps * Fix race condition * Small changes to test * Remove spurious line * Blacken * Use intermediate tensors to please CUDA graphs when TN is run through torchmd * Save some intermediates * Modify model.py so the backwards graph is not retained in eval mode * Allow External to take a path to a ckpt or a model instance * Remove unnecessary change to TensorNet * Remove spurious comment * Small correction
torchmd · Oct 19, 2023 · 2cc5395 · 2cc5395
1 parent ad8a3fb
commit 2cc5395
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 13 deletions.
diff --git a/tests/test_calculator.py b/tests/test_calculator.py
@@ -1,10 +1,11 @@
 import torch
 from torch.testing import assert_allclose
+import pytest
 from pytest import mark
 from glob import glob
 from os.path import dirname, join
 from torchmdnet.calculators import External
-from torchmdnet.models.model import load_model
+from torchmdnet.models.model import load_model, create_model
 
 from utils import create_example_batch
 
@@ -21,6 +22,42 @@ def test_compare_forward():
     assert_allclose(e_calc, e_pred)
     assert_allclose(f_calc, f_pred.unsqueeze(0))
 
+def test_compare_forward_cuda_graph():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    checkpoint = join(dirname(dirname(__file__)), "tests", "example.ckpt")
+    args = {"model": "tensornet",
+            "embedding_dimension": 128,
+            "num_layers": 2,
+            "num_rbf": 32,
+            "rbf_type": "expnorm",
+            "trainable_rbf": False,
+            "activation": "silu",
+            "cutoff_lower": 0.0,
+            "cutoff_upper": 5.0,
+            "max_z": 100,
+            "max_num_neighbors": 128,
+            "equivariance_invariance_group": "O(3)",
+            "prior_model": None,
+            "atom_filter": -1,
+            "derivative": True,
+            "output_model": "Scalar",
+            "reduce_op": "sum",
+            "precision": 32 }
+    model = create_model(args).to(device="cuda")
+    z, pos, _ = create_example_batch(multiple_batches=False)
+    z = z.to("cuda")
+    pos = pos.to("cuda")
+    calc = External(checkpoint, z.unsqueeze(0), use_cuda_graph=False, device="cuda")
+    calc_graph = External(checkpoint, z.unsqueeze(0), use_cuda_graph=True, device="cuda")
+    calc.model = model
+    calc_graph.model = model
+    for _ in range(10):
+        e_calc, f_calc = calc.calculate(pos, None)
+        e_pred, f_pred = calc_graph.calculate(pos, None)
+        assert_allclose(e_calc, e_pred)
+        assert_allclose(f_calc, f_pred)
+
 
 def test_compare_forward_multiple():
     checkpoint = join(dirname(dirname(__file__)), "tests", "example.ckpt")

diff --git a/torchmdnet/calculators.py b/torchmdnet/calculators.py
@@ -20,15 +20,43 @@
 
 class External:
     """
-    The External class is used to calculate the energy and forces of an external potential, such as a neural network. The class is initialized with the path to the neural network
-    ckpt, the embeddings, the device on which the neural network should be run and the output_transform argument. The output_transform is used to give a function that transform
-    the energy and the forces, this could be a preset transform or a custom function. In this way there is no constraint to the units of the neural network, the user can choose
-    the units of the simulation and the neural network will be automatically converted to the units of the simulation. The function should take two arguments, the energy and the
-    forces, and return the transformed energy and the transformed forces.
+    This is an adapter to use TorchMD-Net models in TorchMD.
+    Parameters
+    ----------
+    netfile : str or torch.nn.Module
+        Path to the checkpoint file of the model or the model itself.
+    embeddings : torch.Tensor
+        Embeddings of the atoms in the system.
+    device : str, optional
+        Device on which the model should be run. Default: "cpu"
+    output_transform : str or callable, optional
+        Transform to apply to the energy and forces.
+        If a string is given, it should be a key in the `transforms` dict.
+        If a callable is given, it should take two arguments (energy and forces) and return two tensors of the same shape.
+        Default: None
+    use_cuda_graph : bool, optional
+        Whether to use CUDA graphs to speed up the calculation. Default: False
+    cuda_graph_warmup_steps : int, optional
+        Number of steps to run as warmup before recording the CUDA graph. Default: 12
     """
 
-    def __init__(self, netfile, embeddings, device="cpu", output_transform=None):
-        self.model = load_model(netfile, device=device, derivative=True)
+    def __init__(
+        self,
+        netfile,
+        embeddings,
+        device="cpu",
+        output_transform=None,
+        use_cuda_graph=False,
+        cuda_graph_warmup_steps=12,
+    ):
+        if isinstance(netfile, str):
+            self.model = load_model(netfile, device=device, derivative=True)
+        elif isinstance(netfile, torch.nn.Module):
+            self.model = netfile
+        else:
+            raise ValueError(
+                f"Expected a path to a checkpoint file or a torch.nn.Module, got {type(netfile)}"
+            )
         self.device = device
         self.n_atoms = embeddings.size(1)
         self.embeddings = embeddings.reshape(-1).to(device)
@@ -46,11 +74,49 @@ def __init__(self, netfile, embeddings, device="cpu", output_transform=None):
             self.output_transformer = tranforms[output_transform]
         else:
             self.output_transformer = eval(output_transform)
+        if not torch.cuda.is_available() and use_cuda_graph:
+            raise ValueError("CUDA graphs are only available if CUDA is")
+        self.use_cuda_graph = use_cuda_graph
+        self.cuda_graph_warmup_steps = cuda_graph_warmup_steps
+        self.cuda_graph = None
+        self.energy = None
+        self.forces = None
+        self.pos = None
+
+    def _init_cuda_graph(self):
+        stream = torch.cuda.Stream()
+        self.cuda_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.stream(stream):
+            for _ in range(self.cuda_graph_warmup_steps):
+                self.energy, self.forces = self.model(
+                    self.embeddings, self.pos, self.batch
+                )
+            with torch.cuda.graph(self.cuda_graph):
+                self.energy, self.forces = self.model(
+                    self.embeddings, self.pos, self.batch
+                )
 
     def calculate(self, pos, box):
         pos = pos.to(self.device).type(torch.float32).reshape(-1, 3)
-        energy, forces = self.model(self.embeddings, pos, self.batch)
-
+        if self.use_cuda_graph:
+            if self.pos is None:
+                self.pos = (
+                    pos.clone()
+                    .to(self.device)
+                    .detach()
+                    .requires_grad_(pos.requires_grad)
+                )
+            if self.cuda_graph is None:
+                self._init_cuda_graph()
+            assert self.cuda_graph is not None, "CUDA graph is not initialized. This should not had happened."
+            with torch.no_grad():
+                self.pos.copy_(pos)
+                self.cuda_graph.replay()
+        else:
+            self.energy, self.forces = self.model(self.embeddings, pos, self.batch)
+        assert self.forces is not None, "The model is not returning forces"
+        assert self.energy is not None, "The model is not returning energy"
         return self.output_transformer(
-            energy.detach(), forces.reshape(-1, self.n_atoms, 3).detach()
+            self.energy.clone().detach(),
+            self.forces.clone().reshape(-1, self.n_atoms, 3).detach(),
         )
diff --git a/torchmdnet/models/model.py b/torchmdnet/models/model.py
@@ -289,8 +289,8 @@ def forward(
                 [y],
                 [pos],
                 grad_outputs=grad_outputs,
-                create_graph=True,
-                retain_graph=True,
+                create_graph=self.training,
+                retain_graph=self.training,
             )[0]
             if dy is None:
                 raise RuntimeError("Autograd returned None for the force prediction.")