From 78b360f6b5143a08f679788091d1abdf93aebf96 Mon Sep 17 00:00:00 2001
From: Minsoo Kim <48931129+mnskim@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:06:14 -0800
Subject: [PATCH] fix: always propagate visit counts, separately from rewards
 (#48)

Co-authored-by: sweagent <sweagent@pnlp.org>
---
 moatless/search_tree.py | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/moatless/search_tree.py b/moatless/search_tree.py
index 501ef8af..6318b564 100644
--- a/moatless/search_tree.py
+++ b/moatless/search_tree.py
@@ -361,24 +361,32 @@ def _simulate(self, node: Node):
                 raise  # Re-raise to abort the entire search
 
     def _backpropagate(self, node: Node):
-        """Backpropagate the reward up the tree."""
-
-        if not node.reward:
+        """Backpropagate both visits and rewards up the tree."""
+        
+        # Always update visit counts, separately from reward propagation
+        current = node
+        while current is not None:
+            current.visits += 1
+            current = current.parent
+
+        # Only propagate rewards if they exist
+        if node.reward:
+            current = node
+            reward = node.reward.value
+            while current is not None:
+                if not current.value:
+                    current.value = reward
+                else:
+                    current.value += reward
+                current = current.parent
+        
+        else:
             self.log(
                 logger.info,
-                f"Node{node.node_id} has no evaluation. Skipping backpropagation.",
+                f"Node{node.node_id} has no evaluation. Skipping reward backpropagation.",
             )
             return
 
-        reward = node.reward.value
-        while node is not None:
-            node.visits += 1
-            if not node.value:
-                node.value = reward
-            else:
-                node.value += reward
-            node = node.parent
-
     def get_best_trajectory(self) -> Node | None:
         """
         Get the best finished trajectory to return