Don't grad through max subtract in softmax

elixir-nx · Oct 18, 2021 · 8916210 · 8916210
1 parent 19e6ef3
commit 8916210
Showing 1 changed file with 15 additions and 1 deletion.
diff --git a/lib/axon/activations.ex b/lib/axon/activations.ex
@@ -600,7 +600,21 @@ defmodule Axon.Activations do
       end
     end)
 
-    max_val = Nx.reduce_max(x, axes: [opts[:axis]], keep_axes: true)
+    # This is a scaling term designed to prevent over/under flow when x is very
+    # large. Consider cases where the intermediate value e^x with large positive
+    # x, e^x tends towards infinity or 0. This poisons the rest of the
+    # calculation which would otherwise be normalized with the division by sum(e^x).
+    # Thus we can scale by the max value in the tensor which guarantees all values
+    # are smaller than 0.
+    #
+    # Given the expression is essentially:
+    #
+    # e^(x - C) / sum(e^(x - C))
+    #
+    # We are essentially treating the max value as a constant term, C. Thus there
+    # is no need to differentiate through the max. See also: https://github.com/google/jax/pull/2260
+    # for a note on performance.
+    max_val = stop_grad(Nx.reduce_max(x, axes: [opts[:axis]], keep_axes: true))
 
     stable_exp =
       x