From a2edf4d929ae22079dd1a301b88593e46e316256 Mon Sep 17 00:00:00 2001
From: Yuantao Feng <yuantao.feng@opencv.org.cn>
Date: Wed, 6 Dec 2023 04:46:24 -0600
Subject: [PATCH] Merge pull request #24647 from fengyuentau:cuda_sub

dnn cuda: support Sub #24647

Related https://github.com/opencv/opencv/issues/24606#issuecomment-1837390257

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
---
 modules/dnn/src/cuda/eltwise_ops.cu              |  7 +++++++
 modules/dnn/src/cuda/functors.hpp                | 12 ++++++++++++
 modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp |  3 +++
 modules/dnn/src/cuda4dnn/primitives/eltwise.hpp  |  3 +++
 modules/dnn/src/layers/nary_eltwise_layers.cpp   | 11 ++++++++---
 5 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
index f94bdb811346..16f6cccf6b16 100644
--- a/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -319,7 +319,13 @@ void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     eltwise_op<T, DivFunctor<T>>(stream, output, x, y);
 }
 
+template <class T>
+void eltwise_sub_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, SubFunctor<T>>(stream, output, x, y);
+}
+
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void eltwise_sub_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>);
@@ -327,6 +333,7 @@ void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
 #endif
+    template void eltwise_sub_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_sum_coeff_2(const Stream&, TensorSpan<float>, float, TensorView<float>, float, TensorView<float>);
diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp
index 3e487cd98a50..2df32030f0e4 100644
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@@ -741,6 +741,18 @@ struct DivFunctor {
     CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; }
 };
 
+template <class T>
+struct SubFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SubFunctor() { }
+    CUDA4DNN_DEVICE SubFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x - y; }
+};
+
 template <class T>
 struct SignFunctor {
     struct Params {
diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
index 0e44372fee3d..3dc3355b3b8b 100644
--- a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
@@ -30,6 +30,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T>
     void eltwise_div_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
 
+    template <class T>
+    void eltwise_sub_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
index b46f0d870f4d..05bca83820d1 100644
--- a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
@@ -27,6 +27,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         PRODUCT,
         DIV,
         MIN,
+        SUB,
     };
 
     class EltwiseOpBase : public CUDABackendNode {
@@ -88,6 +89,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                     else
                         kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
                     break;
+                case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, input_x, input_y); break;
                 }
             }
             else
@@ -119,6 +121,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::eltwise_sum_coeff_2<T>(stream, output, coeff_x, output, coeffs[i], input);
                         }
                         break;
+                    case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, output, input); break;
                     }
                 }
             }
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
index 8572eee995a7..c988ec69f299 100644
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -114,9 +114,11 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
                     op == OPERATION::GREATER_EQUAL ||
                     op == OPERATION::LESS_EQUAL
             );
-        if (op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
-            op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD)
-            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+        if (backendId == DNN_BACKEND_CUDA) {
+            return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
+                   op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD ||
+                   op == OPERATION::SUB;
+        }
         return backendId == DNN_BACKEND_OPENCV;
     }
 
@@ -828,6 +830,9 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
             case OPERATION::ADD:
                 op_ = cuda4dnn::EltwiseOpType::SUM;
                 break;
+            case OPERATION::SUB:
+                op_ = cuda4dnn::EltwiseOpType::SUB;
+                break;
             default: return Ptr<BackendNode>(); // return empty cuda_node if the EltwiseOpType is unsupported type.
         };