Merge pull request opencv#24058 from hanliutong:rewrite-imgporc

Rewrite Universal Intrinsic code by using new API: ImgProc module. opencv#24058 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro in the `opencv/modules/imgproc` folder: rewrite them by using the new Universal Intrinsic API. For easier review, this PR includes a part of the rewritten code, and another part will be brought in the next PR (coming soon). I tested this patch on RVV (QEMU) and AVX devices, `opencv_test_imgproc` is passed. The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR opencv#23885 and opencv#23980. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
MengqingCao · Sep 14, 2023 · 5e91915 · 5e91915
1 parent 515f119
commit 5e91915
Show file tree

Hide file tree

Showing 11 changed files with 1,183 additions and 1,143 deletions.
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp
@@ -48,12 +48,12 @@
 #include "opencv2/core/hal/intrin.hpp"
 
 namespace cv {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2)
 {
     const v_float32 v_eps = vx_setall_f32(1e-5f);
-    v_float32 v_denom = v_w1 + v_w2 + v_eps;
-    return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
+    v_float32 v_denom = v_add(v_add(v_w1, v_w2), v_eps);
+    return v_div(v_add(v_mul(v_src1, v_w1), v_mul(v_src2, v_w2)), v_denom);
 }
 static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
 {
@@ -105,23 +105,23 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13;
             v_float32 v_src20, v_src21, v_src22, v_src23;
             load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
             load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
 
             v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
-            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
 
             store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_uint8 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
@@ -135,20 +135,20 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
 
             v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
             v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
-            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes);
-            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
+            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
 
             v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
             v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
             v_store_interleave(dst + x, v_dsta, v_dstb);
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*VTraits<v_uint8>::vlanes(); x += 3*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
@@ -164,13 +164,13 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
             expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);
 
             v_float32 v_w10 = vx_load(weights1 + weight_offset);
-            v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes);
-            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes);
-            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w11 = vx_load(weights1 + weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*VTraits<v_float32>::vlanes());
             v_float32 v_w20 = vx_load(weights2 + weight_offset);
-            v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes);
-            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes);
-            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w21 = vx_load(weights2 + weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*VTraits<v_float32>::vlanes());
             v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
             v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
             v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
@@ -192,7 +192,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
         }
         break;
     case 4:
-        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13;
             v_float32 v_src20, v_src21, v_src22, v_src23;
@@ -229,7 +229,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src1 = vx_load(src1 + x);
             v_float32 v_src2 = vx_load(src2 + x);
@@ -242,7 +242,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*VTraits<v_float32>::vlanes(); x += 2*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
@@ -257,7 +257,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
@@ -273,7 +273,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 4:
-        for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 4*VTraits<v_float32>::vlanes(); x += 4*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
@@ -320,7 +320,7 @@ class BlendLinearInvoker :
             T * const dst_row = dst->ptr<T>(y);
 
             int x = 0;
-            #if CV_SIMD
+            #if (CV_SIMD || CV_SIMD_SCALABLE)
             x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
             #endif
 

diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
@@ -306,11 +306,11 @@ class parallelCanny : public ParallelLoopBody
         src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient)
     {
-#if CV_SIMD
-        for(int i = 0; i < v_int8::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
         {
             smask[i] = 0;
-            smask[i + v_int8::nlanes] = (schar)-1;
+            smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
         }
         if (true)
             _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
@@ -330,11 +330,11 @@ class parallelCanny : public ParallelLoopBody
         src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient)
     {
-#if CV_SIMD
-        for(int i = 0; i < v_int8::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
         {
             smask[i] = 0;
-            smask[i + v_int8::nlanes] = (schar)-1;
+            smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
         }
         if (true)
             _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
@@ -396,7 +396,7 @@ class parallelCanny : public ParallelLoopBody
         }
 
         // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         AutoBuffer<int> buffer(3 * (mapstep * cn + CV_SIMD_WIDTH));
         _mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH);
         _mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH);
@@ -436,8 +436,8 @@ class parallelCanny : public ParallelLoopBody
                 if (L2gradient)
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD
-                    for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for ( ; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
                     {
                         v_int16 v_dx = vx_load((const short*)(_dx + j));
                         v_int16 v_dy = vx_load((const short*)(_dy + j));
@@ -447,8 +447,8 @@ class parallelCanny : public ParallelLoopBody
                         v_expand(v_dx, v_dxp_low, v_dxp_high);
                         v_expand(v_dy, v_dyp_low, v_dyp_high);
 
-                        v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
-                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
+                        v_store_aligned((int *)(_mag_n + j), v_add(v_mul(v_dxp_low, v_dxp_low), v_mul(v_dyp_low, v_dyp_low)));
+                        v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_mul(v_dxp_high, v_dxp_high), v_mul(v_dyp_high, v_dyp_high)));
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -457,8 +457,8 @@ class parallelCanny : public ParallelLoopBody
                 else
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD
-                    for(; j <= width - v_int16::nlanes; j += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for(; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
                     {
                         v_int16 v_dx = vx_load((const short *)(_dx + j));
                         v_int16 v_dy = vx_load((const short *)(_dy + j));
@@ -470,8 +470,8 @@ class parallelCanny : public ParallelLoopBody
                         v_expand(v_dx, v_dx_ml, v_dx_mh);
                         v_expand(v_dy, v_dy_ml, v_dy_mh);
 
-                        v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
-                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh);
+                        v_store_aligned((int *)(_mag_n + j), v_add(v_dx_ml, v_dy_ml));
+                        v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_dx_mh, v_dy_mh));
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -515,7 +515,7 @@ class parallelCanny : public ParallelLoopBody
 
             // From here actual src row is (i - 1)
             // Set left and right border to 1
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (true)
                 _pmap = map.ptr<uchar>(i) + CV_SIMD_WIDTH;
             else
@@ -537,22 +537,22 @@ class parallelCanny : public ParallelLoopBody
 
             const int TG22 = 13573;
             int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 const v_int32 v_low = vx_setall_s32(low);
                 const v_int8 v_one = vx_setall_s8(1);
 
-                for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes)
+                for (; j <= src.cols - VTraits<v_int8>::vlanes(); j += VTraits<v_int8>::vlanes())
                 {
                     v_store_aligned((signed char*)(_pmap + j), v_one);
-                    v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j                    )) > v_low,
-                                                 vx_load_aligned((const int*)(_mag_a + j +   v_int32::nlanes)) > v_low),
-                                          v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low,
-                                                 vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low));
+                    v_int8 v_cmp = v_pack(v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j)), v_low),
+                                                 v_gt(vx_load_aligned((const int *)(_mag_a + j + VTraits<v_int32>::vlanes())), v_low)),
+                                          v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j + 2 * VTraits<v_int32>::vlanes())), v_low),
+                                                 v_gt(vx_load_aligned((const int *)(_mag_a + j + 3 * VTraits<v_int32>::vlanes())), v_low)));
                     while (v_check_any(v_cmp))
                     {
                         int l = v_scan_forward(v_cmp);
-                        v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l);
+                        v_cmp = v_and(v_cmp, vx_load(smask + VTraits<v_int8>::vlanes() - 1 - l));
                         int k = j + l;
 
                         int m = _mag_a[k];
@@ -693,8 +693,8 @@ class parallelCanny : public ParallelLoopBody
     ptrdiff_t mapstep;
     int cn;
     mutable Mutex mutex;
-#if CV_SIMD
-    schar smask[2*v_int8::nlanes];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    schar smask[2*VTraits<v_int8>::max_nlanes];
 #endif
 };
 
@@ -718,31 +718,31 @@ class finalPass : public ParallelLoopBody
             int j = 0;
             uchar *pdst = dst.ptr<uchar>(i);
             const uchar *pmap = map.ptr<uchar>(i + 1);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (true)
                 pmap += CV_SIMD_WIDTH;
             else
 #endif
                 pmap += 1;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 const v_uint8 v_zero = vx_setzero_u8();
-                const v_uint8 v_ff = ~v_zero;
+                const v_uint8 v_ff = v_not(v_zero);
                 const v_uint8 v_two = vx_setall_u8(2);
 
-                for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes)
+                for (; j <= dst.cols - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
                 {
                     v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j));
-                    v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
+                    v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
                     v_store((pdst + j), v_pmap);
                 }
 
-                if (j <= dst.cols - v_uint8::nlanes/2)
+                if (j <= dst.cols - VTraits<v_uint8>::vlanes()/2)
                 {
                     v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j));
-                    v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
+                    v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
                     v_store_low((pdst + j), v_pmap);
-                    j += v_uint8::nlanes/2;
+                    j += VTraits<v_uint8>::vlanes()/2;
                 }
             }
 #endif