Skip to content

Commit

Permalink
Merge pull request opencv#24058 from hanliutong:rewrite-imgporc
Browse files Browse the repository at this point in the history
Rewrite Universal Intrinsic code by using new API: ImgProc module. opencv#24058

The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro in the `opencv/modules/imgproc` folder: rewrite them by using the new Universal Intrinsic API. 

For easier review, this PR includes a part of the rewritten code, and another part will be brought in the next PR (coming soon). I tested this patch on RVV (QEMU) and AVX devices, `opencv_test_imgproc` is passed.

The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR opencv#23885 and opencv#23980.



### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
  • Loading branch information
hanliutong authored Sep 14, 2023
1 parent 515f119 commit 5e91915
Show file tree
Hide file tree
Showing 11 changed files with 1,183 additions and 1,143 deletions.
640 changes: 320 additions & 320 deletions modules/imgproc/src/accum.simd.hpp

Large diffs are not rendered by default.

54 changes: 27 additions & 27 deletions modules/imgproc/src/blend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@
#include "opencv2/core/hal/intrin.hpp"

namespace cv {
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2)
{
const v_float32 v_eps = vx_setall_f32(1e-5f);
v_float32 v_denom = v_w1 + v_w2 + v_eps;
return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
v_float32 v_denom = v_add(v_add(v_w1, v_w2), v_eps);
return v_div(v_add(v_mul(v_src1, v_w1), v_mul(v_src2, v_w2)), v_denom);
}
static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
{
Expand Down Expand Up @@ -105,23 +105,23 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
switch(cn)
{
case 1:
for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes)
for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
{
v_float32 v_src10, v_src11, v_src12, v_src13;
v_float32 v_src20, v_src21, v_src22, v_src23;
load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);

v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes);
v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes);
v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes);
v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());

store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
}
break;
case 2:
for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
for(int weight_offset = 0 ; x <= width - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
{
v_uint8 v_src10, v_src11, v_src20, v_src21;
v_load_deinterleave(src1 + x, v_src10, v_src11);
Expand All @@ -135,20 +135,20 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,

v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes);
v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes);
v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes);
v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes);
v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes);
v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes);
v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());

v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
v_store_interleave(dst + x, v_dsta, v_dstb);
}
break;
case 3:
for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
for(int weight_offset = 0 ; x <= width - 3*VTraits<v_uint8>::vlanes(); x += 3*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
{
v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
Expand All @@ -164,13 +164,13 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);

v_float32 v_w10 = vx_load(weights1 + weight_offset);
v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes);
v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes);
v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes);
v_float32 v_w11 = vx_load(weights1 + weight_offset + VTraits<v_float32>::vlanes());
v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*VTraits<v_float32>::vlanes());
v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*VTraits<v_float32>::vlanes());
v_float32 v_w20 = vx_load(weights2 + weight_offset);
v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes);
v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes);
v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes);
v_float32 v_w21 = vx_load(weights2 + weight_offset + VTraits<v_float32>::vlanes());
v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*VTraits<v_float32>::vlanes());
v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*VTraits<v_float32>::vlanes());
v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
Expand All @@ -192,7 +192,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
}
break;
case 4:
for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes)
for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
{
v_float32 v_src10, v_src11, v_src12, v_src13;
v_float32 v_src20, v_src21, v_src22, v_src23;
Expand Down Expand Up @@ -229,7 +229,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
switch(cn)
{
case 1:
for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes)
for(int weight_offset = 0 ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
{
v_float32 v_src1 = vx_load(src1 + x);
v_float32 v_src2 = vx_load(src2 + x);
Expand All @@ -242,7 +242,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
}
break;
case 2:
for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes)
for(int weight_offset = 0 ; x <= width - 2*VTraits<v_float32>::vlanes(); x += 2*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
{
v_float32 v_src10, v_src11, v_src20, v_src21;
v_load_deinterleave(src1 + x, v_src10, v_src11);
Expand All @@ -257,7 +257,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
}
break;
case 3:
for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes)
for(int weight_offset = 0 ; x <= width - 3*VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
{
v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
Expand All @@ -273,7 +273,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
}
break;
case 4:
for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes)
for(int weight_offset = 0 ; x <= width - 4*VTraits<v_float32>::vlanes(); x += 4*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
{
v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
Expand Down Expand Up @@ -320,7 +320,7 @@ class BlendLinearInvoker :
T * const dst_row = dst->ptr<T>(y);

int x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
#endif

Expand Down
66 changes: 33 additions & 33 deletions modules/imgproc/src/canny.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,11 @@ class parallelCanny : public ParallelLoopBody
src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel),
low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient)
{
#if CV_SIMD
for(int i = 0; i < v_int8::nlanes; ++i)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
{
smask[i] = 0;
smask[i + v_int8::nlanes] = (schar)-1;
smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
}
if (true)
_map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
Expand All @@ -330,11 +330,11 @@ class parallelCanny : public ParallelLoopBody
src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel),
low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient)
{
#if CV_SIMD
for(int i = 0; i < v_int8::nlanes; ++i)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
{
smask[i] = 0;
smask[i + v_int8::nlanes] = (schar)-1;
smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
}
if (true)
_map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
Expand Down Expand Up @@ -396,7 +396,7 @@ class parallelCanny : public ParallelLoopBody
}

// _mag_p: previous row, _mag_a: actual row, _mag_n: next row
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
AutoBuffer<int> buffer(3 * (mapstep * cn + CV_SIMD_WIDTH));
_mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH);
_mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH);
Expand Down Expand Up @@ -436,8 +436,8 @@ class parallelCanny : public ParallelLoopBody
if (L2gradient)
{
int j = 0, width = src.cols * cn;
#if CV_SIMD
for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for ( ; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
{
v_int16 v_dx = vx_load((const short*)(_dx + j));
v_int16 v_dy = vx_load((const short*)(_dy + j));
Expand All @@ -447,8 +447,8 @@ class parallelCanny : public ParallelLoopBody
v_expand(v_dx, v_dxp_low, v_dxp_high);
v_expand(v_dy, v_dyp_low, v_dyp_high);

v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
v_store_aligned((int *)(_mag_n + j), v_add(v_mul(v_dxp_low, v_dxp_low), v_mul(v_dyp_low, v_dyp_low)));
v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_mul(v_dxp_high, v_dxp_high), v_mul(v_dyp_high, v_dyp_high)));
}
#endif
for ( ; j < width; ++j)
Expand All @@ -457,8 +457,8 @@ class parallelCanny : public ParallelLoopBody
else
{
int j = 0, width = src.cols * cn;
#if CV_SIMD
for(; j <= width - v_int16::nlanes; j += v_int16::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for(; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
{
v_int16 v_dx = vx_load((const short *)(_dx + j));
v_int16 v_dy = vx_load((const short *)(_dy + j));
Expand All @@ -470,8 +470,8 @@ class parallelCanny : public ParallelLoopBody
v_expand(v_dx, v_dx_ml, v_dx_mh);
v_expand(v_dy, v_dy_ml, v_dy_mh);

v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh);
v_store_aligned((int *)(_mag_n + j), v_add(v_dx_ml, v_dy_ml));
v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_dx_mh, v_dy_mh));
}
#endif
for ( ; j < width; ++j)
Expand Down Expand Up @@ -515,7 +515,7 @@ class parallelCanny : public ParallelLoopBody

// From here actual src row is (i - 1)
// Set left and right border to 1
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (true)
_pmap = map.ptr<uchar>(i) + CV_SIMD_WIDTH;
else
Expand All @@ -537,22 +537,22 @@ class parallelCanny : public ParallelLoopBody

const int TG22 = 13573;
int j = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
const v_int32 v_low = vx_setall_s32(low);
const v_int8 v_one = vx_setall_s8(1);

for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes)
for (; j <= src.cols - VTraits<v_int8>::vlanes(); j += VTraits<v_int8>::vlanes())
{
v_store_aligned((signed char*)(_pmap + j), v_one);
v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j )) > v_low,
vx_load_aligned((const int*)(_mag_a + j + v_int32::nlanes)) > v_low),
v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low,
vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low));
v_int8 v_cmp = v_pack(v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j)), v_low),
v_gt(vx_load_aligned((const int *)(_mag_a + j + VTraits<v_int32>::vlanes())), v_low)),
v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j + 2 * VTraits<v_int32>::vlanes())), v_low),
v_gt(vx_load_aligned((const int *)(_mag_a + j + 3 * VTraits<v_int32>::vlanes())), v_low)));
while (v_check_any(v_cmp))
{
int l = v_scan_forward(v_cmp);
v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l);
v_cmp = v_and(v_cmp, vx_load(smask + VTraits<v_int8>::vlanes() - 1 - l));
int k = j + l;

int m = _mag_a[k];
Expand Down Expand Up @@ -693,8 +693,8 @@ class parallelCanny : public ParallelLoopBody
ptrdiff_t mapstep;
int cn;
mutable Mutex mutex;
#if CV_SIMD
schar smask[2*v_int8::nlanes];
#if (CV_SIMD || CV_SIMD_SCALABLE)
schar smask[2*VTraits<v_int8>::max_nlanes];
#endif
};

Expand All @@ -718,31 +718,31 @@ class finalPass : public ParallelLoopBody
int j = 0;
uchar *pdst = dst.ptr<uchar>(i);
const uchar *pmap = map.ptr<uchar>(i + 1);
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (true)
pmap += CV_SIMD_WIDTH;
else
#endif
pmap += 1;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
const v_uint8 v_zero = vx_setzero_u8();
const v_uint8 v_ff = ~v_zero;
const v_uint8 v_ff = v_not(v_zero);
const v_uint8 v_two = vx_setall_u8(2);

for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes)
for (; j <= dst.cols - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
{
v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j));
v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
v_store((pdst + j), v_pmap);
}

if (j <= dst.cols - v_uint8::nlanes/2)
if (j <= dst.cols - VTraits<v_uint8>::vlanes()/2)
{
v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j));
v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
v_store_low((pdst + j), v_pmap);
j += v_uint8::nlanes/2;
j += VTraits<v_uint8>::vlanes()/2;
}
}
#endif
Expand Down
Loading

0 comments on commit 5e91915

Please sign in to comment.