diff --git a/modules/calib3d/src/undistort.simd.hpp b/modules/calib3d/src/undistort.simd.hpp index 7998a3b086ea..70bac4470245 100644 --- a/modules/calib3d/src/undistort.simd.hpp +++ b/modules/calib3d/src/undistort.simd.hpp @@ -89,8 +89,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody s2(_s2), s3(_s3), s4(_s4) { -#if CV_SIMD_64F - for (int i = 0; i < 2 * v_float64::nlanes; ++i) +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + for (int i = 0; i < 2 * VTraits::vlanes(); ++i) { s_x[i] = ir[0] * i; s_y[i] = ir[3] * i; @@ -123,26 +123,26 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody else CV_Assert(m1 != NULL); -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) const v_float64 v_one = vx_setall_f64(1.0); - for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6]) + for (; j <= size.width - 2*VTraits::vlanes(); j += 2*VTraits::vlanes(), _x += 2*VTraits::vlanes() * ir[0], _y += 2*VTraits::vlanes() * ir[3], _w += 2*VTraits::vlanes() * ir[6]) { v_float64 m_0, m_1, m_2, m_3; - m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w)); - m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes)); + m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w))); + m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits::vlanes()))); m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y); - v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2; - v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3; - v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2; - v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3; + v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2); + v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits::vlanes())), m_3); + v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2); + v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits::vlanes())), m_3); - v_float64 xd_0 = x_0 * x_0; - v_float64 yd_0 = y_0 * y_0; - v_float64 xd_1 = x_1 * x_1; - v_float64 yd_1 = y_1 * y_1; + v_float64 xd_0 = v_mul(x_0, x_0); + v_float64 yd_0 = v_mul(y_0, y_0); + v_float64 xd_1 = v_mul(x_1, x_1); + v_float64 yd_1 = v_mul(y_1, y_1); - v_float64 r2_0 = xd_0 + yd_0; - v_float64 r2_1 = xd_1 + yd_1; + v_float64 r2_0 = v_add(xd_0, yd_0); + v_float64 r2_1 = v_add(xd_1, yd_1); m_1 = vx_setall_f64(k3); m_2 = vx_setall_f64(k2); @@ -151,18 +151,18 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one); m_3 = vx_setall_f64(k6); m_2 = vx_setall_f64(k5); - m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one); - m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one); + m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one)); + m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one)); m_3 = vx_setall_f64(2.0); xd_0 = v_muladd(m_3, xd_0, r2_0); yd_0 = v_muladd(m_3, yd_0, r2_0); xd_1 = v_muladd(m_3, xd_1, r2_1); yd_1 = v_muladd(m_3, yd_1, r2_1); - m_2 = x_0 * y_0 * m_3; - m_3 = x_1 * y_1 * m_3; + m_2 = v_mul(v_mul(x_0, y_0), m_3); + m_3 = v_mul(v_mul(x_1, y_1), m_3); - x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1; + x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1); m_0 = vx_setall_f64(p1); m_1 = vx_setall_f64(p2); @@ -176,8 +176,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody xd_1 = v_muladd(m_0, m_3, xd_1); yd_1 = v_muladd(m_1, m_3, yd_1); - m_0 = r2_0 * r2_0; - m_1 = r2_1 * r2_1; + m_0 = v_mul(r2_0, r2_0); + m_1 = v_mul(r2_1, r2_1); m_2 = vx_setall_f64(s2); m_3 = vx_setall_f64(s1); xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0)); @@ -203,17 +203,17 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2)); r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2)); m_0 = vx_setzero_f64(); - r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0); - r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1); + r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0)); + r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1)); m_0 = vx_setall_f64(fx); m_1 = vx_setall_f64(u0); m_2 = vx_setall_f64(fy); m_3 = vx_setall_f64(v0); - x_0 = v_muladd(m_0 * r2_0, x_0, m_1); - y_0 = v_muladd(m_2 * r2_0, y_0, m_3); - x_1 = v_muladd(m_0 * r2_1, x_1, m_1); - y_1 = v_muladd(m_2 * r2_1, y_1, m_3); + x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1); + y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3); + x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1); + y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3); if (m1type == CV_32FC1) { @@ -225,20 +225,20 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody v_float32 mf0, mf1; v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1); v_store(&m1f[j * 2], mf0); - v_store(&m1f[j * 2 + v_float32::nlanes], mf1); + v_store(&m1f[j * 2 + VTraits::vlanes()], mf1); } else // m1type == CV_16SC2 { m_0 = vx_setall_f64(INTER_TAB_SIZE); - x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0; + x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0); v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1); v_int32 iu = v_round(x_0, x_1); v_int32 iv = v_round(y_0, y_1); - v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE)); + v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE)))); v_int32 out0, out1; - v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1); + v_zip(v_shr(iu), v_shr(iv), out0, out1); v_store(&m1[j * 2], v_pack(out0, out1)); } } @@ -302,10 +302,10 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody double s2; double s3; double s4; -#if CV_SIMD_64F - double s_x[2*v_float64::nlanes]; - double s_y[2*v_float64::nlanes]; - double s_w[2*v_float64::nlanes]; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + double s_x[2*VTraits::max_nlanes]; + double s_y[2*VTraits::max_nlanes]; + double s_w[2*VTraits::max_nlanes]; #endif }; } diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 88a002145ae0..bf9a247054c1 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -972,6 +972,15 @@ namespace CV__SIMD_NAMESPACE { { \ return a op b; \ } + #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \ + inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a == b; \ + } \ + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a != b; \ + } #define OPENCV_HAL_WRAP_CMP(_Tpvec) \ OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \ @@ -984,11 +993,11 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_CMP(v_uint8) OPENCV_HAL_WRAP_CMP(v_uint16) OPENCV_HAL_WRAP_CMP(v_uint32) - // OPENCV_HAL_WRAP_CMP(v_uint64) + OPENCV_HAL_WRAP_EQ_OP(v_uint64) OPENCV_HAL_WRAP_CMP(v_int8) OPENCV_HAL_WRAP_CMP(v_int16) OPENCV_HAL_WRAP_CMP(v_int32) - // OPENCV_HAL_WRAP_CMP(v_int64) + OPENCV_HAL_WRAP_EQ_OP(v_int64) OPENCV_HAL_WRAP_CMP(v_float32) #if CV_SIMD_64F OPENCV_HAL_WRAP_CMP(v_float64) @@ -997,9 +1006,11 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_CMP(v_uint8x16) OPENCV_HAL_WRAP_CMP(v_uint16x8) OPENCV_HAL_WRAP_CMP(v_uint32x4) + OPENCV_HAL_WRAP_EQ_OP(v_uint64x2) OPENCV_HAL_WRAP_CMP(v_int8x16) OPENCV_HAL_WRAP_CMP(v_int16x8) OPENCV_HAL_WRAP_CMP(v_int32x4) + OPENCV_HAL_WRAP_EQ_OP(v_int64x2) OPENCV_HAL_WRAP_CMP(v_float32x4) #if CV_SIMD_64F OPENCV_HAL_WRAP_CMP(v_float64x2) @@ -1009,9 +1020,11 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_CMP(v_uint8x32) OPENCV_HAL_WRAP_CMP(v_uint16x16) OPENCV_HAL_WRAP_CMP(v_uint32x8) + OPENCV_HAL_WRAP_EQ_OP(v_uint64x4) OPENCV_HAL_WRAP_CMP(v_int8x32) OPENCV_HAL_WRAP_CMP(v_int16x16) OPENCV_HAL_WRAP_CMP(v_int32x8) + OPENCV_HAL_WRAP_EQ_OP(v_int64x4) OPENCV_HAL_WRAP_CMP(v_float32x8) #if CV_SIMD_64F OPENCV_HAL_WRAP_CMP(v_float64x4) diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp index 1c97e91fbe2e..2c26680b51cf 100644 --- a/modules/core/src/arithm.simd.hpp +++ b/modules/core/src/arithm.simd.hpp @@ -69,7 +69,7 @@ #define DEFINE_SIMD_F32(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__) -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) #define DEFINE_SIMD_F64(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__) #else @@ -104,7 +104,7 @@ namespace cv { namespace hal { #ifdef ARITHM_DEFINITIONS_ONLY -#if !CV_SIMD_64F +#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F) typedef int v_float64; // dummy #endif @@ -266,7 +266,7 @@ struct op_absdiff template<> struct op_absdiff { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_int8 r(const v_int8& a, const v_int8& b) { return v_absdiffs(a, b); } #endif @@ -276,7 +276,7 @@ struct op_absdiff template<> struct op_absdiff { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_int16 r(const v_int16& a, const v_int16& b) { return v_absdiffs(a, b); } #endif @@ -286,7 +286,7 @@ struct op_absdiff template<> struct op_absdiff { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_int32 r(const v_int32& a, const v_int32& b) { return v_reinterpret_as_s32(v_absdiff(a, b)); } #endif @@ -331,7 +331,7 @@ struct op_not //////////////////////////// Loaders ///////////////////////////////// -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) template< template class OP, typename T1, typename Tvec> struct bin_loader @@ -396,7 +396,7 @@ template class OP, typename T1, typename Tv static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height) { typedef OP op; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef bin_loader ldr; const int wide_step = VTraits::vlanes(); #if !CV_NEON && CV_SIMD_WIDTH == 16 @@ -414,7 +414,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, { int x = 0; - #if CV_SIMD || CV_SIMD_SCALABLE + #if (CV_SIMD || CV_SIMD_SCALABLE) #if !CV_NEON && !CV_MSA if (is_aligned(src1, src2, dst)) { @@ -464,7 +464,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, vx_cleanup(); } -#if !CV_SIMD_64F +#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F) template class OP, typename T1, typename Tvec> static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height) { @@ -496,7 +496,7 @@ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t #define BIN_LOOP64F bin_loop_nosimd #else #define BIN_LOOP64F bin_loop -#endif //!CV_SIMD_64F +#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F) #endif // ARITHM_DEFINITIONS_ONLY @@ -621,7 +621,7 @@ struct op_cmpne //////////////////////////// Loaders ///////////////////////////////// -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) // todo: add support for RW alignment & stream template class OP, typename T1, typename Tvec> struct cmp_loader_n @@ -701,7 +701,7 @@ template class OP, typename T1, typename Tv static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height) { typedef OP op; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef cmp_loader_n ldr; const int wide_step = VTraits::vlanes() * sizeof(T1); #endif // CV_SIMD @@ -713,7 +713,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, { int x = 0; - #if CV_SIMD || CV_SIMD_SCALABLE + #if (CV_SIMD || CV_SIMD_SCALABLE) for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, src2 + x, dst + x); @@ -768,7 +768,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, } } -#if !CV_SIMD_64F +#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F) template< template class OP, typename T1> static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height) { @@ -822,7 +822,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2 break; } } -#endif // !CV_SIMD_64F +#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F) #endif // ARITHM_DEFINITIONS_ONLY @@ -880,7 +880,7 @@ DEFINE_SIMD_ALL(cmp) //////////////////////////// Loaders /////////////////////////////// -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) // todo: add support for RW alignment & stream template class OP, typename T1, typename T2, typename Tvec> struct scalar_loader_n @@ -1099,16 +1099,16 @@ struct scalar_loader_n }; #endif // CV_SIMD -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) template class OP> struct scalar_loader_n { typedef OP op; typedef OP op64; - enum {step = v_int32::nlanes}; static inline void l(const int* src1, const int* src2, const double* scalar, int* dst) { + const int step = VTraits::vlanes(); v_int32 v_src1 = vx_load(src1); v_int32 v_src2 = vx_load(src2); v_int32 v_src1s = vx_load(src1 + step); @@ -1125,6 +1125,7 @@ struct scalar_loader_n } static inline void l(const int* src1, const double* scalar, int* dst) { + const int step = VTraits::vlanes(); v_int32 v_src1 = vx_load(src1); v_int32 v_src1s = vx_load(src1 + step); @@ -1169,10 +1170,10 @@ struct scalar_loader_n { typedef OP op; typedef OP op64; - enum {step = v_float32::nlanes}; static inline void l(const float* src1, const float* src2, const double* scalar, float* dst) { + const int step = VTraits::vlanes(); v_float32 v_src1 = vx_load(src1); v_float32 v_src2 = vx_load(src2); v_float32 v_src1s = vx_load(src1 + step); @@ -1186,6 +1187,7 @@ struct scalar_loader_n } static inline void l(const float* src1, const double* scalar, float* dst) { + const int step = VTraits::vlanes(); v_float32 v_src1 = vx_load(src1); v_float32 v_src1s = vx_load(src1 + step); @@ -1226,10 +1228,10 @@ template class OP> struct scalar_loader_n { typedef OP op; - enum {step = v_float64::nlanes}; static inline void l(const double* src1, const double* src2, const double* scalar, double* dst) { + const int step = VTraits::vlanes(); v_float64 v_src1 = vx_load(src1); v_float64 v_src2 = vx_load(src2); v_float64 v_src1s = vx_load(src1 + step); @@ -1243,6 +1245,7 @@ struct scalar_loader_n } static inline void l(const double* src1, const double* scalar, double* dst) { + const int step = VTraits::vlanes(); v_float64 v_src1 = vx_load(src1); v_float64 v_src1s = vx_load(src1 + step); @@ -1253,7 +1256,7 @@ struct scalar_loader_n v_store(dst + step, r1); } }; -#endif // CV_SIMD_64F +#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) //////////////////////////// Loops ///////////////////////////////// @@ -1263,7 +1266,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef scalar_loader_n ldr; const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits::vlanes() * 2 : sizeof(T1) == sizeof(uchar) ? VTraits::vlanes() / 2 : VTraits::vlanes(); @@ -1277,7 +1280,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste { int x = 0; - #if CV_SIMD || CV_SIMD_SCALABLE + #if (CV_SIMD || CV_SIMD_SCALABLE) for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, src2 + x, scalar, dst + x); @@ -1309,7 +1312,7 @@ template class OP, typename T1 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef scalar_loader_n ldr; const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits::vlanes() * 2 : sizeof(T1) == sizeof(uchar) ? VTraits::vlanes() / 2 : VTraits::vlanes(); @@ -1322,7 +1325,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int { int x = 0; - #if CV_SIMD || CV_SIMD_SCALABLE + #if (CV_SIMD || CV_SIMD_SCALABLE) for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, scalar, dst + x); @@ -1349,7 +1352,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int vx_cleanup(); } -#if !CV_SIMD_64F +#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F) // dual source template class OP, typename T1, typename T2, typename Tvec> static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, @@ -1413,7 +1416,7 @@ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t ste #define SCALAR_LOOP64F scalar_loop_nosimd #else #define SCALAR_LOOP64F scalar_loop -#endif // !CV_SIMD_64F +#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F) #endif // ARITHM_DEFINITIONS_ONLY @@ -1437,7 +1440,7 @@ struct op_mul template struct op_mul_scale { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); @@ -1453,7 +1456,7 @@ struct op_mul_scale template<> struct op_mul_scale { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); @@ -1578,7 +1581,7 @@ struct op_div_f template struct op_div_scale { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); @@ -1600,7 +1603,7 @@ struct op_div_scale template<> struct op_div_scale { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); @@ -1614,7 +1617,7 @@ struct op_div_scale template<> struct op_div_scale { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); @@ -1686,7 +1689,7 @@ DEFINE_SIMD_ALL(div, div_loop) template struct op_add_scale { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_alpha = vx_setall_f32(*scalar); @@ -1702,7 +1705,7 @@ struct op_add_scale template<> struct op_add_scale { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_alpha = vx_setall_f64(*scalar); @@ -1719,7 +1722,7 @@ struct op_add_scale template struct op_add_weighted { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars) { const v_float32 v_alpha = vx_setall_f32(scalars[0]); @@ -1737,7 +1740,7 @@ struct op_add_weighted template<> struct op_add_weighted { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars) { const v_float64 v_alpha = vx_setall_f64(scalars[0]); @@ -1836,7 +1839,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d) template struct op_recip { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 r(const v_float32& a, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); @@ -1858,7 +1861,7 @@ struct op_recip template<> struct op_recip { -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 r(const v_float32& a, const float* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); @@ -1872,7 +1875,7 @@ struct op_recip template<> struct op_recip { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) static inline v_float64 r(const v_float64& a, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); diff --git a/modules/core/src/has_non_zero.simd.hpp b/modules/core/src/has_non_zero.simd.hpp index 6ea8bcd7d2d1..e9f9b683d695 100644 --- a/modules/core/src/has_non_zero.simd.hpp +++ b/modules/core/src/has_non_zero.simd.hpp @@ -87,11 +87,11 @@ static bool hasNonZero8u( const uchar* src, size_t len ) { bool res = false; const uchar* srcEnd = src+len; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef v_uint8 v_type; const v_type v_zero = vx_setzero_u8(); constexpr const int unrollCount = 2; - int step = v_type::nlanes * unrollCount; + int step = VTraits::vlanes() * unrollCount; int len0 = len & -step; const uchar* srcSimdEnd = src+len0; @@ -99,10 +99,10 @@ static bool hasNonZero8u( const uchar* src, size_t len ) while(!res && countSIMD--) { v_type v0 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v1 = vx_load(src); - src += v_type::nlanes; - res = v_check_any(((v0 | v1) != v_zero)); + src += VTraits::vlanes(); + res = v_check_any((v_ne(v_or(v0, v1), v_zero))); } v_cleanup(); @@ -114,11 +114,11 @@ static bool hasNonZero16u( const ushort* src, size_t len ) { bool res = false; const ushort* srcEnd = src+len; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef v_uint16 v_type; const v_type v_zero = vx_setzero_u16(); constexpr const int unrollCount = 4; - int step = v_type::nlanes * unrollCount; + int step = VTraits::vlanes() * unrollCount; int len0 = len & -step; const ushort* srcSimdEnd = src+len0; @@ -126,16 +126,16 @@ static bool hasNonZero16u( const ushort* src, size_t len ) while(!res && countSIMD--) { v_type v0 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v1 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v2 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v3 = vx_load(src); - src += v_type::nlanes; - v0 |= v1; - v2 |= v3; - res = v_check_any(((v0 | v2) != v_zero)); + src += VTraits::vlanes(); + v0 = v_or(v0, v1); + v2 = v_or(v2, v3); + res = v_check_any((v_ne(v_or(v0, v2), v_zero))); } v_cleanup(); @@ -147,11 +147,11 @@ static bool hasNonZero32s( const int* src, size_t len ) { bool res = false; const int* srcEnd = src+len; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef v_int32 v_type; const v_type v_zero = vx_setzero_s32(); constexpr const int unrollCount = 8; - int step = v_type::nlanes * unrollCount; + int step = VTraits::vlanes() * unrollCount; int len0 = len & -step; const int* srcSimdEnd = src+len0; @@ -159,29 +159,29 @@ static bool hasNonZero32s( const int* src, size_t len ) while(!res && countSIMD--) { v_type v0 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v1 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v2 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v3 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v4 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v5 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v6 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v7 = vx_load(src); - src += v_type::nlanes; - v0 |= v1; - v2 |= v3; - v4 |= v5; - v6 |= v7; - - v0 |= v2; - v4 |= v6; - res = v_check_any(((v0 | v4) != v_zero)); + src += VTraits::vlanes(); + v0 = v_or(v0, v1); + v2 = v_or(v2, v3); + v4 = v_or(v4, v5); + v6 = v_or(v6, v7); + + v0 = v_or(v0, v2); + v4 = v_or(v4, v6); + res = v_check_any((v_ne(v_or(v0, v4), v_zero))); } v_cleanup(); @@ -193,11 +193,11 @@ static bool hasNonZero32f( const float* src, size_t len ) { bool res = false; const float* srcEnd = src+len; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) typedef v_float32 v_type; const v_type v_zero = vx_setzero_f32(); constexpr const int unrollCount = 8; - int step = v_type::nlanes * unrollCount; + int step = VTraits::vlanes() * unrollCount; int len0 = len & -step; const float* srcSimdEnd = src+len0; @@ -205,30 +205,30 @@ static bool hasNonZero32f( const float* src, size_t len ) while(!res && countSIMD--) { v_type v0 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v1 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v2 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v3 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v4 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v5 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v6 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v7 = vx_load(src); - src += v_type::nlanes; - v0 |= v1; - v2 |= v3; - v4 |= v5; - v6 |= v7; - - v0 |= v2; - v4 |= v6; + src += VTraits::vlanes(); + v0 = v_or(v0, v1); + v2 = v_or(v2, v3); + v4 = v_or(v4, v5); + v6 = v_or(v6, v7); + + v0 = v_or(v0, v2); + v4 = v_or(v4, v6); //res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ - res = !v_check_all(((v0 | v4) == v_zero)); + res = !v_check_all((v_eq(v_or(v0, v4), v_zero))); } v_cleanup(); @@ -240,11 +240,11 @@ static bool hasNonZero64f( const double* src, size_t len ) { bool res = false; const double* srcEnd = src+len; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) typedef v_float64 v_type; const v_type v_zero = vx_setzero_f64(); constexpr const int unrollCount = 16; - int step = v_type::nlanes * unrollCount; + int step = VTraits::vlanes() * unrollCount; int len0 = len & -step; const double* srcSimdEnd = src+len0; @@ -252,55 +252,55 @@ static bool hasNonZero64f( const double* src, size_t len ) while(!res && countSIMD--) { v_type v0 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v1 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v2 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v3 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v4 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v5 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v6 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v7 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v8 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v9 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v10 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v11 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v12 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v13 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v14 = vx_load(src); - src += v_type::nlanes; + src += VTraits::vlanes(); v_type v15 = vx_load(src); - src += v_type::nlanes; - v0 |= v1; - v2 |= v3; - v4 |= v5; - v6 |= v7; - v8 |= v9; - v10 |= v11; - v12 |= v13; - v14 |= v15; - - v0 |= v2; - v4 |= v6; - v8 |= v10; - v12 |= v14; - - v0 |= v4; - v8 |= v12; + src += VTraits::vlanes(); + v0 = v_or(v0, v1); + v2 = v_or(v2, v3); + v4 = v_or(v4, v5); + v6 = v_or(v6, v7); + v8 = v_or(v8, v9); + v10 = v_or(v10, v11); + v12 = v_or(v12, v13); + v14 = v_or(v14, v15); + + v0 = v_or(v0, v2); + v4 = v_or(v4, v6); + v8 = v_or(v8, v10); + v12 = v_or(v12, v14); + + v0 = v_or(v0, v4); + v8 = v_or(v8, v12); //res = v_check_any(((v0 | v8) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ - res = !v_check_all(((v0 | v8) == v_zero)); + res = !v_check_all((v_eq(v_or(v0, v8), v_zero))); } v_cleanup(); diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 43c6d07d5897..12376c250841 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -276,7 +276,7 @@ template struct VBLAS int givens(T*, T*, int, T, T) const { return 0; } }; -#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F +#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related template<> inline int VBLAS::dot(const float* a, const float* b, int n, float* result) const { if( n < 2*VTraits::vlanes() ) diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index 62aacc0d63f5..3a9dbd9be800 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -2549,6 +2549,7 @@ double dotProd_16s(const short* src1, const short* src2, int len) double dotProd_32s(const int* src1, const int* src2, int len) { #if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F +// Test failed on RVV(QEMU): Too big difference (=1.20209e-08 > 1.11022e-12) double r = .0; int i = 0; const int step = VTraits::vlanes(); diff --git a/modules/core/src/mean.simd.hpp b/modules/core/src/mean.simd.hpp index bb815adc1ccd..e34293ee385d 100644 --- a/modules/core/src/mean.simd.hpp +++ b/modules/core/src/mean.simd.hpp @@ -24,7 +24,7 @@ struct SumSqr_SIMD } }; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) template <> struct SumSqr_SIMD diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp index 0b3c5480a071..60301a406c57 100644 --- a/modules/dnn/src/int8layers/convolution_layer.cpp +++ b/modules/dnn/src/int8layers/convolution_layer.cpp @@ -19,7 +19,7 @@ namespace cv namespace dnn { -#if CV_SIMD +#if CV_SIMD128 static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b, v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3) { @@ -1015,7 +1015,7 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl outptr[0] = std::min(std::max(out1, -128), 127); out_j = 1; } - #if CV_SIMD + #if CV_SIMD128 if( stride_w == 1 ) { const int out_delta = 16; diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp index 826ed12ad955..b8e3bd6ee542 100644 --- a/modules/dnn/src/int8layers/fully_connected_layer.cpp +++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp @@ -305,7 +305,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8 #endif { int i = 0; - #if CV_SIMD + #if CV_SIMD128 for( ; i <= nw - 4; i += 4, wptr += 4*wstep ) { v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(), diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp index 7fe7aabeaf82..13363026135d 100644 --- a/modules/imgproc/src/accum.simd.hpp +++ b/modules/imgproc/src/accum.simd.hpp @@ -475,9 +475,9 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -493,8 +493,8 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) #else for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_store(dst + x, vx_load(dst + x) + vx_load(src + x)); - v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step)); + v_store(dst + x, v_add(vx_load(dst + x), vx_load(src + x))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), vx_load(src + x + step))); } #endif // CV_AVX && !CV_AVX2 } @@ -508,11 +508,11 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) v_uint16 v_masku16 = vx_load_expand(mask + x); v_uint32 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); - v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); - v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); + v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0)))); + v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0)))); - v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0)); - v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1)); + v_store(dst + x, v_add(vx_load(dst + x), v_and(vx_load(src + x), v_mask0))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(vx_load(src + x + step), v_mask1))); } } else if (cn == 3) @@ -522,25 +522,25 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) v_uint16 v_masku16 = vx_load_expand(mask + x); v_uint32 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); - v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); - v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); + v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0)))); + v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0)))); v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); - v_src00 = v_src00 & v_mask0; - v_src01 = v_src01 & v_mask1; - v_src10 = v_src10 & v_mask0; - v_src11 = v_src11 & v_mask1; - v_src20 = v_src20 & v_mask0; - v_src21 = v_src21 & v_mask1; + v_src00 = v_and(v_src00, v_mask0); + v_src01 = v_and(v_src01, v_mask1); + v_src10 = v_and(v_src10, v_mask0); + v_src11 = v_and(v_src11, v_mask1); + v_src20 = v_and(v_src20, v_mask0); + v_src21 = v_and(v_src21, v_mask1); v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20)); + v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21)); } } } @@ -862,9 +862,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_float32::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -889,8 +889,8 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn v_float64 v_src0 = v_cvt_f64(v_src); v_float64 v_src1 = v_cvt_f64_high(v_src); - v_store(dst + x, vx_load(dst + x) + v_src0); - v_store(dst + x + step, vx_load(dst + x + step) + v_src1); + v_store(dst + x, v_add(vx_load(dst + x), v_src0)); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1)); } #endif // CV_AVX && !CV_AVX2 } @@ -904,15 +904,15 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn v_uint32 v_masku32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float32 v_src = vx_load(src + x); - v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0; - v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1; + v_float64 v_src0 = v_and(v_cvt_f64(v_src), v_mask0); + v_float64 v_src1 = v_and(v_cvt_f64_high(v_src), v_mask1); - v_store(dst + x, vx_load(dst + x) + v_src0); - v_store(dst + x + step, vx_load(dst + x + step) + v_src1); + v_store(dst + x, v_add(vx_load(dst + x), v_src0)); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1)); } } else if (cn == 3) @@ -922,24 +922,24 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn v_uint32 v_masku32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float32 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0; - v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1; - v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0; - v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1; - v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0; - v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1; + v_float64 v_src00 = v_and(v_cvt_f64(v_src0), v_mask0); + v_float64 v_src01 = v_and(v_cvt_f64_high(v_src0), v_mask1); + v_float64 v_src10 = v_and(v_cvt_f64(v_src1), v_mask0); + v_float64 v_src11 = v_and(v_cvt_f64_high(v_src1), v_mask1); + v_float64 v_src20 = v_and(v_cvt_f64(v_src2), v_mask0); + v_float64 v_src21 = v_and(v_cvt_f64_high(v_src2), v_mask1); v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20)); + v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21)); } } } @@ -950,9 +950,9 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_float64::nlanes * 2; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes() * 2; + const int step = VTraits::vlanes(); if (!mask) { @@ -971,8 +971,8 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c v_float64 v_src0 = vx_load(src + x); v_float64 v_src1 = vx_load(src + x + step); - v_store(dst + x, vx_load(dst + x) + v_src0); - v_store(dst + x + step, vx_load(dst + x + step) + v_src1); + v_store(dst + x, v_add(vx_load(dst + x), v_src0)); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1)); } #endif // CV_AVX && !CV_AVX2 } @@ -986,14 +986,14 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c v_uint32 v_masku32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float64 v_src0 = vx_load(src + x); v_float64 v_src1 = vx_load(src + x + step); - v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0)); - v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1)); + v_store(dst + x, v_add(vx_load(dst + x), v_and(v_src0, v_mask0))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_src1, v_mask1))); } } else if (cn == 3) @@ -1003,25 +1003,25 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c v_uint32 v_masku32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); - v_src00 = v_src00 & v_mask0; - v_src01 = v_src01 & v_mask1; - v_src10 = v_src10 & v_mask0; - v_src11 = v_src11 & v_mask1; - v_src20 = v_src20 & v_mask0; - v_src21 = v_src21 & v_mask1; + v_src00 = v_and(v_src00, v_mask0); + v_src01 = v_and(v_src01, v_mask1); + v_src10 = v_and(v_src10, v_mask0); + v_src11 = v_and(v_src11, v_mask1); + v_src20 = v_and(v_src20, v_mask0); + v_src21 = v_and(v_src21, v_mask1); v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20)); + v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21)); } } } @@ -1256,9 +1256,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1293,12 +1293,12 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int v_uint16 v_mask16 = vx_load_expand(mask + x); v_uint32 v_mask_0, v_mask_1; v_expand(v_mask16, v_mask_0, v_mask_1); - v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); - v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); + v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0))); + v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0))); v_float32 v_src0 = vx_load(src + x); v_float32 v_src1 = vx_load(src + x + step); - v_src0 = v_src0 & v_mask0; - v_src1 = v_src1 & v_mask1; + v_src0 = v_and(v_src0, v_mask0); + v_src1 = v_and(v_src1, v_mask1); v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); @@ -1311,18 +1311,18 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int v_uint16 v_mask16 = vx_load_expand(mask + x); v_uint32 v_mask_0, v_mask_1; v_expand(v_mask16, v_mask_0, v_mask_1); - v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); - v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); + v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0))); + v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0))); v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); - v_src00 = v_src00 & v_mask0; - v_src01 = v_src01 & v_mask1; - v_src10 = v_src10 & v_mask0; - v_src11 = v_src11 & v_mask1; - v_src20 = v_src20 & v_mask0; - v_src21 = v_src21 & v_mask1; + v_src00 = v_and(v_src00, v_mask0); + v_src01 = v_and(v_src01, v_mask1); + v_src10 = v_and(v_src10, v_mask0); + v_src11 = v_and(v_src11, v_mask1); + v_src20 = v_and(v_src20, v_mask0); + v_src21 = v_and(v_src21, v_mask1); v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); @@ -1625,9 +1625,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_float32::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1667,9 +1667,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint32 v_mask = vx_load_expand_q(mask + x);; - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_float32 v_src = vx_load(src + x); - v_src = v_src & v_reinterpret_as_f32(v_mask); + v_src = v_and(v_src, v_reinterpret_as_f32(v_mask)); v_float64 v_src0 = v_cvt_f64(v_src); v_float64 v_src1 = v_cvt_f64_high(v_src); @@ -1682,13 +1682,13 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint32 v_mask = vx_load_expand_q(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_float32 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_reinterpret_as_f32(v_mask); - v_src1 = v_src1 & v_reinterpret_as_f32(v_mask); - v_src2 = v_src2 & v_reinterpret_as_f32(v_mask); + v_src0 = v_and(v_src0, v_reinterpret_as_f32(v_mask)); + v_src1 = v_and(v_src1, v_reinterpret_as_f32(v_mask)); + v_src2 = v_and(v_src2, v_reinterpret_as_f32(v_mask)); v_float64 v_src00 = v_cvt_f64(v_src0); v_float64 v_src01 = v_cvt_f64_high(v_src0); @@ -1720,9 +1720,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_float64::nlanes * 2; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes() * 2; + const int step = VTraits::vlanes(); if (!mask) { @@ -1756,12 +1756,12 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in v_uint32 v_mask32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_mask32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float64 v_src0 = vx_load(src + x); v_float64 v_src1 = vx_load(src + x + step); - v_src0 = v_src0 & v_mask0; - v_src1 = v_src1 & v_mask1; + v_src0 = v_and(v_src0, v_mask0); + v_src1 = v_and(v_src1, v_mask1); v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); } @@ -1773,18 +1773,18 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in v_uint32 v_mask32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_mask32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); - v_src00 = v_src00 & v_mask0; - v_src01 = v_src01 & v_mask1; - v_src10 = v_src10 & v_mask0; - v_src11 = v_src11 & v_mask1; - v_src20 = v_src20 & v_mask0; - v_src21 = v_src21 & v_mask1; + v_src00 = v_and(v_src00, v_mask0); + v_src01 = v_and(v_src01, v_mask1); + v_src10 = v_and(v_src10, v_mask0); + v_src11 = v_and(v_src11, v_mask1); + v_src20 = v_and(v_src20, v_mask0); + v_src21 = v_and(v_src21, v_mask1); v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); @@ -2035,9 +2035,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2069,11 +2069,11 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar { v_uint32 v_mask32_0 = vx_load_expand_q(mask + x); v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step); - v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); - v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); + v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0))); + v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0))); - v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0)); - v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1)); + v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(vx_load(src1 + x), vx_load(src2 + x)), v_mask0))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(vx_load(src1 + x + step), vx_load(src2 + x + step)), v_mask1))); } } else if (cn == 3) @@ -2082,8 +2082,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar { v_uint32 v_mask32_0 = vx_load_expand_q(mask + x); v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step); - v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); - v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); + v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0))); + v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0))); v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; @@ -2096,8 +2096,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0)); - v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1)); + v_store_interleave(dst + x * cn, v_add(v_dst00, v_and(v_mul(v_1src00, v_2src00), v_mask0)), v_add(v_dst10, v_and(v_mul(v_1src10, v_2src10), v_mask0)), v_add(v_dst20, v_and(v_mul(v_1src20, v_2src20), v_mask0))); + v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_and(v_mul(v_1src01, v_2src01), v_mask1)), v_add(v_dst11, v_and(v_mul(v_1src11, v_2src11), v_mask1)), v_add(v_dst21, v_and(v_mul(v_1src21, v_2src21), v_mask1))); } } } @@ -2398,9 +2398,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_float32::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2447,11 +2447,11 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint32 v_mask = vx_load_expand_q(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_float32 v_1src = vx_load(src1 + x); v_float32 v_2src = vx_load(src2 + x); - v_1src = v_1src & v_reinterpret_as_f32(v_mask); - v_2src = v_2src & v_reinterpret_as_f32(v_mask); + v_1src = v_and(v_1src, v_reinterpret_as_f32(v_mask)); + v_2src = v_and(v_2src, v_reinterpret_as_f32(v_mask)); v_float64 v_1src0 = v_cvt_f64(v_1src); v_float64 v_1src1 = v_cvt_f64_high(v_1src); @@ -2467,16 +2467,16 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint32 v_mask = vx_load_expand_q(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask); - v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask); - v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask); - v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask); - v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask); - v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask); + v_1src0 = v_and(v_1src0, v_reinterpret_as_f32(v_mask)); + v_1src1 = v_and(v_1src1, v_reinterpret_as_f32(v_mask)); + v_1src2 = v_and(v_1src2, v_reinterpret_as_f32(v_mask)); + v_2src0 = v_and(v_2src0, v_reinterpret_as_f32(v_mask)); + v_2src1 = v_and(v_2src1, v_reinterpret_as_f32(v_mask)); + v_2src2 = v_and(v_2src2, v_reinterpret_as_f32(v_mask)); v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); @@ -2501,9 +2501,9 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_float64::nlanes * 2; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes() * 2; + const int step = VTraits::vlanes(); if (!mask) { @@ -2542,16 +2542,16 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc v_uint32 v_mask32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_mask32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float64 v_src00 = vx_load(src1 + x); v_float64 v_src01 = vx_load(src1 + x + step); v_float64 v_src10 = vx_load(src2 + x); v_float64 v_src11 = vx_load(src2 + x + step); - v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0)); - v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1)); + v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(v_src00, v_src10), v_mask0))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(v_src01, v_src11), v_mask1))); } } else if (cn == 3) @@ -2561,8 +2561,8 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc v_uint32 v_mask32 = vx_load_expand_q(mask + x); v_uint64 v_masku640, v_masku641; v_expand(v_mask32, v_masku640, v_masku641); - v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0))); + v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0))); v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; @@ -2570,19 +2570,19 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21); v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20); v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21); - v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00; - v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01; - v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10; - v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11; - v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20; - v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21; + v_float64 v_src00 = v_mul(v_and(v_1src00, v_mask0), v_2src00); + v_float64 v_src01 = v_mul(v_and(v_1src01, v_mask1), v_2src01); + v_float64 v_src10 = v_mul(v_and(v_1src10, v_mask0), v_2src10); + v_float64 v_src11 = v_mul(v_and(v_1src11, v_mask1), v_2src11); + v_float64 v_src20 = v_mul(v_and(v_1src20, v_mask0), v_2src20); + v_float64 v_src21 = v_mul(v_and(v_1src21, v_mask1), v_2src21); v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20)); + v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21)); } } } diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp index bea1decc3ae6..bef9497760db 100644 --- a/modules/imgproc/src/color_hsv.simd.hpp +++ b/modules/imgproc/src/color_hsv.simd.hpp @@ -98,7 +98,7 @@ struct RGB2HSV_b int i = 0; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) const int vsize = VTraits::vlanes(); for ( ; i <= n - vsize; i += vsize, src += scn*vsize, dst += 3*vsize) @@ -274,7 +274,7 @@ struct RGB2HSV_f : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) { } - #if CV_SIMD || CV_SIMD_SCALABLE + #if (CV_SIMD || CV_SIMD_SCALABLE) inline void process(const v_float32& v_r, const v_float32& v_g, const v_float32& v_b, v_float32& v_h, v_float32& v_s, v_float32& v_v, float hscale) const @@ -308,7 +308,7 @@ struct RGB2HSV_f float hscale = hrange*(1.f/360.f); n *= 3; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) const int vsize = VTraits::vlanes(); for ( ; i <= n - 3*vsize; i += 3*vsize, src += scn * vsize) { @@ -368,7 +368,7 @@ struct RGB2HSV_f }; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) inline void HSV2RGB_simd(const v_float32& h, const v_float32& s, const v_float32& v, v_float32& b, v_float32& g, v_float32& r, float hscale) { @@ -473,7 +473,7 @@ struct HSV2RGB_f float hs = hscale; n *= 3; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) const int vsize = VTraits::vlanes(); v_float32 valpha = vx_setall_f32(alpha); for (; i <= n - vsize*3; i += vsize*3, dst += dcn * vsize) @@ -530,7 +530,7 @@ struct HSV2RGB_b int j = 0, dcn = dstcn; uchar alpha = ColorChannel::max(); -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) const int vsize = VTraits::vlanes(); for (j = 0; j <= (n - vsize*4) * 3; j += 3 * 4 * vsize, dst += dcn * 4 * vsize) @@ -679,7 +679,7 @@ struct RGB2HLS_f { } -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) inline void process(const v_float32& r, const v_float32& g, const v_float32& b, const v_float32& vhscale, v_float32& h, v_float32& l, v_float32& s) const @@ -718,7 +718,7 @@ struct RGB2HLS_f int i = 0, bidx = blueIdx, scn = srccn; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) const int vsize = VTraits::vlanes(); v_float32 vhscale = vx_setall_f32(hscale); @@ -802,13 +802,13 @@ struct RGB2HLS_b int scn = srccn; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE]; #else float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE]; #endif -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static const int fsize = VTraits::vlanes(); //TODO: fix that when v_interleave is available float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits::max_nlanes*3]; @@ -823,7 +823,7 @@ struct RGB2HLS_b { int dn = std::min(n - i, (int)BLOCK_SIZE); -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v255inv = vx_setall_f32(1.f/255.f); if (scn == 3) { @@ -902,7 +902,7 @@ struct RGB2HLS_b cvt(buf, buf, dn); int j = 0; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) for( ; j <= dn*3 - fsize*3*4; j += fsize*3*4) { v_float32 f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11; @@ -973,7 +973,7 @@ struct HLS2RGB_f : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { } -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) inline void process(const v_float32& h, const v_float32& l, const v_float32& s, v_float32& b, v_float32& g, v_float32& r) const { @@ -1016,7 +1016,7 @@ struct HLS2RGB_f int i = 0, bidx = blueIdx, dcn = dstcn; float alpha = ColorChannel::max(); -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static const int vsize = VTraits::vlanes(); for (; i <= n - vsize; i += vsize, src += 3*vsize, dst += dcn*vsize) { @@ -1099,13 +1099,13 @@ struct HLS2RGB_b int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE]; #else float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE]; #endif -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) static const int fsize = VTraits::vlanes(); //TODO: fix that when v_interleave is available float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits::max_nlanes*3]; @@ -1122,7 +1122,7 @@ struct HLS2RGB_b int dn = std::min(n - i, (int)BLOCK_SIZE); j = 0; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) for( ; j <= dn*3 - 3*4*fsize; j += 3*4*fsize) { // 3x uchar -> 3*4 float @@ -1179,7 +1179,7 @@ struct HLS2RGB_b } cvt(buf, buf, dn); -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v255 = vx_setall_f32(255.f); if(dcn == 3) { diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp index 90f0b2033021..7d8423d3224c 100644 --- a/modules/imgproc/src/median_blur.simd.hpp +++ b/modules/imgproc/src/median_blur.simd.hpp @@ -548,7 +548,7 @@ struct MinMax32f } }; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) struct MinMaxVec8u { @@ -688,7 +688,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) if( limit == size.width ) break; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) int nlanes = VTraits::vlanes(); #else int nlanes = 1; @@ -793,7 +793,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) if( limit == size.width ) break; -#if CV_SIMD || CV_SIMD_SCALABLE +#if (CV_SIMD || CV_SIMD_SCALABLE) int nlanes = VTraits::vlanes(); #else int nlanes = 1;