From 5e9191558d787b33e2f4d43833106b2e00c21cd1 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Fri, 15 Sep 2023 01:37:46 +0800 Subject: [PATCH] Merge pull request #24058 from hanliutong:rewrite-imgporc Rewrite Universal Intrinsic code by using new API: ImgProc module. #24058 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro in the `opencv/modules/imgproc` folder: rewrite them by using the new Universal Intrinsic API. For easier review, this PR includes a part of the rewritten code, and another part will be brought in the next PR (coming soon). I tested this patch on RVV (QEMU) and AVX devices, `opencv_test_imgproc` is passed. The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR https://github.com/opencv/opencv/pull/23885 and https://github.com/opencv/opencv/pull/23980. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/imgproc/src/accum.simd.hpp | 640 ++++++++++++------------ modules/imgproc/src/blend.cpp | 54 +- modules/imgproc/src/canny.cpp | 66 +-- modules/imgproc/src/color_rgb.simd.hpp | 174 ++++--- modules/imgproc/src/contours.cpp | 20 +- modules/imgproc/src/corner.cpp | 72 +-- modules/imgproc/src/histogram.cpp | 34 +- modules/imgproc/src/pyramids.cpp | 478 +++++++++--------- modules/imgproc/src/resize.cpp | 413 ++++++++------- modules/imgproc/src/smooth.simd.hpp | 335 ++++++------- modules/imgproc/src/spatialgradient.cpp | 40 +- 11 files changed, 1183 insertions(+), 1143 deletions(-) diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp index 6b0e6d6fbe73..7fe7aabeaf82 100644 --- a/modules/imgproc/src/accum.simd.hpp +++ b/modules/imgproc/src/accum.simd.hpp @@ -139,7 +139,7 @@ void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int } #if CV_AVX && !CV_AVX2 _mm256_zeroupper(); -#elif CV_SIMD +#elif (CV_SIMD || CV_SIMD_SCALABLE) vx_cleanup(); #endif } @@ -187,7 +187,7 @@ accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int } #if CV_AVX && !CV_AVX2 _mm256_zeroupper(); -#elif CV_SIMD +#elif (CV_SIMD || CV_SIMD_SCALABLE) vx_cleanup(); #endif } @@ -236,7 +236,7 @@ accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int } #if CV_AVX && !CV_AVX2 _mm256_zeroupper(); -#elif CV_SIMD +#elif (CV_SIMD || CV_SIMD_SCALABLE) vx_cleanup(); #endif } @@ -285,16 +285,16 @@ accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double } #if CV_AVX && !CV_AVX2 _mm256_zeroupper(); -#elif CV_SIMD +#elif (CV_SIMD || CV_SIMD_SCALABLE) vx_cleanup(); #endif } void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint8::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -309,10 +309,10 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01)))); + v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10)))); + v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11)))); } } else @@ -323,9 +323,9 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_0 == v_mask); + v_mask = v_not(v_eq(v_0, v_mask)); v_uint8 v_src = vx_load(src + x); - v_src = v_src & v_mask; + v_src = v_and(v_src, v_mask); v_uint16 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); @@ -333,10 +333,10 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01)))); + v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10)))); + v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11)))); } } else if (cn == 3) @@ -344,12 +344,12 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_0 == v_mask); + v_mask = v_not(v_eq(v_0, v_mask)); v_uint8 v_src0, v_src1, v_src2; v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; + v_src0 = v_and(v_src0, v_mask); + v_src1 = v_and(v_src1, v_mask); + v_src2 = v_and(v_src2, v_mask); v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); @@ -373,18 +373,18 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210); v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211); - v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000)); - v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100)); - v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200)); - v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001)); - v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101)); - v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201)); - v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010)); - v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110)); - v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210)); - v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011)); - v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111)); - v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211)); + v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000))); + v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100))); + v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200))); + v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001))); + v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101))); + v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201))); + v_dst010 = v_add(v_dst010, v_cvt_f32(v_reinterpret_as_s32(v_src010))); + v_dst110 = v_add(v_dst110, v_cvt_f32(v_reinterpret_as_s32(v_src110))); + v_dst210 = v_add(v_dst210, v_cvt_f32(v_reinterpret_as_s32(v_src210))); + v_dst011 = v_add(v_dst011, v_cvt_f32(v_reinterpret_as_s32(v_src011))); + v_dst111 = v_add(v_dst111, v_cvt_f32(v_reinterpret_as_s32(v_src111))); + v_dst211 = v_add(v_dst211, v_cvt_f32(v_reinterpret_as_s32(v_src211))); v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200); v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201); @@ -400,9 +400,9 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -413,8 +413,8 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn v_uint32 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src0)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src1)))); } } else @@ -425,14 +425,14 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_src = vx_load(src + x); - v_src = v_src & v_mask; + v_src = v_and(v_src, v_mask); v_uint32 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src0)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src1)))); } } else if (cn == 3) @@ -441,12 +441,12 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; + v_src0 = v_and(v_src0, v_mask); + v_src1 = v_and(v_src1, v_mask); + v_src2 = v_and(v_src2, v_mask); v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); @@ -456,12 +456,12 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00)); - v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01)); - v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10)); - v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11)); - v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20)); - v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21)); + v_dst00 = v_add(v_dst00, v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_dst01 = v_add(v_dst01, v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_dst10 = v_add(v_dst10, v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_dst11 = v_add(v_dst11, v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_dst20 = v_add(v_dst20, v_cvt_f32(v_reinterpret_as_s32(v_src20))); + v_dst21 = v_add(v_dst21, v_cvt_f32(v_reinterpret_as_s32(v_src21))); v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); @@ -551,9 +551,9 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_uint8::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -586,14 +586,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn v_float64 v_dst6 = vx_load(dst + x + step * 6); v_float64 v_dst7 = vx_load(dst + x + step * 7); - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; - v_dst4 = v_dst4 + v_src4; - v_dst5 = v_dst5 + v_src5; - v_dst6 = v_dst6 + v_src6; - v_dst7 = v_dst7 + v_src7; + v_dst0 = v_add(v_dst0, v_src0); + v_dst1 = v_add(v_dst1, v_src1); + v_dst2 = v_add(v_dst2, v_src2); + v_dst3 = v_add(v_dst3, v_src3); + v_dst4 = v_add(v_dst4, v_src4); + v_dst5 = v_add(v_dst5, v_src5); + v_dst6 = v_add(v_dst6, v_src6); + v_dst7 = v_add(v_dst7, v_src7); v_store(dst + x, v_dst0); v_store(dst + x + step, v_dst1); @@ -613,9 +613,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint8 v_src = vx_load(src + x); - v_src = v_src & v_mask; + v_src = v_and(v_src, v_mask); v_uint16 v_int0, v_int1; v_expand(v_src, v_int0, v_int1); @@ -641,14 +641,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn v_float64 v_dst6 = vx_load(dst + x + step * 6); v_float64 v_dst7 = vx_load(dst + x + step * 7); - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; - v_dst4 = v_dst4 + v_src4; - v_dst5 = v_dst5 + v_src5; - v_dst6 = v_dst6 + v_src6; - v_dst7 = v_dst7 + v_src7; + v_dst0 = v_add(v_dst0, v_src0); + v_dst1 = v_add(v_dst1, v_src1); + v_dst2 = v_add(v_dst2, v_src2); + v_dst3 = v_add(v_dst3, v_src3); + v_dst4 = v_add(v_dst4, v_src4); + v_dst5 = v_add(v_dst5, v_src5); + v_dst6 = v_add(v_dst6, v_src6); + v_dst7 = v_add(v_dst7, v_src7); v_store(dst + x, v_dst0); v_store(dst + x + step, v_dst1); @@ -665,12 +665,12 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_0 == v_mask); + v_mask = v_not(v_eq(v_0, v_mask)); v_uint8 v_src0, v_src1, v_src2; v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; + v_src0 = v_and(v_src0, v_mask); + v_src1 = v_and(v_src1, v_mask); + v_src2 = v_and(v_src2, v_mask); v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); @@ -726,14 +726,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110); v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111); - v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000); - v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001); - v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010); - v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011); - v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100); - v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101); - v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110); - v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111); + v_store_interleave(dst + (x * cn), v_add(v_dst0000, v_src0000), v_add(v_dst1000, v_src1000), v_add(v_dst2000, v_src2000)); + v_store_interleave(dst + ((x + step) * cn), v_add(v_dst0001, v_src0001), v_add(v_dst1001, v_src1001), v_add(v_dst2001, v_src2001)); + v_store_interleave(dst + ((x + step * 2) * cn), v_add(v_dst0010, v_src0010), v_add(v_dst1010, v_src1010), v_add(v_dst2010, v_src2010)); + v_store_interleave(dst + ((x + step * 3) * cn), v_add(v_dst0011, v_src0011), v_add(v_dst1011, v_src1011), v_add(v_dst2011, v_src2011)); + v_store_interleave(dst + ((x + step * 4) * cn), v_add(v_dst0100, v_src0100), v_add(v_dst1100, v_src1100), v_add(v_dst2100, v_src2100)); + v_store_interleave(dst + ((x + step * 5) * cn), v_add(v_dst0101, v_src0101), v_add(v_dst1101, v_src1101), v_add(v_dst2101, v_src2101)); + v_store_interleave(dst + ((x + step * 6) * cn), v_add(v_dst0110, v_src0110), v_add(v_dst1110, v_src1110), v_add(v_dst2110, v_src2110)); + v_store_interleave(dst + ((x + step * 7) * cn), v_add(v_dst0111, v_src0111), v_add(v_dst1111, v_src1111), v_add(v_dst2111, v_src2111)); } } } @@ -744,9 +744,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -767,10 +767,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c v_float64 v_dst2 = vx_load(dst + x + step * 2); v_float64 v_dst3 = vx_load(dst + x + step * 3); - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; + v_dst0 = v_add(v_dst0, v_src0); + v_dst1 = v_add(v_dst1, v_src1); + v_dst2 = v_add(v_dst2, v_src2); + v_dst3 = v_add(v_dst3, v_src3); v_store(dst + x, v_dst0); v_store(dst + x + step, v_dst1); @@ -786,9 +786,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_src = vx_load(src + x); - v_src = v_src & v_mask; + v_src = v_and(v_src, v_mask); v_uint32 v_int0, v_int1; v_expand(v_src, v_int0, v_int1); @@ -802,10 +802,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c v_float64 v_dst2 = vx_load(dst + x + step * 2); v_float64 v_dst3 = vx_load(dst + x + step * 3); - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; + v_dst0 = v_add(v_dst0, v_src0); + v_dst1 = v_add(v_dst1, v_src1); + v_dst2 = v_add(v_dst2, v_src2); + v_dst3 = v_add(v_dst3, v_src3); v_store(dst + x, v_dst0); v_store(dst + x + step, v_dst1); @@ -818,12 +818,12 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c for ( ; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; + v_src0 = v_and(v_src0, v_mask); + v_src1 = v_and(v_src1, v_mask); + v_src2 = v_and(v_src2, v_mask); v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_src0, v_int00, v_int01); v_expand(v_src1, v_int10, v_int11); @@ -848,10 +848,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); - v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20)); + v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21)); + v_store_interleave(dst + (x + step * 2) * cn, v_add(v_dst02, v_src02), v_add(v_dst12, v_src12), v_add(v_dst22, v_src22)); + v_store_interleave(dst + (x + step * 3) * cn, v_add(v_dst03, v_src03), v_add(v_dst13, v_src13), v_add(v_dst23, v_src23)); } } } @@ -1033,9 +1033,9 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint8::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1052,10 +1052,10 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01)))); + v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10)))); + v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11)))); } } else @@ -1066,9 +1066,9 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_0 == v_mask); + v_mask = v_not(v_eq(v_0, v_mask)); v_uint8 v_src = vx_load(src + x); - v_src = v_src & v_mask; + v_src = v_and(v_src, v_mask); v_uint16 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); v_src0 = v_mul_wrap(v_src0, v_src0); @@ -1078,10 +1078,10 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01)))); + v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10)))); + v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11)))); } } else if (cn == 3) @@ -1089,13 +1089,13 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_0 == v_mask); + v_mask = v_not(v_eq(v_0, v_mask)); v_uint8 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; + v_src0 = v_and(v_src0, v_mask); + v_src1 = v_and(v_src1, v_mask); + v_src2 = v_and(v_src2, v_mask); v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); @@ -1126,20 +1126,20 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210); v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211); - v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000)); - v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001)); - v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010)); - v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011)); + v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000))); + v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001))); + v_dst010 = v_add(v_dst010, v_cvt_f32(v_reinterpret_as_s32(v_src010))); + v_dst011 = v_add(v_dst011, v_cvt_f32(v_reinterpret_as_s32(v_src011))); - v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100)); - v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101)); - v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110)); - v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111)); + v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100))); + v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101))); + v_dst110 = v_add(v_dst110, v_cvt_f32(v_reinterpret_as_s32(v_src110))); + v_dst111 = v_add(v_dst111, v_cvt_f32(v_reinterpret_as_s32(v_src111))); - v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200)); - v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201)); - v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210)); - v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211)); + v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200))); + v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201))); + v_dst210 = v_add(v_dst210, v_cvt_f32(v_reinterpret_as_s32(v_src210))); + v_dst211 = v_add(v_dst211, v_cvt_f32(v_reinterpret_as_s32(v_src211))); v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200); v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201); @@ -1155,9 +1155,9 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1186,13 +1186,13 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int v_uint16 v_mask16 = vx_load_expand(mask + x); v_uint32 v_mask0, v_mask1; v_expand(v_mask16, v_mask0, v_mask1); - v_mask0 = ~(v_mask0 == v_0); - v_mask1 = ~(v_mask1 == v_0); + v_mask0 = v_not(v_eq(v_mask0, v_0)); + v_mask1 = v_not(v_eq(v_mask1, v_0)); v_uint16 v_src = vx_load(src + x); v_uint32 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_src0 = v_src0 & v_mask0; - v_src1 = v_src1 & v_mask1; + v_src0 = v_and(v_src0, v_mask0); + v_src1 = v_and(v_src1, v_mask1); v_float32 v_float0, v_float1; v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0)); @@ -1209,8 +1209,8 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int v_uint16 v_mask16 = vx_load_expand(mask + x); v_uint32 v_mask0, v_mask1; v_expand(v_mask16, v_mask0, v_mask1); - v_mask0 = ~(v_mask0 == v_0); - v_mask1 = ~(v_mask1 == v_0); + v_mask0 = v_not(v_eq(v_mask0, v_0)); + v_mask1 = v_not(v_eq(v_mask1, v_0)); v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); @@ -1218,12 +1218,12 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int v_expand(v_src0, v_int00, v_int01); v_expand(v_src1, v_int10, v_int11); v_expand(v_src2, v_int20, v_int21); - v_int00 = v_int00 & v_mask0; - v_int01 = v_int01 & v_mask1; - v_int10 = v_int10 & v_mask0; - v_int11 = v_int11 & v_mask1; - v_int20 = v_int20 & v_mask0; - v_int21 = v_int21 & v_mask1; + v_int00 = v_and(v_int00, v_mask0); + v_int01 = v_and(v_int01, v_mask1); + v_int10 = v_and(v_int10, v_mask0); + v_int11 = v_and(v_int11, v_mask1); + v_int20 = v_and(v_int20, v_mask0); + v_int21 = v_and(v_int21, v_mask1); v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00)); @@ -1347,9 +1347,9 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1390,9 +1390,9 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_src = vx_load_expand(src + x); - v_uint16 v_int = v_src & v_mask; + v_uint16 v_int = v_and(v_src, v_mask); v_uint32 v_int0, v_int1; v_expand(v_int, v_int0, v_int1); @@ -1430,10 +1430,10 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int v_uint16 v_int2 = v_expand_low(v_src2); v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); - v_int0 = v_int0 & v_mask; - v_int1 = v_int1 & v_mask; - v_int2 = v_int2 & v_mask; + v_mask = v_not(v_eq(v_mask, v_0)); + v_int0 = v_and(v_int0, v_mask); + v_int1 = v_and(v_int1, v_mask); + v_int2 = v_and(v_int2, v_mask); v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_int0, v_int00, v_int01); @@ -1486,9 +1486,9 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1531,9 +1531,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_src = vx_load(src + x); - v_src = v_src & v_mask; + v_src = v_and(v_src, v_mask); v_uint32 v_int_0, v_int_1; v_expand(v_src, v_int_0, v_int_1); @@ -1566,12 +1566,12 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; + v_src0 = v_and(v_src0, v_mask); + v_src1 = v_and(v_src1, v_mask); + v_src2 = v_and(v_src2, v_mask); v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_src0, v_int00, v_int01); v_expand(v_src1, v_int10, v_int11); @@ -1810,9 +1810,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint8::nlanes; - const int step = v_uint32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1829,10 +1829,10 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01)))); + v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10)))); + v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11)))); } } else @@ -1843,11 +1843,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint8 v_1src = vx_load(src1 + x); v_uint8 v_2src = vx_load(src2 + x); - v_1src = v_1src & v_mask; - v_2src = v_2src & v_mask; + v_1src = v_and(v_1src, v_mask); + v_2src = v_and(v_2src, v_mask); v_uint16 v_src0, v_src1; v_mul_expand(v_1src, v_2src, v_src0, v_src1); @@ -1856,10 +1856,10 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00)))); + v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01)))); + v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10)))); + v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11)))); } } else if (cn == 3) @@ -1867,16 +1867,16 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint8 v_mask = vx_load(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_mask; - v_1src1 = v_1src1 & v_mask; - v_1src2 = v_1src2 & v_mask; - v_2src0 = v_2src0 & v_mask; - v_2src1 = v_2src1 & v_mask; - v_2src2 = v_2src2 & v_mask; + v_1src0 = v_and(v_1src0, v_mask); + v_1src1 = v_and(v_1src1, v_mask); + v_1src2 = v_and(v_1src2, v_mask); + v_2src0 = v_and(v_2src0, v_mask); + v_2src1 = v_and(v_2src1, v_mask); + v_2src2 = v_and(v_2src2, v_mask); v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_mul_expand(v_1src0, v_2src0, v_src00, v_src01); @@ -1896,18 +1896,18 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201); v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202); v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203); - v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)); - v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)); - v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002)); - v_dst003 = v_dst003 + v_cvt_f32(v_reinterpret_as_s32(v_src003)); - v_dst100 = v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)); - v_dst101 = v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)); - v_dst102 = v_dst102 + v_cvt_f32(v_reinterpret_as_s32(v_src102)); - v_dst103 = v_dst103 + v_cvt_f32(v_reinterpret_as_s32(v_src103)); - v_dst200 = v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200)); - v_dst201 = v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201)); - v_dst202 = v_dst202 + v_cvt_f32(v_reinterpret_as_s32(v_src202)); - v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203)); + v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000))); + v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001))); + v_dst002 = v_add(v_dst002, v_cvt_f32(v_reinterpret_as_s32(v_src002))); + v_dst003 = v_add(v_dst003, v_cvt_f32(v_reinterpret_as_s32(v_src003))); + v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100))); + v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101))); + v_dst102 = v_add(v_dst102, v_cvt_f32(v_reinterpret_as_s32(v_src102))); + v_dst103 = v_add(v_dst103, v_cvt_f32(v_reinterpret_as_s32(v_src103))); + v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200))); + v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201))); + v_dst202 = v_add(v_dst202, v_cvt_f32(v_reinterpret_as_s32(v_src202))); + v_dst203 = v_add(v_dst203, v_cvt_f32(v_reinterpret_as_s32(v_src203))); v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200); v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201); @@ -1923,9 +1923,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -1956,10 +1956,10 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_0 == v_mask); + v_mask = v_not(v_eq(v_0, v_mask)); - v_uint16 v_1src = vx_load(src1 + x) & v_mask; - v_uint16 v_2src = vx_load(src2 + x) & v_mask; + v_uint16 v_1src = v_and(vx_load(src1 + x), v_mask); + v_uint16 v_2src = v_and(vx_load(src2 + x), v_mask); v_uint32 v_1src0, v_1src1, v_2src0, v_2src1; v_expand(v_1src, v_1src0, v_1src1); @@ -1979,17 +1979,17 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_0 == v_mask); + v_mask = v_not(v_eq(v_0, v_mask)); v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_mask; - v_1src1 = v_1src1 & v_mask; - v_1src2 = v_1src2 & v_mask; - v_2src0 = v_2src0 & v_mask; - v_2src1 = v_2src1 & v_mask; - v_2src2 = v_2src2 & v_mask; + v_1src0 = v_and(v_1src0, v_mask); + v_1src1 = v_and(v_1src1, v_mask); + v_1src2 = v_and(v_1src2, v_mask); + v_2src0 = v_and(v_2src0, v_mask); + v_2src1 = v_and(v_2src1, v_mask); + v_2src2 = v_and(v_2src2, v_mask); v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; v_expand(v_1src0, v_1src00, v_1src01); @@ -2108,9 +2108,9 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2153,9 +2153,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); - v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask; - v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask; + v_mask = v_not(v_eq(v_mask, v_0)); + v_uint16 v_1int = v_and(vx_load_expand(src1 + x), v_mask); + v_uint16 v_2int = v_and(vx_load_expand(src2 + x), v_mask); v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1; v_expand(v_1int, v_1int_0, v_1int_1); @@ -2198,13 +2198,13 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha v_uint16 v_2int2 = v_expand_low(v_2src2); v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); - v_1int0 = v_1int0 & v_mask; - v_1int1 = v_1int1 & v_mask; - v_1int2 = v_1int2 & v_mask; - v_2int0 = v_2int0 & v_mask; - v_2int1 = v_2int1 & v_mask; - v_2int2 = v_2int2 & v_mask; + v_mask = v_not(v_eq(v_mask, v_0)); + v_1int0 = v_and(v_1int0, v_mask); + v_1int1 = v_and(v_1int1, v_mask); + v_1int2 = v_and(v_1int2, v_mask); + v_2int0 = v_and(v_2int0, v_mask); + v_2int1 = v_and(v_2int1, v_mask); + v_2int2 = v_and(v_2int2, v_mask); v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21; v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21; @@ -2248,9 +2248,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn) { int x = 0; -#if CV_SIMD_64F - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float64::nlanes; +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2293,11 +2293,11 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_1src = vx_load(src1 + x); v_uint16 v_2src = vx_load(src2 + x); - v_1src = v_1src & v_mask; - v_2src = v_2src & v_mask; + v_1src = v_and(v_1src, v_mask); + v_2src = v_and(v_2src, v_mask); v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1; v_expand(v_1src, v_1int_0, v_1int_1); @@ -2329,16 +2329,16 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc for (; x <= len - cVectorWidth; x += cVectorWidth) { v_uint16 v_mask = vx_load_expand(mask + x); - v_mask = ~(v_mask == v_0); + v_mask = v_not(v_eq(v_mask, v_0)); v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_mask; - v_1src1 = v_1src1 & v_mask; - v_1src2 = v_1src2 & v_mask; - v_2src0 = v_2src0 & v_mask; - v_2src1 = v_2src1 & v_mask; - v_2src2 = v_2src2 & v_mask; + v_1src0 = v_and(v_1src0, v_mask); + v_1src1 = v_and(v_1src1, v_mask); + v_1src2 = v_and(v_1src2, v_mask); + v_2src0 = v_and(v_2src0, v_mask); + v_2src1 = v_and(v_2src1, v_mask); + v_2src2 = v_and(v_2src2, v_mask); v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01; v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11; @@ -2594,11 +2594,11 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha) { int x = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const v_float32 v_alpha = vx_setall_f32((float)alpha); const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha)); - const int cVectorWidth = v_uint8::nlanes; - const int step = v_float32::nlanes; + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2619,10 +2619,10 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn v_float32 v_dst10 = vx_load(dst + x + step * 2); v_float32 v_dst11 = vx_load(dst + x + step * 3); - v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha); - v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha); - v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha); - v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha); + v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha)); + v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha)); + v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha)); + v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha)); v_store(dst + x , v_dst00); v_store(dst + x + step , v_dst01); @@ -2663,15 +2663,15 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn v_float32 v_dst10 = vx_load(dst + x + step * 2); v_float32 v_dst11 = vx_load(dst + x + step * 3); - v_mf00 = v_mf00 != zero; - v_mf01 = v_mf01 != zero; - v_mf10 = v_mf10 != zero; - v_mf11 = v_mf11 != zero; + v_mf00 = v_ne(v_mf00, zero); + v_mf01 = v_ne(v_mf01, zero); + v_mf10 = v_ne(v_mf10, zero); + v_mf11 = v_ne(v_mf11, zero); - v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00); - v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01); - v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10); - v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11); + v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha)), v_dst00); + v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha)), v_dst01); + v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha)), v_dst10); + v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha)), v_dst11); v_store(dst + x , v_dst00); v_store(dst + x + step , v_dst01); @@ -2719,25 +2719,25 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10)); v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11)); - v_mf00 = v_mf00 != zero; - v_mf01 = v_mf01 != zero; - v_mf10 = v_mf10 != zero; - v_mf11 = v_mf11 != zero; + v_mf00 = v_ne(v_mf00, zero); + v_mf01 = v_ne(v_mf01, zero); + v_mf10 = v_ne(v_mf10, zero); + v_mf11 = v_ne(v_mf11, zero); - v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src000)) * v_alpha), v_dst00); - v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src001)) * v_alpha), v_dst01); - v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src010)) * v_alpha), v_dst02); - v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src011)) * v_alpha), v_dst03); + v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_alpha)), v_dst00); + v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_alpha)), v_dst01); + v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_alpha)), v_dst02); + v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_alpha)), v_dst03); - v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src100)) * v_alpha), v_dst10); - v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src101)) * v_alpha), v_dst11); - v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src110)) * v_alpha), v_dst12); - v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src111)) * v_alpha), v_dst13); + v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_alpha)), v_dst10); + v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_alpha)), v_dst11); + v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_alpha)), v_dst12); + v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_alpha)), v_dst13); - v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src200)) * v_alpha), v_dst20); - v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src201)) * v_alpha), v_dst21); - v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src210)) * v_alpha), v_dst22); - v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src211)) * v_alpha), v_dst23); + v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src200)), v_alpha)), v_dst20); + v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src201)), v_alpha)), v_dst21); + v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src210)), v_alpha)), v_dst22); + v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src211)), v_alpha)), v_dst23); v_store_interleave(dst + x * cn , v_dst00, v_dst10, v_dst20); v_store_interleave(dst + ( x + step ) * cn, v_dst01, v_dst11, v_dst21); @@ -2753,11 +2753,11 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha) { int x = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const v_float32 v_alpha = vx_setall_f32((float)alpha); const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha)); - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2770,8 +2770,8 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c v_float32 v_dst0 = vx_load(dst + x); v_float32 v_dst1 = vx_load(dst + x + step); - v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha); - v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha); + v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_int0)), v_alpha)); + v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_int1)), v_alpha)); v_store(dst + x , v_dst0); v_store(dst + x + step, v_dst1); @@ -2799,11 +2799,11 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c v_float32 v_dst0 = vx_load(dst + x); v_float32 v_dst1 = vx_load(dst + x + step); - v_mf0 = v_mf0 != zero; - v_mf1 = v_mf1 != zero; + v_mf0 = v_ne(v_mf0, zero); + v_mf1 = v_ne(v_mf1, zero); - v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src0)) * v_alpha), v_dst0); - v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src1)) * v_alpha), v_dst1); + v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src0)), v_alpha)), v_dst0); + v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src1)), v_alpha)), v_dst1); v_store(dst + x , v_dst0); v_store(dst + x + step, v_dst1); @@ -2833,16 +2833,16 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0)); v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1)); - v_mf0 = v_mf0 != zero; - v_mf1 = v_mf1 != zero; + v_mf0 = v_ne(v_mf0, zero); + v_mf1 = v_ne(v_mf1, zero); - v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00); - v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10); - v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src20)) * v_alpha), v_dst20); + v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha)), v_dst00); + v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha)), v_dst10); + v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src20)), v_alpha)), v_dst20); - v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01); - v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11); - v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src21)) * v_alpha), v_dst21); + v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha)), v_dst01); + v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha)), v_dst11); + v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src21)), v_alpha)), v_dst21); v_store_interleave(dst + x * cn , v_dst00, v_dst10, v_dst20); v_store_interleave(dst + ( x + step ) * cn, v_dst01, v_dst11, v_dst21); @@ -2870,11 +2870,11 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha))); } } -#elif CV_SIMD +#elif (CV_SIMD || CV_SIMD_SCALABLE) const v_float32 v_alpha = vx_setall_f32((float)alpha); const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha)); - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float32::nlanes; + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2884,8 +2884,8 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn v_float32 v_dst0 = vx_load(dst + x); v_float32 v_dst1 = vx_load(dst + x + step); - v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha); - v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha); + v_dst0 = v_fma(v_dst0, v_beta, v_mul(vx_load(src + x), v_alpha)); + v_dst1 = v_fma(v_dst1, v_beta, v_mul(vx_load(src + x + step), v_alpha)); v_store(dst + x, v_dst0); v_store(dst + x + step, v_dst1); @@ -2898,11 +2898,11 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha) { int x = 0; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) const v_float64 v_alpha = vx_setall_f64(alpha); const v_float64 v_beta = vx_setall_f64(1.0f - alpha); - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float64::nlanes; + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2927,10 +2927,10 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c v_float64 v_dst2 = vx_load(dst + x + step * 2); v_float64 v_dst3 = vx_load(dst + x + step * 3); - v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha); - v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha); - v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha); - v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha); + v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_src0, v_alpha)); + v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_src1, v_alpha)); + v_dst2 = v_fma(v_dst2, v_beta, v_mul(v_src2, v_alpha)); + v_dst3 = v_fma(v_dst3, v_beta, v_mul(v_src3, v_alpha)); v_store(dst + x, v_dst0); v_store(dst + x + step, v_dst1); @@ -2945,11 +2945,11 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha) { int x = 0; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) const v_float64 v_alpha = vx_setall_f64(alpha); const v_float64 v_beta = vx_setall_f64(1.0f - alpha); - const int cVectorWidth = v_uint16::nlanes; - const int step = v_float64::nlanes; + const int cVectorWidth = VTraits::vlanes(); + const int step = VTraits::vlanes(); if (!mask) { @@ -2973,10 +2973,10 @@ void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int v_float64 v_dst10 = vx_load(dst + x + step * 2); v_float64 v_dst11 = vx_load(dst + x + step * 3); - v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha); - v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha); - v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha); - v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha); + v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_src00, v_alpha)); + v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_src01, v_alpha)); + v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_src10, v_alpha)); + v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_src11, v_alpha)); v_store(dst + x, v_dst00); v_store(dst + x + step, v_dst01); @@ -3014,11 +3014,11 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha))); } } -#elif CV_SIMD_64F +#elif (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) const v_float64 v_alpha = vx_setall_f64(alpha); const v_float64 v_beta = vx_setall_f64(1.0f - alpha); - const int cVectorWidth = v_float32::nlanes * 2; - const int step = v_float64::nlanes; + const int cVectorWidth = VTraits::vlanes() * 2; + const int step = VTraits::vlanes(); if (!mask) { @@ -3026,7 +3026,7 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c for (; x <= size - cVectorWidth; x += cVectorWidth) { v_float32 v_src0 = vx_load(src + x); - v_float32 v_src1 = vx_load(src + x + v_float32::nlanes); + v_float32 v_src1 = vx_load(src + x + VTraits::vlanes()); v_float64 v_src00 = v_cvt_f64(v_src0); v_float64 v_src01 = v_cvt_f64_high(v_src0); v_float64 v_src10 = v_cvt_f64(v_src1); @@ -3037,10 +3037,10 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c v_float64 v_dst10 = vx_load(dst + x + step * 2); v_float64 v_dst11 = vx_load(dst + x + step * 3); - v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha); - v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha); - v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha); - v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha); + v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_src00, v_alpha)); + v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_src01, v_alpha)); + v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_src10, v_alpha)); + v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_src11, v_alpha)); v_store(dst + x, v_dst00); v_store(dst + x + step, v_dst01); @@ -3072,11 +3072,11 @@ void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha))); } } -#elif CV_SIMD_64F +#elif (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) const v_float64 v_alpha = vx_setall_f64(alpha); const v_float64 v_beta = vx_setall_f64(1.0f - alpha); - const int cVectorWidth = v_float64::nlanes * 2; - const int step = v_float64::nlanes; + const int cVectorWidth = VTraits::vlanes() * 2; + const int step = VTraits::vlanes(); if (!mask) { @@ -3089,8 +3089,8 @@ void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int v_float64 v_dst0 = vx_load(dst + x); v_float64 v_dst1 = vx_load(dst + x + step); - v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha); - v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha); + v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_src0, v_alpha)); + v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_src1, v_alpha)); v_store(dst + x, v_dst0); v_store(dst + x + step, v_dst1); diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp index 5a1296b50958..accb45e7ad87 100644 --- a/modules/imgproc/src/blend.cpp +++ b/modules/imgproc/src/blend.cpp @@ -48,12 +48,12 @@ #include "opencv2/core/hal/intrin.hpp" namespace cv { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2) { const v_float32 v_eps = vx_setall_f32(1e-5f); - v_float32 v_denom = v_w1 + v_w2 + v_eps; - return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom; + v_float32 v_denom = v_add(v_add(v_w1, v_w2), v_eps); + return v_div(v_add(v_mul(v_src1, v_w1), v_mul(v_src2, v_w2)), v_denom); } static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset) { @@ -105,7 +105,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, switch(cn) { case 1: - for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes) + for(int weight_offset = 0 ; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_float32 v_src10, v_src11, v_src12, v_src13; v_float32 v_src20, v_src21, v_src22, v_src23; @@ -113,15 +113,15 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23); v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset); - v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes); - v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes); - v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes); + v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + VTraits::vlanes()); + v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*VTraits::vlanes()); + v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*VTraits::vlanes()); store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); } break; case 2: - for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes) + for(int weight_offset = 0 ; x <= width - 2*VTraits::vlanes(); x += 2*VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_uint8 v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src1 + x, v_src10, v_src11); @@ -135,12 +135,12 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset); v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset); - v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes); - v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes); - v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes); - v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes); - v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes); - v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes); + v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + VTraits::vlanes()); + v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + VTraits::vlanes()); + v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*VTraits::vlanes()); + v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*VTraits::vlanes()); + v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*VTraits::vlanes()); + v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*VTraits::vlanes()); v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6); v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7); @@ -148,7 +148,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, } break; case 3: - for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes) + for(int weight_offset = 0 ; x <= width - 3*VTraits::vlanes(); x += 3*VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); @@ -164,13 +164,13 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223); v_float32 v_w10 = vx_load(weights1 + weight_offset); - v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes); - v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes); - v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes); + v_float32 v_w11 = vx_load(weights1 + weight_offset + VTraits::vlanes()); + v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*VTraits::vlanes()); + v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*VTraits::vlanes()); v_float32 v_w20 = vx_load(weights2 + weight_offset); - v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes); - v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes); - v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes); + v_float32 v_w21 = vx_load(weights2 + weight_offset + VTraits::vlanes()); + v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*VTraits::vlanes()); + v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*VTraits::vlanes()); v_src100 = blend(v_src100, v_src200, v_w10, v_w20); v_src110 = blend(v_src110, v_src210, v_w10, v_w20); v_src120 = blend(v_src120, v_src220, v_w10, v_w20); @@ -192,7 +192,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, } break; case 4: - for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes) + for(int weight_offset = 0 ; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_float32 v_src10, v_src11, v_src12, v_src13; v_float32 v_src20, v_src21, v_src22, v_src23; @@ -229,7 +229,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1, switch(cn) { case 1: - for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes) + for(int weight_offset = 0 ; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_float32 v_src1 = vx_load(src1 + x); v_float32 v_src2 = vx_load(src2 + x); @@ -242,7 +242,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1, } break; case 2: - for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes) + for(int weight_offset = 0 ; x <= width - 2*VTraits::vlanes(); x += 2*VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_float32 v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src1 + x, v_src10, v_src11); @@ -257,7 +257,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1, } break; case 3: - for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes) + for(int weight_offset = 0 ; x <= width - 3*VTraits::vlanes(); x += 3*VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); @@ -273,7 +273,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1, } break; case 4: - for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes) + for(int weight_offset = 0 ; x <= width - 4*VTraits::vlanes(); x += 4*VTraits::vlanes(), weight_offset += VTraits::vlanes()) { v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13); @@ -320,7 +320,7 @@ class BlendLinearInvoker : T * const dst_row = dst->ptr(y); int x = 0; - #if CV_SIMD + #if (CV_SIMD || CV_SIMD_SCALABLE) x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn); #endif diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 9c14929dc8aa..2fed0ba0c233 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -306,11 +306,11 @@ class parallelCanny : public ParallelLoopBody src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel), low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient) { -#if CV_SIMD - for(int i = 0; i < v_int8::nlanes; ++i) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for(int i = 0; i < VTraits::vlanes(); ++i) { smask[i] = 0; - smask[i + v_int8::nlanes] = (schar)-1; + smask[i + VTraits::vlanes()] = (schar)-1; } if (true) _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1); @@ -330,11 +330,11 @@ class parallelCanny : public ParallelLoopBody src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel), low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient) { -#if CV_SIMD - for(int i = 0; i < v_int8::nlanes; ++i) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for(int i = 0; i < VTraits::vlanes(); ++i) { smask[i] = 0; - smask[i + v_int8::nlanes] = (schar)-1; + smask[i + VTraits::vlanes()] = (schar)-1; } if (true) _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1); @@ -396,7 +396,7 @@ class parallelCanny : public ParallelLoopBody } // _mag_p: previous row, _mag_a: actual row, _mag_n: next row -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) AutoBuffer buffer(3 * (mapstep * cn + CV_SIMD_WIDTH)); _mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH); _mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH); @@ -436,8 +436,8 @@ class parallelCanny : public ParallelLoopBody if (L2gradient) { int j = 0, width = src.cols * cn; -#if CV_SIMD - for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for ( ; j <= width - VTraits::vlanes(); j += VTraits::vlanes()) { v_int16 v_dx = vx_load((const short*)(_dx + j)); v_int16 v_dy = vx_load((const short*)(_dy + j)); @@ -447,8 +447,8 @@ class parallelCanny : public ParallelLoopBody v_expand(v_dx, v_dxp_low, v_dxp_high); v_expand(v_dy, v_dyp_low, v_dyp_high); - v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); - v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); + v_store_aligned((int *)(_mag_n + j), v_add(v_mul(v_dxp_low, v_dxp_low), v_mul(v_dyp_low, v_dyp_low))); + v_store_aligned((int *)(_mag_n + j + VTraits::vlanes()), v_add(v_mul(v_dxp_high, v_dxp_high), v_mul(v_dyp_high, v_dyp_high))); } #endif for ( ; j < width; ++j) @@ -457,8 +457,8 @@ class parallelCanny : public ParallelLoopBody else { int j = 0, width = src.cols * cn; -#if CV_SIMD - for(; j <= width - v_int16::nlanes; j += v_int16::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for(; j <= width - VTraits::vlanes(); j += VTraits::vlanes()) { v_int16 v_dx = vx_load((const short *)(_dx + j)); v_int16 v_dy = vx_load((const short *)(_dy + j)); @@ -470,8 +470,8 @@ class parallelCanny : public ParallelLoopBody v_expand(v_dx, v_dx_ml, v_dx_mh); v_expand(v_dy, v_dy_ml, v_dy_mh); - v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); - v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh); + v_store_aligned((int *)(_mag_n + j), v_add(v_dx_ml, v_dy_ml)); + v_store_aligned((int *)(_mag_n + j + VTraits::vlanes()), v_add(v_dx_mh, v_dy_mh)); } #endif for ( ; j < width; ++j) @@ -515,7 +515,7 @@ class parallelCanny : public ParallelLoopBody // From here actual src row is (i - 1) // Set left and right border to 1 -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if (true) _pmap = map.ptr(i) + CV_SIMD_WIDTH; else @@ -537,22 +537,22 @@ class parallelCanny : public ParallelLoopBody const int TG22 = 13573; int j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { const v_int32 v_low = vx_setall_s32(low); const v_int8 v_one = vx_setall_s8(1); - for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes) + for (; j <= src.cols - VTraits::vlanes(); j += VTraits::vlanes()) { v_store_aligned((signed char*)(_pmap + j), v_one); - v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j )) > v_low, - vx_load_aligned((const int*)(_mag_a + j + v_int32::nlanes)) > v_low), - v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low, - vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low)); + v_int8 v_cmp = v_pack(v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j)), v_low), + v_gt(vx_load_aligned((const int *)(_mag_a + j + VTraits::vlanes())), v_low)), + v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j + 2 * VTraits::vlanes())), v_low), + v_gt(vx_load_aligned((const int *)(_mag_a + j + 3 * VTraits::vlanes())), v_low))); while (v_check_any(v_cmp)) { int l = v_scan_forward(v_cmp); - v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l); + v_cmp = v_and(v_cmp, vx_load(smask + VTraits::vlanes() - 1 - l)); int k = j + l; int m = _mag_a[k]; @@ -693,8 +693,8 @@ class parallelCanny : public ParallelLoopBody ptrdiff_t mapstep; int cn; mutable Mutex mutex; -#if CV_SIMD - schar smask[2*v_int8::nlanes]; +#if (CV_SIMD || CV_SIMD_SCALABLE) + schar smask[2*VTraits::max_nlanes]; #endif }; @@ -718,31 +718,31 @@ class finalPass : public ParallelLoopBody int j = 0; uchar *pdst = dst.ptr(i); const uchar *pmap = map.ptr(i + 1); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if (true) pmap += CV_SIMD_WIDTH; else #endif pmap += 1; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { const v_uint8 v_zero = vx_setzero_u8(); - const v_uint8 v_ff = ~v_zero; + const v_uint8 v_ff = v_not(v_zero); const v_uint8 v_two = vx_setall_u8(2); - for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes) + for (; j <= dst.cols - VTraits::vlanes(); j += VTraits::vlanes()) { v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j)); - v_pmap = v_select(v_pmap == v_two, v_ff, v_zero); + v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero); v_store((pdst + j), v_pmap); } - if (j <= dst.cols - v_uint8::nlanes/2) + if (j <= dst.cols - VTraits::vlanes()/2) { v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j)); - v_pmap = v_select(v_pmap == v_two, v_ff, v_zero); + v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero); v_store_low((pdst + j), v_pmap); - j += v_uint8::nlanes/2; + j += VTraits::vlanes()/2; } } #endif diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp index 6e1102019749..67e2febd5b7f 100644 --- a/modules/imgproc/src/color_rgb.simd.hpp +++ b/modules/imgproc/src/color_rgb.simd.hpp @@ -122,8 +122,8 @@ struct RGB2RGB int i = 0; _Tp alphav = ColorChannel<_Tp>::max(); -#if CV_SIMD - const int vsize = vt::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); for(; i <= n-vsize; i += vsize, src += vsize*scn, dst += vsize*dcn) @@ -138,8 +138,13 @@ struct RGB2RGB v_load_deinterleave(src, a, b, c); d = v_set<_Tp>::set(alphav); } - if(bi == 2) + if(bi == 2) { + #if CV_SIMD_SCALABLE + auto t = a; a = c; c = t; // swap(a, c); + #else swap(a, c); + #endif + } if(dcn == 4) { @@ -185,53 +190,57 @@ struct RGB5x52RGB int dcn = dstcn, bidx = blueIdx, gb = greenBits; int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255); for(; i <= n-vsize; i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn) { v_uint16 t0 = v_reinterpret_as_u16(vx_load(src)); v_uint16 t1 = v_reinterpret_as_u16(vx_load(src + - sizeof(ushort)*v_uint16::nlanes)); + sizeof(ushort)*VTraits::vlanes())); //TODO: shorten registers use when v_interleave is available v_uint8 r, g, b, a; - v_uint16 b0 = (t0 << 11) >> 8; - v_uint16 b1 = (t1 << 11) >> 8; + v_uint16 b0 = v_shr<8>(v_shl<11>(t0)); + v_uint16 b1 = v_shr<8>(v_shl<11>(t1)); b = v_pack(b0, b1); v_uint16 g0, g1, r0, r1, a0, a1; if( gb == 6 ) { - g0 = ((t0 >> 5) << 10) >> 8; - g1 = ((t1 >> 5) << 10) >> 8; + g0 = v_shr<8>(v_shl<10>(v_shr<5>(t0))); + g1 = v_shr<8>(v_shl<10>(v_shr<5>(t1))); - r0 = (t0 >> 11) << 3; - r1 = (t1 >> 11) << 3; + r0 = v_shl<3>(v_shr<11>(t0)); + r1 = v_shl<3>(v_shr<11>(t1)); a = vn0; } else { - g0 = ((t0 >> 5) << 11) >> 8; - g1 = ((t1 >> 5) << 11) >> 8; + g0 = v_shr<8>(v_shl<11>(v_shr<5>(t0))); + g1 = v_shr<8>(v_shl<11>(v_shr<5>(t1))); - r0 = ((t0 >> 10) << 11) >> 8; - r1 = ((t1 >> 10) << 11) >> 8; + r0 = v_shr<8>(v_shl<11>(v_shr<10>(t0))); + r1 = v_shr<8>(v_shl<11>(v_shr<10>(t1))); - a0 = t0 >> 15; - a1 = t1 >> 15; + a0 = v_shr<15>(t0); + a1 = v_shr<15>(t1); a = v_pack(a0, a1); - a = a != vz; + a = v_ne(a, vz); } g = v_pack(g0, g1); r = v_pack(r0, r1); - if(bidx == 2) + if(bidx == 2) { + #if CV_SIMD_SCALABLE + auto t = r; r = b; b = t; // swap(b, r); + #else swap(b, r); - + #endif + } if(dcn == 4) { v_store_interleave(dst, b, g, r, a); @@ -289,8 +298,8 @@ struct RGB2RGB5x5 int scn = srccn, bidx = blueIdx, gb = greenBits; int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint16 vn3 = vx_setall_u16((ushort)(~3)); v_uint16 vn7 = vx_setall_u16((ushort)(~7)); v_uint16 vz = vx_setzero_u16(); @@ -308,10 +317,15 @@ struct RGB2RGB5x5 { v_load_deinterleave(src, b, g, r, a); } - if(bidx == 2) + if(bidx == 2){ + #if CV_SIMD_SCALABLE + auto t = r; r = b; b = t; // swap(b, r); + #else swap(b, r); + #endif + } - r = r & v7; + r = v_and(r, v7); //TODO: shorten registers use when v_deinterleave is available v_uint16 r0, r1, g0, g1, b0, b1, a0, a1; @@ -322,20 +336,20 @@ struct RGB2RGB5x5 v_uint16 d0, d1; - b0 = b0 >> 3; - b1 = b1 >> 3; - a0 = (a0 != vz) << 15; - a1 = (a1 != vz) << 15; + b0 = v_shr<3>(b0); + b1 = v_shr<3>(b1); + a0 = v_shl<15>(v_ne(a0, vz)); + a1 = v_shl<15>(v_ne(a1, vz)); if(gb == 6) { - d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8); - d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8); + d0 = v_or(v_or(b0, v_shl<3>(v_and(g0, vn3))), v_shl<8>(r0)); + d1 = v_or(v_or(b1, v_shl<3>(v_and(g1, vn3))), v_shl<8>(r1)); } else { - d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0; - d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1; + d0 = v_or(v_or(v_or(b0, v_shl<2>(v_and(g0, vn7))), v_shl<7>(r0)), a0); + d1 = v_or(v_or(v_or(b1, v_shl<2>(v_and(g1, vn7))), v_shl<7>(r1)), a1); } v_store((ushort*)dst, d0); @@ -382,8 +396,8 @@ struct Gray2RGB int i = 0; _Tp alpha = ColorChannel<_Tp>::max(); -#if CV_SIMD - const int vsize = vt::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); vt valpha = v_set<_Tp>::set(alpha); for(; i <= n-vsize; i += vsize, src += vsize, dst += vsize*dcn) @@ -424,8 +438,8 @@ struct Gray2RGB5x5 { int gb = greenBits; int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint16 v3 = vx_setall_u16((ushort)(~3)); for(; i <= n-vsize; i += vsize, src += vsize, dst += vsize*sizeof(ushort)) @@ -433,16 +447,16 @@ struct Gray2RGB5x5 v_uint8 t8 = vx_load_low(src); v_uint16 t = v_expand_low(t8); - v_uint16 t3 = t >> 3; + v_uint16 t3 = v_shr<3>(t); v_uint16 d = t3; if(gb == 6) { - d |= ((t & v3) << 3) | (t3 << 11); + d = v_or(d, v_or(v_shl<3>(v_and(t, v3)), v_shl<11>(t3))); } else { - d |= (t3 << 5) | (t3 << 10); + d = v_or(d, v_or(v_shl<5>(t3), v_shl<10>(t3))); } v_store((ushort*)dst, d); @@ -488,8 +502,8 @@ struct RGB5x52Gray { int gb = greenBits; int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_int16 bg2y; v_int16 r12y; @@ -504,17 +518,17 @@ struct RGB5x52Gray v_uint16 t = vx_load((ushort*)src); v_uint16 r, g, b; - b = (t << 11) >> 8; + b = v_shr<8>(v_shl<11>(t)); if(gb == 5) { - g = ((t >> 5) << 11) >> 8; - r = ((t >> 10) << 11) >> 8; + g = v_shr<8>(v_shl<11>(v_shr<5>(t))); + r = v_shr<8>(v_shl<11>(v_shr<10>(t))); } else { - g = ((t >> 5) << 10) >> 8; - r = (t >> 11) << 3; + g = v_shr<8>(v_shl<10>(v_shr<5>(t))); + r = v_shl<3>(v_shr<11>(t)); } v_uint8 d; @@ -530,11 +544,11 @@ struct RGB5x52Gray v_zip(sr, delta, rd0, rd1); v_uint32 d0, d1; - d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)); - d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)); + d0 = v_reinterpret_as_u32(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y))); + d1 = v_reinterpret_as_u32(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y))); - d0 = d0 >> shift; - d1 = d1 >> shift; + d0 = v_shr(d0); + d1 = v_shr(d1); dx = v_pack(d0, d1); // high part isn't used @@ -611,8 +625,8 @@ struct RGB2Gray int scn = srccn, i = 0; float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; -#if CV_SIMD - const int vsize = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb); for(; i <= n-vsize; i += vsize, src += vsize*scn, dst += vsize) @@ -627,7 +641,7 @@ struct RGB2Gray v_load_deinterleave(src, b, g, r, a); } - v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv)); + v_float32 d = v_fma(r, rv, v_fma(g, gv, v_mul(b, bv))); v_store(dst, d); } @@ -669,8 +683,8 @@ struct RGB2Gray short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_int16 bg2y; v_int16 r12y; v_int16 dummy; @@ -706,10 +720,10 @@ struct RGB2Gray v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11); v_uint32 y00, y01, y10, y11; - y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; - y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; - y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; - y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; + y00 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y)))); + y01 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y)))); + y10 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y)))); + y11 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y)))); v_uint16 y0, y1; y0 = v_pack(y00, y01); @@ -762,8 +776,8 @@ struct RGB2Gray short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_int16 b2y = vx_setall_s16(cb); v_int16 g2y = vx_setall_s16(cg); @@ -802,13 +816,13 @@ struct RGB2Gray // fixing 16bit signed multiplication v_int16 mr, mg, mb; - mr = (sr < z) & r2y; - mg = (sg < z) & g2y; - mb = (sb < z) & b2y; - v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; + mr = v_and(v_lt(sr, z), r2y); + mg = v_and(v_lt(sg, z), g2y); + mb = v_and(v_lt(sb, z), b2y); + v_int16 fixmul = v_shl(v_add_wrap(mr, v_add_wrap(mg, mb))); - v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; - v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; + v_int32 sy0 = v_shr(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y))); + v_int32 sy1 = v_shr(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y))); v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul); @@ -973,8 +987,8 @@ struct mRGBA2RGBA uchar max_val = ColorChannel::max(); int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vsize = VTraits::vlanes(); v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); v_uint8 vmax = vx_setall_u8(max_val); @@ -989,9 +1003,9 @@ struct mRGBA2RGBA v_uint8 a; v_uint16 a16; v_uint32 a32; - a16 = v_reinterpret_as_u16(s & amask); - a32 = v_reinterpret_as_u32(a16 | (a16 >> 8)); - a = v_reinterpret_as_u8(a32 | (a32 >> 16)); + a16 = v_reinterpret_as_u16(v_and(s, amask)); + a32 = v_reinterpret_as_u32(v_or(a16, v_shr<8>(a16))); + a = v_reinterpret_as_u8(v_or(a32, v_shr<16>(a32))); // s *= max_val v_uint16 s0, s1; @@ -1000,7 +1014,7 @@ struct mRGBA2RGBA // s += a/2 v_uint16 ae0, ae1; v_expand(a, ae0, ae1); - s0 += ae0 >> 1; s1 += ae1 >> 1; + s0 = v_add(s0, v_shr<1>(ae0)); s1 = v_add(s1, v_shr<1>(ae1)); // s, a -> u32 -> float v_uint32 u00, u01, u10, u11; @@ -1035,10 +1049,10 @@ struct mRGBA2RGBA // float d = (float)s/(float)a v_float32 fd00, fd01, fd10, fd11; - fd00 = fs00/fa00; - fd01 = fs01/fa01; - fd10 = fs10/fa10; - fd11 = fs11/fa11; + fd00 = v_div(fs00, fa00); + fd01 = v_div(fs01, fa01); + fd10 = v_div(fs10, fa10); + fd11 = v_div(fs11, fa11); // d -> u32 -> u8 v_uint32 ud00, ud01, ud10, ud11; @@ -1054,8 +1068,8 @@ struct mRGBA2RGBA // if a == 0 then d = 0 v_uint8 am; - am = a != vx_setzero_u8(); - d = d & am; + am = v_ne(a, vx_setzero_u8()); + d = v_and(d, am); // put alpha values d = v_select(amask, a, d); diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index d8823206f29c..3e3096e7a57a 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -1080,7 +1080,7 @@ cvFindNextContour( CvContourScanner scanner ) } else { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if ((p = img[x]) != prev) { goto _next_contour; @@ -1088,9 +1088,9 @@ cvFindNextContour( CvContourScanner scanner ) else { v_uint8 v_prev = vx_setall_u8((uchar)prev); - for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { - v_uint8 vmask = (vx_load((uchar*)(img + x)) != v_prev); + v_uint8 vmask = (v_ne(vx_load((uchar *)(img + x)), v_prev)); if (v_check_any(vmask)) { p = img[(x += v_scan_forward(vmask))]; @@ -1105,7 +1105,7 @@ cvFindNextContour( CvContourScanner scanner ) if( x >= width ) break; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) _next_contour: #endif { @@ -1353,11 +1353,11 @@ CvLinkedRunPoint; inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_uint8 v_zero = vx_setzero_u8(); - for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes) + for (; j <= img_size.width - VTraits::vlanes(); j += VTraits::vlanes()) { - v_uint8 vmask = (vx_load((uchar*)(src_data + j)) != v_zero); + v_uint8 vmask = (v_ne(vx_load((uchar *)(src_data + j)), v_zero)); if (v_check_any(vmask)) { j += v_scan_forward(vmask); @@ -1372,7 +1372,7 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if (j < img_size.width && !src_data[j]) { return j; @@ -1380,9 +1380,9 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) else { v_uint8 v_zero = vx_setzero_u8(); - for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes) + for (; j <= img_size.width - VTraits::vlanes(); j += VTraits::vlanes()) { - v_uint8 vmask = (vx_load((uchar*)(src_data + j)) == v_zero); + v_uint8 vmask = (v_eq(vx_load((uchar *)(src_data + j)), v_zero)); if (v_check_any(vmask)) { j += v_scan_forward(vmask); diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index f0ea0b5bb5e1..1d6ee1ac04d5 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -74,21 +74,21 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst ) #endif // CV_TRY_AVX j = 0; -#if CV_SIMD128 +#if (CV_SIMD || CV_SIMD_SCALABLE) { - v_float32x4 half = v_setall_f32(0.5f); - for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) + v_float32 half = vx_setall_f32(0.5f); + for( ; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes() ) { - v_float32x4 v_a, v_b, v_c, v_t; + v_float32 v_a, v_b, v_c, v_t; v_load_deinterleave(cov + j*3, v_a, v_b, v_c); - v_a *= half; - v_c *= half; - v_t = v_a - v_c; - v_t = v_muladd(v_b, v_b, (v_t * v_t)); - v_store(dst + j, (v_a + v_c) - v_sqrt(v_t)); + v_a = v_mul(v_a, half); + v_c = v_mul(v_c, half); + v_t = v_sub(v_a, v_c); + v_t = v_muladd(v_b, v_b, (v_mul(v_t, v_t))); + v_store(dst + j, v_sub(v_add(v_a, v_c), v_sqrt(v_t))); } } -#endif // CV_SIMD128 +#endif // CV_SIMD for( ; j < size.width; j++ ) { @@ -127,18 +127,18 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) #endif // CV_TRY_AVX j = 0; -#if CV_SIMD128 +#if (CV_SIMD || CV_SIMD_SCALABLE) { - v_float32x4 v_k = v_setall_f32((float)k); + v_float32 v_k = vx_setall_f32((float)k); - for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) + for( ; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes() ) { - v_float32x4 v_a, v_b, v_c; + v_float32 v_a, v_b, v_c; v_load_deinterleave(cov + j * 3, v_a, v_b, v_c); - v_float32x4 v_ac_bb = v_a * v_c - v_b * v_b; - v_float32x4 v_ac = v_a + v_c; - v_float32x4 v_dst = v_ac_bb - v_k * v_ac * v_ac; + v_float32 v_ac_bb = v_sub(v_mul(v_a, v_c), v_mul(v_b, v_b)); + v_float32 v_ac = v_add(v_a, v_c); + v_float32 v_dst = v_sub(v_ac_bb, v_mul(v_mul(v_k, v_ac), v_ac)); v_store(dst + j, v_dst); } } @@ -282,22 +282,22 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, #endif // CV_TRY_AVX j = 0; -#if CV_SIMD128 +#if (CV_SIMD || CV_SIMD_SCALABLE) { - for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) + for( ; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes() ) { - v_float32x4 v_dx = v_load(dxdata + j); - v_float32x4 v_dy = v_load(dydata + j); + v_float32 v_dx = vx_load(dxdata + j); + v_float32 v_dy = vx_load(dydata + j); - v_float32x4 v_dst0, v_dst1, v_dst2; - v_dst0 = v_dx * v_dx; - v_dst1 = v_dx * v_dy; - v_dst2 = v_dy * v_dy; + v_float32 v_dst0, v_dst1, v_dst2; + v_dst0 = v_mul(v_dx, v_dx); + v_dst1 = v_mul(v_dx, v_dy); + v_dst2 = v_mul(v_dy, v_dy); v_store_interleave(cov_data + j * 3, v_dst0, v_dst1, v_dst2); } } -#endif // CV_SIMD128 +#endif // CV_SIMD for( ; j < size.width; j++ ) { @@ -693,9 +693,9 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord if( src.depth() == CV_8U ) factor *= 255; factor = 1./(factor * factor * factor); -#if CV_SIMD128 +#if (CV_SIMD || CV_SIMD_SCALABLE) float factor_f = (float)factor; - v_float32x4 v_factor = v_setall_f32(factor_f), v_m2 = v_setall_f32(-2.0f); + v_float32 v_factor = vx_setall_f32(factor_f), v_m2 = vx_setall_f32(-2.0f); #endif Size size = src.size(); @@ -711,18 +711,18 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord j = 0; -#if CV_SIMD128 +#if (CV_SIMD || CV_SIMD_SCALABLE) { - for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) + for( ; j <= size.width - VTraits::vlanes(); j += VTraits::vlanes() ) { - v_float32x4 v_dx = v_load(dxdata + j); - v_float32x4 v_dy = v_load(dydata + j); + v_float32 v_dx = vx_load(dxdata + j); + v_float32 v_dy = vx_load(dydata + j); - v_float32x4 v_s1 = (v_dx * v_dx) * v_load(d2ydata + j); - v_float32x4 v_s2 = v_muladd((v_dy * v_dy), v_load(d2xdata + j), v_s1); - v_float32x4 v_s3 = v_muladd((v_dy * v_dx) * v_load(dxydata + j), v_m2, v_s2); + v_float32 v_s1 = v_mul(v_mul(v_dx, v_dx), vx_load(d2ydata + j)); + v_float32 v_s2 = v_muladd((v_mul(v_dy, v_dy)), vx_load(d2xdata + j), v_s1); + v_float32 v_s3 = v_muladd(v_mul(v_mul(v_dy, v_dx), vx_load(dxydata + j)), v_m2, v_s2); - v_store(dstdata + j, v_s3 * v_factor); + v_store(dstdata + j, v_mul(v_s3, v_factor)); } } #endif diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 068dfd3a2713..cbd60550e037 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -2053,13 +2053,13 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_CORREL ) { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) v_float64 v_s1 = vx_setzero_f64(); v_float64 v_s2 = vx_setzero_f64(); v_float64 v_s11 = vx_setzero_f64(); v_float64 v_s12 = vx_setzero_f64(); v_float64 v_s22 = vx_setzero_f64(); - for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) + for ( ; j <= len - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_a = vx_load(h1 + j); v_float32 v_b = vx_load(h2 + j); @@ -2070,8 +2070,8 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) v_s12 = v_muladd(v_ad, v_bd, v_s12); v_s11 = v_muladd(v_ad, v_ad, v_s11); v_s22 = v_muladd(v_bd, v_bd, v_s22); - v_s1 += v_ad; - v_s2 += v_bd; + v_s1 = v_add(v_s1, v_ad); + v_s2 = v_add(v_s2, v_bd); // 2-3 v_ad = v_cvt_f64_high(v_a); @@ -2079,8 +2079,8 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) v_s12 = v_muladd(v_ad, v_bd, v_s12); v_s11 = v_muladd(v_ad, v_ad, v_s11); v_s22 = v_muladd(v_bd, v_bd, v_s22); - v_s1 += v_ad; - v_s2 += v_bd; + v_s1 = v_add(v_s1, v_ad); + v_s2 = v_add(v_s2, v_bd); } s12 += v_reduce_sum(v_s12); s11 += v_reduce_sum(v_s11); @@ -2124,12 +2124,12 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_INTERSECT ) { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) v_float64 v_result = vx_setzero_f64(); - for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) + for ( ; j <= len - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j)); - v_result += v_cvt_f64(v_src) + v_cvt_f64_high(v_src); + v_result = v_add(v_result, v_add(v_cvt_f64(v_src), v_cvt_f64_high(v_src))); } result += v_reduce_sum(v_result); #elif CV_SIMD @@ -2146,26 +2146,26 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_BHATTACHARYYA ) { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) v_float64 v_s1 = vx_setzero_f64(); v_float64 v_s2 = vx_setzero_f64(); v_float64 v_result = vx_setzero_f64(); - for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) + for ( ; j <= len - VTraits::vlanes(); j += VTraits::vlanes()) { v_float32 v_a = vx_load(h1 + j); v_float32 v_b = vx_load(h2 + j); v_float64 v_ad = v_cvt_f64(v_a); v_float64 v_bd = v_cvt_f64(v_b); - v_s1 += v_ad; - v_s2 += v_bd; - v_result += v_sqrt(v_ad * v_bd); + v_s1 = v_add(v_s1, v_ad); + v_s2 = v_add(v_s2, v_bd); + v_result = v_add(v_result, v_sqrt(v_mul(v_ad, v_bd))); v_ad = v_cvt_f64_high(v_a); v_bd = v_cvt_f64_high(v_b); - v_s1 += v_ad; - v_s2 += v_bd; - v_result += v_sqrt(v_ad * v_bd); + v_s1 = v_add(v_s1, v_ad); + v_s2 = v_add(v_s2, v_bd); + v_result = v_add(v_result, v_sqrt(v_mul(v_ad, v_bd))); } s1 += v_reduce_sum(v_s1); s2 += v_reduce_sum(v_s2); diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index c13354406968..dae09564d35f 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -84,7 +84,7 @@ template int PyrUpVecV(T1**, T2**, int) { return 0; } template int PyrUpVecVOneRow(T1**, T2*, int) { return 0; } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template<> int PyrDownVecH(const uchar* src, int* row, int width) { @@ -93,10 +93,8 @@ template<> int PyrDownVecH(const uchar* src, int* row, int width) v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) - v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4) + - v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4) + - (v_reinterpret_as_s32(vx_load_expand(src4)) >> 16)); + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += VTraits::vlanes(), src23 += VTraits::vlanes(), src4 += VTraits::vlanes(), row += VTraits::vlanes()) + v_store(row, v_add(v_add(v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4), v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4)), v_shr<16>(v_reinterpret_as_s32(vx_load_expand(src4))))); vx_cleanup(); return x; @@ -108,42 +106,40 @@ template<> int PyrDownVecH(const uchar* src, int* row, int width) v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) - v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) + - v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) + - (v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))) >> 16)); + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += VTraits::vlanes(), src23 += VTraits::vlanes(), src4 += VTraits::vlanes(), row += VTraits::vlanes()) + v_store(row, v_add(v_add(v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4), v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4)))))); vx_cleanup(); return x; } template<> int PyrDownVecH(const uchar* src, int* row, int width) { - int idx[v_int8::nlanes/2 + 4]; - for (int i = 0; i < v_int8::nlanes/4 + 2; i++) + int idx[VTraits::max_nlanes/2 + 4]; + for (int i = 0; i < VTraits::vlanes()/4 + 2; i++) { idx[i] = 6*i; - idx[i + v_int8::nlanes/4 + 2] = 6*i + 3; + idx[i + VTraits::vlanes()/4 + 2] = 6*i + 3; } int x = 0; v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int8::nlanes; x += 3*v_int8::nlanes/4, src += 6*v_int8::nlanes/4, row += 3*v_int8::nlanes/4) + for (; x <= width - VTraits::vlanes(); x += 3*VTraits::vlanes()/4, src += 6*VTraits::vlanes()/4, row += 3*VTraits::vlanes()/4) { v_uint16 r0l, r0h, r1l, r1h, r2l, r2h, r3l, r3h, r4l, r4h; v_expand(vx_lut_quads(src, idx ), r0l, r0h); - v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 2), r1l, r1h); + v_expand(vx_lut_quads(src, idx + VTraits::vlanes()/4 + 2), r1l, r1h); v_expand(vx_lut_quads(src, idx + 1 ), r2l, r2h); - v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 3), r3l, r3h); + v_expand(vx_lut_quads(src, idx + VTraits::vlanes()/4 + 3), r3l, r3h); v_expand(vx_lut_quads(src, idx + 2 ), r4l, r4h); - v_zip(r2l, r1l + r3l, r1l, r3l); - v_zip(r2h, r1h + r3h, r1h, r3h); - r0l += r4l; r0h += r4h; + v_zip(r2l, v_add(r1l, r3l), r1l, r3l); + v_zip(r2h, v_add(r1h, r3h), r1h, r3h); + r0l = v_add(r0l, r4l); r0h = v_add(r0h, r4h); - v_store(row , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0l)))); - v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0l)))); - v_store(row + 6*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0h)))); - v_store(row + 9*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0h)))); + v_store(row , v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4), v_reinterpret_as_s32(v_expand_low(r0l))))); + v_store(row + 3*VTraits::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4), v_reinterpret_as_s32(v_expand_high(r0l))))); + v_store(row + 6*VTraits::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4), v_reinterpret_as_s32(v_expand_low(r0h))))); + v_store(row + 9*VTraits::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4), v_reinterpret_as_s32(v_expand_high(r0h))))); } vx_cleanup(); @@ -156,10 +152,8 @@ template<> int PyrDownVecH(const uchar* src, int* row, int width) v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) - v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) + - v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) + - (v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))) >> 16)); + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += VTraits::vlanes(), src23 += VTraits::vlanes(), src4 += VTraits::vlanes(), row += VTraits::vlanes()) + v_store(row, v_add(v_add(v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4), v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4)))))); vx_cleanup(); return x; @@ -172,10 +166,8 @@ template<> int PyrDownVecH(const short* src, int* row, int width) v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) - v_store(row, v_dotprod(vx_load(src01), v_1_4) + - v_dotprod(vx_load(src23), v_6_4) + - (v_reinterpret_as_s32(vx_load(src4)) >> 16)); + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += VTraits::vlanes(), src23 += VTraits::vlanes(), src4 += VTraits::vlanes(), row += VTraits::vlanes()) + v_store(row, v_add(v_add(v_dotprod(vx_load(src01), v_1_4), v_dotprod(vx_load(src23), v_6_4)), v_shr<16>(v_reinterpret_as_s32(vx_load(src4))))); vx_cleanup(); return x; @@ -187,34 +179,32 @@ template<> int PyrDownVecH(const short* src, int* row, int width) v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) - v_store(row, v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4) + - v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4) + - (v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))) >> 16)); + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += VTraits::vlanes(), src23 += VTraits::vlanes(), src4 += VTraits::vlanes(), row += VTraits::vlanes()) + v_store(row, v_add(v_add(v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4), v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4)))))); vx_cleanup(); return x; } template<> int PyrDownVecH(const short* src, int* row, int width) { - int idx[v_int16::nlanes/2 + 4]; - for (int i = 0; i < v_int16::nlanes/4 + 2; i++) + int idx[VTraits::max_nlanes/2 + 4]; + for (int i = 0; i < VTraits::vlanes()/4 + 2; i++) { idx[i] = 6*i; - idx[i + v_int16::nlanes/4 + 2] = 6*i + 3; + idx[i + VTraits::vlanes()/4 + 2] = 6*i + 3; } int x = 0; v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4) + for (; x <= width - VTraits::vlanes(); x += 3*VTraits::vlanes()/4, src += 6*VTraits::vlanes()/4, row += 3*VTraits::vlanes()/4) { v_int16 r0, r1, r2, r3, r4; - v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); - v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 3), r2, r3); r4 = vx_lut_quads(src, idx + 2); - v_store(row, v_pack_triplets(v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4))); - v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4))); + v_store(row, v_pack_triplets(v_add(v_add(v_dotprod(r0, v_1_4), v_dotprod(r2, v_6_4)), v_expand_low(r4)))); + v_store(row + 3*VTraits::vlanes()/4, v_pack_triplets(v_add(v_add(v_dotprod(r1, v_1_4), v_dotprod(r3, v_6_4)), v_expand_high(r4)))); } vx_cleanup(); @@ -222,24 +212,24 @@ template<> int PyrDownVecH(const short* src, int* row, int width) } template<> int PyrDownVecH(const short* src, int* row, int width) { - int idx[v_int16::nlanes/2 + 4]; - for (int i = 0; i < v_int16::nlanes/4 + 2; i++) + int idx[VTraits::max_nlanes/2 + 4]; + for (int i = 0; i < VTraits::vlanes()/4 + 2; i++) { idx[i] = 8*i; - idx[i + v_int16::nlanes/4 + 2] = 8*i + 4; + idx[i + VTraits::vlanes()/4 + 2] = 8*i + 4; } int x = 0; v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); - for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src += 2*VTraits::vlanes(), row += VTraits::vlanes()) { v_int16 r0, r1, r2, r3, r4; - v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); - v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 3), r2, r3); r4 = vx_lut_quads(src, idx + 2); - v_store(row, v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4)); - v_store(row + v_int32::nlanes, v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4)); + v_store(row, v_add(v_add(v_dotprod(r0, v_1_4), v_dotprod(r2, v_6_4)), v_expand_low(r4))); + v_store(row + VTraits::vlanes(), v_add(v_add(v_dotprod(r1, v_1_4), v_dotprod(r3, v_6_4)), v_expand_high(r4))); } vx_cleanup(); @@ -255,10 +245,8 @@ template<> int PyrDownVecH(const ushort* src, int* row, int widt v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); v_uint16 v_half = vx_setall_u16(0x8000); v_int32 v_half15 = vx_setall_s32(0x00078000); - for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) - v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4) + - v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4) + - v_reinterpret_as_s32(v_reinterpret_as_u32(vx_load(src4)) >> 16) + v_half15); + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += VTraits::vlanes(), src23 += VTraits::vlanes(), src4 += VTraits::vlanes(), row += VTraits::vlanes()) + v_store(row, v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4)), v_reinterpret_as_s32(v_shr<16>(v_reinterpret_as_u32(vx_load(src4))))), v_half15)); vx_cleanup(); return x; @@ -272,21 +260,19 @@ template<> int PyrDownVecH(const ushort* src, int* row, int widt v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); v_uint16 v_half = vx_setall_u16(0x8000); v_int32 v_half15 = vx_setall_s32(0x00078000); - for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) - v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4) + - v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4) + - v_reinterpret_as_s32(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4))) >> 16) + v_half15); + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += VTraits::vlanes(), src23 += VTraits::vlanes(), src4 += VTraits::vlanes(), row += VTraits::vlanes()) + v_store(row, v_add(v_add(v_add(v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4), v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4)), v_reinterpret_as_s32(v_shr<16>(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4)))))), v_half15)); vx_cleanup(); return x; } template<> int PyrDownVecH(const ushort* src, int* row, int width) { - int idx[v_int16::nlanes/2 + 4]; - for (int i = 0; i < v_int16::nlanes/4 + 2; i++) + int idx[VTraits::max_nlanes/2 + 4]; + for (int i = 0; i < VTraits::vlanes()/4 + 2; i++) { idx[i] = 6*i; - idx[i + v_int16::nlanes/4 + 2] = 6*i + 3; + idx[i + VTraits::vlanes()/4 + 2] = 6*i + 3; } int x = 0; @@ -294,18 +280,14 @@ template<> int PyrDownVecH(const ushort* src, int* row, int widt v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); v_uint16 v_half = vx_setall_u16(0x8000); v_int32 v_half15 = vx_setall_s32(0x00078000); - for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4) + for (; x <= width - VTraits::vlanes(); x += 3*VTraits::vlanes()/4, src += 6*VTraits::vlanes()/4, row += 3*VTraits::vlanes()/4) { v_uint16 r0, r1, r2, r3, r4; - v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); - v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 3), r2, r3); r4 = vx_lut_quads(src, idx + 2); - v_store(row , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) + - v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) + - v_reinterpret_as_s32(v_expand_low(r4)) + v_half15)); - v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) + - v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) + - v_reinterpret_as_s32(v_expand_high(r4)) + v_half15)); + v_store(row , v_pack_triplets(v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_low(r4))), v_half15))); + v_store(row + 3*VTraits::vlanes()/4, v_pack_triplets(v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_high(r4))), v_half15))); } vx_cleanup(); @@ -313,11 +295,11 @@ template<> int PyrDownVecH(const ushort* src, int* row, int widt } template<> int PyrDownVecH(const ushort* src, int* row, int width) { - int idx[v_int16::nlanes/2 + 4]; - for (int i = 0; i < v_int16::nlanes/4 + 2; i++) + int idx[VTraits::max_nlanes/2 + 4]; + for (int i = 0; i < VTraits::vlanes()/4 + 2; i++) { idx[i] = 8*i; - idx[i + v_int16::nlanes/4 + 2] = 8*i + 4; + idx[i + VTraits::vlanes()/4 + 2] = 8*i + 4; } int x = 0; @@ -325,18 +307,14 @@ template<> int PyrDownVecH(const ushort* src, int* row, int widt v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); v_uint16 v_half = vx_setall_u16(0x8000); v_int32 v_half15 = vx_setall_s32(0x00078000); - for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src += 2*VTraits::vlanes(), row += VTraits::vlanes()) { v_uint16 r0, r1, r2, r3, r4; - v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1); - v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3); + v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 2), r0, r1); + v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits::vlanes()/4 + 3), r2, r3); r4 = vx_lut_quads(src, idx + 2); - v_store(row , v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) + - v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) + - v_reinterpret_as_s32(v_expand_low(r4)) + v_half15); - v_store(row + v_int32::nlanes, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) + - v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) + - v_reinterpret_as_s32(v_expand_high(r4)) + v_half15); + v_store(row , v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_low(r4))), v_half15)); + v_store(row + VTraits::vlanes(), v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_high(r4))), v_half15)); } vx_cleanup(); @@ -349,13 +327,13 @@ template<> int PyrDownVecH(const float* src, float* row, int wi const float *src01 = src, *src23 = src + 2, *src4 = src + 3; v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); - for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src01 += 2*v_float32::nlanes, src23 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += 2*VTraits::vlanes(), src23 += 2*VTraits::vlanes(), src4 += 2*VTraits::vlanes(), row+=VTraits::vlanes()) { v_float32 r0, r1, r2, r3, r4, rtmp; v_load_deinterleave(src01, r0, r1); v_load_deinterleave(src23, r2, r3); v_load_deinterleave(src4, rtmp, r4); - v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); + v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4)))); } vx_cleanup(); @@ -367,13 +345,13 @@ template<> int PyrDownVecH(const float* src, float* row, int wi const float *src01 = src, *src23 = src + 4, *src4 = src + 6; v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); - for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src01 += 4*v_float32::nlanes, src23 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes) + for (; x <= width - 2*VTraits::vlanes(); x += 2*VTraits::vlanes(), src01 += 4*VTraits::vlanes(), src23 += 4*VTraits::vlanes(), src4 += 4*VTraits::vlanes(), row += 2*VTraits::vlanes()) { v_float32 r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, rtmpa, rtmpb; v_load_deinterleave(src01, r0a, r0b, r1a, r1b); v_load_deinterleave(src23, r2a, r2b, r3a, r3b); v_load_deinterleave(src4, rtmpa, rtmpb, r4a, r4b); - v_store_interleave(row, v_muladd(r2a, _6, v_muladd(r1a + r3a, _4, r0a + r4a)), v_muladd(r2b, _6, v_muladd(r1b + r3b, _4, r0b + r4b))); + v_store_interleave(row, v_muladd(r2a, _6, v_muladd(v_add(r1a, r3a), _4, v_add(r0a, r4a))), v_muladd(r2b, _6, v_muladd(v_add(r1b, r3b), _4, v_add(r0b, r4b)))); } vx_cleanup(); @@ -381,23 +359,23 @@ template<> int PyrDownVecH(const float* src, float* row, int wi } template<> int PyrDownVecH(const float* src, float* row, int width) { - int idx[v_float32::nlanes/2 + 4]; - for (int i = 0; i < v_float32::nlanes/4 + 2; i++) + int idx[VTraits::max_nlanes/2 + 4]; + for (int i = 0; i < VTraits::vlanes()/4 + 2; i++) { idx[i] = 6*i; - idx[i + v_float32::nlanes/4 + 2] = 6*i + 3; + idx[i + VTraits::vlanes()/4 + 2] = 6*i + 3; } int x = 0; v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); - for (; x <= width - v_float32::nlanes; x += 3*v_float32::nlanes/4, src += 6*v_float32::nlanes/4, row += 3*v_float32::nlanes/4) + for (; x <= width - VTraits::vlanes(); x += 3*VTraits::vlanes()/4, src += 6*VTraits::vlanes()/4, row += 3*VTraits::vlanes()/4) { v_float32 r0 = vx_lut_quads(src, idx); - v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2); + v_float32 r1 = vx_lut_quads(src, idx + VTraits::vlanes()/4 + 2); v_float32 r2 = vx_lut_quads(src, idx + 1); - v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3); + v_float32 r3 = vx_lut_quads(src, idx + VTraits::vlanes()/4 + 3); v_float32 r4 = vx_lut_quads(src, idx + 2); - v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)))); + v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4))))); } vx_cleanup(); @@ -405,43 +383,43 @@ template<> int PyrDownVecH(const float* src, float* row, int wi } template<> int PyrDownVecH(const float* src, float* row, int width) { - int idx[v_float32::nlanes/2 + 4]; - for (int i = 0; i < v_float32::nlanes/4 + 2; i++) + int idx[VTraits::max_nlanes/2 + 4]; + for (int i = 0; i < VTraits::vlanes()/4 + 2; i++) { idx[i] = 8*i; - idx[i + v_float32::nlanes/4 + 2] = 8*i + 4; + idx[i + VTraits::vlanes()/4 + 2] = 8*i + 4; } int x = 0; v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); - for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src += 2*v_float32::nlanes, row += v_float32::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src += 2*VTraits::vlanes(), row += VTraits::vlanes()) { v_float32 r0 = vx_lut_quads(src, idx); - v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2); + v_float32 r1 = vx_lut_quads(src, idx + VTraits::vlanes()/4 + 2); v_float32 r2 = vx_lut_quads(src, idx + 1); - v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3); + v_float32 r3 = vx_lut_quads(src, idx + VTraits::vlanes()/4 + 3); v_float32 r4 = vx_lut_quads(src, idx + 2); - v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); + v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4)))); } vx_cleanup(); return x; } -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) template<> int PyrDownVecH(const double* src, double* row, int width) { int x = 0; const double *src01 = src, *src23 = src + 2, *src4 = src + 3; v_float64 _4 = vx_setall_f64(4.f), _6 = vx_setall_f64(6.f); - for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src01 += 2*v_float64::nlanes, src23 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes(), src01 += 2*VTraits::vlanes(), src23 += 2*VTraits::vlanes(), src4 += 2*VTraits::vlanes(), row += VTraits::vlanes()) { v_float64 r0, r1, r2, r3, r4, rtmp; v_load_deinterleave(src01, r0, r1); v_load_deinterleave(src23, r2, r3); v_load_deinterleave(src4, rtmp, r4); - v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); + v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4)))); } vx_cleanup(); @@ -454,35 +432,36 @@ template<> int PyrDownVecV(int** src, uchar* dst, int width) int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes ) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes() ) { v_uint16 r0, r1, r2, r3, r4, t0, t1; - r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); - r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); - r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); - r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); - r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); - t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); - r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes))); - r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes))); - r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes))); - r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes))); - r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes))); - t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits::vlanes()))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits::vlanes()))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits::vlanes()))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + VTraits::vlanes()))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + VTraits::vlanes()))); + t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2))); + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*VTraits::vlanes()), vx_load(row0 + x + 3*VTraits::vlanes()))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*VTraits::vlanes()), vx_load(row1 + x + 3*VTraits::vlanes()))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*VTraits::vlanes()), vx_load(row2 + x + 3*VTraits::vlanes()))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*VTraits::vlanes()), vx_load(row3 + x + 3*VTraits::vlanes()))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*VTraits::vlanes()), vx_load(row4 + x + 3*VTraits::vlanes()))); + t1 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2))); v_store(dst + x, v_rshr_pack<8>(t0, t1)); } - if (x <= width - v_int16::nlanes) + if (x <= width - VTraits::vlanes()) { v_uint16 r0, r1, r2, r3, r4, t0; - r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); - r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); - r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); - r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); - r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); - t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits::vlanes()))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits::vlanes()))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits::vlanes()))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + VTraits::vlanes()))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + VTraits::vlanes()))); + t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2))); v_rshr_pack_store<8>(dst + x, t0); - x += v_uint16::nlanes; + x += VTraits::vlanes(); } + #if CV_SIMD128 typedef int CV_DECL_ALIGNED(1) unaligned_int; for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) { @@ -492,10 +471,23 @@ template<> int PyrDownVecV(int** src, uchar* dst, int width) r2 = v_load(row2 + x); r3 = v_load(row3 + x); r4 = v_load(row4 + x); - t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2))); *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); } + #else + for (; x <= width - 1; x += 1) + { + int r0 = *(row0 + x); + int r1 = *(row1 + x); + int r2 = *(row2 + x); + int r3 = *(row3 + x); + int r4 = *(row4 + x); + int t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + // Similar to v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16()).get0() + *(dst + x) = (int)((((unsigned int)t0) + ((1 << (8 - 1)))) >> 8); + } + #endif //CV_SIMD128 vx_cleanup(); return x; @@ -508,7 +500,7 @@ int PyrDownVecV(float** src, float* dst, int width) const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256); - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_float32 r0, r1, r2, r3, r4; r0 = vx_load(row0 + x); @@ -516,7 +508,7 @@ int PyrDownVecV(float** src, float* dst, int width) r2 = vx_load(row2 + x); r3 = vx_load(row3 + x); r4 = vx_load(row4 + x); - v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale); + v_store(dst + x, v_mul(v_muladd(v_add(v_add(r1, r3), r2), _4, v_add(v_add(r0, r4), v_add(r2, r2))), _scale)); } vx_cleanup(); @@ -528,30 +520,30 @@ template <> int PyrDownVecV(int** src, ushort* dst, int width) int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 r00 = vx_load(row0 + x), - r01 = vx_load(row0 + x + v_int32::nlanes), + r01 = vx_load(row0 + x + VTraits::vlanes()), r10 = vx_load(row1 + x), - r11 = vx_load(row1 + x + v_int32::nlanes), + r11 = vx_load(row1 + x + VTraits::vlanes()), r20 = vx_load(row2 + x), - r21 = vx_load(row2 + x + v_int32::nlanes), + r21 = vx_load(row2 + x + VTraits::vlanes()), r30 = vx_load(row3 + x), - r31 = vx_load(row3 + x + v_int32::nlanes), + r31 = vx_load(row3 + x + VTraits::vlanes()), r40 = vx_load(row4 + x), - r41 = vx_load(row4 + x + v_int32::nlanes); - v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), - r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); + r41 = vx_load(row4 + x + VTraits::vlanes()); + v_store(dst + x, v_rshr_pack_u<8>(v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))), + v_add(v_add(v_add(r01, r41), v_add(r21, r21)), v_shl<2>(v_add(v_add(r11, r21), r31))))); } - if (x <= width - v_int32::nlanes) + if (x <= width - VTraits::vlanes()) { v_int32 r00 = vx_load(row0 + x), r10 = vx_load(row1 + x), r20 = vx_load(row2 + x), r30 = vx_load(row3 + x), r40 = vx_load(row4 + x); - v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); - x += v_int32::nlanes; + v_rshr_pack_u_store<8>(dst + x, v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30)))); + x += VTraits::vlanes(); } vx_cleanup(); @@ -563,30 +555,30 @@ template <> int PyrDownVecV(int** src, short* dst, int width) int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 r00 = vx_load(row0 + x), - r01 = vx_load(row0 + x + v_int32::nlanes), + r01 = vx_load(row0 + x + VTraits::vlanes()), r10 = vx_load(row1 + x), - r11 = vx_load(row1 + x + v_int32::nlanes), + r11 = vx_load(row1 + x + VTraits::vlanes()), r20 = vx_load(row2 + x), - r21 = vx_load(row2 + x + v_int32::nlanes), + r21 = vx_load(row2 + x + VTraits::vlanes()), r30 = vx_load(row3 + x), - r31 = vx_load(row3 + x + v_int32::nlanes), + r31 = vx_load(row3 + x + VTraits::vlanes()), r40 = vx_load(row4 + x), - r41 = vx_load(row4 + x + v_int32::nlanes); - v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), - r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); + r41 = vx_load(row4 + x + VTraits::vlanes()); + v_store(dst + x, v_rshr_pack<8>(v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))), + v_add(v_add(v_add(r01, r41), v_add(r21, r21)), v_shl<2>(v_add(v_add(r11, r21), r31))))); } - if (x <= width - v_int32::nlanes) + if (x <= width - VTraits::vlanes()) { v_int32 r00 = vx_load(row0 + x), r10 = vx_load(row1 + x), r20 = vx_load(row2 + x), r30 = vx_load(row3 + x), r40 = vx_load(row4 + x); - v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); - x += v_int32::nlanes; + v_rshr_pack_store<8>(dst + x, v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30)))); + x += VTraits::vlanes(); } vx_cleanup(); @@ -599,39 +591,55 @@ template <> int PyrUpVecV(int** src, uchar** dst, int width) uchar *dst0 = dst[0], *dst1 = dst[1]; const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { - v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), - v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)), - v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), - v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)), - v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)), - v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes)); - v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11); - v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11))); - v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits::vlanes())), + v_r01 = v_pack(vx_load(row0 + x + 2 * VTraits::vlanes()), vx_load(row0 + x + 3 * VTraits::vlanes())), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits::vlanes())), + v_r11 = v_pack(vx_load(row1 + x + 2 * VTraits::vlanes()), vx_load(row1 + x + 3 * VTraits::vlanes())), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits::vlanes())), + v_r21 = v_pack(vx_load(row2 + x + 2 * VTraits::vlanes()), vx_load(row2 + x + 3 * VTraits::vlanes())); + v_int16 v_2r10 = v_add(v_r10, v_r10), v_2r11 = (v_add(v_r11, v_r11)); + v_store(dst0 + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_add(v_add(v_r01, v_r21), v_add(v_add(v_2r11, v_2r11), v_2r11)))); + v_store(dst1 + x, v_rshr_pack_u<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21)))); } - if(x <= width - v_uint16::nlanes) + if(x <= width - VTraits::vlanes()) { - v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), - v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), - v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)); - v_int16 v_2r10 = v_r10 + v_r10; - v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10)); - v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); - x += v_uint16::nlanes; + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits::vlanes())), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits::vlanes())), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits::vlanes())); + v_int16 v_2r10 = v_add(v_r10, v_r10); + v_rshr_pack_u_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10))); + v_rshr_pack_u_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20))); + x += VTraits::vlanes(); } + #if CV_SIMD128 typedef int CV_DECL_ALIGNED(1) unaligned_int; for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); - v_int32 v_2r10 = v_r10 + v_r10; - v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2); + v_int32 v_2r10 = v_add(v_r10, v_r10); + v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20))); *(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0(); *(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0(); } + #else + for (; x <= width - 1; x += 1) + { + int r00 = *(row0 + x), + r10 = *(row1 + x), + r20 = *(row2 + x); + int _2r10 = r10 + r10; + int d = r00 + r20 + (_2r10 + _2r10 + _2r10); + int d_shifted = (r10 + r20) << 2; + // Similar to v_rshr_pack_u<6>(d, vx_setzero_s16()).get0() + *(dst0 + x) = (int)((((unsigned int)d) + ((1 << (6 - 1)))) >> 6); + // Similar to v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16()).get0() + *(dst1 + x) = (int)((((unsigned int)d_shifted) + ((1 << (6 - 1)))) >> 6); + } + #endif //CV_SIMD128 vx_cleanup(); return x; @@ -643,25 +651,25 @@ template <> int PyrUpVecV(int** src, short** dst, int width) short *dst0 = dst[0], *dst1 = dst[1]; const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), - v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r01 = vx_load(row0 + x + VTraits::vlanes()), v_r10 = vx_load(row1 + x), - v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r11 = vx_load(row1 + x + VTraits::vlanes()), v_r20 = vx_load(row2 + x), - v_r21 = vx_load(row2 + x + v_int32::nlanes); - v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); - v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); + v_r21 = vx_load(row2 + x + VTraits::vlanes()); + v_store(dst0 + x, v_rshr_pack<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11))))); + v_store(dst1 + x, v_rshr_pack<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21)))); } - if(x <= width - v_int32::nlanes) + if(x <= width - VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); - v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); - v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2); - x += v_int32::nlanes; + v_rshr_pack_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10)))); + v_rshr_pack_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20))); + x += VTraits::vlanes(); } vx_cleanup(); @@ -674,25 +682,25 @@ template <> int PyrUpVecV(int** src, ushort** dst, int width) ushort *dst0 = dst[0], *dst1 = dst[1]; const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), - v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r01 = vx_load(row0 + x + VTraits::vlanes()), v_r10 = vx_load(row1 + x), - v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r11 = vx_load(row1 + x + VTraits::vlanes()), v_r20 = vx_load(row2 + x), - v_r21 = vx_load(row2 + x + v_int32::nlanes); - v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); - v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); + v_r21 = vx_load(row2 + x + VTraits::vlanes()); + v_store(dst0 + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11))))); + v_store(dst1 + x, v_rshr_pack_u<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21)))); } - if(x <= width - v_int32::nlanes) + if(x <= width - VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); - v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); - v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); - x += v_int32::nlanes; + v_rshr_pack_u_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10)))); + v_rshr_pack_u_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20))); + x += VTraits::vlanes(); } vx_cleanup(); @@ -706,13 +714,13 @@ template <> int PyrUpVecV(float** src, float** dst, int width) float *dst0 = dst[0], *dst1 = dst[1]; v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f); - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_float32 v_r0 = vx_load(row0 + x), v_r1 = vx_load(row1 + x), v_r2 = vx_load(row2 + x); - v_store(dst1 + x, v_scale4 * (v_r1 + v_r2)); - v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2)); + v_store(dst1 + x, v_mul(v_scale4, v_add(v_r1, v_r2))); + v_store(dst0 + x, v_mul(v_scale, v_add(v_muladd(v_6, v_r1, v_r0), v_r2))); } vx_cleanup(); @@ -724,36 +732,50 @@ template <> int PyrUpVecVOneRow(int** src, uchar* dst, int width) int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { - v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), - v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)), - v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), - v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)), - v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)), - v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes)); - v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11); - v_store(dst + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11))); + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits::vlanes())), + v_r01 = v_pack(vx_load(row0 + x + 2 * VTraits::vlanes()), vx_load(row0 + x + 3 * VTraits::vlanes())), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits::vlanes())), + v_r11 = v_pack(vx_load(row1 + x + 2 * VTraits::vlanes()), vx_load(row1 + x + 3 * VTraits::vlanes())), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits::vlanes())), + v_r21 = v_pack(vx_load(row2 + x + 2 * VTraits::vlanes()), vx_load(row2 + x + 3 * VTraits::vlanes())); + v_int16 v_2r10 = v_add(v_r10, v_r10), v_2r11 = (v_add(v_r11, v_r11)); + v_store(dst + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_add(v_add(v_r01, v_r21), v_add(v_add(v_2r11, v_2r11), v_2r11)))); } - if(x <= width - v_uint16::nlanes) + if(x <= width - VTraits::vlanes()) { - v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), - v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), - v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)); - v_int16 v_2r10 = v_r10 + v_r10; - v_rshr_pack_u_store<6>(dst + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10)); - x += v_uint16::nlanes; + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits::vlanes())), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits::vlanes())), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits::vlanes())); + v_int16 v_2r10 = v_add(v_r10, v_r10); + v_rshr_pack_u_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10))); + x += VTraits::vlanes(); } + #if CV_SIMD128 typedef int CV_DECL_ALIGNED(1) unaligned_int; for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); - v_int32 v_2r10 = v_r10 + v_r10; - v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2); + v_int32 v_2r10 = v_add(v_r10, v_r10); + v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20))); *(unaligned_int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0(); } + #else + for (; x <= width - 1; x += 1) + { + int r00 = *(row0 + x), + r10 = *(row1 + x), + r20 = *(row2 + x); + int _2r10 = r10 + r10; + int d = r00 + r20 + (_2r10 + _2r10 + _2r10); + int d_shifted = (r10 + r20) << 2; + // Similar to v_rshr_pack_u<6>(d, vx_setzero_s16()).get0() + *(dst + x) = (int)((((unsigned int)d) + ((1 << (6 - 1)))) >> 6); + } + #endif //CV_SIMD128 vx_cleanup(); return x; @@ -764,23 +786,23 @@ template <> int PyrUpVecVOneRow(int** src, short* dst, int width) int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), - v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r01 = vx_load(row0 + x + VTraits::vlanes()), v_r10 = vx_load(row1 + x), - v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r11 = vx_load(row1 + x + VTraits::vlanes()), v_r20 = vx_load(row2 + x), - v_r21 = vx_load(row2 + x + v_int32::nlanes); - v_store(dst + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); + v_r21 = vx_load(row2 + x + VTraits::vlanes()); + v_store(dst + x, v_rshr_pack<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11))))); } - if(x <= width - v_int32::nlanes) + if(x <= width - VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); - v_rshr_pack_store<6>(dst + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); - x += v_int32::nlanes; + v_rshr_pack_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10)))); + x += VTraits::vlanes(); } vx_cleanup(); @@ -792,23 +814,23 @@ template <> int PyrUpVecVOneRow(int** src, ushort* dst, int width) int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), - v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r01 = vx_load(row0 + x + VTraits::vlanes()), v_r10 = vx_load(row1 + x), - v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r11 = vx_load(row1 + x + VTraits::vlanes()), v_r20 = vx_load(row2 + x), - v_r21 = vx_load(row2 + x + v_int32::nlanes); - v_store(dst + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); + v_r21 = vx_load(row2 + x + VTraits::vlanes()); + v_store(dst + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11))))); } - if(x <= width - v_int32::nlanes) + if(x <= width - VTraits::vlanes()) { v_int32 v_r00 = vx_load(row0 + x), v_r10 = vx_load(row1 + x), v_r20 = vx_load(row2 + x); - v_rshr_pack_u_store<6>(dst + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); - x += v_int32::nlanes; + v_rshr_pack_u_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10)))); + x += VTraits::vlanes(); } vx_cleanup(); @@ -821,12 +843,12 @@ template <> int PyrUpVecVOneRow(float** src, float* dst, int width const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f); - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_float32 v_r0 = vx_load(row0 + x), v_r1 = vx_load(row1 + x), v_r2 = vx_load(row2 + x); - v_store(dst + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2)); + v_store(dst + x, v_mul(v_scale, v_add(v_muladd(v_6, v_r1, v_r0), v_r2))); } vx_cleanup(); diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 456cfc4af916..1ad8e8932deb 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -346,8 +346,8 @@ void hlineResizeCn(uint8_t* src, int, int *o { int i = 0; ufixedpoint16 src_0(src[0]); -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); v_uint16 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0)); for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { @@ -358,7 +358,7 @@ void hlineResizeCn(uint8_t* src, int, int *o { *(dst++) = src_0; } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for (; i <= dst_max - 2*VECSZ; i += 2*VECSZ, m += 4*VECSZ, dst += 2*VECSZ) { v_uint16 v_src0, v_src1; @@ -384,7 +384,7 @@ void hlineResizeCn(uint8_t* src, int, int *o *(dst++) = m[0] * px[0] + m[1] * px[1]; } src_0 = (src + ofst[dst_width - 1])[0]; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_src_0 = vx_setall_u16(*((uint16_t*)&src_0)); for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { @@ -406,8 +406,8 @@ void hlineResizeCn(uint8_t* src, int, int *o } srccn; ((ufixedpoint16*)(srccn.w))[0] = src[0]; ((ufixedpoint16*)(srccn.w))[1] = src[1]; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d)); for (; i <= dst_min - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { @@ -419,7 +419,7 @@ void hlineResizeCn(uint8_t* src, int, int *o *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) { v_uint16 v_src0, v_src1; @@ -440,7 +440,7 @@ void hlineResizeCn(uint8_t* src, int, int *o *(dst++) = m[0] * px[1] + m[1] * px[3]; } ((ufixedpoint16*)(srccn.w))[0] = (src + 2 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 2 * ofst[dst_width - 1])[1]; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d)); for (; i <= dst_width - VECSZ/2; i += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { @@ -465,8 +465,8 @@ void hlineResizeCn(uint8_t* src, int, int *o ((ufixedpoint16*)(srccn.w))[1] = src[1]; ((ufixedpoint16*)(srccn.w))[2] = src[2]; ((ufixedpoint16*)(srccn.w))[3] = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); v_uint16 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q))); for (; i <= dst_min - (VECSZ+2)/3; i += VECSZ/4, m += VECSZ/2, dst += 3*VECSZ/4) // Points that fall left from src image so became equal to leftmost src point { @@ -479,14 +479,14 @@ void hlineResizeCn(uint8_t* src, int, int *o *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; *(dst++) = ((ufixedpoint16*)(srccn.w))[2]; } -#if CV_SIMD - CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VECSZ/2]; +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VTraits::max_nlanes/2]; for (; i <= dst_max - (3*VECSZ/4 + (VECSZ+2)/3); i += VECSZ/2, m += VECSZ, dst += 3*VECSZ/2) { - v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3)); + v_store(ofst3, v_mul(vx_load(ofst + i), vx_setall_s32(3))); v_uint8 v_src01, v_src23; v_uint16 v_src0, v_src1, v_src2, v_src3; - v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)) >> 8), v_src01, v_src23); + v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_shr<8>(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)))), v_src01, v_src23); v_expand(v_src01, v_src0, v_src1); v_expand(v_src23, v_src2, v_src3); @@ -514,7 +514,7 @@ void hlineResizeCn(uint8_t* src, int, int *o ((ufixedpoint16*)(srccn.w))[0] = (src + 3*ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 3*ofst[dst_width - 1])[1]; ((ufixedpoint16*)(srccn.w))[2] = (src + 3*ofst[dst_width - 1])[2]; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q))); for (; i <= dst_width - (VECSZ+2)/3; i += VECSZ/4, dst += 3*VECSZ/4) // Points that fall right from src image so became equal to rightmost src point { @@ -540,8 +540,8 @@ void hlineResizeCn(uint8_t* src, int, int *o ((ufixedpoint16*)(srccn.w))[1] = src[1]; ((ufixedpoint16*)(srccn.w))[2] = src[2]; ((ufixedpoint16*)(srccn.w))[3] = src[3]; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q)); for (; i <= dst_min - VECSZ/4; i += VECSZ/4, m += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { @@ -555,7 +555,7 @@ void hlineResizeCn(uint8_t* src, int, int *o *(dst++) = ((ufixedpoint16*)(srccn.w))[2]; *(dst++) = ((ufixedpoint16*)(srccn.w))[3]; } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ) { v_uint16 v_src0, v_src1, v_src2, v_src3; @@ -586,7 +586,7 @@ void hlineResizeCn(uint8_t* src, int, int *o } ((ufixedpoint16*)(srccn.w))[0] = (src + 4 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 4 * ofst[dst_width - 1])[1]; ((ufixedpoint16*)(srccn.w))[2] = (src + 4 * ofst[dst_width - 1])[2]; ((ufixedpoint16*)(srccn.w))[3] = (src + 4 * ofst[dst_width - 1])[3]; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q)); for (; i <= dst_width - VECSZ/4; i += VECSZ/4, dst += VECSZ) // Points that fall right from src image so became equal to rightmost src point { @@ -606,8 +606,8 @@ void hlineResizeCn(uint16_t* src, int, int { int i = 0; ufixedpoint32 src_0(src[0]); -#if CV_SIMD - const int VECSZ = v_uint32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); v_uint32 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0)); for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { @@ -618,16 +618,16 @@ void hlineResizeCn(uint16_t* src, int, int { *(dst++) = src_0; } -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) { v_uint32 v_src0, v_src1; v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1); - v_uint64 v_res0 = v_reinterpret_as_u64(v_src0 * vx_load((uint32_t*)m)); - v_uint64 v_res1 = v_reinterpret_as_u64(v_src1 * vx_load((uint32_t*)m + VECSZ)); - v_store((uint32_t*)dst, v_pack((v_res0 & vx_setall_u64(0xFFFFFFFF)) + (v_res0 >> 32), - (v_res1 & vx_setall_u64(0xFFFFFFFF)) + (v_res1 >> 32))); + v_uint64 v_res0 = v_reinterpret_as_u64(v_mul(v_src0, vx_load((uint32_t *)m))); + v_uint64 v_res1 = v_reinterpret_as_u64(v_mul(v_src1, vx_load((uint32_t *)m + VECSZ))); + v_store((uint32_t*)dst, v_pack(v_add(v_and(v_res0, vx_setall_u64(0xFFFFFFFF)), v_shr<32>(v_res0)), + v_add(v_and(v_res1, vx_setall_u64(0xFFFFFFFF)), v_shr<32>(v_res1)))); } #endif for (; i < dst_max; i += 1, m += 2) @@ -636,7 +636,7 @@ void hlineResizeCn(uint16_t* src, int, int *(dst++) = m[0] * px[0] + m[1] * px[1]; } src_0 = (src + ofst[dst_width - 1])[0]; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_src_0 = vx_setall_u32(*((uint32_t*)&src_0)); for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) { @@ -659,16 +659,16 @@ template <> void vlineSet(ufixedpoint16* src, uint8_t* dst, int dst_width) { int i = 0; -#if CV_SIMD - const int VECSZ = v_uint8::nlanes; - static const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); + const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1)); for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) { v_uint16 v_src0 = vx_load((uint16_t*)src); v_uint16 v_src1 = vx_load((uint16_t*)src + VECSZ/2); - v_uint16 v_res0 = (v_src0 + v_fixedRound) >> 8; - v_uint16 v_res1 = (v_src1 + v_fixedRound) >> 8; + v_uint16 v_res0 = v_shr<8>(v_add(v_src0, v_fixedRound)); + v_uint16 v_res1 = v_shr<8>(v_add(v_src1, v_fixedRound)); v_store(dst, v_pack(v_res0, v_res1)); } @@ -693,11 +693,11 @@ void vlineResize(ufixedpoint16* src, size_t src_step, { int i = 0; ufixedpoint16* src1 = src + src_step; -#if CV_SIMD - const int VECSZ = v_uint8::nlanes; - static const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1)); - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15)); - static const v_int8 v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); + const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1)); + const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15)); + const v_int8 v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7)); v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(((uint32_t*)m)[0])); for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, src1 += VECSZ, dst += VECSZ) @@ -716,10 +716,10 @@ void vlineResize(ufixedpoint16* src, size_t src_step, v_int32 v_res2 = v_dotprod(v_tmp0, v_mul); v_int32 v_res3 = v_dotprod(v_tmp1, v_mul); - v_int8 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16, - (v_res1 + v_fixedRound) >> 16), - v_pack((v_res2 + v_fixedRound) >> 16, - (v_res3 + v_fixedRound) >> 16)); + v_int8 v_res = v_pack(v_pack(v_shr<16>(v_add(v_res0, v_fixedRound)), + v_shr<16>(v_add(v_res1, v_fixedRound))), + v_pack(v_shr<16>(v_add(v_res2, v_fixedRound)), + v_shr<16>(v_add(v_res3, v_fixedRound)))); v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16))); } @@ -828,7 +828,7 @@ class resize_bitExactInvoker : hResize((ET*)(src + (src_height - 1) * src_step), cn, xoffsets, xcoeffs, endline, min_x, max_x, dst_width); for (; dy < range.end; dy++) vlineSet(endline, (ET*)(dst + dst_step * dy), dst_width*cn); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) vx_cleanup(); #endif } @@ -1136,16 +1136,16 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody switch( pix_size ) { case 1: -#if CV_SIMD - for( ; x <= dsize.width - v_uint8::nlanes; x += v_uint8::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; x <= dsize.width - VTraits::vlanes(); x += VTraits::vlanes() ) v_store(D + x, vx_lut(S, x_ofse + x)); #endif for( ; x < dsize.width; x++ ) D[x] = S[x_ofse[x]]; break; case 2: -#if CV_SIMD - for( ; x <= dsize.width - v_uint16::nlanes; x += v_uint16::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; x <= dsize.width - VTraits::vlanes(); x += VTraits::vlanes() ) v_store((ushort*)D + x, vx_lut((ushort*)S, x_ofse + x)); #endif for( ; x < dsize.width; x++ ) @@ -1159,8 +1159,8 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody } break; case 4: -#if CV_SIMD - for( ; x <= dsize.width - v_uint32::nlanes; x += v_uint32::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; x <= dsize.width - VTraits::vlanes(); x += VTraits::vlanes() ) v_store((uint32_t*)D + x, vx_lut((uint32_t*)S, x_ofse + x)); #endif for( ; x < dsize.width; x++ ) @@ -1175,8 +1175,8 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody } break; case 8: -#if CV_SIMD - for( ; x <= dsize.width - v_uint64::nlanes; x += v_uint64::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; x <= dsize.width - VTraits::vlanes(); x += VTraits::vlanes() ) v_store((uint64_t*)D + x, vx_lut((uint64_t*)S, x_ofse + x)); #endif for( ; x < dsize.width; x++ ) @@ -1250,7 +1250,7 @@ struct HResizeNoVec } }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) struct VResizeLinearVec_32s8u { @@ -1260,22 +1260,17 @@ struct VResizeLinearVec_32s8u int x = 0; v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]); - if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) - for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) - v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x ) >> 4, vx_load_aligned(S0 + x + v_int32::nlanes) >> 4), b0) + - v_mul_hi(v_pack(vx_load_aligned(S1 + x ) >> 4, vx_load_aligned(S1 + x + v_int32::nlanes) >> 4), b1), - v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) + - v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1))); + if( (((size_t)S0|(size_t)S1)&(VTraits::vlanes() - 1)) == 0 ) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S0 + x)), v_shr<4>(vx_load_aligned(S0 + x + VTraits::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S1 + x)), v_shr<4>(vx_load_aligned(S1 + x + VTraits::vlanes()))), b1)), + v_add(v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S0 + x + 2 * VTraits::vlanes())), v_shr<4>(vx_load_aligned(S0 + x + 3 * VTraits::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S1 + x + 2 * VTraits::vlanes())), v_shr<4>(vx_load_aligned(S1 + x + 3 * VTraits::vlanes()))), b1)))); else - for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) - v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x ) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) + - v_mul_hi(v_pack(vx_load(S1 + x ) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1), - v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) + - v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1))); + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits::vlanes()))), b1)), + v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x + 2 * VTraits::vlanes())), v_shr<4>(vx_load(S0 + x + 3 * VTraits::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x + 2 * VTraits::vlanes())), v_shr<4>(vx_load(S1 + x + 3 * VTraits::vlanes()))), b1)))); - for( ; x < width - v_int16::nlanes; x += v_int16::nlanes) - v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) + - v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1)); + for( ; x < width - VTraits::vlanes(); x += VTraits::vlanes()) + v_rshr_pack_u_store<2>(dst + x, v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits::vlanes()))), b1))); return x; } @@ -1290,17 +1285,17 @@ struct VResizeLinearVec_32f16u v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) - for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) - v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)), - v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1)))); + if( (((size_t)S0|(size_t)S1)&(VTraits::vlanes() - 1)) == 0 ) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, v_mul(vx_load_aligned(S1 + x), b1))), + v_round(v_muladd(vx_load_aligned(S0 + x + VTraits::vlanes()), b0, v_mul(vx_load_aligned(S1 + x + VTraits::vlanes()), b1))))); else - for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) - v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)), - v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1)))); - for( ; x < width - v_float32::nlanes; x += v_float32::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, v_mul(vx_load(S1 + x), b1))), + v_round(v_muladd(vx_load(S0 + x + VTraits::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits::vlanes()), b1))))); + for( ; x < width - VTraits::vlanes(); x += VTraits::vlanes()) { - v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); + v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1))); v_store_low(dst + x, v_pack_u(t0, t0)); } @@ -1317,17 +1312,17 @@ struct VResizeLinearVec_32f16s v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) - v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)), - v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1)))); + if( (((size_t)S0|(size_t)S1)&(VTraits::vlanes() - 1)) == 0 ) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, v_mul(vx_load_aligned(S1 + x), b1))), + v_round(v_muladd(vx_load_aligned(S0 + x + VTraits::vlanes()), b0, v_mul(vx_load_aligned(S1 + x + VTraits::vlanes()), b1))))); else - for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) - v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)), - v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1)))); - for( ; x < width - v_float32::nlanes; x += v_float32::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, v_mul(vx_load(S1 + x), b1))), + v_round(v_muladd(vx_load(S0 + x + VTraits::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits::vlanes()), b1))))); + for( ; x < width - VTraits::vlanes(); x += VTraits::vlanes()) { - v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); + v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1))); v_store_low(dst + x, v_pack(t0, t0)); } @@ -1344,12 +1339,12 @@ struct VResizeLinearVec_32f v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) - v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1)); + if( (((size_t)S0|(size_t)S1)&(VTraits::vlanes() - 1)) == 0 ) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, v_mul(vx_load_aligned(S1 + x), b1))); else - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) - v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) + v_store(dst + x, v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1))); return x; } @@ -1367,26 +1362,26 @@ struct VResizeCubicVec_32s8u v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale), b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale); - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 ) - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(VTraits::vlanes() - 1)) == 0 ) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x )), b0, v_muladd(v_cvt_f32(vx_load_aligned(S1 + x )), b1, v_muladd(v_cvt_f32(vx_load_aligned(S2 + x )), b2, - v_cvt_f32(vx_load_aligned(S3 + x )) * b3)))), - v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)), b0, - v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)), b1, - v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)), b2, - v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3)))))); + v_mul(v_cvt_f32(vx_load_aligned(S3 + x)), b3))))), + v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + VTraits::vlanes())), b0, + v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + VTraits::vlanes())), b1, + v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + VTraits::vlanes())), b2, + v_mul(v_cvt_f32(vx_load_aligned(S3 + x + VTraits::vlanes())), b3))))))); else - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x )), b0, v_muladd(v_cvt_f32(vx_load(S1 + x )), b1, v_muladd(v_cvt_f32(vx_load(S2 + x )), b2, - v_cvt_f32(vx_load(S3 + x )) * b3)))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)), b0, - v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)), b1, - v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)), b2, - v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3)))))); + v_mul(v_cvt_f32(vx_load(S3 + x)), b3))))), + v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + VTraits::vlanes())), b0, + v_muladd(v_cvt_f32(vx_load(S1 + x + VTraits::vlanes())), b1, + v_muladd(v_cvt_f32(vx_load(S2 + x + VTraits::vlanes())), b2, + v_mul(v_cvt_f32(vx_load(S3 + x + VTraits::vlanes())), b3))))))); return x; } }; @@ -1400,15 +1395,15 @@ struct VResizeCubicVec_32f16u v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, v_muladd(vx_load(S1 + x ), b1, v_muladd(vx_load(S2 + x ), b2, - vx_load(S3 + x ) * b3)))), - v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, - v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, - v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, - vx_load(S3 + x + v_float32::nlanes) * b3)))))); + v_mul(vx_load(S3 + x), b3))))), + v_round(v_muladd(vx_load(S0 + x + VTraits::vlanes()), b0, + v_muladd(vx_load(S1 + x + VTraits::vlanes()), b1, + v_muladd(vx_load(S2 + x + VTraits::vlanes()), b2, + v_mul(vx_load(S3 + x + VTraits::vlanes()), b3))))))); return x; } @@ -1423,15 +1418,15 @@ struct VResizeCubicVec_32f16s v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, v_muladd(vx_load(S1 + x ), b1, v_muladd(vx_load(S2 + x ), b2, - vx_load(S3 + x ) * b3)))), - v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, - v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, - v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, - vx_load(S3 + x + v_float32::nlanes) * b3)))))); + v_mul(vx_load(S3 + x), b3))))), + v_round(v_muladd(vx_load(S0 + x + VTraits::vlanes()), b0, + v_muladd(vx_load(S1 + x + VTraits::vlanes()), b1, + v_muladd(vx_load(S2 + x + VTraits::vlanes()), b2, + v_mul(vx_load(S3 + x + VTraits::vlanes()), b3))))))); return x; } @@ -1446,11 +1441,11 @@ struct VResizeCubicVec_32f v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_store(dst + x, v_muladd(vx_load(S0 + x), b0, v_muladd(vx_load(S1 + x), b1, v_muladd(vx_load(S2 + x), b2, - vx_load(S3 + x) * b3)))); + v_mul(vx_load(S3 + x), b3))))); return x; } @@ -1484,7 +1479,7 @@ struct VResizeLanczos4Vec_32f16u b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); - for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, v_muladd(vx_load(S1 + x ), b1, v_muladd(vx_load(S2 + x ), b2, @@ -1492,15 +1487,15 @@ struct VResizeLanczos4Vec_32f16u v_muladd(vx_load(S4 + x ), b4, v_muladd(vx_load(S5 + x ), b5, v_muladd(vx_load(S6 + x ), b6, - vx_load(S7 + x ) * b7)))))))), - v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, - v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, - v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, - v_muladd(vx_load(S3 + x + v_float32::nlanes), b3, - v_muladd(vx_load(S4 + x + v_float32::nlanes), b4, - v_muladd(vx_load(S5 + x + v_float32::nlanes), b5, - v_muladd(vx_load(S6 + x + v_float32::nlanes), b6, - vx_load(S7 + x + v_float32::nlanes) * b7)))))))))); + v_mul(vx_load(S7 + x ), b7))))))))), + v_round(v_muladd(vx_load(S0 + x + VTraits::vlanes()), b0, + v_muladd(vx_load(S1 + x + VTraits::vlanes()), b1, + v_muladd(vx_load(S2 + x + VTraits::vlanes()), b2, + v_muladd(vx_load(S3 + x + VTraits::vlanes()), b3, + v_muladd(vx_load(S4 + x + VTraits::vlanes()), b4, + v_muladd(vx_load(S5 + x + VTraits::vlanes()), b5, + v_muladd(vx_load(S6 + x + VTraits::vlanes()), b6, + v_mul(vx_load(S7 + x + VTraits::vlanes()), b7))))))))))); return x; } @@ -1520,7 +1515,7 @@ struct VResizeLanczos4Vec_32f16s b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, v_muladd(vx_load(S1 + x ), b1, v_muladd(vx_load(S2 + x ), b2, @@ -1528,15 +1523,15 @@ struct VResizeLanczos4Vec_32f16s v_muladd(vx_load(S4 + x ), b4, v_muladd(vx_load(S5 + x ), b5, v_muladd(vx_load(S6 + x ), b6, - vx_load(S7 + x ) * b7)))))))), - v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, - v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, - v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, - v_muladd(vx_load(S3 + x + v_float32::nlanes), b3, - v_muladd(vx_load(S4 + x + v_float32::nlanes), b4, - v_muladd(vx_load(S5 + x + v_float32::nlanes), b5, - v_muladd(vx_load(S6 + x + v_float32::nlanes), b6, - vx_load(S7 + x + v_float32::nlanes) * b7)))))))))); + v_mul(vx_load(S7 + x), b7))))))))), + v_round(v_muladd(vx_load(S0 + x + VTraits::vlanes()), b0, + v_muladd(vx_load(S1 + x + VTraits::vlanes()), b1, + v_muladd(vx_load(S2 + x + VTraits::vlanes()), b2, + v_muladd(vx_load(S3 + x + VTraits::vlanes()), b3, + v_muladd(vx_load(S4 + x + VTraits::vlanes()), b4, + v_muladd(vx_load(S5 + x + VTraits::vlanes()), b5, + v_muladd(vx_load(S6 + x + VTraits::vlanes()), b6, + v_mul(vx_load(S7 + x + VTraits::vlanes()), b7))))))))))); return x; } @@ -1555,7 +1550,7 @@ struct VResizeLanczos4Vec_32f b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); - for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) v_store(dst + x, v_muladd(vx_load(S0 + x), b0, v_muladd(vx_load(S1 + x), b1, v_muladd(vx_load(S2 + x), b2, @@ -1563,7 +1558,7 @@ struct VResizeLanczos4Vec_32f v_muladd(vx_load(S4 + x), b4, v_muladd(vx_load(S5 + x), b5, v_muladd(vx_load(S6 + x), b6, - vx_load(S7 + x) * b7)))))))); + v_mul(vx_load(S7 + x), b7))))))))); return x; } @@ -1620,8 +1615,8 @@ struct HResizeLinearVec_X4 DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]); DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]); DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]); - v_store(&D1[dx], s0_u * a_even + s1_u * a_odd); - v_store(&D0[dx], s0 * a_even + s1 * a_odd); + v_store(&D1[dx], v_add(v_mul(s0_u, a_even), v_mul(s1_u, a_odd))); + v_store(&D0[dx], v_add(v_mul(s0, a_even), v_mul(s1, a_odd))); } } for( ; k < count; k++ ) @@ -1640,7 +1635,7 @@ struct HResizeLinearVec_X4 v_load_deinterleave(&alpha[dx*2], a_even, a_odd); DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]); DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]); - v_store(&D[dx], s0 * a_even + s1 * a_odd); + v_store(&D[dx], v_add(v_mul(s0, a_even), v_mul(s1, a_odd))); } } return dx; @@ -1752,8 +1747,8 @@ struct HResizeLinearVecU8_X4 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn ) { v_int16x8 a = v_load(alpha+dx*2); - v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a)); - v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a)); + v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S0 + xofs[dx]), v_shl<16>(v_load_expand_q(S0 + xofs[dx] + cn)))), a)); + v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S1 + xofs[dx]), v_shl<16>(v_load_expand_q(S1 + xofs[dx] + cn)))), a)); } } for( ; k < count; k++ ) @@ -1763,7 +1758,7 @@ struct HResizeLinearVecU8_X4 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn ) { v_int16x8 a = v_load(alpha+dx*2); - v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a)); + v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S + xofs[dx]), v_shl<16>(v_load_expand_q(S + xofs[dx] + cn)))), a)); } } /* Debug check to ensure truthiness that we never vector the final value. */ @@ -2452,27 +2447,27 @@ class ResizeAreaFastVec_SIMD_8u if (cn == 1) { v_uint16 masklow = vx_setall_u16(0x00ff); - for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes) + for ( ; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += VTraits::vlanes(), S1 += VTraits::vlanes(), D += VTraits::vlanes()) { v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0)); v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1)); - v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow)); + v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<8>(r0), v_and(r0, masklow)), v_shr<8>(r1)), v_and(r1, masklow))); } } else if (cn == 3) { if (CV_SIMD_WIDTH > 64) return 0; - for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes) + for ( ; dx <= w - 3*VTraits::vlanes(); dx += 3*VTraits::vlanes(), S0 += 6*VTraits::vlanes(), S1 += 6*VTraits::vlanes(), D += 3*VTraits::vlanes()) { v_uint16 t0, t1, t2, t3, t4, t5; v_uint16 s0, s1, s2, s3, s4, s5; - s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); - s1 = vx_load_expand(S0 + v_uint16::nlanes) + vx_load_expand(S1 + v_uint16::nlanes); - s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes); - s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes); - s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes); - s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes); + s0 = v_add(vx_load_expand(S0), vx_load_expand(S1)); + s1 = v_add(vx_load_expand(S0 + VTraits::vlanes()), vx_load_expand(S1 + VTraits::vlanes())); + s2 = v_add(vx_load_expand(S0 + 2 * VTraits::vlanes()), vx_load_expand(S1 + 2 * VTraits::vlanes())); + s3 = v_add(vx_load_expand(S0 + 3 * VTraits::vlanes()), vx_load_expand(S1 + 3 * VTraits::vlanes())); + s4 = v_add(vx_load_expand(S0 + 4 * VTraits::vlanes()), vx_load_expand(S1 + 4 * VTraits::vlanes())); + s5 = v_add(vx_load_expand(S0 + 5 * VTraits::vlanes()), vx_load_expand(S1 + 5 * VTraits::vlanes())); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); @@ -2481,18 +2476,18 @@ class ResizeAreaFastVec_SIMD_8u bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; #elif CV_SIMD_WIDTH == 32 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); - bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; + bl = v_add(s0, s3); gl = v_add(s1, s4); rl = v_add(s2, s5); #elif CV_SIMD_WIDTH == 64 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; #endif - s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes); - s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes); - s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes); - s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes); - s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes); - s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes); + s0 = v_add(vx_load_expand(S0 + 6 * VTraits::vlanes()), vx_load_expand(S1 + 6 * VTraits::vlanes())); + s1 = v_add(vx_load_expand(S0 + 7 * VTraits::vlanes()), vx_load_expand(S1 + 7 * VTraits::vlanes())); + s2 = v_add(vx_load_expand(S0 + 8 * VTraits::vlanes()), vx_load_expand(S1 + 8 * VTraits::vlanes())); + s3 = v_add(vx_load_expand(S0 + 9 * VTraits::vlanes()), vx_load_expand(S1 + 9 * VTraits::vlanes())); + s4 = v_add(vx_load_expand(S0 + 10 * VTraits::vlanes()), vx_load_expand(S1 + 10 * VTraits::vlanes())); + s5 = v_add(vx_load_expand(S0 + 11 * VTraits::vlanes()), vx_load_expand(S1 + 11 * VTraits::vlanes())); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); @@ -2501,7 +2496,7 @@ class ResizeAreaFastVec_SIMD_8u bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; #elif CV_SIMD_WIDTH == 32 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); - bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; + bh = v_add(s0, s3); gh = v_add(s1, s4); rh = v_add(s2, s5); #elif CV_SIMD_WIDTH == 64 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); @@ -2513,7 +2508,7 @@ class ResizeAreaFastVec_SIMD_8u else { CV_Assert(cn == 4); - for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes) + for ( ; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += 2*VTraits::vlanes(), S1 += 2*VTraits::vlanes(), D += VTraits::vlanes()) { v_uint32 r00, r01, r10, r11; v_load_deinterleave((uint32_t*)S0, r00, r01); @@ -2524,7 +2519,7 @@ class ResizeAreaFastVec_SIMD_8u v_expand(v_reinterpret_as_u8(r01), r01l, r01h); v_expand(v_reinterpret_as_u8(r10), r10l, r10h); v_expand(v_reinterpret_as_u8(r11), r11l, r11h); - v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); + v_store(D, v_rshr_pack<2>(v_add(v_add(v_add(r00l, r01l), r10l), r11l), v_add(v_add(v_add(r00h, r01h), r10h), r11h))); } } @@ -2551,11 +2546,11 @@ class ResizeAreaFastVec_SIMD_16u if (cn == 1) { v_uint32 masklow = vx_setall_u32(0x0000ffff); - for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes) + for (; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += VTraits::vlanes(), S1 += VTraits::vlanes(), D += VTraits::vlanes()) { v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0)); v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1)); - v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow)); + v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<16>(r0), v_and(r0, masklow)), v_shr<16>(r1)), v_and(r1, masklow))); } } else if (cn == 3) @@ -2574,38 +2569,38 @@ class ResizeAreaFastVec_SIMD_16u v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); #endif #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 - for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes) + for ( ; dx <= w - 3*VTraits::vlanes(); dx += 3*VTraits::vlanes(), S0 += 6*VTraits::vlanes(), S1 += 6*VTraits::vlanes(), D += 3*VTraits::vlanes()) { v_uint32 t0, t1, t2, t3, t4, t5; v_uint32 s0, s1, s2, s3, s4, s5; - s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); - s1 = vx_load_expand(S0 + v_uint32::nlanes) + vx_load_expand(S1 + v_uint32::nlanes); - s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes); - s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes); - s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes); - s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes); + s0 = v_add(vx_load_expand(S0), vx_load_expand(S1)); + s1 = v_add(vx_load_expand(S0 + VTraits::vlanes()), vx_load_expand(S1 + VTraits::vlanes())); + s2 = v_add(vx_load_expand(S0 + 2 * VTraits::vlanes()), vx_load_expand(S1 + 2 * VTraits::vlanes())); + s3 = v_add(vx_load_expand(S0 + 3 * VTraits::vlanes()), vx_load_expand(S1 + 3 * VTraits::vlanes())); + s4 = v_add(vx_load_expand(S0 + 4 * VTraits::vlanes()), vx_load_expand(S1 + 4 * VTraits::vlanes())); + s5 = v_add(vx_load_expand(S0 + 5 * VTraits::vlanes()), vx_load_expand(S1 + 5 * VTraits::vlanes())); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_uint32 bl, gl, rl; v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); #if CV_SIMD_WIDTH == 32 - bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; + bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5); #else //CV_SIMD_WIDTH == 64 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; #endif - s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes); - s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes); - s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes); - s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes); - s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes); - s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes); + s0 = v_add(vx_load_expand(S0 + 6 * VTraits::vlanes()), vx_load_expand(S1 + 6 * VTraits::vlanes())); + s1 = v_add(vx_load_expand(S0 + 7 * VTraits::vlanes()), vx_load_expand(S1 + 7 * VTraits::vlanes())); + s2 = v_add(vx_load_expand(S0 + 8 * VTraits::vlanes()), vx_load_expand(S1 + 8 * VTraits::vlanes())); + s3 = v_add(vx_load_expand(S0 + 9 * VTraits::vlanes()), vx_load_expand(S1 + 9 * VTraits::vlanes())); + s4 = v_add(vx_load_expand(S0 + 10 * VTraits::vlanes()), vx_load_expand(S1 + 10 * VTraits::vlanes())); + s5 = v_add(vx_load_expand(S0 + 11 * VTraits::vlanes()), vx_load_expand(S1 + 11 * VTraits::vlanes())); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_uint32 bh, gh, rh; v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); #if CV_SIMD_WIDTH == 32 - bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; + bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5); #else //CV_SIMD_WIDTH == 64 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; @@ -2649,19 +2644,19 @@ class ResizeAreaFastVec_SIMD_16u v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); } #else - for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes) + for ( ; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += VTraits::vlanes(), S1 += VTraits::vlanes(), D += VTraits::vlanes()) { v_uint32 r0, r1, r2, r3; v_expand(vx_load(S0), r0, r1); v_expand(vx_load(S1), r2, r3); - r0 += r2; r1 += r3; + r0 = v_add(r0, r2); r1 = v_add(r1, r3); v_uint32 v_d; #if CV_SIMD_WIDTH == 16 v_d = r0 + r1; #elif CV_SIMD_WIDTH == 32 v_uint32 t0, t1; v_recombine(r0, r1, t0, t1); - v_d = t0 + t1; + v_d = v_add(t0, t1); #endif v_rshr_pack_store<2>(D, v_d); } @@ -2691,11 +2686,11 @@ class ResizeAreaFastVec_SIMD_16s if (cn == 1) { v_int32 masklow = vx_setall_s32(0x0000ffff); - for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes) + for (; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += VTraits::vlanes(), S1 += VTraits::vlanes(), D += VTraits::vlanes()) { v_int32 r0 = v_reinterpret_as_s32(vx_load(S0)); v_int32 r1 = v_reinterpret_as_s32(vx_load(S1)); - v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16)); + v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<16>(r0), v_shr<16>(v_shl<16>(v_and(r0, masklow)))), v_shr<16>(r1)), v_shr<16>(v_shl<16>(v_and(r1, masklow))))); } } else if (cn == 3) @@ -2704,38 +2699,38 @@ class ResizeAreaFastVec_SIMD_16s for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 - for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes) + for ( ; dx <= w - 3*VTraits::vlanes(); dx += 3*VTraits::vlanes(), S0 += 6*VTraits::vlanes(), S1 += 6*VTraits::vlanes(), D += 3*VTraits::vlanes()) { v_int32 t0, t1, t2, t3, t4, t5; v_int32 s0, s1, s2, s3, s4, s5; - s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); - s1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes); - s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes); - s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes); - s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes); - s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes); + s0 = v_add(vx_load_expand(S0), vx_load_expand(S1)); + s1 = v_add(vx_load_expand(S0 + VTraits::vlanes()), vx_load_expand(S1 + VTraits::vlanes())); + s2 = v_add(vx_load_expand(S0 + 2 * VTraits::vlanes()), vx_load_expand(S1 + 2 * VTraits::vlanes())); + s3 = v_add(vx_load_expand(S0 + 3 * VTraits::vlanes()), vx_load_expand(S1 + 3 * VTraits::vlanes())); + s4 = v_add(vx_load_expand(S0 + 4 * VTraits::vlanes()), vx_load_expand(S1 + 4 * VTraits::vlanes())); + s5 = v_add(vx_load_expand(S0 + 5 * VTraits::vlanes()), vx_load_expand(S1 + 5 * VTraits::vlanes())); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_int32 bl, gl, rl; v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); #if CV_SIMD_WIDTH == 32 - bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; + bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5); #else //CV_SIMD_WIDTH == 64 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; #endif - s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes); - s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes); - s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes); - s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes); - s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes); - s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes); + s0 = v_add(vx_load_expand(S0 + 6 * VTraits::vlanes()), vx_load_expand(S1 + 6 * VTraits::vlanes())); + s1 = v_add(vx_load_expand(S0 + 7 * VTraits::vlanes()), vx_load_expand(S1 + 7 * VTraits::vlanes())); + s2 = v_add(vx_load_expand(S0 + 8 * VTraits::vlanes()), vx_load_expand(S1 + 8 * VTraits::vlanes())); + s3 = v_add(vx_load_expand(S0 + 9 * VTraits::vlanes()), vx_load_expand(S1 + 9 * VTraits::vlanes())); + s4 = v_add(vx_load_expand(S0 + 10 * VTraits::vlanes()), vx_load_expand(S1 + 10 * VTraits::vlanes())); + s5 = v_add(vx_load_expand(S0 + 11 * VTraits::vlanes()), vx_load_expand(S1 + 11 * VTraits::vlanes())); v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); v_int32 bh, gh, rh; v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); #if CV_SIMD_WIDTH == 32 - bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; + bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5); #else //CV_SIMD_WIDTH == 64 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; @@ -2763,7 +2758,7 @@ class ResizeAreaFastVec_SIMD_16s else { CV_Assert(cn == 4); - for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes) + for (; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += 2 * VTraits::vlanes(), S1 += 2 * VTraits::vlanes(), D += VTraits::vlanes()) { #if CV_SIMD_WIDTH >= 64 v_int64 r00, r01, r10, r11; @@ -2778,17 +2773,17 @@ class ResizeAreaFastVec_SIMD_16s v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); #else v_int32 r0, r1, r2, r3; - r0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); - r1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes); - r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes); - r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes); + r0 = v_add(vx_load_expand(S0), vx_load_expand(S1)); + r1 = v_add(vx_load_expand(S0 + VTraits::vlanes()), vx_load_expand(S1 + VTraits::vlanes())); + r2 = v_add(vx_load_expand(S0 + 2 * VTraits::vlanes()), vx_load_expand(S1 + 2 * VTraits::vlanes())); + r3 = v_add(vx_load_expand(S0 + 3 * VTraits::vlanes()), vx_load_expand(S1 + 3 * VTraits::vlanes())); v_int32 dl, dh; #if CV_SIMD_WIDTH == 16 dl = r0 + r1; dh = r2 + r3; #elif CV_SIMD_WIDTH == 32 v_int32 t0, t1, t2, t3; v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3); - dl = t0 + t1; dh = t2 + t3; + dl = v_add(t0, t1); dh = v_add(t2, t3); #endif v_store(D, v_rshr_pack<2>(dl, dh)); #endif @@ -2822,12 +2817,12 @@ struct ResizeAreaFastVec_SIMD_32f if (cn == 1) { v_float32 v_025 = vx_setall_f32(0.25f); - for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes) + for ( ; dx <= w - VTraits::vlanes(); dx += VTraits::vlanes(), S0 += 2*VTraits::vlanes(), S1 += 2*VTraits::vlanes(), D += VTraits::vlanes()) { v_float32 v_row00, v_row01, v_row10, v_row11; v_load_deinterleave(S0, v_row00, v_row01); v_load_deinterleave(S1, v_row10, v_row11); - v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025); + v_store(D, v_mul(v_add(v_add(v_row00, v_row01), v_add(v_row10, v_row11)), v_025)); } } else if (cn == 4) @@ -2841,8 +2836,8 @@ struct ResizeAreaFastVec_SIMD_32f for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes) { v_float32x8 dst0, dst1; - v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1); - v_store(D, (dst0 + dst1) * v_025); + v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + v_float32x8::nlanes), v256_load(S1 + v_float32x8::nlanes)), dst0, dst1); + v_store(D, v_mul(v_add(dst0, dst1), v_025)); } #endif } diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp index 62ff31ac940c..33e58d4e80b4 100644 --- a/modules/imgproc/src/smooth.simd.hpp +++ b/modules/imgproc/src/smooth.simd.hpp @@ -81,11 +81,11 @@ void hlineSmooth1N(const uint8_t* src, int cn, const ufi { int lencn = len*cn; int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); + v_uint16 vmul = vx_setall_u16(*((uint16_t*)m)); for (; i <= lencn - VECSZ; i += VECSZ) - v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i))); + v_store((uint16_t*)dst + i, v_mul(vmul, vx_load_expand(src + i))); #endif for (; i < lencn; i++) dst[i] = m[0] * src[i]; @@ -101,8 +101,8 @@ void hlineSmooth1N1(const uint8_t* src, int cn, const uf { int lencn = len*cn; int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= lencn - VECSZ; i += VECSZ) v_store((uint16_t*)dst + i, v_shl<8>(vx_load_expand(src + i))); #endif @@ -168,16 +168,14 @@ void hlineSmooth3N(const uint8_t* src, int cn, const ufi src += cn; dst += cn; int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); v_uint16 v_mul0 = vx_setall_u16(_m[0]); v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul2 = vx_setall_u16(_m[2]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) + - v_mul_wrap(vx_load_expand(src), v_mul1) + - v_mul_wrap(vx_load_expand(src + cn), v_mul2)); + v_store((uint16_t*)dst, v_add(v_add(v_mul(vx_load_expand(src - cn), v_mul0), v_mul(vx_load_expand(src), v_mul1)), v_mul(vx_load_expand(src + cn), v_mul2))); #endif for (; i < lencn; i++, src++, dst++) *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn]; @@ -220,10 +218,10 @@ void hlineSmooth3N121Impl(const ET* src, int cn, const FT*, int, FT* dst, int le src += cn; dst += cn; int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD - const int VECSZ = VFT::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((typename FT::raw_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << (FT::fixedShift-2)); + v_store((typename FT::raw_t*)dst, v_shl<(FT::fixedShift-2)>(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn), v_shl<1>((vx_load_expand(src)))))); #endif for (; i < lencn; i++, src++, dst++) *dst = (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[0])>>1); @@ -320,14 +318,13 @@ void hlineSmooth3Naba(const uint8_t* src, int cn, const src += cn; dst += cn; int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); v_uint16 v_mul0 = vx_setall_u16(_m[0]); v_uint16 v_mul1 = vx_setall_u16(_m[1]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) + - v_mul_wrap(vx_load_expand(src), v_mul1)); + v_store((uint16_t*)dst, v_add(v_mul(v_add( vx_load_expand(src - cn), vx_load_expand(src + cn)), v_mul0), v_mul(vx_load_expand(src), v_mul1))); #endif for (; i < lencn; i++, src++, dst++) *((uint16_t*)dst) = saturate_cast(((uint16_t*)m)[1] * (uint32_t)(src[0]) + ((uint16_t*)m)[0] * ((uint32_t)(src[-cn]) + (uint32_t)(src[cn]))); @@ -514,20 +511,16 @@ void hlineSmooth5N(const uint8_t* src, int cn, const ufi src += 2 * cn; dst += 2 * cn; int i = 2*cn, lencn = (len - 2)*cn; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); v_uint16 v_mul0 = vx_setall_u16(_m[0]); v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul2 = vx_setall_u16(_m[2]); v_uint16 v_mul3 = vx_setall_u16(_m[3]); v_uint16 v_mul4 = vx_setall_u16(_m[4]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) + - v_mul_wrap(vx_load_expand(src - cn), v_mul1) + - v_mul_wrap(vx_load_expand(src), v_mul2) + - v_mul_wrap(vx_load_expand(src + cn), v_mul3) + - v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4)); + v_store((uint16_t*)dst, v_add(v_add(v_add(v_add(v_mul(vx_load_expand(src - 2 * cn), v_mul0), v_mul(vx_load_expand(src - cn), v_mul1)), v_mul(vx_load_expand(src), v_mul2)), v_mul(vx_load_expand(src + cn), v_mul3)), v_mul(vx_load_expand(src + 2 * cn), v_mul4))); #endif for (; i < lencn; i++, src++, dst++) *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn]; @@ -726,11 +719,11 @@ void hlineSmooth5N14641(const uint8_t* src, int cn, cons src += 2 * cn; dst += 2 * cn; int i = 2 * cn, lencn = (len - 2)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); v_uint16 v_6 = vx_setall_u16(6); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4); + v_store((uint16_t*)dst, v_shl<4>(v_add(v_add(v_add(v_mul(vx_load_expand(src), v_6), v_shl<2>(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn)))), vx_load_expand(src - 2 * cn)), vx_load_expand(src + 2 * cn)))); #endif for (; i < lencn; i++, src++, dst++) *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4; @@ -924,16 +917,14 @@ void hlineSmooth5Nabcba(const uint8_t* src, int cn, cons src += 2 * cn; dst += 2 * cn; int i = 2 * cn, lencn = (len - 2)*cn; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); v_uint16 v_mul0 = vx_setall_u16(_m[0]); v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul2 = vx_setall_u16(_m[2]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) + - v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) + - v_mul_wrap(vx_load_expand(src), v_mul2)); + v_store((uint16_t*)dst, v_add(v_add(v_mul(v_add(vx_load_expand(src - 2 * cn), vx_load_expand(src + 2 * cn)), v_mul0), v_mul(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_mul1)), v_mul(vx_load_expand(src), v_mul2))); #endif for (; i < lencn; i++, src++, dst++) *((uint16_t*)dst) = saturate_cast(((uint16_t*)m)[0] * ((uint32_t)(src[-2 * cn]) + (uint32_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint32_t)(src[-cn]) + (uint32_t)(src[cn])) + ((uint16_t*)m)[2] * (uint32_t)(src[0])); @@ -1044,13 +1035,13 @@ void hlineSmooth(const uint8_t* src, int cn, const ufixe } i *= cn; int lencn = (len - post_shift + 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ) { - v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m))); + v_uint16 v_res0 = v_mul(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m))); for (int j = 1; j < n; j++) - v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j)))); + v_res0 = v_add(v_res0, v_mul(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t *)(m + j))))); v_store((uint16_t*)dst, v_res0); } #endif @@ -1163,13 +1154,13 @@ void hlineSmoothONa_yzy_a(const uint8_t* src, int cn, co } i *= cn; int lencn = (len - post_shift + 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) { - v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift)))); + v_uint16 v_res0 = v_mul(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift)))); for (int j = 0; j < pre_shift; j ++) - v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j)))); + v_res0 = v_add(v_res0, v_mul(v_add(vx_load_expand(src + j * cn), vx_load_expand(src + (n - 1 - j) * cn)), vx_setall_u16(*((uint16_t *)(m + j))))); v_store((uint16_t*)dst, v_res0); } #endif @@ -1228,8 +1219,8 @@ void hlineSmoothONa_yzy_a(const uint16_t* src, int cn, } i *= cn; int lencn = (len - post_shift + 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= lencn - VECSZ * 2; i += VECSZ * 2, src += VECSZ * 2, dst += VECSZ * 2) { v_uint32 v_res0, v_res1; @@ -1239,11 +1230,11 @@ void hlineSmoothONa_yzy_a(const uint16_t* src, int cn, v_uint16 v_weight = vx_setall_u16((uint16_t) *((uint32_t*)(m + j))); v_uint32 v_add0, v_add1; v_mul_expand(vx_load(src + j * cn), v_weight, v_add0, v_add1); - v_res0 += v_add0; - v_res1 += v_add1; + v_res0 = v_add(v_res0, v_add0); + v_res1 = v_add(v_res1, v_add1); v_mul_expand(vx_load(src + (n - 1 - j)*cn), v_weight, v_add0, v_add1); - v_res0 += v_add0; - v_res1 += v_add1; + v_res0 = v_add(v_res0, v_add0); + v_res1 = v_add(v_res1, v_add1); } v_store((uint32_t*)dst, v_res0); v_store((uint32_t*)dst + VECSZ, v_res1); @@ -1285,8 +1276,8 @@ void vlineSmooth1N(const ufixedpoint16* const * src, con { const ufixedpoint16* src0 = src[0]; int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)<<1); for (; i <= len - VECSZ; i += VECSZ) v_rshr_pack_store<1>(dst + i, v_mul_hi(vx_load((uint16_t*)src0 + i), v_mul)); @@ -1306,8 +1297,8 @@ void vlineSmooth1N1(const ufixedpoint16* const * src, co { const ufixedpoint16* src0 = src[0]; int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= len - VECSZ; i += VECSZ) v_rshr_pack_store<8>(dst + i, vx_load((uint16_t*)(src0 + i))); #endif @@ -1324,10 +1315,10 @@ template <> void vlineSmooth3N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) { int i = 0; -#if CV_SIMD - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); if (len >= VECSZ) { ufixedpoint32 val[] = { (m[0] + m[1] + m[2]) * ufixedpoint16((uint8_t)128) }; @@ -1370,26 +1361,26 @@ void vlineSmooth3N(const ufixedpoint16* const * src, con v_src02 = vx_load(src2 + 2*VECSZ); v_src03 = vx_load(src2 + 3*VECSZ); v_mul_expand(v_add_wrap(v_src00, v_128), v_mul2, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; + v_res0 = v_add(v_res0, v_resj0); + v_res1 = v_add(v_res1, v_resj1); v_mul_expand(v_add_wrap(v_src01, v_128), v_mul2, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; + v_res2 = v_add(v_res2, v_resj0); + v_res3 = v_add(v_res3, v_resj1); v_mul_expand(v_add_wrap(v_src02, v_128), v_mul2, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; + v_res4 = v_add(v_res4, v_resj0); + v_res5 = v_add(v_res5, v_resj1); v_mul_expand(v_add_wrap(v_src03, v_128), v_mul2, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; + v_res6 = v_add(v_res6, v_resj0); + v_res7 = v_add(v_res7, v_resj1); + + v_res0 = v_add(v_res0, v_128_4); + v_res1 = v_add(v_res1, v_128_4); + v_res2 = v_add(v_res2, v_128_4); + v_res3 = v_add(v_res3, v_128_4); + v_res4 = v_add(v_res4, v_128_4); + v_res5 = v_add(v_res5, v_128_4); + v_res6 = v_add(v_res6, v_128_4); + v_res7 = v_add(v_res7, v_128_4); v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); @@ -1410,8 +1401,8 @@ template <> void vlineSmooth3N121(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) { int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= len - 2*VECSZ; i += 2*VECSZ) { v_uint32 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; @@ -1421,8 +1412,8 @@ void vlineSmooth3N121(const ufixedpoint16* const * src, v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13); v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21); v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23); - v_store(dst + i, v_pack(v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)), - v_rshr_pack<10>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13)))); + v_store(dst + i, v_pack(v_rshr_pack<10>(v_add(v_add(v_src00, v_src20), v_add(v_src10, v_src10)), v_add(v_add(v_src01, v_src21), v_add(v_src11, v_src11))), + v_rshr_pack<10>(v_add(v_add(v_src02, v_src22), v_add(v_src12, v_src12)), v_add(v_add(v_src03, v_src23), v_add(v_src13, v_src13))))); } #endif for (; i < len; i++) @@ -1432,8 +1423,8 @@ template <> void vlineSmooth3N121(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len) { int i = 0; -#if CV_SIMD - const int VECSZ = v_uint32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= len - 2*VECSZ; i += 2*VECSZ) { v_uint64 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; @@ -1443,8 +1434,8 @@ void vlineSmooth3N121(const ufixedpoint32* const * src, v_expand(vx_load((uint32_t*)(src[1]) + i + VECSZ), v_src12, v_src13); v_expand(vx_load((uint32_t*)(src[2]) + i), v_src20, v_src21); v_expand(vx_load((uint32_t*)(src[2]) + i + VECSZ), v_src22, v_src23); - v_store(dst + i, v_pack(v_rshr_pack<18>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)), - v_rshr_pack<18>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13)))); + v_store(dst + i, v_pack(v_rshr_pack<18>(v_add(v_add(v_src00, v_src20), v_add(v_src10, v_src10)), v_add(v_add(v_src01, v_src21), v_add(v_src11, v_src11))), + v_rshr_pack<18>(v_add(v_add(v_src02, v_src22), v_add(v_src12, v_src12)), v_add(v_add(v_src03, v_src23), v_add(v_src13, v_src13))))); } #endif for (; i < len; i++) @@ -1460,13 +1451,13 @@ template <> void vlineSmooth5N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) { int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); if (len >= 4 * VECSZ) { ufixedpoint32 val[] = { (m[0] + m[1] + m[2] + m[3] + m[4]) * ufixedpoint16((uint8_t)128) }; v_int32 v_128_4 = vx_setall_s32(*((int32_t*)val)); - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); + const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); v_int16 v_mul23 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m + 2)))); v_int16 v_mul4 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 4)))); @@ -1509,17 +1500,17 @@ void vlineSmooth5N(const ufixedpoint16* const * src, con v_src12 = vx_load(src3 + 2*VECSZ); v_src13 = vx_load(src3 + 3*VECSZ); v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul23); - v_res1 += v_dotprod(v_tmp1, v_mul23); + v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul23)); + v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul23)); v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_res2 += v_dotprod(v_tmp0, v_mul23); - v_res3 += v_dotprod(v_tmp1, v_mul23); + v_res2 = v_add(v_res2, v_dotprod(v_tmp0, v_mul23)); + v_res3 = v_add(v_res3, v_dotprod(v_tmp1, v_mul23)); v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_res4 += v_dotprod(v_tmp0, v_mul23); - v_res5 += v_dotprod(v_tmp1, v_mul23); + v_res4 = v_add(v_res4, v_dotprod(v_tmp0, v_mul23)); + v_res5 = v_add(v_res5, v_dotprod(v_tmp1, v_mul23)); v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_res6 += v_dotprod(v_tmp0, v_mul23); - v_res7 += v_dotprod(v_tmp1, v_mul23); + v_res6 = v_add(v_res6, v_dotprod(v_tmp0, v_mul23)); + v_res7 = v_add(v_res7, v_dotprod(v_tmp1, v_mul23)); v_int32 v_resj0, v_resj1; const int16_t* src4 = (const int16_t*)src[4] + i; @@ -1528,26 +1519,26 @@ void vlineSmooth5N(const ufixedpoint16* const * src, con v_src02 = vx_load(src4 + 2*VECSZ); v_src03 = vx_load(src4 + 3*VECSZ); v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; + v_res0 = v_add(v_res0, v_resj0); + v_res1 = v_add(v_res1, v_resj1); v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; + v_res2 = v_add(v_res2, v_resj0); + v_res3 = v_add(v_res3, v_resj1); v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; + v_res4 = v_add(v_res4, v_resj0); + v_res5 = v_add(v_res5, v_resj1); v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; + v_res6 = v_add(v_res6, v_resj0); + v_res7 = v_add(v_res7, v_resj1); + + v_res0 = v_add(v_res0, v_128_4); + v_res1 = v_add(v_res1, v_128_4); + v_res2 = v_add(v_res2, v_128_4); + v_res3 = v_add(v_res3, v_128_4); + v_res4 = v_add(v_res4, v_128_4); + v_res5 = v_add(v_res5, v_128_4); + v_res6 = v_add(v_res6, v_128_4); + v_res7 = v_add(v_res7, v_128_4); v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); @@ -1569,9 +1560,9 @@ template <> void vlineSmooth5N14641(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_uint32 v_6 = vx_setall_u32(6); - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); for (; i <= len - 2*VECSZ; i += 2*VECSZ) { v_uint32 v_src00, v_src10, v_src20, v_src30, v_src40; @@ -1588,10 +1579,10 @@ void vlineSmooth5N14641(const ufixedpoint16* const * src v_expand(vx_load((uint16_t*)(src[3]) + i + VECSZ), v_src32, v_src33); v_expand(vx_load((uint16_t*)(src[4]) + i), v_src40, v_src41); v_expand(vx_load((uint16_t*)(src[4]) + i + VECSZ), v_src42, v_src43); - v_store(dst + i, v_pack(v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40, - v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41), - v_rshr_pack<12>(v_src22*v_6 + ((v_src12 + v_src32) << 2) + v_src02 + v_src42, - v_src23*v_6 + ((v_src13 + v_src33) << 2) + v_src03 + v_src43))); + v_store(dst + i, v_pack(v_rshr_pack<12>(v_add(v_add(v_add(v_mul(v_src20, v_6), v_shl<2>(v_add(v_src10, v_src30))), v_src00), v_src40), + v_add(v_add(v_add(v_mul(v_src21, v_6), v_shl<2>(v_add(v_src11, v_src31))), v_src01), v_src41)), + v_rshr_pack<12>(v_add(v_add(v_add(v_mul(v_src22, v_6), v_shl<2>(v_add(v_src12, v_src32))), v_src02), v_src42), + v_add(v_add(v_add(v_mul(v_src23, v_6), v_shl<2>(v_add(v_src13, v_src33))), v_src03), v_src43)))); } #endif for (; i < len; i++) @@ -1603,8 +1594,8 @@ template <> void vlineSmooth5N14641(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len) { int i = 0; -#if CV_SIMD - const int VECSZ = v_uint32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for (; i <= len - 2*VECSZ; i += 2*VECSZ) { v_uint64 v_src00, v_src10, v_src20, v_src30, v_src40; @@ -1621,10 +1612,10 @@ void vlineSmooth5N14641(const ufixedpoint32* const * sr v_expand(vx_load((uint32_t*)(src[3]) + i + VECSZ), v_src32, v_src33); v_expand(vx_load((uint32_t*)(src[4]) + i), v_src40, v_src41); v_expand(vx_load((uint32_t*)(src[4]) + i + VECSZ), v_src42, v_src43); - v_store(dst + i, v_pack(v_rshr_pack<20>((v_src20 << 2) + (v_src20 << 1) + ((v_src10 + v_src30) << 2) + v_src00 + v_src40, - (v_src21 << 2) + (v_src21 << 1) + ((v_src11 + v_src31) << 2) + v_src01 + v_src41), - v_rshr_pack<20>((v_src22 << 2) + (v_src22 << 1) + ((v_src12 + v_src32) << 2) + v_src02 + v_src42, - (v_src23 << 2) + (v_src23 << 1) + ((v_src13 + v_src33) << 2) + v_src03 + v_src43))); + v_store(dst + i, v_pack(v_rshr_pack<20>(v_add(v_add(v_add(v_add(v_shl<2>(v_src20), v_shl<1>(v_src20)), v_shl<2>(v_add(v_src10, v_src30))), v_src00), v_src40), + v_add(v_add(v_add(v_add(v_shl<2>(v_src21), v_shl<1>(v_src21)), v_shl<2>(v_add(v_src11, v_src31))), v_src01), v_src41)), + v_rshr_pack<20>(v_add(v_add(v_add(v_add(v_shl<2>(v_src22), v_shl<1>(v_src22)), v_shl<2>(v_add(v_src12, v_src32))), v_src02), v_src42), + v_add(v_add(v_add(v_add(v_shl<2>(v_src23), v_shl<1>(v_src23)), v_shl<2>(v_add(v_src13, v_src33))), v_src03), v_src43)))); } #endif for (; i < len; i++) @@ -1647,10 +1638,10 @@ template <> void vlineSmooth(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len) { int i = 0; -#if CV_SIMD - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); if (len >= VECSZ) { ufixedpoint16 msum = m[0] + m[1]; @@ -1705,17 +1696,17 @@ void vlineSmooth(const ufixedpoint16* const * src, const v_src12 = vx_load(srcj1 + 2*VECSZ); v_src13 = vx_load(srcj1 + 3*VECSZ); v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul); - v_res1 += v_dotprod(v_tmp1, v_mul); + v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul)); + v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul)); v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_res2 += v_dotprod(v_tmp0, v_mul); - v_res3 += v_dotprod(v_tmp1, v_mul); + v_res2 = v_add(v_res2, v_dotprod(v_tmp0, v_mul)); + v_res3 = v_add(v_res3, v_dotprod(v_tmp1, v_mul)); v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_res4 += v_dotprod(v_tmp0, v_mul); - v_res5 += v_dotprod(v_tmp1, v_mul); + v_res4 = v_add(v_res4, v_dotprod(v_tmp0, v_mul)); + v_res5 = v_add(v_res5, v_dotprod(v_tmp1, v_mul)); v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_res6 += v_dotprod(v_tmp0, v_mul); - v_res7 += v_dotprod(v_tmp1, v_mul); + v_res6 = v_add(v_res6, v_dotprod(v_tmp0, v_mul)); + v_res7 = v_add(v_res7, v_dotprod(v_tmp1, v_mul)); } if(j < n) { @@ -1727,26 +1718,26 @@ void vlineSmooth(const ufixedpoint16* const * src, const v_src02 = vx_load(srcj + 2*VECSZ); v_src03 = vx_load(srcj + 3*VECSZ); v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; + v_res0 = v_add(v_res0, v_resj0); + v_res1 = v_add(v_res1, v_resj1); v_mul_expand(v_add_wrap(v_src01, v_128), v_mul, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; + v_res2 = v_add(v_res2, v_resj0); + v_res3 = v_add(v_res3, v_resj1); v_mul_expand(v_add_wrap(v_src02, v_128), v_mul, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; + v_res4 = v_add(v_res4, v_resj0); + v_res5 = v_add(v_res5, v_resj1); v_mul_expand(v_add_wrap(v_src03, v_128), v_mul, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - } - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; + v_res6 = v_add(v_res6, v_resj0); + v_res7 = v_add(v_res7, v_resj1); + } + v_res0 = v_add(v_res0, v_128_4); + v_res1 = v_add(v_res1, v_128_4); + v_res2 = v_add(v_res2, v_128_4); + v_res3 = v_add(v_res3, v_128_4); + v_res4 = v_add(v_res4, v_128_4); + v_res5 = v_add(v_res5, v_128_4); + v_res6 = v_add(v_res6, v_128_4); + v_res7 = v_add(v_res7, v_128_4); v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); @@ -1780,11 +1771,11 @@ template <> void vlineSmoothONa_yzy_a(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) int pre_shift = n / 2; - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); + const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; + const int VECSZ = VTraits::vlanes(); if (len >= VECSZ) { ufixedpoint16 msum = m[0] + m[pre_shift] + m[n - 1]; @@ -1826,27 +1817,27 @@ void vlineSmoothONa_yzy_a(const ufixedpoint16* const * s v_src21 = vx_load(srcj1 + 2*VECSZ); v_src31 = vx_load(srcj1 + 3*VECSZ); v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src01, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul); - v_res1 += v_dotprod(v_tmp1, v_mul); + v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul)); + v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul)); v_zip(v_add_wrap(v_src10, v_128), v_add_wrap(v_src11, v_128), v_tmp2, v_tmp3); - v_res2 += v_dotprod(v_tmp2, v_mul); - v_res3 += v_dotprod(v_tmp3, v_mul); + v_res2 = v_add(v_res2, v_dotprod(v_tmp2, v_mul)); + v_res3 = v_add(v_res3, v_dotprod(v_tmp3, v_mul)); v_zip(v_add_wrap(v_src20, v_128), v_add_wrap(v_src21, v_128), v_tmp4, v_tmp5); - v_res4 += v_dotprod(v_tmp4, v_mul); - v_res5 += v_dotprod(v_tmp5, v_mul); + v_res4 = v_add(v_res4, v_dotprod(v_tmp4, v_mul)); + v_res5 = v_add(v_res5, v_dotprod(v_tmp5, v_mul)); v_zip(v_add_wrap(v_src30, v_128), v_add_wrap(v_src31, v_128), v_tmp6, v_tmp7); - v_res6 += v_dotprod(v_tmp6, v_mul); - v_res7 += v_dotprod(v_tmp7, v_mul); + v_res6 = v_add(v_res6, v_dotprod(v_tmp6, v_mul)); + v_res7 = v_add(v_res7, v_dotprod(v_tmp7, v_mul)); } - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; + v_res0 = v_add(v_res0, v_128_4); + v_res1 = v_add(v_res1, v_128_4); + v_res2 = v_add(v_res2, v_128_4); + v_res3 = v_add(v_res3, v_128_4); + v_res4 = v_add(v_res4, v_128_4); + v_res5 = v_add(v_res5, v_128_4); + v_res6 = v_add(v_res6, v_128_4); + v_res7 = v_add(v_res7, v_128_4); v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); @@ -1868,9 +1859,9 @@ template <> void vlineSmoothONa_yzy_a(const ufixedpoint32* const * src, const ufixedpoint32* m, int n, uint16_t* dst, int len) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) int pre_shift = n / 2; - const int VECSZ = v_uint32::nlanes; + const int VECSZ = VTraits::vlanes(); for (; i <= len - 2*VECSZ; i += 2*VECSZ) { v_uint32 v_src00, v_src10, v_src01, v_src11; @@ -1895,15 +1886,15 @@ void vlineSmoothONa_yzy_a(const ufixedpoint32* const * v_src01 = vx_load(srcj1); v_mul_expand(v_src00, v_mul, v_tmp0, v_tmp1); v_mul_expand(v_src01, v_mul, v_tmp2, v_tmp3); - v_res0 += v_tmp0 + v_tmp2; - v_res1 += v_tmp1 + v_tmp3; + v_res0 = v_add(v_res0, v_add(v_tmp0, v_tmp2)); + v_res1 = v_add(v_res1, v_add(v_tmp1, v_tmp3)); v_src10 = vx_load(srcj0 + VECSZ); v_src11 = vx_load(srcj1 + VECSZ); v_mul_expand(v_src10, v_mul, v_tmp4, v_tmp5); v_mul_expand(v_src11, v_mul, v_tmp6, v_tmp7); - v_res2 += v_tmp4 + v_tmp6; - v_res3 += v_tmp5 + v_tmp7; + v_res2 = v_add(v_res2, v_add(v_tmp4, v_tmp6)); + v_res3 = v_add(v_res3, v_add(v_tmp5, v_tmp7)); } v_store(dst + i, v_pack(v_rshr_pack<32>(v_res0, v_res1), diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp index 1aed1fa03166..f422609c40f6 100644 --- a/modules/imgproc/src/spatialgradient.cpp +++ b/modules/imgproc/src/spatialgradient.cpp @@ -57,15 +57,33 @@ namespace cv * 0 0 0 * 1 2 1 */ +#if (CV_SIMD || CV_SIMD_SCALABLE) template -static inline void spatialGradientKernel( T& vx, T& vy, +static inline void spatialGradientKernel_vec( T& vx, T& vy, const T& v00, const T& v01, const T& v02, const T& v10, const T& v12, const T& v20, const T& v21, const T& v22 ) { // vx = (v22 - v00) + (v02 - v20) + 2 * (v12 - v10) // vy = (v22 - v00) + (v20 - v02) + 2 * (v21 - v01) + T tmp_add = v_sub(v22, v00), + tmp_sub = v_sub(v02, v20), + tmp_x = v_sub(v12, v10), + tmp_y = v_sub(v21, v01); + + vx = v_add(v_add(v_add(tmp_add, tmp_sub), tmp_x), tmp_x); + vy = v_add(v_add(v_sub(tmp_add, tmp_sub), tmp_y), tmp_y); +} +#endif +template +static inline void spatialGradientKernel( T& vx, T& vy, + const T& v00, const T& v01, const T& v02, + const T& v10, const T& v12, + const T& v20, const T& v21, const T& v22 ) +{ + // vx = (v22 - v00) + (v02 - v20) + 2 * (v12 - v10) + // vy = (v22 - v00) + (v20 - v02) + 2 * (v21 - v01) T tmp_add = v22 - v00, tmp_sub = v02 - v20, tmp_x = v12 - v10, @@ -125,7 +143,7 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, int i_start = 0; int j_start = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) // Characters in variable names have the following meanings: // u: unsigned char // s: signed int @@ -148,7 +166,7 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, short *n_dy = dy.ptr(i+1); // Process rest of columns 16-column chunks at a time - for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes) + for ( j = 1; j < W - VTraits::vlanes(); j += VTraits::vlanes()) { // Load top row for 3x3 Sobel filter v_uint8 v_um = vx_load(&p_src[j-1]); @@ -195,22 +213,22 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, // dx & dy for rows 1, 2, 3 v_int16 v_sdx1, v_sdy1; - spatialGradientKernel( v_sdx1, v_sdy1, + spatialGradientKernel_vec( v_sdx1, v_sdy1, v_s1m1, v_s1n1, v_s1p1, v_s2m1, v_s2p1, v_s3m1, v_s3n1, v_s3p1 ); v_int16 v_sdx2, v_sdy2; - spatialGradientKernel( v_sdx2, v_sdy2, + spatialGradientKernel_vec( v_sdx2, v_sdy2, v_s1m2, v_s1n2, v_s1p2, v_s2m2, v_s2p2, v_s3m2, v_s3n2, v_s3p2 ); // Store v_store(&c_dx[j], v_sdx1); - v_store(&c_dx[j+v_int16::nlanes], v_sdx2); + v_store(&c_dx[j+VTraits::vlanes()], v_sdx2); v_store(&c_dy[j], v_sdy1); - v_store(&c_dy[j+v_int16::nlanes], v_sdy2); + v_store(&c_dy[j+VTraits::vlanes()], v_sdy2); // Load fourth row for 3x3 Sobel filter v_um = vx_load(&m_src[j-1]); @@ -227,21 +245,21 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2); // dx & dy for rows 2, 3, 4 - spatialGradientKernel( v_sdx1, v_sdy1, + spatialGradientKernel_vec( v_sdx1, v_sdy1, v_s2m1, v_s2n1, v_s2p1, v_s3m1, v_s3p1, v_s4m1, v_s4n1, v_s4p1 ); - spatialGradientKernel( v_sdx2, v_sdy2, + spatialGradientKernel_vec( v_sdx2, v_sdy2, v_s2m2, v_s2n2, v_s2p2, v_s3m2, v_s3p2, v_s4m2, v_s4n2, v_s4p2 ); // Store v_store(&n_dx[j], v_sdx1); - v_store(&n_dx[j+v_int16::nlanes], v_sdx2); + v_store(&n_dx[j+VTraits::vlanes()], v_sdx2); v_store(&n_dy[j], v_sdy1); - v_store(&n_dy[j+v_int16::nlanes], v_sdy2); + v_store(&n_dy[j+VTraits::vlanes()], v_sdy2); } } i_start = i;