From 5e9191558d787b33e2f4d43833106b2e00c21cd1 Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Fri, 15 Sep 2023 01:37:46 +0800
Subject: [PATCH] Merge pull request #24058 from hanliutong:rewrite-imgporc

Rewrite Universal Intrinsic code by using new API: ImgProc module. #24058

The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro in the `opencv/modules/imgproc` folder: rewrite them by using the new Universal Intrinsic API.

For easier review, this PR includes a part of the rewritten code, and another part will be brought in the next PR (coming soon). I tested this patch on RVV (QEMU) and AVX devices, `opencv_test_imgproc` is passed.

The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR https://github.com/opencv/opencv/pull/23885 and https://github.com/opencv/opencv/pull/23980.


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/imgproc/src/accum.simd.hpp      | 640 ++++++++++++------------
 modules/imgproc/src/blend.cpp           |  54 +-
 modules/imgproc/src/canny.cpp           |  66 +--
 modules/imgproc/src/color_rgb.simd.hpp  | 174 ++++---
 modules/imgproc/src/contours.cpp        |  20 +-
 modules/imgproc/src/corner.cpp          |  72 +--
 modules/imgproc/src/histogram.cpp       |  34 +-
 modules/imgproc/src/pyramids.cpp        | 478 +++++++++---------
 modules/imgproc/src/resize.cpp          | 413 ++++++++-------
 modules/imgproc/src/smooth.simd.hpp     | 335 ++++++-------
 modules/imgproc/src/spatialgradient.cpp |  40 +-
 11 files changed, 1183 insertions(+), 1143 deletions(-)

diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
index 6b0e6d6fbe73..7fe7aabeaf82 100644
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -139,7 +139,7 @@ void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
@@ -187,7 +187,7 @@ accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
@@ -236,7 +236,7 @@ accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
@@ -285,16 +285,16 @@ accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
 void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -309,10 +309,10 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+            v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+            v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
         }
     }
     else
@@ -323,9 +323,9 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint16 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
 
@@ -333,10 +333,10 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+                v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+                v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
             }
         }
         else if (cn == 3)
@@ -344,12 +344,12 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
@@ -373,18 +373,18 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
                 v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
                 v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
 
-                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
-                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
-                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
-                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
-                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
-                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
-                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
-                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
-                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
-                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
-                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
-                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+                v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000)));
+                v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100)));
+                v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200)));
+                v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001)));
+                v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101)));
+                v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201)));
+                v_dst010 = v_add(v_dst010, v_cvt_f32(v_reinterpret_as_s32(v_src010)));
+                v_dst110 = v_add(v_dst110, v_cvt_f32(v_reinterpret_as_s32(v_src110)));
+                v_dst210 = v_add(v_dst210, v_cvt_f32(v_reinterpret_as_s32(v_src210)));
+                v_dst011 = v_add(v_dst011, v_cvt_f32(v_reinterpret_as_s32(v_src011)));
+                v_dst111 = v_add(v_dst111, v_cvt_f32(v_reinterpret_as_s32(v_src111)));
+                v_dst211 = v_add(v_dst211, v_cvt_f32(v_reinterpret_as_s32(v_src211)));
 
                 v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
                 v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
@@ -400,9 +400,9 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -413,8 +413,8 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
             v_uint32 v_src0, v_src1;
             v_expand(v_src, v_src0, v_src1);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src0))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src1))));
         }
     }
     else
@@ -425,14 +425,14 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint32 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src0))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src1))));
             }
         }
         else if (cn == 3)
@@ -441,12 +441,12 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
@@ -456,12 +456,12 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00));
-                v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01));
-                v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10));
-                v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11));
-                v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20));
-                v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21));
+                v_dst00 = v_add(v_dst00, v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_dst01 = v_add(v_dst01, v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_dst10 = v_add(v_dst10, v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_dst11 = v_add(v_dst11, v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_dst20 = v_add(v_dst20, v_cvt_f32(v_reinterpret_as_s32(v_src20)));
+                v_dst21 = v_add(v_dst21, v_cvt_f32(v_reinterpret_as_s32(v_src21)));
 
                 v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
@@ -551,9 +551,9 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -586,14 +586,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
             v_float64 v_dst6 = vx_load(dst + x + step * 6);
             v_float64 v_dst7 = vx_load(dst + x + step * 7);
 
-            v_dst0 = v_dst0 + v_src0;
-            v_dst1 = v_dst1 + v_src1;
-            v_dst2 = v_dst2 + v_src2;
-            v_dst3 = v_dst3 + v_src3;
-            v_dst4 = v_dst4 + v_src4;
-            v_dst5 = v_dst5 + v_src5;
-            v_dst6 = v_dst6 + v_src6;
-            v_dst7 = v_dst7 + v_src7;
+            v_dst0 = v_add(v_dst0, v_src0);
+            v_dst1 = v_add(v_dst1, v_src1);
+            v_dst2 = v_add(v_dst2, v_src2);
+            v_dst3 = v_add(v_dst3, v_src3);
+            v_dst4 = v_add(v_dst4, v_src4);
+            v_dst5 = v_add(v_dst5, v_src5);
+            v_dst6 = v_add(v_dst6, v_src6);
+            v_dst7 = v_add(v_dst7, v_src7);
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -613,9 +613,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint8 v_src  = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint16 v_int0, v_int1;
                 v_expand(v_src, v_int0, v_int1);
 
@@ -641,14 +641,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                 v_float64 v_dst6 = vx_load(dst + x + step * 6);
                 v_float64 v_dst7 = vx_load(dst + x + step * 7);
 
-                v_dst0 = v_dst0 + v_src0;
-                v_dst1 = v_dst1 + v_src1;
-                v_dst2 = v_dst2 + v_src2;
-                v_dst3 = v_dst3 + v_src3;
-                v_dst4 = v_dst4 + v_src4;
-                v_dst5 = v_dst5 + v_src5;
-                v_dst6 = v_dst6 + v_src6;
-                v_dst7 = v_dst7 + v_src7;
+                v_dst0 = v_add(v_dst0, v_src0);
+                v_dst1 = v_add(v_dst1, v_src1);
+                v_dst2 = v_add(v_dst2, v_src2);
+                v_dst3 = v_add(v_dst3, v_src3);
+                v_dst4 = v_add(v_dst4, v_src4);
+                v_dst5 = v_add(v_dst5, v_src5);
+                v_dst6 = v_add(v_dst6, v_src6);
+                v_dst7 = v_add(v_dst7, v_src7);
 
                 v_store(dst + x, v_dst0);
                 v_store(dst + x + step, v_dst1);
@@ -665,12 +665,12 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
@@ -726,14 +726,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                 v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110);
                 v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111);
 
-                v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000);
-                v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
-                v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
-                v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
-                v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
-                v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
-                v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
-                v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
+                v_store_interleave(dst + (x * cn), v_add(v_dst0000, v_src0000), v_add(v_dst1000, v_src1000), v_add(v_dst2000, v_src2000));
+                v_store_interleave(dst + ((x + step) * cn), v_add(v_dst0001, v_src0001), v_add(v_dst1001, v_src1001), v_add(v_dst2001, v_src2001));
+                v_store_interleave(dst + ((x + step * 2) * cn), v_add(v_dst0010, v_src0010), v_add(v_dst1010, v_src1010), v_add(v_dst2010, v_src2010));
+                v_store_interleave(dst + ((x + step * 3) * cn), v_add(v_dst0011, v_src0011), v_add(v_dst1011, v_src1011), v_add(v_dst2011, v_src2011));
+                v_store_interleave(dst + ((x + step * 4) * cn), v_add(v_dst0100, v_src0100), v_add(v_dst1100, v_src1100), v_add(v_dst2100, v_src2100));
+                v_store_interleave(dst + ((x + step * 5) * cn), v_add(v_dst0101, v_src0101), v_add(v_dst1101, v_src1101), v_add(v_dst2101, v_src2101));
+                v_store_interleave(dst + ((x + step * 6) * cn), v_add(v_dst0110, v_src0110), v_add(v_dst1110, v_src1110), v_add(v_dst2110, v_src2110));
+                v_store_interleave(dst + ((x + step * 7) * cn), v_add(v_dst0111, v_src0111), v_add(v_dst1111, v_src1111), v_add(v_dst2111, v_src2111));
             }
         }
     }
@@ -744,9 +744,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
 void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -767,10 +767,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_dst2 = vx_load(dst + x + step * 2);
             v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-            v_dst0 = v_dst0 + v_src0;
-            v_dst1 = v_dst1 + v_src1;
-            v_dst2 = v_dst2 + v_src2;
-            v_dst3 = v_dst3 + v_src3;
+            v_dst0 = v_add(v_dst0, v_src0);
+            v_dst1 = v_add(v_dst1, v_src1);
+            v_dst2 = v_add(v_dst2, v_src2);
+            v_dst3 = v_add(v_dst3, v_src3);
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -786,9 +786,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src  = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint32 v_int0, v_int1;
                 v_expand(v_src, v_int0, v_int1);
 
@@ -802,10 +802,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
                 v_float64 v_dst2 = vx_load(dst + x + step * 2);
                 v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-                v_dst0 = v_dst0 + v_src0;
-                v_dst1 = v_dst1 + v_src1;
-                v_dst2 = v_dst2 + v_src2;
-                v_dst3 = v_dst3 + v_src3;
+                v_dst0 = v_add(v_dst0, v_src0);
+                v_dst1 = v_add(v_dst1, v_src1);
+                v_dst2 = v_add(v_dst2, v_src2);
+                v_dst3 = v_add(v_dst3, v_src3);
 
                 v_store(dst + x, v_dst0);
                 v_store(dst + x + step, v_dst1);
@@ -818,12 +818,12 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
@@ -848,10 +848,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
                 v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
                 v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
+                v_store_interleave(dst + (x + step * 2) * cn, v_add(v_dst02, v_src02), v_add(v_dst12, v_src12), v_add(v_dst22, v_src22));
+                v_store_interleave(dst + (x + step * 3) * cn, v_add(v_dst03, v_src03), v_add(v_dst13, v_src13), v_add(v_dst23, v_src23));
             }
         }
     }
@@ -1033,9 +1033,9 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
 void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1052,10 +1052,10 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+            v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+            v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
         }
     }
     else
@@ -1066,9 +1066,9 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint16 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
                 v_src0 = v_mul_wrap(v_src0, v_src0);
@@ -1078,10 +1078,10 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+                v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+                v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
             }
         }
         else if (cn == 3)
@@ -1089,13 +1089,13 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
 
                 v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
 
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
@@ -1126,20 +1126,20 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
                 v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
                 v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
 
-                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
-                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
-                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
-                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
+                v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000)));
+                v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001)));
+                v_dst010 = v_add(v_dst010, v_cvt_f32(v_reinterpret_as_s32(v_src010)));
+                v_dst011 = v_add(v_dst011, v_cvt_f32(v_reinterpret_as_s32(v_src011)));
 
-                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
-                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
-                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
-                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
+                v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100)));
+                v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101)));
+                v_dst110 = v_add(v_dst110, v_cvt_f32(v_reinterpret_as_s32(v_src110)));
+                v_dst111 = v_add(v_dst111, v_cvt_f32(v_reinterpret_as_s32(v_src111)));
 
-                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
-                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
-                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
-                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+                v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200)));
+                v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201)));
+                v_dst210 = v_add(v_dst210, v_cvt_f32(v_reinterpret_as_s32(v_src210)));
+                v_dst211 = v_add(v_dst211, v_cvt_f32(v_reinterpret_as_s32(v_src211)));
 
                 v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
                 v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
@@ -1155,9 +1155,9 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
 void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1186,13 +1186,13 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask0, v_mask1;
                 v_expand(v_mask16, v_mask0, v_mask1);
-                v_mask0 = ~(v_mask0 == v_0);
-                v_mask1 = ~(v_mask1 == v_0);
+                v_mask0 = v_not(v_eq(v_mask0, v_0));
+                v_mask1 = v_not(v_eq(v_mask1, v_0));
                 v_uint16 v_src = vx_load(src + x);
                 v_uint32 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);
 
                 v_float32 v_float0, v_float1;
                 v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
@@ -1209,8 +1209,8 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask0, v_mask1;
                 v_expand(v_mask16, v_mask0, v_mask1);
-                v_mask0 = ~(v_mask0 == v_0);
-                v_mask1 = ~(v_mask1 == v_0);
+                v_mask0 = v_not(v_eq(v_mask0, v_0));
+                v_mask1 = v_not(v_eq(v_mask1, v_0));
 
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
@@ -1218,12 +1218,12 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
                 v_expand(v_src2, v_int20, v_int21);
-                v_int00 = v_int00 & v_mask0;
-                v_int01 = v_int01 & v_mask1;
-                v_int10 = v_int10 & v_mask0;
-                v_int11 = v_int11 & v_mask1;
-                v_int20 = v_int20 & v_mask0;
-                v_int21 = v_int21 & v_mask1;
+                v_int00 = v_and(v_int00, v_mask0);
+                v_int01 = v_and(v_int01, v_mask1);
+                v_int10 = v_and(v_int10, v_mask0);
+                v_int11 = v_and(v_int11, v_mask1);
+                v_int20 = v_and(v_int20, v_mask0);
+                v_int21 = v_and(v_int21, v_mask1);
 
                 v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00));
@@ -1347,9 +1347,9 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
 void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1390,9 +1390,9 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src = vx_load_expand(src + x);
-                v_uint16 v_int = v_src & v_mask;
+                v_uint16 v_int = v_and(v_src, v_mask);
 
                 v_uint32 v_int0, v_int1;
                 v_expand(v_int, v_int0, v_int1);
@@ -1430,10 +1430,10 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
                 v_uint16 v_int2 = v_expand_low(v_src2);
 
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_int0 = v_int0 & v_mask;
-                v_int1 = v_int1 & v_mask;
-                v_int2 = v_int2 & v_mask;
+                v_mask = v_not(v_eq(v_mask, v_0));
+                v_int0 = v_and(v_int0, v_mask);
+                v_int1 = v_and(v_int1, v_mask);
+                v_int2 = v_and(v_int2, v_mask);
 
                 v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_int0, v_int00, v_int01);
@@ -1486,9 +1486,9 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
 void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1531,9 +1531,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint32 v_int_0, v_int_1;
                 v_expand(v_src, v_int_0, v_int_1);
 
@@ -1566,12 +1566,12 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
@@ -1810,9 +1810,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
 void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_uint32>::vlanes();
 
     if (!mask)
     {
@@ -1829,10 +1829,10 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+            v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+            v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
         }
     }
     else
@@ -1843,11 +1843,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint8 v_1src = vx_load(src1 + x);
                 v_uint8 v_2src = vx_load(src2 + x);
-                v_1src = v_1src & v_mask;
-                v_2src = v_2src & v_mask;
+                v_1src = v_and(v_1src, v_mask);
+                v_2src = v_and(v_2src, v_mask);
 
                 v_uint16 v_src0, v_src1;
                 v_mul_expand(v_1src, v_2src, v_src0, v_src1);
@@ -1856,10 +1856,10 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+                v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+                v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
             }
         }
         else if (cn == 3)
@@ -1867,16 +1867,16 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_mask;
-                v_1src1 = v_1src1 & v_mask;
-                v_1src2 = v_1src2 & v_mask;
-                v_2src0 = v_2src0 & v_mask;
-                v_2src1 = v_2src1 & v_mask;
-                v_2src2 = v_2src2 & v_mask;
+                v_1src0 = v_and(v_1src0, v_mask);
+                v_1src1 = v_and(v_1src1, v_mask);
+                v_1src2 = v_and(v_1src2, v_mask);
+                v_2src0 = v_and(v_2src0, v_mask);
+                v_2src1 = v_and(v_2src1, v_mask);
+                v_2src2 = v_and(v_2src2, v_mask);
 
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_mul_expand(v_1src0, v_2src0, v_src00, v_src01);
@@ -1896,18 +1896,18 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                 v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
                 v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
                 v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
-                v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000));
-                v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001));
-                v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002));
-                v_dst003 = v_dst003 + v_cvt_f32(v_reinterpret_as_s32(v_src003));
-                v_dst100 = v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100));
-                v_dst101 = v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101));
-                v_dst102 = v_dst102 + v_cvt_f32(v_reinterpret_as_s32(v_src102));
-                v_dst103 = v_dst103 + v_cvt_f32(v_reinterpret_as_s32(v_src103));
-                v_dst200 = v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200));
-                v_dst201 = v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201));
-                v_dst202 = v_dst202 + v_cvt_f32(v_reinterpret_as_s32(v_src202));
-                v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203));
+                v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000)));
+                v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001)));
+                v_dst002 = v_add(v_dst002, v_cvt_f32(v_reinterpret_as_s32(v_src002)));
+                v_dst003 = v_add(v_dst003, v_cvt_f32(v_reinterpret_as_s32(v_src003)));
+                v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100)));
+                v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101)));
+                v_dst102 = v_add(v_dst102, v_cvt_f32(v_reinterpret_as_s32(v_src102)));
+                v_dst103 = v_add(v_dst103, v_cvt_f32(v_reinterpret_as_s32(v_src103)));
+                v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200)));
+                v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201)));
+                v_dst202 = v_add(v_dst202, v_cvt_f32(v_reinterpret_as_s32(v_src202)));
+                v_dst203 = v_add(v_dst203, v_cvt_f32(v_reinterpret_as_s32(v_src203)));
 
                 v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
                 v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
@@ -1923,9 +1923,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
 void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1956,10 +1956,10 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
 
-                v_uint16 v_1src = vx_load(src1 + x) & v_mask;
-                v_uint16 v_2src = vx_load(src2 + x) & v_mask;
+                v_uint16 v_1src = v_and(vx_load(src1 + x), v_mask);
+                v_uint16 v_2src = v_and(vx_load(src2 + x), v_mask);
 
                 v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
                 v_expand(v_1src, v_1src0, v_1src1);
@@ -1979,17 +1979,17 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
 
                 v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_mask;
-                v_1src1 = v_1src1 & v_mask;
-                v_1src2 = v_1src2 & v_mask;
-                v_2src0 = v_2src0 & v_mask;
-                v_2src1 = v_2src1 & v_mask;
-                v_2src2 = v_2src2 & v_mask;
+                v_1src0 = v_and(v_1src0, v_mask);
+                v_1src1 = v_and(v_1src1, v_mask);
+                v_1src2 = v_and(v_1src2, v_mask);
+                v_2src0 = v_and(v_2src0, v_mask);
+                v_2src1 = v_and(v_2src1, v_mask);
+                v_2src2 = v_and(v_2src2, v_mask);
 
                 v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
                 v_expand(v_1src0, v_1src00, v_1src01);
@@ -2108,9 +2108,9 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
 void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2153,9 +2153,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask;
-                v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask;
+                v_mask = v_not(v_eq(v_mask, v_0));
+                v_uint16 v_1int = v_and(vx_load_expand(src1 + x), v_mask);
+                v_uint16 v_2int = v_and(vx_load_expand(src2 + x), v_mask);
 
                 v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
                 v_expand(v_1int, v_1int_0, v_1int_1);
@@ -2198,13 +2198,13 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
                 v_uint16 v_2int2 = v_expand_low(v_2src2);
 
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_1int0 = v_1int0 & v_mask;
-                v_1int1 = v_1int1 & v_mask;
-                v_1int2 = v_1int2 & v_mask;
-                v_2int0 = v_2int0 & v_mask;
-                v_2int1 = v_2int1 & v_mask;
-                v_2int2 = v_2int2 & v_mask;
+                v_mask = v_not(v_eq(v_mask, v_0));
+                v_1int0 = v_and(v_1int0, v_mask);
+                v_1int1 = v_and(v_1int1, v_mask);
+                v_1int2 = v_and(v_1int2, v_mask);
+                v_2int0 = v_and(v_2int0, v_mask);
+                v_2int1 = v_and(v_2int1, v_mask);
+                v_2int2 = v_and(v_2int2, v_mask);
 
                 v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
                 v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
@@ -2248,9 +2248,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
 void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2293,11 +2293,11 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_1src = vx_load(src1 + x);
                 v_uint16 v_2src = vx_load(src2 + x);
-                v_1src = v_1src & v_mask;
-                v_2src = v_2src & v_mask;
+                v_1src = v_and(v_1src, v_mask);
+                v_2src = v_and(v_2src, v_mask);
 
                 v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
                 v_expand(v_1src, v_1int_0, v_1int_1);
@@ -2329,16 +2329,16 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_mask;
-                v_1src1 = v_1src1 & v_mask;
-                v_1src2 = v_1src2 & v_mask;
-                v_2src0 = v_2src0 & v_mask;
-                v_2src1 = v_2src1 & v_mask;
-                v_2src2 = v_2src2 & v_mask;
+                v_1src0 = v_and(v_1src0, v_mask);
+                v_1src1 = v_and(v_1src1, v_mask);
+                v_1src2 = v_and(v_1src2, v_mask);
+                v_2src0 = v_and(v_2src0, v_mask);
+                v_2src1 = v_and(v_2src1, v_mask);
+                v_2src2 = v_and(v_2src2, v_mask);
 
                 v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
                 v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
@@ -2594,11 +2594,11 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
 void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const v_float32 v_alpha = vx_setall_f32((float)alpha);
     const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float32::nlanes;
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2619,10 +2619,10 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
             v_float32 v_dst10 = vx_load(dst + x + step * 2);
             v_float32 v_dst11 = vx_load(dst + x + step * 3);
 
-            v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
-            v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
-            v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
-            v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
+            v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha));
+            v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha));
+            v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha));
+            v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha));
 
             v_store(dst + x           , v_dst00);
             v_store(dst + x + step    , v_dst01);
@@ -2663,15 +2663,15 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
                 v_float32 v_dst10 = vx_load(dst + x + step * 2);
                 v_float32 v_dst11 = vx_load(dst + x + step * 3);
 
-                v_mf00 = v_mf00 != zero;
-                v_mf01 = v_mf01 != zero;
-                v_mf10 = v_mf10 != zero;
-                v_mf11 = v_mf11 != zero;
+                v_mf00 = v_ne(v_mf00, zero);
+                v_mf01 = v_ne(v_mf01, zero);
+                v_mf10 = v_ne(v_mf10, zero);
+                v_mf11 = v_ne(v_mf11, zero);
 
-                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
-                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
-                v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
-                v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha)), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha)), v_dst01);
+                v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha)), v_dst10);
+                v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha)), v_dst11);
 
                 v_store(dst + x           , v_dst00);
                 v_store(dst + x + step    , v_dst01);
@@ -2719,25 +2719,25 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
                 v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10));
                 v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11));
 
-                v_mf00 = v_mf00 != zero;
-                v_mf01 = v_mf01 != zero;
-                v_mf10 = v_mf10 != zero;
-                v_mf11 = v_mf11 != zero;
+                v_mf00 = v_ne(v_mf00, zero);
+                v_mf01 = v_ne(v_mf01, zero);
+                v_mf10 = v_ne(v_mf10, zero);
+                v_mf11 = v_ne(v_mf11, zero);
 
-                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src000)) * v_alpha), v_dst00);
-                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src001)) * v_alpha), v_dst01);
-                v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src010)) * v_alpha), v_dst02);
-                v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src011)) * v_alpha), v_dst03);
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_alpha)), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_alpha)), v_dst01);
+                v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_alpha)), v_dst02);
+                v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_alpha)), v_dst03);
 
-                v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src100)) * v_alpha), v_dst10);
-                v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src101)) * v_alpha), v_dst11);
-                v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src110)) * v_alpha), v_dst12);
-                v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src111)) * v_alpha), v_dst13);
+                v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_alpha)), v_dst10);
+                v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_alpha)), v_dst11);
+                v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_alpha)), v_dst12);
+                v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_alpha)), v_dst13);
 
-                v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src200)) * v_alpha), v_dst20);
-                v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src201)) * v_alpha), v_dst21);
-                v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src210)) * v_alpha), v_dst22);
-                v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src211)) * v_alpha), v_dst23);
+                v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src200)), v_alpha)), v_dst20);
+                v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src201)), v_alpha)), v_dst21);
+                v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src210)), v_alpha)), v_dst22);
+                v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src211)), v_alpha)), v_dst23);
 
                 v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
                 v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
@@ -2753,11 +2753,11 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
 void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const v_float32 v_alpha = vx_setall_f32((float)alpha);
     const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2770,8 +2770,8 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
 
             v_float32 v_dst0 = vx_load(dst + x);
             v_float32 v_dst1 = vx_load(dst + x + step);
-            v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_int0)), v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_int1)), v_alpha));
 
             v_store(dst + x       , v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -2799,11 +2799,11 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
                 v_float32 v_dst0 = vx_load(dst + x);
                 v_float32 v_dst1 = vx_load(dst + x + step);
 
-                v_mf0 = v_mf0 != zero;
-                v_mf1 = v_mf1 != zero;
+                v_mf0 = v_ne(v_mf0, zero);
+                v_mf1 = v_ne(v_mf1, zero);
 
-                v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src0)) * v_alpha), v_dst0);
-                v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src1)) * v_alpha), v_dst1);
+                v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src0)), v_alpha)), v_dst0);
+                v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src1)), v_alpha)), v_dst1);
 
                 v_store(dst + x       , v_dst0);
                 v_store(dst + x + step, v_dst1);
@@ -2833,16 +2833,16 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
                 v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0));
                 v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1));
 
-                v_mf0 = v_mf0 != zero;
-                v_mf1 = v_mf1 != zero;
+                v_mf0 = v_ne(v_mf0, zero);
+                v_mf1 = v_ne(v_mf1, zero);
 
-                v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
-                v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
-                v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src20)) * v_alpha), v_dst20);
+                v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha)), v_dst00);
+                v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha)), v_dst10);
+                v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src20)), v_alpha)), v_dst20);
 
-                v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
-                v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
-                v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src21)) * v_alpha), v_dst21);
+                v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha)), v_dst01);
+                v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha)), v_dst11);
+                v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src21)), v_alpha)), v_dst21);
 
                 v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
                 v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
@@ -2870,11 +2870,11 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
             _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
         }
     }
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     const v_float32 v_alpha = vx_setall_f32((float)alpha);
     const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2884,8 +2884,8 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
             v_float32 v_dst0 = vx_load(dst + x);
             v_float32 v_dst1 = vx_load(dst + x + step);
 
-            v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(vx_load(src + x), v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(vx_load(src + x + step), v_alpha));
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -2898,11 +2898,11 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
 void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2927,10 +2927,10 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_dst2 = vx_load(dst + x + step * 2);
             v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
-            v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha);
-            v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_src0, v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_src1, v_alpha));
+            v_dst2 = v_fma(v_dst2, v_beta, v_mul(v_src2, v_alpha));
+            v_dst3 = v_fma(v_dst3, v_beta, v_mul(v_src3, v_alpha));
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -2945,11 +2945,11 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c
 void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2973,10 +2973,10 @@ void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int
             v_float64 v_dst10 = vx_load(dst + x + step * 2);
             v_float64 v_dst11 = vx_load(dst + x + step * 3);
 
-            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
-            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
-            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
-            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
+            v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_src00, v_alpha));
+            v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_src01, v_alpha));
+            v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_src10, v_alpha));
+            v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_src11, v_alpha));
 
             v_store(dst + x, v_dst00);
             v_store(dst + x + step, v_dst01);
@@ -3014,11 +3014,11 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c
             _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
         }
     }
-#elif CV_SIMD_64F
+#elif (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_float32::nlanes * 2;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_float32>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -3026,7 +3026,7 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
             v_float32 v_src0 = vx_load(src + x);
-            v_float32 v_src1 = vx_load(src + x + v_float32::nlanes);
+            v_float32 v_src1 = vx_load(src + x + VTraits<v_float32>::vlanes());
             v_float64 v_src00 = v_cvt_f64(v_src0);
             v_float64 v_src01 = v_cvt_f64_high(v_src0);
             v_float64 v_src10 = v_cvt_f64(v_src1);
@@ -3037,10 +3037,10 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_dst10 = vx_load(dst + x + step * 2);
             v_float64 v_dst11 = vx_load(dst + x + step * 3);
 
-            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
-            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
-            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
-            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
+            v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_src00, v_alpha));
+            v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_src01, v_alpha));
+            v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_src10, v_alpha));
+            v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_src11, v_alpha));
 
             v_store(dst + x, v_dst00);
             v_store(dst + x + step, v_dst01);
@@ -3072,11 +3072,11 @@ void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int
             _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
         }
     }
-#elif CV_SIMD_64F
+#elif (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -3089,8 +3089,8 @@ void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int
             v_float64 v_dst0 = vx_load(dst + x);
             v_float64 v_dst1 = vx_load(dst + x + step);
 
-            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_src0, v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_src1, v_alpha));
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp
index 5a1296b50958..accb45e7ad87 100644
--- a/modules/imgproc/src/blend.cpp
+++ b/modules/imgproc/src/blend.cpp
@@ -48,12 +48,12 @@
 #include "opencv2/core/hal/intrin.hpp"
 
 namespace cv {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2)
 {
     const v_float32 v_eps = vx_setall_f32(1e-5f);
-    v_float32 v_denom = v_w1 + v_w2 + v_eps;
-    return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
+    v_float32 v_denom = v_add(v_add(v_w1, v_w2), v_eps);
+    return v_div(v_add(v_mul(v_src1, v_w1), v_mul(v_src2, v_w2)), v_denom);
 }
 static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
 {
@@ -105,7 +105,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13;
             v_float32 v_src20, v_src21, v_src22, v_src23;
@@ -113,15 +113,15 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
             load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
 
             v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
-            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
 
             store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_uint8 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
@@ -135,12 +135,12 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
 
             v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
             v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
-            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes);
-            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
+            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
 
             v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
             v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
@@ -148,7 +148,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*VTraits<v_uint8>::vlanes(); x += 3*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
@@ -164,13 +164,13 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
             expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);
 
             v_float32 v_w10 = vx_load(weights1 + weight_offset);
-            v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes);
-            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes);
-            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w11 = vx_load(weights1 + weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*VTraits<v_float32>::vlanes());
             v_float32 v_w20 = vx_load(weights2 + weight_offset);
-            v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes);
-            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes);
-            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w21 = vx_load(weights2 + weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*VTraits<v_float32>::vlanes());
             v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
             v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
             v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
@@ -192,7 +192,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
         }
         break;
     case 4:
-        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13;
             v_float32 v_src20, v_src21, v_src22, v_src23;
@@ -229,7 +229,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src1 = vx_load(src1 + x);
             v_float32 v_src2 = vx_load(src2 + x);
@@ -242,7 +242,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*VTraits<v_float32>::vlanes(); x += 2*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
@@ -257,7 +257,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
@@ -273,7 +273,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 4:
-        for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 4*VTraits<v_float32>::vlanes(); x += 4*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
@@ -320,7 +320,7 @@ class BlendLinearInvoker :
             T * const dst_row = dst->ptr<T>(y);
 
             int x = 0;
-            #if CV_SIMD
+            #if (CV_SIMD || CV_SIMD_SCALABLE)
             x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
             #endif
 
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index 9c14929dc8aa..2fed0ba0c233 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -306,11 +306,11 @@ class parallelCanny : public ParallelLoopBody
         src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient)
     {
-#if CV_SIMD
-        for(int i = 0; i < v_int8::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
         {
             smask[i] = 0;
-            smask[i + v_int8::nlanes] = (schar)-1;
+            smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
         }
         if (true)
             _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
@@ -330,11 +330,11 @@ class parallelCanny : public ParallelLoopBody
         src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient)
     {
-#if CV_SIMD
-        for(int i = 0; i < v_int8::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
         {
             smask[i] = 0;
-            smask[i + v_int8::nlanes] = (schar)-1;
+            smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
         }
         if (true)
             _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
@@ -396,7 +396,7 @@ class parallelCanny : public ParallelLoopBody
         }
 
         // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         AutoBuffer<int> buffer(3 * (mapstep * cn + CV_SIMD_WIDTH));
         _mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH);
         _mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH);
@@ -436,8 +436,8 @@ class parallelCanny : public ParallelLoopBody
                 if (L2gradient)
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD
-                    for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for ( ; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
                     {
                         v_int16 v_dx = vx_load((const short*)(_dx + j));
                         v_int16 v_dy = vx_load((const short*)(_dy + j));
@@ -447,8 +447,8 @@ class parallelCanny : public ParallelLoopBody
                         v_expand(v_dx, v_dxp_low, v_dxp_high);
                         v_expand(v_dy, v_dyp_low, v_dyp_high);
 
-                        v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
-                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
+                        v_store_aligned((int *)(_mag_n + j), v_add(v_mul(v_dxp_low, v_dxp_low), v_mul(v_dyp_low, v_dyp_low)));
+                        v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_mul(v_dxp_high, v_dxp_high), v_mul(v_dyp_high, v_dyp_high)));
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -457,8 +457,8 @@ class parallelCanny : public ParallelLoopBody
                 else
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD
-                    for(; j <= width - v_int16::nlanes; j += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for(; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
                     {
                         v_int16 v_dx = vx_load((const short *)(_dx + j));
                         v_int16 v_dy = vx_load((const short *)(_dy + j));
@@ -470,8 +470,8 @@ class parallelCanny : public ParallelLoopBody
                         v_expand(v_dx, v_dx_ml, v_dx_mh);
                         v_expand(v_dy, v_dy_ml, v_dy_mh);
 
-                        v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
-                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh);
+                        v_store_aligned((int *)(_mag_n + j), v_add(v_dx_ml, v_dy_ml));
+                        v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_dx_mh, v_dy_mh));
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -515,7 +515,7 @@ class parallelCanny : public ParallelLoopBody
 
             // From here actual src row is (i - 1)
             // Set left and right border to 1
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (true)
                 _pmap = map.ptr<uchar>(i) + CV_SIMD_WIDTH;
             else
@@ -537,22 +537,22 @@ class parallelCanny : public ParallelLoopBody
 
             const int TG22 = 13573;
             int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 const v_int32 v_low = vx_setall_s32(low);
                 const v_int8 v_one = vx_setall_s8(1);
 
-                for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes)
+                for (; j <= src.cols - VTraits<v_int8>::vlanes(); j += VTraits<v_int8>::vlanes())
                 {
                     v_store_aligned((signed char*)(_pmap + j), v_one);
-                    v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j                    )) > v_low,
-                                                 vx_load_aligned((const int*)(_mag_a + j +   v_int32::nlanes)) > v_low),
-                                          v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low,
-                                                 vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low));
+                    v_int8 v_cmp = v_pack(v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j)), v_low),
+                                                 v_gt(vx_load_aligned((const int *)(_mag_a + j + VTraits<v_int32>::vlanes())), v_low)),
+                                          v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j + 2 * VTraits<v_int32>::vlanes())), v_low),
+                                                 v_gt(vx_load_aligned((const int *)(_mag_a + j + 3 * VTraits<v_int32>::vlanes())), v_low)));
                     while (v_check_any(v_cmp))
                     {
                         int l = v_scan_forward(v_cmp);
-                        v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l);
+                        v_cmp = v_and(v_cmp, vx_load(smask + VTraits<v_int8>::vlanes() - 1 - l));
                         int k = j + l;
 
                         int m = _mag_a[k];
@@ -693,8 +693,8 @@ class parallelCanny : public ParallelLoopBody
     ptrdiff_t mapstep;
     int cn;
     mutable Mutex mutex;
-#if CV_SIMD
-    schar smask[2*v_int8::nlanes];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    schar smask[2*VTraits<v_int8>::max_nlanes];
 #endif
 };
 
@@ -718,31 +718,31 @@ class finalPass : public ParallelLoopBody
             int j = 0;
             uchar *pdst = dst.ptr<uchar>(i);
             const uchar *pmap = map.ptr<uchar>(i + 1);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (true)
                 pmap += CV_SIMD_WIDTH;
             else
 #endif
                 pmap += 1;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 const v_uint8 v_zero = vx_setzero_u8();
-                const v_uint8 v_ff = ~v_zero;
+                const v_uint8 v_ff = v_not(v_zero);
                 const v_uint8 v_two = vx_setall_u8(2);
 
-                for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes)
+                for (; j <= dst.cols - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
                 {
                     v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j));
-                    v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
+                    v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
                     v_store((pdst + j), v_pmap);
                 }
 
-                if (j <= dst.cols - v_uint8::nlanes/2)
+                if (j <= dst.cols - VTraits<v_uint8>::vlanes()/2)
                 {
                     v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j));
-                    v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
+                    v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
                     v_store_low((pdst + j), v_pmap);
-                    j += v_uint8::nlanes/2;
+                    j += VTraits<v_uint8>::vlanes()/2;
                 }
             }
 #endif
diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp
index 6e1102019749..67e2febd5b7f 100644
--- a/modules/imgproc/src/color_rgb.simd.hpp
+++ b/modules/imgproc/src/color_rgb.simd.hpp
@@ -122,8 +122,8 @@ struct RGB2RGB
         int i = 0;
         _Tp alphav = ColorChannel<_Tp>::max();
 
-#if CV_SIMD
-        const int vsize = vt::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<vt>::vlanes();
 
         for(; i <= n-vsize;
             i += vsize, src += vsize*scn, dst += vsize*dcn)
@@ -138,8 +138,13 @@ struct RGB2RGB
                 v_load_deinterleave(src, a, b, c);
                 d = v_set<_Tp>::set(alphav);
             }
-            if(bi == 2)
+            if(bi == 2) {
+                #if CV_SIMD_SCALABLE
+                auto t = a; a = c; c = t; // swap(a, c);
+                #else
                 swap(a, c);
+                #endif
+            }
 
             if(dcn == 4)
             {
@@ -185,53 +190,57 @@ struct RGB5x52RGB
         int dcn = dstcn, bidx = blueIdx, gb = greenBits;
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255);
         for(; i <= n-vsize;
             i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn)
         {
             v_uint16 t0 = v_reinterpret_as_u16(vx_load(src));
             v_uint16 t1 = v_reinterpret_as_u16(vx_load(src +
-                                                       sizeof(ushort)*v_uint16::nlanes));
+                                                       sizeof(ushort)*VTraits<v_uint16>::vlanes()));
 
             //TODO: shorten registers use when v_interleave is available
             v_uint8 r, g, b, a;
-            v_uint16 b0 = (t0 << 11) >> 8;
-            v_uint16 b1 = (t1 << 11) >> 8;
+            v_uint16 b0 = v_shr<8>(v_shl<11>(t0));
+            v_uint16 b1 = v_shr<8>(v_shl<11>(t1));
             b = v_pack(b0, b1);
 
             v_uint16 g0, g1, r0, r1, a0, a1;
 
             if( gb == 6 )
             {
-                g0 = ((t0 >> 5) << 10) >> 8;
-                g1 = ((t1 >> 5) << 10) >> 8;
+                g0 = v_shr<8>(v_shl<10>(v_shr<5>(t0)));
+                g1 = v_shr<8>(v_shl<10>(v_shr<5>(t1)));
 
-                r0 = (t0 >> 11) << 3;
-                r1 = (t1 >> 11) << 3;
+                r0 = v_shl<3>(v_shr<11>(t0));
+                r1 = v_shl<3>(v_shr<11>(t1));
 
                 a = vn0;
             }
             else
             {
-                g0 = ((t0 >> 5) << 11) >> 8;
-                g1 = ((t1 >> 5) << 11) >> 8;
+                g0 = v_shr<8>(v_shl<11>(v_shr<5>(t0)));
+                g1 = v_shr<8>(v_shl<11>(v_shr<5>(t1)));
 
-                r0 = ((t0 >> 10) << 11) >> 8;
-                r1 = ((t1 >> 10) << 11) >> 8;
+                r0 = v_shr<8>(v_shl<11>(v_shr<10>(t0)));
+                r1 = v_shr<8>(v_shl<11>(v_shr<10>(t1)));
 
-                a0 = t0 >> 15;
-                a1 = t1 >> 15;
+                a0 = v_shr<15>(t0);
+                a1 = v_shr<15>(t1);
                 a = v_pack(a0, a1);
-                a = a != vz;
+                a = v_ne(a, vz);
             }
             g = v_pack(g0, g1);
             r = v_pack(r0, r1);
 
-            if(bidx == 2)
+            if(bidx == 2) {
+                #if CV_SIMD_SCALABLE
+                auto t = r; r = b; b = t; // swap(b, r);
+                #else
                 swap(b, r);
-
+                #endif
+            }
             if(dcn == 4)
             {
                 v_store_interleave(dst, b, g, r, a);
@@ -289,8 +298,8 @@ struct RGB2RGB5x5
         int scn = srccn, bidx = blueIdx, gb = greenBits;
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint16 vn3 = vx_setall_u16((ushort)(~3));
         v_uint16 vn7 = vx_setall_u16((ushort)(~7));
         v_uint16 vz = vx_setzero_u16();
@@ -308,10 +317,15 @@ struct RGB2RGB5x5
             {
                 v_load_deinterleave(src, b, g, r, a);
             }
-            if(bidx == 2)
+            if(bidx == 2){
+                #if CV_SIMD_SCALABLE
+                auto t = r; r = b; b = t; // swap(b, r);
+                #else
                 swap(b, r);
+                #endif
+            }
 
-            r = r & v7;
+            r = v_and(r, v7);
 
             //TODO: shorten registers use when v_deinterleave is available
             v_uint16 r0, r1, g0, g1, b0, b1, a0, a1;
@@ -322,20 +336,20 @@ struct RGB2RGB5x5
 
             v_uint16 d0, d1;
 
-            b0 = b0 >> 3;
-            b1 = b1 >> 3;
-            a0 = (a0 != vz) << 15;
-            a1 = (a1 != vz) << 15;
+            b0 = v_shr<3>(b0);
+            b1 = v_shr<3>(b1);
+            a0 = v_shl<15>(v_ne(a0, vz));
+            a1 = v_shl<15>(v_ne(a1, vz));
 
             if(gb == 6)
             {
-                d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8);
-                d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8);
+                d0 = v_or(v_or(b0, v_shl<3>(v_and(g0, vn3))), v_shl<8>(r0));
+                d1 = v_or(v_or(b1, v_shl<3>(v_and(g1, vn3))), v_shl<8>(r1));
             }
             else
             {
-                d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0;
-                d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1;
+                d0 = v_or(v_or(v_or(b0, v_shl<2>(v_and(g0, vn7))), v_shl<7>(r0)), a0);
+                d1 = v_or(v_or(v_or(b1, v_shl<2>(v_and(g1, vn7))), v_shl<7>(r1)), a1);
             }
 
             v_store((ushort*)dst, d0);
@@ -382,8 +396,8 @@ struct Gray2RGB
         int i = 0;
         _Tp alpha = ColorChannel<_Tp>::max();
 
-#if CV_SIMD
-        const int vsize = vt::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<vt>::vlanes();
         vt valpha = v_set<_Tp>::set(alpha);
         for(; i <= n-vsize;
             i += vsize, src += vsize, dst += vsize*dcn)
@@ -424,8 +438,8 @@ struct Gray2RGB5x5
     {
         int gb = greenBits;
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         v_uint16 v3 = vx_setall_u16((ushort)(~3));
         for(; i <= n-vsize;
             i += vsize, src += vsize, dst += vsize*sizeof(ushort))
@@ -433,16 +447,16 @@ struct Gray2RGB5x5
             v_uint8 t8 = vx_load_low(src);
             v_uint16 t = v_expand_low(t8);
 
-            v_uint16 t3 = t >> 3;
+            v_uint16 t3 = v_shr<3>(t);
 
             v_uint16 d = t3;
             if(gb == 6)
             {
-                d |= ((t & v3) << 3) | (t3 << 11);
+                d = v_or(d, v_or(v_shl<3>(v_and(t, v3)), v_shl<11>(t3)));
             }
             else
             {
-                d |= (t3 << 5) | (t3 << 10);
+                d = v_or(d, v_or(v_shl<5>(t3), v_shl<10>(t3)));
             }
 
             v_store((ushort*)dst, d);
@@ -488,8 +502,8 @@ struct RGB5x52Gray
     {
         int gb = greenBits;
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
 
         v_int16 bg2y;
         v_int16 r12y;
@@ -504,17 +518,17 @@ struct RGB5x52Gray
             v_uint16 t = vx_load((ushort*)src);
 
             v_uint16 r, g, b;
-            b = (t << 11) >> 8;
+            b = v_shr<8>(v_shl<11>(t));
 
             if(gb == 5)
             {
-                g = ((t >> 5) << 11) >> 8;
-                r = ((t >> 10) << 11) >> 8;
+                g = v_shr<8>(v_shl<11>(v_shr<5>(t)));
+                r = v_shr<8>(v_shl<11>(v_shr<10>(t)));
             }
             else
             {
-                g = ((t >> 5) << 10) >> 8;
-                r = (t >> 11) << 3;
+                g = v_shr<8>(v_shl<10>(v_shr<5>(t)));
+                r = v_shl<3>(v_shr<11>(t));
             }
 
             v_uint8 d;
@@ -530,11 +544,11 @@ struct RGB5x52Gray
             v_zip(sr, delta, rd0, rd1);
 
             v_uint32 d0, d1;
-            d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y));
-            d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y));
+            d0 = v_reinterpret_as_u32(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)));
+            d1 = v_reinterpret_as_u32(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)));
 
-            d0 = d0 >> shift;
-            d1 = d1 >> shift;
+            d0 = v_shr<shift>(d0);
+            d1 = v_shr<shift>(d1);
 
             dx = v_pack(d0, d1);
             // high part isn't used
@@ -611,8 +625,8 @@ struct RGB2Gray<float>
         int scn = srccn, i = 0;
         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
 
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
         v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb);
         for(; i <= n-vsize;
             i += vsize, src += vsize*scn, dst += vsize)
@@ -627,7 +641,7 @@ struct RGB2Gray<float>
                 v_load_deinterleave(src, b, g, r, a);
             }
 
-            v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv));
+            v_float32 d = v_fma(r, rv, v_fma(g, gv, v_mul(b, bv)));
 
             v_store(dst, d);
         }
@@ -669,8 +683,8 @@ struct RGB2Gray<uchar>
         short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_int16 bg2y;
         v_int16 r12y;
         v_int16 dummy;
@@ -706,10 +720,10 @@ struct RGB2Gray<uchar>
             v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);
 
             v_uint32 y00, y01, y10, y11;
-            y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
-            y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
-            y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
-            y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
+            y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
+            y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
+            y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
+            y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));
 
             v_uint16 y0, y1;
             y0 = v_pack(y00, y01);
@@ -762,8 +776,8 @@ struct RGB2Gray<ushort>
         short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
 
         v_int16 b2y = vx_setall_s16(cb);
         v_int16 g2y = vx_setall_s16(cg);
@@ -802,13 +816,13 @@ struct RGB2Gray<ushort>
 
             // fixing 16bit signed multiplication
             v_int16 mr, mg, mb;
-            mr = (sr < z) & r2y;
-            mg = (sg < z) & g2y;
-            mb = (sb < z) & b2y;
-            v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
+            mr = v_and(v_lt(sr, z), r2y);
+            mg = v_and(v_lt(sg, z), g2y);
+            mb = v_and(v_lt(sb, z), b2y);
+            v_int16 fixmul = v_shl<fix_shift>(v_add_wrap(mr, v_add_wrap(mg, mb)));
 
-            v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
-            v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
+            v_int32 sy0 = v_shr<shift>(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)));
+            v_int32 sy1 = v_shr<shift>(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)));
 
             v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul);
 
@@ -973,8 +987,8 @@ struct mRGBA2RGBA<uchar>
         uchar max_val = ColorChannel<uchar>::max();
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
         v_uint8 vmax = vx_setall_u8(max_val);
 
@@ -989,9 +1003,9 @@ struct mRGBA2RGBA<uchar>
             v_uint8 a;
             v_uint16 a16;
             v_uint32 a32;
-            a16 = v_reinterpret_as_u16(s & amask);
-            a32 = v_reinterpret_as_u32(a16 | (a16 >> 8));
-            a = v_reinterpret_as_u8(a32 | (a32 >> 16));
+            a16 = v_reinterpret_as_u16(v_and(s, amask));
+            a32 = v_reinterpret_as_u32(v_or(a16, v_shr<8>(a16)));
+            a = v_reinterpret_as_u8(v_or(a32, v_shr<16>(a32)));
 
             // s *= max_val
             v_uint16 s0, s1;
@@ -1000,7 +1014,7 @@ struct mRGBA2RGBA<uchar>
             // s += a/2
             v_uint16 ae0, ae1;
             v_expand(a, ae0, ae1);
-            s0 += ae0 >> 1; s1 += ae1 >> 1;
+            s0 = v_add(s0, v_shr<1>(ae0)); s1 = v_add(s1, v_shr<1>(ae1));
 
             // s, a -> u32 -> float
             v_uint32 u00, u01, u10, u11;
@@ -1035,10 +1049,10 @@ struct mRGBA2RGBA<uchar>
 
             // float d = (float)s/(float)a
             v_float32 fd00, fd01, fd10, fd11;
-            fd00 = fs00/fa00;
-            fd01 = fs01/fa01;
-            fd10 = fs10/fa10;
-            fd11 = fs11/fa11;
+            fd00 = v_div(fs00, fa00);
+            fd01 = v_div(fs01, fa01);
+            fd10 = v_div(fs10, fa10);
+            fd11 = v_div(fs11, fa11);
 
             // d -> u32 -> u8
             v_uint32 ud00, ud01, ud10, ud11;
@@ -1054,8 +1068,8 @@ struct mRGBA2RGBA<uchar>
 
             // if a == 0 then d = 0
             v_uint8 am;
-            am = a != vx_setzero_u8();
-            d = d & am;
+            am = v_ne(a, vx_setzero_u8());
+            d = v_and(d, am);
 
             // put alpha values
             d = v_select(amask, a, d);
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index d8823206f29c..3e3096e7a57a 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -1080,7 +1080,7 @@ cvFindNextContour( CvContourScanner scanner )
             }
             else
             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 if ((p = img[x]) != prev)
                 {
                     goto _next_contour;
@@ -1088,9 +1088,9 @@ cvFindNextContour( CvContourScanner scanner )
                 else
                 {
                     v_uint8 v_prev = vx_setall_u8((uchar)prev);
-                    for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+                    for (; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
                     {
-                        v_uint8 vmask = (vx_load((uchar*)(img + x)) != v_prev);
+                        v_uint8 vmask = (v_ne(vx_load((uchar *)(img + x)), v_prev));
                         if (v_check_any(vmask))
                         {
                             p = img[(x += v_scan_forward(vmask))];
@@ -1105,7 +1105,7 @@ cvFindNextContour( CvContourScanner scanner )
 
             if( x >= width )
                 break;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         _next_contour:
 #endif
             {
@@ -1353,11 +1353,11 @@ CvLinkedRunPoint;
 
 inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint8 v_zero = vx_setzero_u8();
-    for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
+    for (; j <= img_size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
     {
-        v_uint8 vmask = (vx_load((uchar*)(src_data + j)) != v_zero);
+        v_uint8 vmask = (v_ne(vx_load((uchar *)(src_data + j)), v_zero));
         if (v_check_any(vmask))
         {
             j += v_scan_forward(vmask);
@@ -1372,7 +1372,7 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 
 inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     if (j < img_size.width && !src_data[j])
     {
         return j;
@@ -1380,9 +1380,9 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
     else
     {
         v_uint8 v_zero = vx_setzero_u8();
-        for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
+        for (; j <= img_size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
         {
-            v_uint8 vmask = (vx_load((uchar*)(src_data + j)) == v_zero);
+            v_uint8 vmask = (v_eq(vx_load((uchar *)(src_data + j)), v_zero));
             if (v_check_any(vmask))
             {
                 j += v_scan_forward(vmask);
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index f0ea0b5bb5e1..1d6ee1ac04d5 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -74,21 +74,21 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
 #endif // CV_TRY_AVX
             j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            v_float32x4 half = v_setall_f32(0.5f);
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            v_float32 half = vx_setall_f32(0.5f);
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_a, v_b, v_c, v_t;
+                v_float32 v_a, v_b, v_c, v_t;
                 v_load_deinterleave(cov + j*3, v_a, v_b, v_c);
-                v_a *= half;
-                v_c *= half;
-                v_t = v_a - v_c;
-                v_t = v_muladd(v_b, v_b, (v_t * v_t));
-                v_store(dst + j, (v_a + v_c) - v_sqrt(v_t));
+                v_a = v_mul(v_a, half);
+                v_c = v_mul(v_c, half);
+                v_t = v_sub(v_a, v_c);
+                v_t = v_muladd(v_b, v_b, (v_mul(v_t, v_t)));
+                v_store(dst + j, v_sub(v_add(v_a, v_c), v_sqrt(v_t)));
             }
         }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 
         for( ; j < size.width; j++ )
         {
@@ -127,18 +127,18 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
 #endif // CV_TRY_AVX
             j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            v_float32x4 v_k = v_setall_f32((float)k);
+            v_float32 v_k = vx_setall_f32((float)k);
 
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_a, v_b, v_c;
+                v_float32 v_a, v_b, v_c;
                 v_load_deinterleave(cov + j * 3, v_a, v_b, v_c);
 
-                v_float32x4 v_ac_bb = v_a * v_c - v_b * v_b;
-                v_float32x4 v_ac = v_a + v_c;
-                v_float32x4 v_dst = v_ac_bb - v_k * v_ac * v_ac;
+                v_float32 v_ac_bb = v_sub(v_mul(v_a, v_c), v_mul(v_b, v_b));
+                v_float32 v_ac = v_add(v_a, v_c);
+                v_float32 v_dst = v_sub(v_ac_bb, v_mul(v_mul(v_k, v_ac), v_ac));
                 v_store(dst + j, v_dst);
             }
         }
@@ -282,22 +282,22 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
 #endif // CV_TRY_AVX
             j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_dx = v_load(dxdata + j);
-                v_float32x4 v_dy = v_load(dydata + j);
+                v_float32 v_dx = vx_load(dxdata + j);
+                v_float32 v_dy = vx_load(dydata + j);
 
-                v_float32x4 v_dst0, v_dst1, v_dst2;
-                v_dst0 = v_dx * v_dx;
-                v_dst1 = v_dx * v_dy;
-                v_dst2 = v_dy * v_dy;
+                v_float32 v_dst0, v_dst1, v_dst2;
+                v_dst0 = v_mul(v_dx, v_dx);
+                v_dst1 = v_mul(v_dx, v_dy);
+                v_dst2 = v_mul(v_dy, v_dy);
 
                 v_store_interleave(cov_data + j * 3, v_dst0, v_dst1, v_dst2);
             }
         }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 
         for( ; j < size.width; j++ )
         {
@@ -693,9 +693,9 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
     if( src.depth() == CV_8U )
         factor *= 255;
     factor = 1./(factor * factor * factor);
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     float factor_f = (float)factor;
-    v_float32x4 v_factor = v_setall_f32(factor_f), v_m2 = v_setall_f32(-2.0f);
+    v_float32 v_factor = vx_setall_f32(factor_f), v_m2 = vx_setall_f32(-2.0f);
 #endif
 
     Size size = src.size();
@@ -711,18 +711,18 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
 
         j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_dx = v_load(dxdata + j);
-                v_float32x4 v_dy = v_load(dydata + j);
+                v_float32 v_dx = vx_load(dxdata + j);
+                v_float32 v_dy = vx_load(dydata + j);
 
-                v_float32x4 v_s1 = (v_dx * v_dx) * v_load(d2ydata + j);
-                v_float32x4 v_s2 = v_muladd((v_dy * v_dy),  v_load(d2xdata + j), v_s1);
-                v_float32x4 v_s3 = v_muladd((v_dy * v_dx) * v_load(dxydata + j), v_m2, v_s2);
+                v_float32 v_s1 = v_mul(v_mul(v_dx, v_dx), vx_load(d2ydata + j));
+                v_float32 v_s2 = v_muladd((v_mul(v_dy, v_dy)),  vx_load(d2xdata + j), v_s1);
+                v_float32 v_s3 = v_muladd(v_mul(v_mul(v_dy, v_dx), vx_load(dxydata + j)), v_m2, v_s2);
 
-                v_store(dstdata + j, v_s3 * v_factor);
+                v_store(dstdata + j, v_mul(v_s3, v_factor));
             }
         }
 #endif
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 068dfd3a2713..cbd60550e037 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -2053,13 +2053,13 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_CORREL )
         {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             v_float64 v_s1 = vx_setzero_f64();
             v_float64 v_s2 = vx_setzero_f64();
             v_float64 v_s11 = vx_setzero_f64();
             v_float64 v_s12 = vx_setzero_f64();
             v_float64 v_s22 = vx_setzero_f64();
-            for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for ( ; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
@@ -2070,8 +2070,8 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
                 v_s12 = v_muladd(v_ad, v_bd, v_s12);
                 v_s11 = v_muladd(v_ad, v_ad, v_s11);
                 v_s22 = v_muladd(v_bd, v_bd, v_s22);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
 
                 // 2-3
                 v_ad = v_cvt_f64_high(v_a);
@@ -2079,8 +2079,8 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
                 v_s12 = v_muladd(v_ad, v_bd, v_s12);
                 v_s11 = v_muladd(v_ad, v_ad, v_s11);
                 v_s22 = v_muladd(v_bd, v_bd, v_s22);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
             }
             s12 += v_reduce_sum(v_s12);
             s11 += v_reduce_sum(v_s11);
@@ -2124,12 +2124,12 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_INTERSECT )
         {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             v_float64 v_result = vx_setzero_f64();
-            for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for ( ; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
-                v_result += v_cvt_f64(v_src) + v_cvt_f64_high(v_src);
+                v_result = v_add(v_result, v_add(v_cvt_f64(v_src), v_cvt_f64_high(v_src)));
             }
             result += v_reduce_sum(v_result);
 #elif CV_SIMD
@@ -2146,26 +2146,26 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_BHATTACHARYYA )
         {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             v_float64 v_s1 = vx_setzero_f64();
             v_float64 v_s2 = vx_setzero_f64();
             v_float64 v_result = vx_setzero_f64();
-            for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for ( ; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
 
                 v_float64 v_ad = v_cvt_f64(v_a);
                 v_float64 v_bd = v_cvt_f64(v_b);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
-                v_result += v_sqrt(v_ad * v_bd);
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
+                v_result = v_add(v_result, v_sqrt(v_mul(v_ad, v_bd)));
 
                 v_ad = v_cvt_f64_high(v_a);
                 v_bd = v_cvt_f64_high(v_b);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
-                v_result += v_sqrt(v_ad * v_bd);
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
+                v_result = v_add(v_result, v_sqrt(v_mul(v_ad, v_bd)));
             }
             s1 += v_reduce_sum(v_s1);
             s2 += v_reduce_sum(v_s2);
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index c13354406968..dae09564d35f 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -84,7 +84,7 @@ template<typename T1, typename T2> int PyrUpVecV(T1**, T2**, int) { return 0; }
 
 template<typename T1, typename T2> int PyrUpVecVOneRow(T1**, T2*, int) { return 0; }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width)
 {
@@ -93,10 +93,8 @@ template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4) +
-                     v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4) +
-                     (v_reinterpret_as_s32(vx_load_expand(src4)) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4), v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4)), v_shr<16>(v_reinterpret_as_s32(vx_load_expand(src4)))));
     vx_cleanup();
 
     return x;
@@ -108,42 +106,40 @@ template<> int PyrDownVecH<uchar, int, 2>(const uchar* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) +
-                     v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) +
-                     (v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4), v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))))));
     vx_cleanup();
 
     return x;
 }
 template<> int PyrDownVecH<uchar, int, 3>(const uchar* src, int* row, int width)
 {
-    int idx[v_int8::nlanes/2 + 4];
-    for (int i = 0; i < v_int8::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int8>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int8>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_int8::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_int8>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int8::nlanes; x += 3*v_int8::nlanes/4, src += 6*v_int8::nlanes/4, row += 3*v_int8::nlanes/4)
+    for (; x <= width - VTraits<v_int8>::vlanes(); x += 3*VTraits<v_int8>::vlanes()/4, src += 6*VTraits<v_int8>::vlanes()/4, row += 3*VTraits<v_int8>::vlanes()/4)
     {
         v_uint16 r0l, r0h, r1l, r1h, r2l, r2h, r3l, r3h, r4l, r4h;
         v_expand(vx_lut_quads(src, idx                       ), r0l, r0h);
-        v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 2), r1l, r1h);
+        v_expand(vx_lut_quads(src, idx + VTraits<v_int8>::vlanes()/4 + 2), r1l, r1h);
         v_expand(vx_lut_quads(src, idx + 1                   ), r2l, r2h);
-        v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 3), r3l, r3h);
+        v_expand(vx_lut_quads(src, idx + VTraits<v_int8>::vlanes()/4 + 3), r3l, r3h);
         v_expand(vx_lut_quads(src, idx + 2                   ), r4l, r4h);
 
-        v_zip(r2l, r1l + r3l, r1l, r3l);
-        v_zip(r2h, r1h + r3h, r1h, r3h);
-        r0l += r4l; r0h += r4h;
+        v_zip(r2l, v_add(r1l, r3l), r1l, r3l);
+        v_zip(r2h, v_add(r1h, r3h), r1h, r3h);
+        r0l = v_add(r0l, r4l); r0h = v_add(r0h, r4h);
 
-        v_store(row                      , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0l))));
-        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0l))));
-        v_store(row + 6*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0h))));
-        v_store(row + 9*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0h))));
+        v_store(row                      , v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4), v_reinterpret_as_s32(v_expand_low(r0l)))));
+        v_store(row + 3*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4), v_reinterpret_as_s32(v_expand_high(r0l)))));
+        v_store(row + 6*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4), v_reinterpret_as_s32(v_expand_low(r0h)))));
+        v_store(row + 9*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4), v_reinterpret_as_s32(v_expand_high(r0h)))));
     }
     vx_cleanup();
 
@@ -156,10 +152,8 @@ template<> int PyrDownVecH<uchar, int, 4>(const uchar* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) +
-                     v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) +
-                     (v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4), v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))))));
     vx_cleanup();
 
     return x;
@@ -172,10 +166,8 @@ template<> int PyrDownVecH<short, int, 1>(const short* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(vx_load(src01), v_1_4) +
-                     v_dotprod(vx_load(src23), v_6_4) +
-                     (v_reinterpret_as_s32(vx_load(src4)) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(vx_load(src01), v_1_4), v_dotprod(vx_load(src23), v_6_4)), v_shr<16>(v_reinterpret_as_s32(vx_load(src4)))));
     vx_cleanup();
 
     return x;
@@ -187,34 +179,32 @@ template<> int PyrDownVecH<short, int, 2>(const short* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4) +
-                     v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4) +
-                     (v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4), v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))))));
     vx_cleanup();
 
     return x;
 }
 template<> int PyrDownVecH<short, int, 3>(const short* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_int16::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += 3*VTraits<v_int16>::vlanes()/4, src += 6*VTraits<v_int16>::vlanes()/4, row += 3*VTraits<v_int16>::vlanes()/4)
     {
         v_int16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_pack_triplets(v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4)));
-        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4)));
+        v_store(row, v_pack_triplets(v_add(v_add(v_dotprod(r0, v_1_4), v_dotprod(r2, v_6_4)), v_expand_low(r4))));
+        v_store(row + 3*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_add(v_dotprod(r1, v_1_4), v_dotprod(r3, v_6_4)), v_expand_high(r4))));
     }
     vx_cleanup();
 
@@ -222,24 +212,24 @@ template<> int PyrDownVecH<short, int, 3>(const short* src, int* row, int width)
 }
 template<> int PyrDownVecH<short, int, 4>(const short* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 8*i;
-        idx[i + v_int16::nlanes/4 + 2] = 8*i + 4;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 8*i + 4;
     }
 
     int x = 0;
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes(), src += 2*VTraits<v_int16>::vlanes(), row += VTraits<v_int16>::vlanes())
     {
         v_int16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4));
-        v_store(row + v_int32::nlanes, v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4));
+        v_store(row, v_add(v_add(v_dotprod(r0, v_1_4), v_dotprod(r2, v_6_4)), v_expand_low(r4)));
+        v_store(row + VTraits<v_int32>::vlanes(), v_add(v_add(v_dotprod(r1, v_1_4), v_dotprod(r3, v_6_4)), v_expand_high(r4)));
     }
     vx_cleanup();
 
@@ -255,10 +245,8 @@ template<> int PyrDownVecH<ushort, int, 1>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4) +
-                     v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4) +
-                     v_reinterpret_as_s32(v_reinterpret_as_u32(vx_load(src4)) >> 16) + v_half15);
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4)), v_reinterpret_as_s32(v_shr<16>(v_reinterpret_as_u32(vx_load(src4))))), v_half15));
     vx_cleanup();
 
     return x;
@@ -272,21 +260,19 @@ template<> int PyrDownVecH<ushort, int, 2>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4) +
-                     v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4) +
-                     v_reinterpret_as_s32(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4))) >> 16) + v_half15);
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_add(v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4), v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4)), v_reinterpret_as_s32(v_shr<16>(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4)))))), v_half15));
     vx_cleanup();
 
     return x;
 }
 template<> int PyrDownVecH<ushort, int, 3>(const ushort* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_int16::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
@@ -294,18 +280,14 @@ template<> int PyrDownVecH<ushort, int, 3>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += 3*VTraits<v_int16>::vlanes()/4, src += 6*VTraits<v_int16>::vlanes()/4, row += 3*VTraits<v_int16>::vlanes()/4)
     {
         v_uint16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row                      , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) +
-                                                           v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) +
-                                                           v_reinterpret_as_s32(v_expand_low(r4)) + v_half15));
-        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) +
-                                                           v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) +
-                                                           v_reinterpret_as_s32(v_expand_high(r4)) + v_half15));
+        v_store(row                      , v_pack_triplets(v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_low(r4))), v_half15)));
+        v_store(row + 3*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_high(r4))), v_half15)));
     }
     vx_cleanup();
 
@@ -313,11 +295,11 @@ template<> int PyrDownVecH<ushort, int, 3>(const ushort* src, int* row, int widt
 }
 template<> int PyrDownVecH<ushort, int, 4>(const ushort* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 8*i;
-        idx[i + v_int16::nlanes/4 + 2] = 8*i + 4;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 8*i + 4;
     }
 
     int x = 0;
@@ -325,18 +307,14 @@ template<> int PyrDownVecH<ushort, int, 4>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes(), src += 2*VTraits<v_int16>::vlanes(), row += VTraits<v_int16>::vlanes())
     {
         v_uint16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row                  , v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) +
-                                       v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) +
-                                       v_reinterpret_as_s32(v_expand_low(r4)) + v_half15);
-        v_store(row + v_int32::nlanes, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) +
-                                       v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) +
-                                       v_reinterpret_as_s32(v_expand_high(r4)) + v_half15);
+        v_store(row                  , v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_low(r4))), v_half15));
+        v_store(row + VTraits<v_int32>::vlanes(), v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_high(r4))), v_half15));
     }
     vx_cleanup();
 
@@ -349,13 +327,13 @@ template<> int PyrDownVecH<float, float, 1>(const float* src, float* row, int wi
     const float *src01 = src, *src23 = src + 2, *src4 = src + 3;
 
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src01 += 2*v_float32::nlanes, src23 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes)
+    for (; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), src01 += 2*VTraits<v_float32>::vlanes(), src23 += 2*VTraits<v_float32>::vlanes(), src4 += 2*VTraits<v_float32>::vlanes(), row+=VTraits<v_float32>::vlanes())
     {
         v_float32 r0, r1, r2, r3, r4, rtmp;
         v_load_deinterleave(src01, r0, r1);
         v_load_deinterleave(src23, r2, r3);
         v_load_deinterleave(src4, rtmp, r4);
-        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+        v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4))));
     }
     vx_cleanup();
 
@@ -367,13 +345,13 @@ template<> int PyrDownVecH<float, float, 2>(const float* src, float* row, int wi
     const float *src01 = src, *src23 = src + 4, *src4 = src + 6;
 
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src01 += 4*v_float32::nlanes, src23 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes)
+    for (; x <= width - 2*VTraits<v_float32>::vlanes(); x += 2*VTraits<v_float32>::vlanes(), src01 += 4*VTraits<v_float32>::vlanes(), src23 += 4*VTraits<v_float32>::vlanes(), src4 += 4*VTraits<v_float32>::vlanes(), row += 2*VTraits<v_float32>::vlanes())
     {
         v_float32 r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, rtmpa, rtmpb;
         v_load_deinterleave(src01, r0a, r0b, r1a, r1b);
         v_load_deinterleave(src23, r2a, r2b, r3a, r3b);
         v_load_deinterleave(src4, rtmpa, rtmpb, r4a, r4b);
-        v_store_interleave(row, v_muladd(r2a, _6, v_muladd(r1a + r3a, _4, r0a + r4a)), v_muladd(r2b, _6, v_muladd(r1b + r3b, _4, r0b + r4b)));
+        v_store_interleave(row, v_muladd(r2a, _6, v_muladd(v_add(r1a, r3a), _4, v_add(r0a, r4a))), v_muladd(r2b, _6, v_muladd(v_add(r1b, r3b), _4, v_add(r0b, r4b))));
     }
     vx_cleanup();
 
@@ -381,23 +359,23 @@ template<> int PyrDownVecH<float, float, 2>(const float* src, float* row, int wi
 }
 template<> int PyrDownVecH<float, float, 3>(const float* src, float* row, int width)
 {
-    int idx[v_float32::nlanes/2 + 4];
-    for (int i = 0; i < v_float32::nlanes/4 + 2; i++)
+    int idx[VTraits<v_float32>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_float32>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_float32::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_float32>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - v_float32::nlanes; x += 3*v_float32::nlanes/4, src += 6*v_float32::nlanes/4, row += 3*v_float32::nlanes/4)
+    for (; x <= width - VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes()/4, src += 6*VTraits<v_float32>::vlanes()/4, row += 3*VTraits<v_float32>::vlanes()/4)
     {
         v_float32 r0 = vx_lut_quads(src, idx);
-        v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2);
+        v_float32 r1 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 2);
         v_float32 r2 = vx_lut_quads(src, idx + 1);
-        v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3);
+        v_float32 r3 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 3);
         v_float32 r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))));
+        v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4)))));
     }
     vx_cleanup();
 
@@ -405,43 +383,43 @@ template<> int PyrDownVecH<float, float, 3>(const float* src, float* row, int wi
 }
 template<> int PyrDownVecH<float, float, 4>(const float* src, float* row, int width)
 {
-    int idx[v_float32::nlanes/2 + 4];
-    for (int i = 0; i < v_float32::nlanes/4 + 2; i++)
+    int idx[VTraits<v_float32>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_float32>::vlanes()/4 + 2; i++)
     {
         idx[i] = 8*i;
-        idx[i + v_float32::nlanes/4 + 2] = 8*i + 4;
+        idx[i + VTraits<v_float32>::vlanes()/4 + 2] = 8*i + 4;
     }
 
     int x = 0;
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src += 2*v_float32::nlanes, row += v_float32::nlanes)
+    for (; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), src += 2*VTraits<v_float32>::vlanes(), row += VTraits<v_float32>::vlanes())
     {
         v_float32 r0 = vx_lut_quads(src, idx);
-        v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2);
+        v_float32 r1 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 2);
         v_float32 r2 = vx_lut_quads(src, idx + 1);
-        v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3);
+        v_float32 r3 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 3);
         v_float32 r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+        v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4))));
     }
     vx_cleanup();
 
     return x;
 }
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<> int PyrDownVecH<double, double, 1>(const double* src, double* row, int width)
 {
     int x = 0;
     const double *src01 = src, *src23 = src + 2, *src4 = src + 3;
 
     v_float64 _4 = vx_setall_f64(4.f), _6 = vx_setall_f64(6.f);
-    for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src01 += 2*v_float64::nlanes, src23 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes)
+    for (; x <= width - VTraits<v_float64>::vlanes(); x += VTraits<v_float64>::vlanes(), src01 += 2*VTraits<v_float64>::vlanes(), src23 += 2*VTraits<v_float64>::vlanes(), src4 += 2*VTraits<v_float64>::vlanes(), row += VTraits<v_float64>::vlanes())
     {
         v_float64 r0, r1, r2, r3, r4, rtmp;
         v_load_deinterleave(src01, r0, r1);
         v_load_deinterleave(src23, r2, r3);
         v_load_deinterleave(src4, rtmp, r4);
-        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+        v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4))));
     }
     vx_cleanup();
 
@@ -454,35 +432,36 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
-    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes )
+    for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
     {
         v_uint16 r0, r1, r2, r3, r4, t0, t1;
-        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
-        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
-        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
-        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
-        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
-        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
-        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes)));
-        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes)));
-        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes)));
-        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes)));
-        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes)));
-        t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + VTraits<v_int32>::vlanes())));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + VTraits<v_int32>::vlanes())));
+        t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row0 + x + 3*VTraits<v_int32>::vlanes())));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row1 + x + 3*VTraits<v_int32>::vlanes())));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row2 + x + 3*VTraits<v_int32>::vlanes())));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row3 + x + 3*VTraits<v_int32>::vlanes())));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row4 + x + 3*VTraits<v_int32>::vlanes())));
+        t1 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
         v_store(dst + x, v_rshr_pack<8>(t0, t1));
     }
-    if (x <= width - v_int16::nlanes)
+    if (x <= width - VTraits<v_int16>::vlanes())
     {
         v_uint16 r0, r1, r2, r3, r4, t0;
-        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
-        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
-        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
-        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
-        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
-        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + VTraits<v_int32>::vlanes())));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + VTraits<v_int32>::vlanes())));
+        t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
         v_rshr_pack_store<8>(dst + x, t0);
-        x += v_uint16::nlanes;
+        x += VTraits<v_uint16>::vlanes();
     }
+    #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
     for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
     {
@@ -492,10 +471,23 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
         r2 = v_load(row2 + x);
         r3 = v_load(row3 + x);
         r4 = v_load(row4 + x);
-        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
 
         *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
     }
+    #else
+    for (; x <= width - 1; x += 1)
+    {
+        int r0 = *(row0 + x);
+        int r1 = *(row1 + x);
+        int r2 = *(row2 + x);
+        int r3 = *(row3 + x);
+        int r4 = *(row4 + x);
+        int t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        // Similar to v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16()).get0()
+        *(dst + x) = (int)((((unsigned int)t0) + ((1 << (8 - 1)))) >> 8);
+    }
+    #endif //CV_SIMD128
     vx_cleanup();
 
     return x;
@@ -508,7 +500,7 @@ int PyrDownVecV<float, float>(float** src, float* dst, int width)
     const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
     v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256);
-    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
     {
         v_float32 r0, r1, r2, r3, r4;
         r0 = vx_load(row0 + x);
@@ -516,7 +508,7 @@ int PyrDownVecV<float, float>(float** src, float* dst, int width)
         r2 = vx_load(row2 + x);
         r3 = vx_load(row3 + x);
         r4 = vx_load(row4 + x);
-        v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale);
+        v_store(dst + x, v_mul(v_muladd(v_add(v_add(r1, r3), r2), _4, v_add(v_add(r0, r4), v_add(r2, r2))), _scale));
     }
     vx_cleanup();
 
@@ -528,30 +520,30 @@ template <> int PyrDownVecV<int, ushort>(int** src, ushort* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
-    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+    for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
-                r01 = vx_load(row0 + x + v_int32::nlanes),
+                r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 r10 = vx_load(row1 + x),
-                r11 = vx_load(row1 + x + v_int32::nlanes),
+                r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 r20 = vx_load(row2 + x),
-                r21 = vx_load(row2 + x + v_int32::nlanes),
+                r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes()),
                 r30 = vx_load(row3 + x),
-                r31 = vx_load(row3 + x + v_int32::nlanes),
+                r31 = vx_load(row3 + x + VTraits<v_int32>::vlanes()),
                 r40 = vx_load(row4 + x),
-                r41 = vx_load(row4 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
-                                            r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
+                r41 = vx_load(row4 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack_u<8>(v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))),
+                                            v_add(v_add(v_add(r01, r41), v_add(r21, r21)), v_shl<2>(v_add(v_add(r11, r21), r31)))));
     }
-    if (x <= width - v_int32::nlanes)
+    if (x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
                 r10 = vx_load(row1 + x),
                 r20 = vx_load(row2 + x),
                 r30 = vx_load(row3 + x),
                 r40 = vx_load(row4 + x);
-        v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
-        x += v_int32::nlanes;
+        v_rshr_pack_u_store<8>(dst + x, v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -563,30 +555,30 @@ template <> int PyrDownVecV<int, short>(int** src, short* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
-    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+    for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
-                r01 = vx_load(row0 + x + v_int32::nlanes),
+                r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 r10 = vx_load(row1 + x),
-                r11 = vx_load(row1 + x + v_int32::nlanes),
+                r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 r20 = vx_load(row2 + x),
-                r21 = vx_load(row2 + x + v_int32::nlanes),
+                r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes()),
                 r30 = vx_load(row3 + x),
-                r31 = vx_load(row3 + x + v_int32::nlanes),
+                r31 = vx_load(row3 + x + VTraits<v_int32>::vlanes()),
                 r40 = vx_load(row4 + x),
-                r41 = vx_load(row4 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
-                                        r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
+                r41 = vx_load(row4 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack<8>(v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))),
+                                        v_add(v_add(v_add(r01, r41), v_add(r21, r21)), v_shl<2>(v_add(v_add(r11, r21), r31)))));
     }
-    if (x <= width - v_int32::nlanes)
+    if (x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
             r10 = vx_load(row1 + x),
             r20 = vx_load(row2 + x),
             r30 = vx_load(row3 + x),
             r40 = vx_load(row4 + x);
-        v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
-        x += v_int32::nlanes;
+        v_rshr_pack_store<8>(dst + x, v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -599,39 +591,55 @@ template <> int PyrUpVecV<int, uchar>(int** src, uchar** dst, int width)
     uchar *dst0 = dst[0], *dst1 = dst[1];
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+    for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)),
-                v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11);
-        v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11)));
-        v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r01 = v_pack(vx_load(row0 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row0 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r11 = v_pack(vx_load(row1 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row1 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())),
+                v_r21 = v_pack(vx_load(row2 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row2 + x + 3 * VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10), v_2r11 = (v_add(v_r11, v_r11));
+        v_store(dst0 + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_add(v_add(v_r01, v_r21), v_add(v_add(v_2r11, v_2r11), v_2r11))));
+        v_store(dst1 + x, v_rshr_pack_u<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21))));
     }
-    if(x <= width - v_uint16::nlanes)
+    if(x <= width - VTraits<v_uint16>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10;
-        v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10));
-        v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-        x += v_uint16::nlanes;
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10);
+        v_rshr_pack_u_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)));
+        v_rshr_pack_u_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20)));
+        x += VTraits<v_uint16>::vlanes();
     }
+    #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
     for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_int32 v_2r10 = v_r10 + v_r10;
-        v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2);
+        v_int32 v_2r10 = v_add(v_r10, v_r10);
+        v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
         *(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
         *(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
     }
+    #else
+    for (; x <= width - 1; x += 1)
+    {
+        int r00 = *(row0 + x),
+            r10 = *(row1 + x),
+            r20 = *(row2 + x);
+        int _2r10 = r10 + r10;
+        int d = r00 + r20 + (_2r10 + _2r10 + _2r10);
+        int d_shifted = (r10 + r20) << 2;
+        // Similar to v_rshr_pack_u<6>(d, vx_setzero_s16()).get0()
+        *(dst0 + x) = (int)((((unsigned int)d) + ((1 << (6 - 1)))) >> 6);
+        // Similar to v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16()).get0()
+        *(dst1 + x) = (int)((((unsigned int)d_shifted) + ((1 << (6 - 1)))) >> 6);
+    }
+    #endif //CV_SIMD128
     vx_cleanup();
 
     return x;
@@ -643,25 +651,25 @@ template <> int PyrUpVecV<int, short>(int** src, short** dst, int width)
     short *dst0 = dst[0], *dst1 = dst[1];
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+    for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
-        v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst0 + x, v_rshr_pack<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
+        v_store(dst1 + x, v_rshr_pack<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-        x += v_int32::nlanes;
+        v_rshr_pack_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        v_rshr_pack_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20)));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -674,25 +682,25 @@ template <> int PyrUpVecV<int, ushort>(int** src, ushort** dst, int width)
     ushort *dst0 = dst[0], *dst1 = dst[1];
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+    for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
-        v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst0 + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
+        v_store(dst1 + x, v_rshr_pack_u<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-        x += v_int32::nlanes;
+        v_rshr_pack_u_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        v_rshr_pack_u_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20)));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -706,13 +714,13 @@ template <> int PyrUpVecV<float, float>(float** src, float** dst, int width)
     float *dst0 = dst[0], *dst1 = dst[1];
 
     v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f);
-    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
     {
         v_float32 v_r0 = vx_load(row0 + x),
                   v_r1 = vx_load(row1 + x),
                   v_r2 = vx_load(row2 + x);
-        v_store(dst1 + x, v_scale4 * (v_r1 + v_r2));
-        v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2));
+        v_store(dst1 + x, v_mul(v_scale4, v_add(v_r1, v_r2)));
+        v_store(dst0 + x, v_mul(v_scale, v_add(v_muladd(v_6, v_r1, v_r0), v_r2)));
     }
     vx_cleanup();
 
@@ -724,36 +732,50 @@ template <> int PyrUpVecVOneRow<int, uchar>(int** src, uchar* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+    for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)),
-                v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11);
-        v_store(dst + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11)));
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r01 = v_pack(vx_load(row0 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row0 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r11 = v_pack(vx_load(row1 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row1 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())),
+                v_r21 = v_pack(vx_load(row2 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row2 + x + 3 * VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10), v_2r11 = (v_add(v_r11, v_r11));
+        v_store(dst + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_add(v_add(v_r01, v_r21), v_add(v_add(v_2r11, v_2r11), v_2r11))));
     }
-    if(x <= width - v_uint16::nlanes)
+    if(x <= width - VTraits<v_uint16>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10;
-        v_rshr_pack_u_store<6>(dst + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10));
-        x += v_uint16::nlanes;
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10);
+        v_rshr_pack_u_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)));
+        x += VTraits<v_uint16>::vlanes();
     }
+    #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
     for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_int32 v_2r10 = v_r10 + v_r10;
-        v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2);
+        v_int32 v_2r10 = v_add(v_r10, v_r10);
+        v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
         *(unaligned_int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
     }
+    #else
+    for (; x <= width - 1; x += 1)
+    {
+        int r00 = *(row0 + x),
+            r10 = *(row1 + x),
+            r20 = *(row2 + x);
+        int _2r10 = r10 + r10;
+        int d = r00 + r20 + (_2r10 + _2r10 + _2r10);
+        int d_shifted = (r10 + r20) << 2;
+        // Similar to v_rshr_pack_u<6>(d, vx_setzero_s16()).get0()
+        *(dst + x) = (int)((((unsigned int)d) + ((1 << (6 - 1)))) >> 6);
+    }
+    #endif //CV_SIMD128
     vx_cleanup();
 
     return x;
@@ -764,23 +786,23 @@ template <> int PyrUpVecVOneRow<int, short>(int** src, short* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+    for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_store<6>(dst + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        x += v_int32::nlanes;
+        v_rshr_pack_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -792,23 +814,23 @@ template <> int PyrUpVecVOneRow<int, ushort>(int** src, ushort* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+    for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_u_store<6>(dst + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        x += v_int32::nlanes;
+        v_rshr_pack_u_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -821,12 +843,12 @@ template <> int PyrUpVecVOneRow<float, float>(float** src, float* dst, int width
     const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
     v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f);
-    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
     {
         v_float32 v_r0 = vx_load(row0 + x),
                   v_r1 = vx_load(row1 + x),
                   v_r2 = vx_load(row2 + x);
-        v_store(dst + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2));
+        v_store(dst + x, v_mul(v_scale, v_add(v_muladd(v_6, v_r1, v_r0), v_r2)));
     }
     vx_cleanup();
 
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 456cfc4af916..1ad8e8932deb 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -346,8 +346,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
 {
     int i = 0;
     ufixedpoint16 src_0(src[0]);
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
     for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -358,7 +358,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
     {
         *(dst++) = src_0;
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - 2*VECSZ; i += 2*VECSZ, m += 4*VECSZ, dst += 2*VECSZ)
     {
         v_uint16 v_src0, v_src1;
@@ -384,7 +384,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
         *(dst++) = m[0] * px[0] + m[1] * px[1];
     }
     src_0 = (src + ofst[dst_width - 1])[0];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
     for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -406,8 +406,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
     } srccn;
     ((ufixedpoint16*)(srccn.w))[0] = src[0];
     ((ufixedpoint16*)(srccn.w))[1] = src[1];
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
     for (; i <= dst_min - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -419,7 +419,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ)
     {
         v_uint16 v_src0, v_src1;
@@ -440,7 +440,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
         *(dst++) = m[0] * px[1] + m[1] * px[3];
     }
     ((ufixedpoint16*)(srccn.w))[0] = (src + 2 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 2 * ofst[dst_width - 1])[1];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
     for (; i <= dst_width - VECSZ/2; i += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -465,8 +465,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *o
     ((ufixedpoint16*)(srccn.w))[1] = src[1];
     ((ufixedpoint16*)(srccn.w))[2] = src[2];
     ((ufixedpoint16*)(srccn.w))[3] = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
     for (; i <= dst_min - (VECSZ+2)/3; i += VECSZ/4, m += VECSZ/2, dst += 3*VECSZ/4) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -479,14 +479,14 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *o
         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
     }
-#if CV_SIMD
-    CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VECSZ/2];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VTraits<v_uint16>::max_nlanes/2];
     for (; i <= dst_max - (3*VECSZ/4 + (VECSZ+2)/3); i += VECSZ/2, m += VECSZ, dst += 3*VECSZ/2)
     {
-        v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3));
+        v_store(ofst3, v_mul(vx_load(ofst + i), vx_setall_s32(3)));
         v_uint8 v_src01, v_src23;
         v_uint16 v_src0, v_src1, v_src2, v_src3;
-        v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)) >> 8), v_src01, v_src23);
+        v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_shr<8>(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)))), v_src01, v_src23);
         v_expand(v_src01, v_src0, v_src1);
         v_expand(v_src23, v_src2, v_src3);
 
@@ -514,7 +514,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *o
     ((ufixedpoint16*)(srccn.w))[0] = (src + 3*ofst[dst_width - 1])[0];
     ((ufixedpoint16*)(srccn.w))[1] = (src + 3*ofst[dst_width - 1])[1];
     ((ufixedpoint16*)(srccn.w))[2] = (src + 3*ofst[dst_width - 1])[2];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
     for (; i <= dst_width - (VECSZ+2)/3; i += VECSZ/4, dst += 3*VECSZ/4) // Points that fall right from src image so became equal to rightmost src point
     {
@@ -540,8 +540,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
     ((ufixedpoint16*)(srccn.w))[1] = src[1];
     ((ufixedpoint16*)(srccn.w))[2] = src[2];
     ((ufixedpoint16*)(srccn.w))[3] = src[3];
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
     for (; i <= dst_min - VECSZ/4; i += VECSZ/4, m += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -555,7 +555,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
         *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ)
     {
         v_uint16 v_src0, v_src1, v_src2, v_src3;
@@ -586,7 +586,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
     }
     ((ufixedpoint16*)(srccn.w))[0] = (src + 4 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 4 * ofst[dst_width - 1])[1];
     ((ufixedpoint16*)(srccn.w))[2] = (src + 4 * ofst[dst_width - 1])[2]; ((ufixedpoint16*)(srccn.w))[3] = (src + 4 * ofst[dst_width - 1])[3];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
     for (; i <= dst_width - VECSZ/4; i += VECSZ/4, dst += VECSZ) // Points that fall right from src image so became equal to rightmost src point
     {
@@ -606,8 +606,8 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
 {
     int i = 0;
     ufixedpoint32 src_0(src[0]);
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     v_uint32 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
     for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -618,16 +618,16 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
     {
         *(dst++) = src_0;
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
     {
         v_uint32 v_src0, v_src1;
         v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
 
-        v_uint64 v_res0 = v_reinterpret_as_u64(v_src0 * vx_load((uint32_t*)m));
-        v_uint64 v_res1 = v_reinterpret_as_u64(v_src1 * vx_load((uint32_t*)m + VECSZ));
-        v_store((uint32_t*)dst, v_pack((v_res0 & vx_setall_u64(0xFFFFFFFF)) + (v_res0 >> 32),
-                                       (v_res1 & vx_setall_u64(0xFFFFFFFF)) + (v_res1 >> 32)));
+        v_uint64 v_res0 = v_reinterpret_as_u64(v_mul(v_src0, vx_load((uint32_t *)m)));
+        v_uint64 v_res1 = v_reinterpret_as_u64(v_mul(v_src1, vx_load((uint32_t *)m + VECSZ)));
+        v_store((uint32_t*)dst, v_pack(v_add(v_and(v_res0, vx_setall_u64(0xFFFFFFFF)), v_shr<32>(v_res0)),
+                                       v_add(v_and(v_res1, vx_setall_u64(0xFFFFFFFF)), v_shr<32>(v_res1))));
     }
 #endif
     for (; i < dst_max; i += 1, m += 2)
@@ -636,7 +636,7 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
         *(dst++) = m[0] * px[0] + m[1] * px[1];
     }
     src_0 = (src + ofst[dst_width - 1])[0];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
     for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ)
     {
@@ -659,16 +659,16 @@ template <>
 void vlineSet<uint8_t, ufixedpoint16>(ufixedpoint16* src, uint8_t* dst, int dst_width)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint8::nlanes;
-    static const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint8>::vlanes();
+    const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1));
     for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
     {
         v_uint16 v_src0 = vx_load((uint16_t*)src);
         v_uint16 v_src1 = vx_load((uint16_t*)src + VECSZ/2);
 
-        v_uint16 v_res0 = (v_src0 + v_fixedRound) >> 8;
-        v_uint16 v_res1 = (v_src1 + v_fixedRound) >> 8;
+        v_uint16 v_res0 = v_shr<8>(v_add(v_src0, v_fixedRound));
+        v_uint16 v_res1 = v_shr<8>(v_add(v_src1, v_fixedRound));
 
         v_store(dst, v_pack(v_res0, v_res1));
     }
@@ -693,11 +693,11 @@ void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step,
 {
     int i = 0;
     ufixedpoint16* src1 = src + src_step;
-#if CV_SIMD
-    const int VECSZ = v_uint8::nlanes;
-    static const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1));
-    static const v_int16 v_128    = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15));
-    static const v_int8  v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint8>::vlanes();
+    const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1));
+    const v_int16 v_128    = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15));
+    const v_int8  v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7));
 
     v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(((uint32_t*)m)[0]));
     for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, src1 += VECSZ, dst += VECSZ)
@@ -716,10 +716,10 @@ void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step,
         v_int32 v_res2 = v_dotprod(v_tmp0, v_mul);
         v_int32 v_res3 = v_dotprod(v_tmp1, v_mul);
 
-        v_int8 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16,
-                                     (v_res1 + v_fixedRound) >> 16),
-                              v_pack((v_res2 + v_fixedRound) >> 16,
-                                     (v_res3 + v_fixedRound) >> 16));
+        v_int8 v_res = v_pack(v_pack(v_shr<16>(v_add(v_res0, v_fixedRound)),
+                                     v_shr<16>(v_add(v_res1, v_fixedRound))),
+                              v_pack(v_shr<16>(v_add(v_res2, v_fixedRound)),
+                                     v_shr<16>(v_add(v_res3, v_fixedRound))));
 
         v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16)));
     }
@@ -828,7 +828,7 @@ class resize_bitExactInvoker :
             hResize((ET*)(src + (src_height - 1) * src_step), cn, xoffsets, xcoeffs, endline, min_x, max_x, dst_width);
         for (; dy < range.end; dy++)
             vlineSet<ET, FT>(endline, (ET*)(dst + dst_step * dy), dst_width*cn);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         vx_cleanup();
 #endif
     }
@@ -1136,16 +1136,16 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody
             switch( pix_size )
             {
             case 1:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint8::nlanes; x += v_uint8::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
                     v_store(D + x, vx_lut(S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
                     D[x] = S[x_ofse[x]];
                 break;
             case 2:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint16::nlanes; x += v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes() )
                     v_store((ushort*)D + x, vx_lut((ushort*)S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
@@ -1159,8 +1159,8 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody
                 }
                 break;
             case 4:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint32::nlanes; x += v_uint32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint32>::vlanes(); x += VTraits<v_uint32>::vlanes() )
                     v_store((uint32_t*)D + x, vx_lut((uint32_t*)S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
@@ -1175,8 +1175,8 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody
                 }
                 break;
             case 8:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint64::nlanes; x += v_uint64::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint64>::vlanes(); x += VTraits<v_uint64>::vlanes() )
                     v_store((uint64_t*)D + x, vx_lut((uint64_t*)S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
@@ -1250,7 +1250,7 @@ struct HResizeNoVec
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 struct VResizeLinearVec_32s8u
 {
@@ -1260,22 +1260,17 @@ struct VResizeLinearVec_32s8u
         int x = 0;
         v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x                      ) >> 4, vx_load_aligned(S0 + x +     v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x                      ) >> 4, vx_load_aligned(S1 + x +     v_int32::nlanes) >> 4), b1),
-                                                  v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
+                v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S0 + x)), v_shr<4>(vx_load_aligned(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S1 + x)), v_shr<4>(vx_load_aligned(S1 + x + VTraits<v_int32>::vlanes()))), b1)),
+                                                  v_add(v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S0 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load_aligned(S0 + x + 3 * VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S1 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load_aligned(S1 + x + 3 * VTraits<v_int32>::vlanes()))), b1))));
         else
-            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x                      ) >> 4, vx_load(S0 + x +     v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load(S1 + x                      ) >> 4, vx_load(S1 + x +     v_int32::nlanes) >> 4), b1),
-                                                  v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
+            for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
+                v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)),
+                                                  v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S0 + x + 3 * VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S1 + x + 3 * VTraits<v_int32>::vlanes()))), b1))));
 
-        for( ; x < width - v_int16::nlanes; x += v_int16::nlanes)
-            v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
-                                            v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1));
+        for( ; x < width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
+            v_rshr_pack_u_store<2>(dst + x, v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)));
 
         return x;
     }
@@ -1290,17 +1285,17 @@ struct VResizeLinearVec_32f16u
 
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
-                                          v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, v_mul(vx_load_aligned(S1 + x), b1))),
+                                          v_round(v_muladd(vx_load_aligned(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load_aligned(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
         else
-            for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
-                                          v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
-        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
+            for (; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, v_mul(vx_load(S1 + x), b1))),
+                                          v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
+        for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
         {
-            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
             v_store_low(dst + x, v_pack_u(t0, t0));
         }
 
@@ -1317,17 +1312,17 @@ struct VResizeLinearVec_32f16s
 
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-                v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
-                                        v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, v_mul(vx_load_aligned(S1 + x), b1))),
+                                        v_round(v_muladd(vx_load_aligned(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load_aligned(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
         else
-            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-                v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
-                                        v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
-        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
+            for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, v_mul(vx_load(S1 + x), b1))),
+                                        v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
+        for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
         {
-            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
             v_store_low(dst + x, v_pack(t0, t0));
         }
 
@@ -1344,12 +1339,12 @@ struct VResizeLinearVec_32f
 
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-                v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
+                v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, v_mul(vx_load_aligned(S1 + x), b1)));
         else
-            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-                v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
+                v_store(dst + x, v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
 
         return x;
     }
@@ -1367,26 +1362,26 @@ struct VResizeCubicVec_32s8u
         v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale),
                   b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale);
 
-        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
                 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x                    )),  b0,
                                                        v_muladd(v_cvt_f32(vx_load_aligned(S1 + x                    )),  b1,
                                                        v_muladd(v_cvt_f32(vx_load_aligned(S2 + x                    )),  b2,
-                                                                v_cvt_f32(vx_load_aligned(S3 + x                    )) * b3)))),
-                                               v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)),  b0,
-                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)),  b1,
-                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)),  b2,
-                                                                v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3))))));
+                                                                v_mul(v_cvt_f32(vx_load_aligned(S3 + x)), b3))))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + VTraits<v_float32>::vlanes())),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + VTraits<v_float32>::vlanes())),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + VTraits<v_float32>::vlanes())),  b2,
+                                                                v_mul(v_cvt_f32(vx_load_aligned(S3 + x + VTraits<v_float32>::vlanes())), b3)))))));
         else
-            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
                 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x                    )),  b0,
                                                        v_muladd(v_cvt_f32(vx_load(S1 + x                    )),  b1,
                                                        v_muladd(v_cvt_f32(vx_load(S2 + x                    )),  b2,
-                                                                v_cvt_f32(vx_load(S3 + x                    )) * b3)))),
-                                               v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)),  b0,
-                                                       v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)),  b1,
-                                                       v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)),  b2,
-                                                                v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3))))));
+                                                                v_mul(v_cvt_f32(vx_load(S3 + x)), b3))))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + VTraits<v_float32>::vlanes())),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load(S1 + x + VTraits<v_float32>::vlanes())),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load(S2 + x + VTraits<v_float32>::vlanes())),  b2,
+                                                                v_mul(v_cvt_f32(vx_load(S3 + x + VTraits<v_float32>::vlanes())), b3)))))));
         return x;
     }
 };
@@ -1400,15 +1395,15 @@ struct VResizeCubicVec_32f16u
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+        for (; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
             v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                               v_muladd(vx_load(S1 + x                    ),  b1,
                                               v_muladd(vx_load(S2 + x                    ),  b2,
-                                                       vx_load(S3 + x                    ) * b3)))),
-                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                                       vx_load(S3 + x + v_float32::nlanes) * b3))))));
+                                                       v_mul(vx_load(S3 + x), b3))))),
+                                      v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                              v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                              v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                                       v_mul(vx_load(S3 + x + VTraits<v_float32>::vlanes()), b3)))))));
 
         return x;
     }
@@ -1423,15 +1418,15 @@ struct VResizeCubicVec_32f16s
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+        for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                             v_muladd(vx_load(S1 + x                    ),  b1,
                                             v_muladd(vx_load(S2 + x                    ),  b2,
-                                                     vx_load(S3 + x                    ) * b3)))),
-                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                                     vx_load(S3 + x + v_float32::nlanes) * b3))))));
+                                                     v_mul(vx_load(S3 + x), b3))))),
+                                    v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                            v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                            v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                                     v_mul(vx_load(S3 + x + VTraits<v_float32>::vlanes()), b3)))))));
 
         return x;
     }
@@ -1446,11 +1441,11 @@ struct VResizeCubicVec_32f
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+        for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
             v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
                              v_muladd(vx_load(S1 + x),  b1,
                              v_muladd(vx_load(S2 + x),  b2,
-                                      vx_load(S3 + x) * b3))));
+                                      v_mul(vx_load(S3 + x), b3)))));
 
         return x;
     }
@@ -1484,7 +1479,7 @@ struct VResizeLanczos4Vec_32f16u
                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+        for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
             v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                               v_muladd(vx_load(S1 + x                    ),  b1,
                                               v_muladd(vx_load(S2 + x                    ),  b2,
@@ -1492,15 +1487,15 @@ struct VResizeLanczos4Vec_32f16u
                                               v_muladd(vx_load(S4 + x                    ),  b4,
                                               v_muladd(vx_load(S5 + x                    ),  b5,
                                               v_muladd(vx_load(S6 + x                    ),  b6,
-                                                       vx_load(S7 + x                    ) * b7)))))))),
-                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                              v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
-                                              v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
-                                              v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
-                                              v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
-                                                       vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
+                                                       v_mul(vx_load(S7 + x                    ), b7))))))))),
+                                      v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                              v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                              v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                              v_muladd(vx_load(S3 + x + VTraits<v_float32>::vlanes()),  b3,
+                                              v_muladd(vx_load(S4 + x + VTraits<v_float32>::vlanes()),  b4,
+                                              v_muladd(vx_load(S5 + x + VTraits<v_float32>::vlanes()),  b5,
+                                              v_muladd(vx_load(S6 + x + VTraits<v_float32>::vlanes()),  b6,
+                                                       v_mul(vx_load(S7 + x + VTraits<v_float32>::vlanes()), b7)))))))))));
 
         return x;
     }
@@ -1520,7 +1515,7 @@ struct VResizeLanczos4Vec_32f16s
                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+        for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                             v_muladd(vx_load(S1 + x                    ),  b1,
                                             v_muladd(vx_load(S2 + x                    ),  b2,
@@ -1528,15 +1523,15 @@ struct VResizeLanczos4Vec_32f16s
                                             v_muladd(vx_load(S4 + x                    ),  b4,
                                             v_muladd(vx_load(S5 + x                    ),  b5,
                                             v_muladd(vx_load(S6 + x                    ),  b6,
-                                                     vx_load(S7 + x                    ) * b7)))))))),
-                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                            v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
-                                            v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
-                                            v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
-                                            v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
-                                                     vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
+                                                     v_mul(vx_load(S7 + x), b7))))))))),
+                                    v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                            v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                            v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                            v_muladd(vx_load(S3 + x + VTraits<v_float32>::vlanes()),  b3,
+                                            v_muladd(vx_load(S4 + x + VTraits<v_float32>::vlanes()),  b4,
+                                            v_muladd(vx_load(S5 + x + VTraits<v_float32>::vlanes()),  b5,
+                                            v_muladd(vx_load(S6 + x + VTraits<v_float32>::vlanes()),  b6,
+                                                     v_mul(vx_load(S7 + x + VTraits<v_float32>::vlanes()), b7)))))))))));
 
         return x;
     }
@@ -1555,7 +1550,7 @@ struct VResizeLanczos4Vec_32f
                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+        for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
             v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
                              v_muladd(vx_load(S1 + x),  b1,
                              v_muladd(vx_load(S2 + x),  b2,
@@ -1563,7 +1558,7 @@ struct VResizeLanczos4Vec_32f
                              v_muladd(vx_load(S4 + x),  b4,
                              v_muladd(vx_load(S5 + x),  b5,
                              v_muladd(vx_load(S6 + x),  b6,
-                                      vx_load(S7 + x) * b7))))))));
+                                      v_mul(vx_load(S7 + x), b7)))))))));
 
         return x;
     }
@@ -1620,8 +1615,8 @@ struct HResizeLinearVec_X4
                 DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]);
                 DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]);
                 DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]);
-                v_store(&D1[dx], s0_u * a_even + s1_u * a_odd);
-                v_store(&D0[dx], s0 * a_even + s1 * a_odd);
+                v_store(&D1[dx], v_add(v_mul(s0_u, a_even), v_mul(s1_u, a_odd)));
+                v_store(&D0[dx], v_add(v_mul(s0, a_even), v_mul(s1, a_odd)));
             }
         }
         for( ; k < count; k++ )
@@ -1640,7 +1635,7 @@ struct HResizeLinearVec_X4
                 v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
                 DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]);
                 DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]);
-                v_store(&D[dx], s0 * a_even + s1 * a_odd);
+                v_store(&D[dx], v_add(v_mul(s0, a_even), v_mul(s1, a_odd)));
             }
         }
         return dx;
@@ -1752,8 +1747,8 @@ struct HResizeLinearVecU8_X4
                 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
                 {
                     v_int16x8 a = v_load(alpha+dx*2);
-                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
-                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a));
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S0 + xofs[dx]), v_shl<16>(v_load_expand_q(S0 + xofs[dx] + cn)))), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S1 + xofs[dx]), v_shl<16>(v_load_expand_q(S1 + xofs[dx] + cn)))), a));
                 }
             }
             for( ; k < count; k++ )
@@ -1763,7 +1758,7 @@ struct HResizeLinearVecU8_X4
                 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
                 {
                     v_int16x8 a = v_load(alpha+dx*2);
-                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S + xofs[dx]), v_shl<16>(v_load_expand_q(S + xofs[dx] + cn)))), a));
                 }
             }
             /* Debug check to ensure truthiness that we never vector the final value. */
@@ -2452,27 +2447,27 @@ class ResizeAreaFastVec_SIMD_8u
         if (cn == 1)
         {
             v_uint16 masklow = vx_setall_u16(0x00ff);
-            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes)
+            for ( ; dx <= w - VTraits<v_uint16>::vlanes(); dx += VTraits<v_uint16>::vlanes(), S0 += VTraits<v_uint8>::vlanes(), S1 += VTraits<v_uint8>::vlanes(), D += VTraits<v_uint16>::vlanes())
             {
                 v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0));
                 v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1));
-                v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<8>(r0), v_and(r0, masklow)), v_shr<8>(r1)), v_and(r1, masklow)));
             }
         }
         else if (cn == 3)
         {
             if (CV_SIMD_WIDTH > 64)
                 return 0;
-            for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_uint8>::vlanes(); dx += 3*VTraits<v_uint8>::vlanes(), S0 += 6*VTraits<v_uint8>::vlanes(), S1 += 6*VTraits<v_uint8>::vlanes(), D += 3*VTraits<v_uint8>::vlanes())
             {
                 v_uint16 t0, t1, t2, t3, t4, t5;
                 v_uint16 s0, s1, s2, s3, s4, s5;
-                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
-                s1 = vx_load_expand(S0 +   v_uint16::nlanes) + vx_load_expand(S1 +   v_uint16::nlanes);
-                s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes);
-                s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes);
-                s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes);
-                s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes);
+                s0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                s1 = v_add(vx_load_expand(S0 + VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + VTraits<v_uint16>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_uint16>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_uint16>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 4 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 4 * VTraits<v_uint16>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 5 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 5 * VTraits<v_uint16>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
@@ -2481,18 +2476,18 @@ class ResizeAreaFastVec_SIMD_8u
                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
 #elif CV_SIMD_WIDTH == 32
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
-                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
+                bl = v_add(s0, s3); gl = v_add(s1, s4); rl = v_add(s2, s5);
 #elif CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
 #endif
-                s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes);
-                s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes);
-                s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes);
-                s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes);
-                s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes);
-                s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes);
+                s0 = v_add(vx_load_expand(S0 + 6 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 6 * VTraits<v_uint16>::vlanes()));
+                s1 = v_add(vx_load_expand(S0 + 7 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 7 * VTraits<v_uint16>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 8 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 8 * VTraits<v_uint16>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 9 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 9 * VTraits<v_uint16>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 10 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 10 * VTraits<v_uint16>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 11 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 11 * VTraits<v_uint16>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
@@ -2501,7 +2496,7 @@ class ResizeAreaFastVec_SIMD_8u
                 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
 #elif CV_SIMD_WIDTH == 32
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
-                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+                bh = v_add(s0, s3); gh = v_add(s1, s4); rh = v_add(s2, s5);
 #elif CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
@@ -2513,7 +2508,7 @@ class ResizeAreaFastVec_SIMD_8u
         else
         {
             CV_Assert(cn == 4);
-            for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes)
+            for ( ; dx <= w - VTraits<v_uint8>::vlanes(); dx += VTraits<v_uint8>::vlanes(), S0 += 2*VTraits<v_uint8>::vlanes(), S1 += 2*VTraits<v_uint8>::vlanes(), D += VTraits<v_uint8>::vlanes())
             {
                 v_uint32 r00, r01, r10, r11;
                 v_load_deinterleave((uint32_t*)S0, r00, r01);
@@ -2524,7 +2519,7 @@ class ResizeAreaFastVec_SIMD_8u
                 v_expand(v_reinterpret_as_u8(r01), r01l, r01h);
                 v_expand(v_reinterpret_as_u8(r10), r10l, r10h);
                 v_expand(v_reinterpret_as_u8(r11), r11l, r11h);
-                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
+                v_store(D, v_rshr_pack<2>(v_add(v_add(v_add(r00l, r01l), r10l), r11l), v_add(v_add(v_add(r00h, r01h), r10h), r11h)));
             }
         }
 
@@ -2551,11 +2546,11 @@ class ResizeAreaFastVec_SIMD_16u
         if (cn == 1)
         {
             v_uint32 masklow = vx_setall_u32(0x0000ffff);
-            for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
+            for (; dx <= w - VTraits<v_uint32>::vlanes(); dx += VTraits<v_uint32>::vlanes(), S0 += VTraits<v_uint16>::vlanes(), S1 += VTraits<v_uint16>::vlanes(), D += VTraits<v_uint32>::vlanes())
             {
                 v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0));
                 v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1));
-                v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<16>(r0), v_and(r0, masklow)), v_shr<16>(r1)), v_and(r1, masklow)));
             }
         }
         else if (cn == 3)
@@ -2574,38 +2569,38 @@ class ResizeAreaFastVec_SIMD_16u
                 v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
 #endif
 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
-            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
             {
                 v_uint32 t0, t1, t2, t3, t4, t5;
                 v_uint32 s0, s1, s2, s3, s4, s5;
-                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
-                s1 = vx_load_expand(S0 +   v_uint32::nlanes) + vx_load_expand(S1 +   v_uint32::nlanes);
-                s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes);
-                s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes);
-                s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes);
-                s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes);
+                s0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                s1 = v_add(vx_load_expand(S0 + VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + VTraits<v_uint32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_uint32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_uint32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 4 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 4 * VTraits<v_uint32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 5 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 5 * VTraits<v_uint32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_uint32 bl, gl, rl;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+                bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
 #endif
-                s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes);
-                s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes);
-                s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes);
-                s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes);
-                s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes);
-                s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes);
+                s0 = v_add(vx_load_expand(S0 + 6 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 6 * VTraits<v_uint32>::vlanes()));
+                s1 = v_add(vx_load_expand(S0 + 7 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 7 * VTraits<v_uint32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 8 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 8 * VTraits<v_uint32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 9 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 9 * VTraits<v_uint32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 10 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 10 * VTraits<v_uint32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 11 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 11 * VTraits<v_uint32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_uint32 bh, gh, rh;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+                bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
@@ -2649,19 +2644,19 @@ class ResizeAreaFastVec_SIMD_16u
                 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
             }
 #else
-            for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
+            for ( ; dx <= w - VTraits<v_uint32>::vlanes(); dx += VTraits<v_uint32>::vlanes(), S0 += VTraits<v_uint16>::vlanes(), S1 += VTraits<v_uint16>::vlanes(), D += VTraits<v_uint32>::vlanes())
             {
                 v_uint32 r0, r1, r2, r3;
                 v_expand(vx_load(S0), r0, r1);
                 v_expand(vx_load(S1), r2, r3);
-                r0 += r2; r1 += r3;
+                r0 = v_add(r0, r2); r1 = v_add(r1, r3);
                 v_uint32 v_d;
 #if CV_SIMD_WIDTH == 16
                 v_d = r0 + r1;
 #elif CV_SIMD_WIDTH == 32
                 v_uint32 t0, t1;
                 v_recombine(r0, r1, t0, t1);
-                v_d = t0 + t1;
+                v_d = v_add(t0, t1);
 #endif
                 v_rshr_pack_store<2>(D, v_d);
             }
@@ -2691,11 +2686,11 @@ class ResizeAreaFastVec_SIMD_16s
         if (cn == 1)
         {
             v_int32 masklow = vx_setall_s32(0x0000ffff);
-            for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes)
+            for (; dx <= w - VTraits<v_int32>::vlanes(); dx += VTraits<v_int32>::vlanes(), S0 += VTraits<v_int16>::vlanes(), S1 += VTraits<v_int16>::vlanes(), D += VTraits<v_int32>::vlanes())
             {
                 v_int32 r0 = v_reinterpret_as_s32(vx_load(S0));
                 v_int32 r1 = v_reinterpret_as_s32(vx_load(S1));
-                v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<16>(r0), v_shr<16>(v_shl<16>(v_and(r0, masklow)))), v_shr<16>(r1)), v_shr<16>(v_shl<16>(v_and(r1, masklow)))));
             }
         }
         else if (cn == 3)
@@ -2704,38 +2699,38 @@ class ResizeAreaFastVec_SIMD_16s
             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
                 v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
-            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
             {
                 v_int32 t0, t1, t2, t3, t4, t5;
                 v_int32 s0, s1, s2, s3, s4, s5;
-                s0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
-                s1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
-                s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
-                s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
-                s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes);
-                s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes);
+                s0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                s1 = v_add(vx_load_expand(S0 + VTraits<v_int32>::vlanes()), vx_load_expand(S1 + VTraits<v_int32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_int32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_int32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 4 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 4 * VTraits<v_int32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 5 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 5 * VTraits<v_int32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_int32 bl, gl, rl;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+                bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
 #endif
-                s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes);
-                s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes);
-                s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes);
-                s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes);
-                s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes);
-                s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes);
+                s0 = v_add(vx_load_expand(S0 + 6 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 6 * VTraits<v_int32>::vlanes()));
+                s1 = v_add(vx_load_expand(S0 + 7 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 7 * VTraits<v_int32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 8 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 8 * VTraits<v_int32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 9 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 9 * VTraits<v_int32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 10 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 10 * VTraits<v_int32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 11 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 11 * VTraits<v_int32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_int32 bh, gh, rh;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+                bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
@@ -2763,7 +2758,7 @@ class ResizeAreaFastVec_SIMD_16s
         else
         {
             CV_Assert(cn == 4);
-            for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes)
+            for (; dx <= w - VTraits<v_int16>::vlanes(); dx += VTraits<v_int16>::vlanes(), S0 += 2 * VTraits<v_int16>::vlanes(), S1 += 2 * VTraits<v_int16>::vlanes(), D += VTraits<v_int16>::vlanes())
             {
 #if CV_SIMD_WIDTH >= 64
                 v_int64 r00, r01, r10, r11;
@@ -2778,17 +2773,17 @@ class ResizeAreaFastVec_SIMD_16s
                 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
 #else
                 v_int32 r0, r1, r2, r3;
-                r0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
-                r1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
-                r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
-                r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
+                r0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                r1 = v_add(vx_load_expand(S0 + VTraits<v_int32>::vlanes()), vx_load_expand(S1 + VTraits<v_int32>::vlanes()));
+                r2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_int32>::vlanes()));
+                r3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_int32>::vlanes()));
                 v_int32 dl, dh;
 #if CV_SIMD_WIDTH == 16
                 dl = r0 + r1; dh = r2 + r3;
 #elif CV_SIMD_WIDTH == 32
                 v_int32 t0, t1, t2, t3;
                 v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
-                dl = t0 + t1; dh = t2 + t3;
+                dl = v_add(t0, t1); dh = v_add(t2, t3);
 #endif
                 v_store(D, v_rshr_pack<2>(dl, dh));
 #endif
@@ -2822,12 +2817,12 @@ struct ResizeAreaFastVec_SIMD_32f
         if (cn == 1)
         {
             v_float32 v_025 = vx_setall_f32(0.25f);
-            for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
+            for ( ; dx <= w - VTraits<v_float32>::vlanes(); dx += VTraits<v_float32>::vlanes(), S0 += 2*VTraits<v_float32>::vlanes(), S1 += 2*VTraits<v_float32>::vlanes(), D += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_row00, v_row01, v_row10, v_row11;
                 v_load_deinterleave(S0, v_row00, v_row01);
                 v_load_deinterleave(S1, v_row10, v_row11);
-                v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025);
+                v_store(D, v_mul(v_add(v_add(v_row00, v_row01), v_add(v_row10, v_row11)), v_025));
             }
         }
         else if (cn == 4)
@@ -2841,8 +2836,8 @@ struct ResizeAreaFastVec_SIMD_32f
             for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
             {
                 v_float32x8 dst0, dst1;
-                v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1);
-                v_store(D, (dst0 + dst1) * v_025);
+                v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + v_float32x8::nlanes), v256_load(S1 + v_float32x8::nlanes)), dst0, dst1);
+                v_store(D, v_mul(v_add(dst0, dst1), v_025));
             }
 #endif
         }
diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp
index 62ff31ac940c..33e58d4e80b4 100644
--- a/modules/imgproc/src/smooth.simd.hpp
+++ b/modules/imgproc/src/smooth.simd.hpp
@@ -81,11 +81,11 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 {
     int lencn = len*cn;
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
-    v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
+    v_uint16 vmul = vx_setall_u16(*((uint16_t*)m));
     for (; i <= lencn - VECSZ; i += VECSZ)
-        v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i)));
+        v_store((uint16_t*)dst + i, v_mul(vmul, vx_load_expand(src + i)));
 #endif
     for (; i < lencn; i++)
         dst[i] = m[0] * src[i];
@@ -101,8 +101,8 @@ void hlineSmooth1N1<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const uf
 {
     int lencn = len*cn;
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= lencn - VECSZ; i += VECSZ)
         v_store((uint16_t*)dst + i, v_shl<8>(vx_load_expand(src + i)));
 #endif
@@ -168,16 +168,14 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul1) +
-                                    v_mul_wrap(vx_load_expand(src + cn), v_mul2));
+            v_store((uint16_t*)dst, v_add(v_add(v_mul(vx_load_expand(src - cn), v_mul0), v_mul(vx_load_expand(src), v_mul1)), v_mul(vx_load_expand(src + cn), v_mul2)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn];
@@ -220,10 +218,10 @@ void hlineSmooth3N121Impl(const ET* src, int cn, const FT*, int, FT* dst, int le
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-#if CV_SIMD
-        const int VECSZ = VFT::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<VFT>::vlanes();
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((typename FT::raw_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << (FT::fixedShift-2));
+            v_store((typename FT::raw_t*)dst, v_shl<(FT::fixedShift-2)>(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn), v_shl<1>((vx_load_expand(src))))));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[0])>>1);
@@ -320,14 +318,13 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul1));
+            v_store((uint16_t*)dst, v_add(v_mul(v_add(  vx_load_expand(src - cn), vx_load_expand(src + cn)),  v_mul0), v_mul(vx_load_expand(src), v_mul1)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = saturate_cast<uint16_t>(((uint16_t*)m)[1] * (uint32_t)(src[0]) + ((uint16_t*)m)[0] * ((uint32_t)(src[-cn]) + (uint32_t)(src[cn])));
@@ -514,20 +511,16 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 
         src += 2 * cn; dst += 2 * cn;
         int i = 2*cn, lencn = (len - 2)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         v_uint16 v_mul3 = vx_setall_u16(_m[3]);
         v_uint16 v_mul4 = vx_setall_u16(_m[4]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src - cn), v_mul1) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul2) +
-                                    v_mul_wrap(vx_load_expand(src + cn), v_mul3) +
-                                    v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4));
+            v_store((uint16_t*)dst, v_add(v_add(v_add(v_add(v_mul(vx_load_expand(src - 2 * cn), v_mul0), v_mul(vx_load_expand(src - cn), v_mul1)), v_mul(vx_load_expand(src), v_mul2)), v_mul(vx_load_expand(src + cn), v_mul3)), v_mul(vx_load_expand(src + 2 * cn), v_mul4)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn];
@@ -726,11 +719,11 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
 
         src += 2 * cn; dst += 2 * cn;
         int i = 2 * cn, lencn = (len - 2)*cn;
-#if CV_SIMD
-        const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_6 = vx_setall_u16(6);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
+            v_store((uint16_t*)dst, v_shl<4>(v_add(v_add(v_add(v_mul(vx_load_expand(src), v_6), v_shl<2>(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn)))), vx_load_expand(src - 2 * cn)), vx_load_expand(src + 2 * cn))));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4;
@@ -924,16 +917,14 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
 
         src += 2 * cn; dst += 2 * cn;
         int i = 2 * cn, lencn = (len - 2)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul2));
+            v_store((uint16_t*)dst, v_add(v_add(v_mul(v_add(vx_load_expand(src - 2 * cn), vx_load_expand(src + 2 * cn)), v_mul0), v_mul(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_mul1)), v_mul(vx_load_expand(src), v_mul2)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = saturate_cast<uint16_t>(((uint16_t*)m)[0] * ((uint32_t)(src[-2 * cn]) + (uint32_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint32_t)(src[-cn]) + (uint32_t)(src[cn])) + ((uint16_t*)m)[2] * (uint32_t)(src[0]));
@@ -1044,13 +1035,13 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
     }
     i *= cn;
     int lencn = (len - post_shift + 1)*cn;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ)
     {
-        v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m)));
+        v_uint16 v_res0 = v_mul(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m)));
         for (int j = 1; j < n; j++)
-            v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j))));
+            v_res0 = v_add(v_res0, v_mul(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t *)(m + j)))));
         v_store((uint16_t*)dst, v_res0);
     }
 #endif
@@ -1163,13 +1154,13 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
     }
     i *= cn;
     int lencn = (len - post_shift + 1)*cn;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
     {
-        v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift))));
+        v_uint16 v_res0 = v_mul(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift))));
         for (int j = 0; j < pre_shift; j ++)
-            v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j))));
+            v_res0 = v_add(v_res0, v_mul(v_add(vx_load_expand(src + j * cn), vx_load_expand(src + (n - 1 - j) * cn)), vx_setall_u16(*((uint16_t *)(m + j)))));
         v_store((uint16_t*)dst, v_res0);
     }
 #endif
@@ -1228,8 +1219,8 @@ void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn,
     }
     i *= cn;
     int lencn = (len - post_shift + 1)*cn;
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= lencn - VECSZ * 2; i += VECSZ * 2, src += VECSZ * 2, dst += VECSZ * 2)
     {
         v_uint32 v_res0, v_res1;
@@ -1239,11 +1230,11 @@ void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn,
             v_uint16 v_weight = vx_setall_u16((uint16_t) *((uint32_t*)(m + j)));
             v_uint32 v_add0, v_add1;
             v_mul_expand(vx_load(src + j * cn), v_weight, v_add0, v_add1);
-            v_res0 += v_add0;
-            v_res1 += v_add1;
+            v_res0 = v_add(v_res0, v_add0);
+            v_res1 = v_add(v_res1, v_add1);
             v_mul_expand(vx_load(src + (n - 1 - j)*cn), v_weight, v_add0, v_add1);
-            v_res0 += v_add0;
-            v_res1 += v_add1;
+            v_res0 = v_add(v_res0, v_add0);
+            v_res1 = v_add(v_res1, v_add1);
         }
         v_store((uint32_t*)dst, v_res0);
         v_store((uint32_t*)dst + VECSZ, v_res1);
@@ -1285,8 +1276,8 @@ void vlineSmooth1N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
 {
     const ufixedpoint16* src0 = src[0];
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)<<1);
     for (; i <= len - VECSZ; i += VECSZ)
         v_rshr_pack_store<1>(dst + i, v_mul_hi(vx_load((uint16_t*)src0 + i), v_mul));
@@ -1306,8 +1297,8 @@ void vlineSmooth1N1<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, co
 {
     const ufixedpoint16* src0 = src[0];
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= len - VECSZ; i += VECSZ)
         v_rshr_pack_store<8>(dst + i, vx_load((uint16_t*)(src0 + i)));
 #endif
@@ -1324,10 +1315,10 @@ template <>
 void vlineSmooth3N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
     v_int32 v_128_4 = vx_setall_s32(128 << 16);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= VECSZ)
     {
         ufixedpoint32 val[] = { (m[0] + m[1] + m[2]) * ufixedpoint16((uint8_t)128) };
@@ -1370,26 +1361,26 @@ void vlineSmooth3N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
         v_src02 = vx_load(src2 + 2*VECSZ);
         v_src03 = vx_load(src2 + 3*VECSZ);
         v_mul_expand(v_add_wrap(v_src00, v_128), v_mul2, v_resj0, v_resj1);
-        v_res0 += v_resj0;
-        v_res1 += v_resj1;
+        v_res0 = v_add(v_res0, v_resj0);
+        v_res1 = v_add(v_res1, v_resj1);
         v_mul_expand(v_add_wrap(v_src01, v_128), v_mul2, v_resj0, v_resj1);
-        v_res2 += v_resj0;
-        v_res3 += v_resj1;
+        v_res2 = v_add(v_res2, v_resj0);
+        v_res3 = v_add(v_res3, v_resj1);
         v_mul_expand(v_add_wrap(v_src02, v_128), v_mul2, v_resj0, v_resj1);
-        v_res4 += v_resj0;
-        v_res5 += v_resj1;
+        v_res4 = v_add(v_res4, v_resj0);
+        v_res5 = v_add(v_res5, v_resj1);
         v_mul_expand(v_add_wrap(v_src03, v_128), v_mul2, v_resj0, v_resj1);
-        v_res6 += v_resj0;
-        v_res7 += v_resj1;
-
-        v_res0 += v_128_4;
-        v_res1 += v_128_4;
-        v_res2 += v_128_4;
-        v_res3 += v_128_4;
-        v_res4 += v_128_4;
-        v_res5 += v_128_4;
-        v_res6 += v_128_4;
-        v_res7 += v_128_4;
+        v_res6 = v_add(v_res6, v_resj0);
+        v_res7 = v_add(v_res7, v_resj1);
+
+        v_res0 = v_add(v_res0, v_128_4);
+        v_res1 = v_add(v_res1, v_128_4);
+        v_res2 = v_add(v_res2, v_128_4);
+        v_res3 = v_add(v_res3, v_128_4);
+        v_res4 = v_add(v_res4, v_128_4);
+        v_res5 = v_add(v_res5, v_128_4);
+        v_res6 = v_add(v_res6, v_128_4);
+        v_res7 = v_add(v_res7, v_128_4);
 
         v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                           v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1410,8 +1401,8 @@ template <>
 void vlineSmooth3N121<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint32 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
@@ -1421,8 +1412,8 @@ void vlineSmooth3N121<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src,
         v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
         v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
         v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
-        v_store(dst + i, v_pack(v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)),
-                                v_rshr_pack<10>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13))));
+        v_store(dst + i, v_pack(v_rshr_pack<10>(v_add(v_add(v_src00, v_src20), v_add(v_src10, v_src10)), v_add(v_add(v_src01, v_src21), v_add(v_src11, v_src11))),
+                                v_rshr_pack<10>(v_add(v_add(v_src02, v_src22), v_add(v_src12, v_src12)), v_add(v_add(v_src03, v_src23), v_add(v_src13, v_src13)))));
     }
 #endif
     for (; i < len; i++)
@@ -1432,8 +1423,8 @@ template <>
 void vlineSmooth3N121<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint64 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
@@ -1443,8 +1434,8 @@ void vlineSmooth3N121<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src,
         v_expand(vx_load((uint32_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
         v_expand(vx_load((uint32_t*)(src[2]) + i), v_src20, v_src21);
         v_expand(vx_load((uint32_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
-        v_store(dst + i, v_pack(v_rshr_pack<18>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)),
-                                v_rshr_pack<18>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13))));
+        v_store(dst + i, v_pack(v_rshr_pack<18>(v_add(v_add(v_src00, v_src20), v_add(v_src10, v_src10)), v_add(v_add(v_src01, v_src21), v_add(v_src11, v_src11))),
+                                v_rshr_pack<18>(v_add(v_add(v_src02, v_src22), v_add(v_src12, v_src12)), v_add(v_add(v_src03, v_src23), v_add(v_src13, v_src13)))));
     }
 #endif
     for (; i < len; i++)
@@ -1460,13 +1451,13 @@ template <>
 void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= 4 * VECSZ)
     {
         ufixedpoint32 val[] = { (m[0] + m[1] + m[2] + m[3] + m[4]) * ufixedpoint16((uint8_t)128) };
         v_int32 v_128_4 = vx_setall_s32(*((int32_t*)val));
-        static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+        const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
         v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m)));
         v_int16 v_mul23 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m + 2))));
         v_int16 v_mul4 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 4))));
@@ -1509,17 +1500,17 @@ void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
             v_src12 = vx_load(src3 + 2*VECSZ);
             v_src13 = vx_load(src3 + 3*VECSZ);
             v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul23);
-            v_res1 += v_dotprod(v_tmp1, v_mul23);
+            v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul23));
+            v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul23));
             v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-            v_res2 += v_dotprod(v_tmp0, v_mul23);
-            v_res3 += v_dotprod(v_tmp1, v_mul23);
+            v_res2 = v_add(v_res2, v_dotprod(v_tmp0, v_mul23));
+            v_res3 = v_add(v_res3, v_dotprod(v_tmp1, v_mul23));
             v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-            v_res4 += v_dotprod(v_tmp0, v_mul23);
-            v_res5 += v_dotprod(v_tmp1, v_mul23);
+            v_res4 = v_add(v_res4, v_dotprod(v_tmp0, v_mul23));
+            v_res5 = v_add(v_res5, v_dotprod(v_tmp1, v_mul23));
             v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-            v_res6 += v_dotprod(v_tmp0, v_mul23);
-            v_res7 += v_dotprod(v_tmp1, v_mul23);
+            v_res6 = v_add(v_res6, v_dotprod(v_tmp0, v_mul23));
+            v_res7 = v_add(v_res7, v_dotprod(v_tmp1, v_mul23));
 
             v_int32 v_resj0, v_resj1;
             const int16_t* src4 = (const int16_t*)src[4] + i;
@@ -1528,26 +1519,26 @@ void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
             v_src02 = vx_load(src4 + 2*VECSZ);
             v_src03 = vx_load(src4 + 3*VECSZ);
             v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1);
-            v_res0 += v_resj0;
-            v_res1 += v_resj1;
+            v_res0 = v_add(v_res0, v_resj0);
+            v_res1 = v_add(v_res1, v_resj1);
             v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1);
-            v_res2 += v_resj0;
-            v_res3 += v_resj1;
+            v_res2 = v_add(v_res2, v_resj0);
+            v_res3 = v_add(v_res3, v_resj1);
             v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1);
-            v_res4 += v_resj0;
-            v_res5 += v_resj1;
+            v_res4 = v_add(v_res4, v_resj0);
+            v_res5 = v_add(v_res5, v_resj1);
             v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1);
-            v_res6 += v_resj0;
-            v_res7 += v_resj1;
-
-            v_res0 += v_128_4;
-            v_res1 += v_128_4;
-            v_res2 += v_128_4;
-            v_res3 += v_128_4;
-            v_res4 += v_128_4;
-            v_res5 += v_128_4;
-            v_res6 += v_128_4;
-            v_res7 += v_128_4;
+            v_res6 = v_add(v_res6, v_resj0);
+            v_res7 = v_add(v_res7, v_resj1);
+
+            v_res0 = v_add(v_res0, v_128_4);
+            v_res1 = v_add(v_res1, v_128_4);
+            v_res2 = v_add(v_res2, v_128_4);
+            v_res3 = v_add(v_res3, v_128_4);
+            v_res4 = v_add(v_res4, v_128_4);
+            v_res5 = v_add(v_res5, v_128_4);
+            v_res6 = v_add(v_res6, v_128_4);
+            v_res7 = v_add(v_res7, v_128_4);
 
             v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                               v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1569,9 +1560,9 @@ template <>
 void vlineSmooth5N14641<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint32 v_6 = vx_setall_u32(6);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint32 v_src00, v_src10, v_src20, v_src30, v_src40;
@@ -1588,10 +1579,10 @@ void vlineSmooth5N14641<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src
         v_expand(vx_load((uint16_t*)(src[3]) + i + VECSZ), v_src32, v_src33);
         v_expand(vx_load((uint16_t*)(src[4]) + i), v_src40, v_src41);
         v_expand(vx_load((uint16_t*)(src[4]) + i + VECSZ), v_src42, v_src43);
-        v_store(dst + i, v_pack(v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
-                                                v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41),
-                                v_rshr_pack<12>(v_src22*v_6 + ((v_src12 + v_src32) << 2) + v_src02 + v_src42,
-                                                v_src23*v_6 + ((v_src13 + v_src33) << 2) + v_src03 + v_src43)));
+        v_store(dst + i, v_pack(v_rshr_pack<12>(v_add(v_add(v_add(v_mul(v_src20, v_6), v_shl<2>(v_add(v_src10, v_src30))), v_src00), v_src40),
+                                                v_add(v_add(v_add(v_mul(v_src21, v_6), v_shl<2>(v_add(v_src11, v_src31))), v_src01), v_src41)),
+                                v_rshr_pack<12>(v_add(v_add(v_add(v_mul(v_src22, v_6), v_shl<2>(v_add(v_src12, v_src32))), v_src02), v_src42),
+                                                v_add(v_add(v_add(v_mul(v_src23, v_6), v_shl<2>(v_add(v_src13, v_src33))), v_src03), v_src43))));
     }
 #endif
     for (; i < len; i++)
@@ -1603,8 +1594,8 @@ template <>
 void vlineSmooth5N14641<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint64 v_src00, v_src10, v_src20, v_src30, v_src40;
@@ -1621,10 +1612,10 @@ void vlineSmooth5N14641<uint16_t, ufixedpoint32>(const ufixedpoint32* const * sr
         v_expand(vx_load((uint32_t*)(src[3]) + i + VECSZ), v_src32, v_src33);
         v_expand(vx_load((uint32_t*)(src[4]) + i), v_src40, v_src41);
         v_expand(vx_load((uint32_t*)(src[4]) + i + VECSZ), v_src42, v_src43);
-        v_store(dst + i, v_pack(v_rshr_pack<20>((v_src20 << 2) + (v_src20 << 1) + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
-                                                (v_src21 << 2) + (v_src21 << 1) + ((v_src11 + v_src31) << 2) + v_src01 + v_src41),
-                                v_rshr_pack<20>((v_src22 << 2) + (v_src22 << 1) + ((v_src12 + v_src32) << 2) + v_src02 + v_src42,
-                                                (v_src23 << 2) + (v_src23 << 1) + ((v_src13 + v_src33) << 2) + v_src03 + v_src43)));
+        v_store(dst + i, v_pack(v_rshr_pack<20>(v_add(v_add(v_add(v_add(v_shl<2>(v_src20), v_shl<1>(v_src20)), v_shl<2>(v_add(v_src10, v_src30))), v_src00), v_src40),
+                                                v_add(v_add(v_add(v_add(v_shl<2>(v_src21), v_shl<1>(v_src21)), v_shl<2>(v_add(v_src11, v_src31))), v_src01), v_src41)),
+                                v_rshr_pack<20>(v_add(v_add(v_add(v_add(v_shl<2>(v_src22), v_shl<1>(v_src22)), v_shl<2>(v_add(v_src12, v_src32))), v_src02), v_src42),
+                                                v_add(v_add(v_add(v_add(v_shl<2>(v_src23), v_shl<1>(v_src23)), v_shl<2>(v_add(v_src13, v_src33))), v_src03), v_src43))));
     }
 #endif
     for (; i < len; i++)
@@ -1647,10 +1638,10 @@ template <>
 void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
     v_int32 v_128_4 = vx_setall_s32(128 << 16);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= VECSZ)
     {
         ufixedpoint16 msum = m[0] + m[1];
@@ -1705,17 +1696,17 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
             v_src12 = vx_load(srcj1 + 2*VECSZ);
             v_src13 = vx_load(srcj1 + 3*VECSZ);
             v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul);
-            v_res1 += v_dotprod(v_tmp1, v_mul);
+            v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul));
+            v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-            v_res2 += v_dotprod(v_tmp0, v_mul);
-            v_res3 += v_dotprod(v_tmp1, v_mul);
+            v_res2 = v_add(v_res2, v_dotprod(v_tmp0, v_mul));
+            v_res3 = v_add(v_res3, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-            v_res4 += v_dotprod(v_tmp0, v_mul);
-            v_res5 += v_dotprod(v_tmp1, v_mul);
+            v_res4 = v_add(v_res4, v_dotprod(v_tmp0, v_mul));
+            v_res5 = v_add(v_res5, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-            v_res6 += v_dotprod(v_tmp0, v_mul);
-            v_res7 += v_dotprod(v_tmp1, v_mul);
+            v_res6 = v_add(v_res6, v_dotprod(v_tmp0, v_mul));
+            v_res7 = v_add(v_res7, v_dotprod(v_tmp1, v_mul));
         }
         if(j < n)
         {
@@ -1727,26 +1718,26 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
             v_src02 = vx_load(srcj + 2*VECSZ);
             v_src03 = vx_load(srcj + 3*VECSZ);
             v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_resj0, v_resj1);
-            v_res0 += v_resj0;
-            v_res1 += v_resj1;
+            v_res0 = v_add(v_res0, v_resj0);
+            v_res1 = v_add(v_res1, v_resj1);
             v_mul_expand(v_add_wrap(v_src01, v_128), v_mul, v_resj0, v_resj1);
-            v_res2 += v_resj0;
-            v_res3 += v_resj1;
+            v_res2 = v_add(v_res2, v_resj0);
+            v_res3 = v_add(v_res3, v_resj1);
             v_mul_expand(v_add_wrap(v_src02, v_128), v_mul, v_resj0, v_resj1);
-            v_res4 += v_resj0;
-            v_res5 += v_resj1;
+            v_res4 = v_add(v_res4, v_resj0);
+            v_res5 = v_add(v_res5, v_resj1);
             v_mul_expand(v_add_wrap(v_src03, v_128), v_mul, v_resj0, v_resj1);
-            v_res6 += v_resj0;
-            v_res7 += v_resj1;
-        }
-        v_res0 += v_128_4;
-        v_res1 += v_128_4;
-        v_res2 += v_128_4;
-        v_res3 += v_128_4;
-        v_res4 += v_128_4;
-        v_res5 += v_128_4;
-        v_res6 += v_128_4;
-        v_res7 += v_128_4;
+            v_res6 = v_add(v_res6, v_resj0);
+            v_res7 = v_add(v_res7, v_resj1);
+        }
+        v_res0 = v_add(v_res0, v_128_4);
+        v_res1 = v_add(v_res1, v_128_4);
+        v_res2 = v_add(v_res2, v_128_4);
+        v_res3 = v_add(v_res3, v_128_4);
+        v_res4 = v_add(v_res4, v_128_4);
+        v_res5 = v_add(v_res5, v_128_4);
+        v_res6 = v_add(v_res6, v_128_4);
+        v_res7 = v_add(v_res7, v_128_4);
 
         v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                           v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1780,11 +1771,11 @@ template <>
 void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int pre_shift = n / 2;
-    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+    const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
     v_int32 v_128_4 = vx_setall_s32(128 << 16);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= VECSZ)
     {
         ufixedpoint16 msum = m[0] + m[pre_shift] + m[n - 1];
@@ -1826,27 +1817,27 @@ void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * s
             v_src21 = vx_load(srcj1 + 2*VECSZ);
             v_src31 = vx_load(srcj1 + 3*VECSZ);
             v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src01, v_128), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul);
-            v_res1 += v_dotprod(v_tmp1, v_mul);
+            v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul));
+            v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src10, v_128), v_add_wrap(v_src11, v_128), v_tmp2, v_tmp3);
-            v_res2 += v_dotprod(v_tmp2, v_mul);
-            v_res3 += v_dotprod(v_tmp3, v_mul);
+            v_res2 = v_add(v_res2, v_dotprod(v_tmp2, v_mul));
+            v_res3 = v_add(v_res3, v_dotprod(v_tmp3, v_mul));
             v_zip(v_add_wrap(v_src20, v_128), v_add_wrap(v_src21, v_128), v_tmp4, v_tmp5);
-            v_res4 += v_dotprod(v_tmp4, v_mul);
-            v_res5 += v_dotprod(v_tmp5, v_mul);
+            v_res4 = v_add(v_res4, v_dotprod(v_tmp4, v_mul));
+            v_res5 = v_add(v_res5, v_dotprod(v_tmp5, v_mul));
             v_zip(v_add_wrap(v_src30, v_128), v_add_wrap(v_src31, v_128), v_tmp6, v_tmp7);
-            v_res6 += v_dotprod(v_tmp6, v_mul);
-            v_res7 += v_dotprod(v_tmp7, v_mul);
+            v_res6 = v_add(v_res6, v_dotprod(v_tmp6, v_mul));
+            v_res7 = v_add(v_res7, v_dotprod(v_tmp7, v_mul));
         }
 
-        v_res0 += v_128_4;
-        v_res1 += v_128_4;
-        v_res2 += v_128_4;
-        v_res3 += v_128_4;
-        v_res4 += v_128_4;
-        v_res5 += v_128_4;
-        v_res6 += v_128_4;
-        v_res7 += v_128_4;
+        v_res0 = v_add(v_res0, v_128_4);
+        v_res1 = v_add(v_res1, v_128_4);
+        v_res2 = v_add(v_res2, v_128_4);
+        v_res3 = v_add(v_res3, v_128_4);
+        v_res4 = v_add(v_res4, v_128_4);
+        v_res5 = v_add(v_res5, v_128_4);
+        v_res6 = v_add(v_res6, v_128_4);
+        v_res7 = v_add(v_res7, v_128_4);
 
         v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                           v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1868,9 +1859,9 @@ template <>
 void vlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32* m, int n, uint16_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int pre_shift = n / 2;
-    const int VECSZ = v_uint32::nlanes;
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint32 v_src00, v_src10, v_src01, v_src11;
@@ -1895,15 +1886,15 @@ void vlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const ufixedpoint32* const *
             v_src01 = vx_load(srcj1);
             v_mul_expand(v_src00, v_mul, v_tmp0, v_tmp1);
             v_mul_expand(v_src01, v_mul, v_tmp2, v_tmp3);
-            v_res0 += v_tmp0 + v_tmp2;
-            v_res1 += v_tmp1 + v_tmp3;
+            v_res0 = v_add(v_res0, v_add(v_tmp0, v_tmp2));
+            v_res1 = v_add(v_res1, v_add(v_tmp1, v_tmp3));
 
             v_src10 = vx_load(srcj0 + VECSZ);
             v_src11 = vx_load(srcj1 + VECSZ);
             v_mul_expand(v_src10, v_mul, v_tmp4, v_tmp5);
             v_mul_expand(v_src11, v_mul, v_tmp6, v_tmp7);
-            v_res2 += v_tmp4 + v_tmp6;
-            v_res3 += v_tmp5 + v_tmp7;
+            v_res2 = v_add(v_res2, v_add(v_tmp4, v_tmp6));
+            v_res3 = v_add(v_res3, v_add(v_tmp5, v_tmp7));
         }
 
         v_store(dst + i, v_pack(v_rshr_pack<32>(v_res0, v_res1),
diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp
index 1aed1fa03166..f422609c40f6 100644
--- a/modules/imgproc/src/spatialgradient.cpp
+++ b/modules/imgproc/src/spatialgradient.cpp
@@ -57,15 +57,33 @@ namespace cv
  *           0  0  0
  *           1  2  1
  */
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template <typename T>
-static inline void spatialGradientKernel( T& vx, T& vy,
+static inline void spatialGradientKernel_vec( T& vx, T& vy,
                                           const T& v00, const T& v01, const T& v02,
                                           const T& v10,               const T& v12,
                                           const T& v20, const T& v21, const T& v22 )
 {
     // vx = (v22 - v00) + (v02 - v20) + 2 * (v12 - v10)
     // vy = (v22 - v00) + (v20 - v02) + 2 * (v21 - v01)
+    T tmp_add = v_sub(v22, v00),
+      tmp_sub = v_sub(v02, v20),
+      tmp_x   = v_sub(v12, v10),
+      tmp_y   = v_sub(v21, v01);
+
+    vx = v_add(v_add(v_add(tmp_add, tmp_sub), tmp_x), tmp_x);
+    vy = v_add(v_add(v_sub(tmp_add, tmp_sub), tmp_y), tmp_y);
+}
+#endif
 
+template <typename T>
+static inline void spatialGradientKernel( T& vx, T& vy,
+                                          const T& v00, const T& v01, const T& v02,
+                                          const T& v10,               const T& v12,
+                                          const T& v20, const T& v21, const T& v22 )
+{
+    // vx = (v22 - v00) + (v02 - v20) + 2 * (v12 - v10)
+    // vy = (v22 - v00) + (v20 - v02) + 2 * (v21 - v01)
     T tmp_add = v22 - v00,
       tmp_sub = v02 - v20,
       tmp_x   = v12 - v10,
@@ -125,7 +143,7 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
     int i_start = 0;
     int j_start = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     // Characters in variable names have the following meanings:
     // u: unsigned char
     // s: signed int
@@ -148,7 +166,7 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
         short *n_dy = dy.ptr<short>(i+1);
 
         // Process rest of columns 16-column chunks at a time
-        for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes)
+        for ( j = 1; j < W - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
         {
             // Load top row for 3x3 Sobel filter
             v_uint8 v_um = vx_load(&p_src[j-1]);
@@ -195,22 +213,22 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
             // dx & dy for rows 1, 2, 3
             v_int16 v_sdx1, v_sdy1;
-            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+            spatialGradientKernel_vec<v_int16>( v_sdx1, v_sdy1,
                                               v_s1m1, v_s1n1, v_s1p1,
                                               v_s2m1,         v_s2p1,
                                               v_s3m1, v_s3n1, v_s3p1 );
 
             v_int16 v_sdx2, v_sdy2;
-            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+            spatialGradientKernel_vec<v_int16>( v_sdx2, v_sdy2,
                                               v_s1m2, v_s1n2, v_s1p2,
                                               v_s2m2,         v_s2p2,
                                               v_s3m2, v_s3n2, v_s3p2 );
 
             // Store
             v_store(&c_dx[j],                 v_sdx1);
-            v_store(&c_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&c_dx[j+VTraits<v_int16>::vlanes()], v_sdx2);
             v_store(&c_dy[j],                 v_sdy1);
-            v_store(&c_dy[j+v_int16::nlanes], v_sdy2);
+            v_store(&c_dy[j+VTraits<v_int16>::vlanes()], v_sdy2);
 
             // Load fourth row for 3x3 Sobel filter
             v_um = vx_load(&m_src[j-1]);
@@ -227,21 +245,21 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
             v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2);
 
             // dx & dy for rows 2, 3, 4
-            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+            spatialGradientKernel_vec<v_int16>( v_sdx1, v_sdy1,
                                               v_s2m1, v_s2n1, v_s2p1,
                                               v_s3m1,         v_s3p1,
                                               v_s4m1, v_s4n1, v_s4p1 );
 
-            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+            spatialGradientKernel_vec<v_int16>( v_sdx2, v_sdy2,
                                               v_s2m2, v_s2n2, v_s2p2,
                                               v_s3m2,         v_s3p2,
                                               v_s4m2, v_s4n2, v_s4p2 );
 
             // Store
             v_store(&n_dx[j],                 v_sdx1);
-            v_store(&n_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&n_dx[j+VTraits<v_int16>::vlanes()], v_sdx2);
             v_store(&n_dy[j],                 v_sdy1);
-            v_store(&n_dy[j+v_int16::nlanes], v_sdy2);
+            v_store(&n_dy[j+VTraits<v_int16>::vlanes()], v_sdy2);
         }
     }
     i_start = i;