From 55f06b76f91ad663ad39a5396923bc7718d288bb Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sat, 26 Dec 2020 09:40:29 +0000
Subject: [PATCH 01/11] dnn: improve debugging of TensorFlow parsing errors

---
 modules/dnn/src/tensorflow/tf_importer.cpp | 478 +++++++++++++--------
 1 file changed, 307 insertions(+), 171 deletions(-)
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 493f0ec3055d..2484c8493c2b 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -11,6 +11,11 @@ Implementation of Tensorflow models parser
 
 #include "../precomp.hpp"
 
+#include <opencv2/core/utils/logger.defines.hpp>
+#undef CV_LOG_STRIP_LEVEL
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
+#include <opencv2/core/utils/logger.hpp>
+
 #ifdef HAVE_PROTOBUF
 #include "tf_io.hpp"
 
@@ -93,7 +98,7 @@ void blobShapeFromTensor(const tensorflow::TensorProto &tensor, MatShape& shape)
                 shape[i] = (int)_shape.dim(i).size();
         }
         else
-            shape.resize(1, 1);  // Scalar.
+            shape.resize(1, 1);  // Scalar. // FIXIT: should be empty
     }
     else
     {
@@ -258,7 +263,7 @@ const tensorflow::AttrValue& getLayerAttr(const tensorflow::NodeDef &layer, cons
     return layer.attr().at(name);
 }
 
-static int getDataLayout(const tensorflow::NodeDef& layer)
+static DataLayout getDataLayout(const tensorflow::NodeDef& layer)
 {
     if (hasLayerAttr(layer, "data_format"))
     {
@@ -280,10 +285,13 @@ static inline std::string getNodeName(const std::string& tensorName)
     return tensorName.substr(0, tensorName.rfind(':'));
 }
 
-static inline int getDataLayout(const std::string& layerName,
-                                const std::map<String, int>& data_layouts)
+static inline
+DataLayout getDataLayout(
+        const std::string& layerName,
+        const std::map<String, DataLayout>& data_layouts
+)
 {
-    std::map<String, int>::const_iterator it = data_layouts.find(getNodeName(layerName));
+    std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layerName));
     return it != data_layouts.end() ? it->second : DATA_LAYOUT_UNKNOWN;
 }
 
@@ -439,15 +447,20 @@ void ExcludeLayer(tensorflow::GraphDef& net, const int layer_index, const int in
         net.mutable_node()->DeleteSubrange(layer_index, 1);
 }
 
-class TFImporter {
+class TFImporter
+{
 public:
-    TFImporter(const char *model, const char *config = NULL);
-    TFImporter(const char *dataModel, size_t lenModel,
+    TFImporter(Net& net, const char *model, const char *config = NULL);
+    TFImporter(Net& net, const char *dataModel, size_t lenModel,
                const char *dataConfig = NULL, size_t lenConfig = 0);
+protected:
+    Net& dstNet;
+    void populateNet();
 
-    void populateNet(Net dstNet);
+    void parseNode(const tensorflow::NodeDef& layer);
+
+    DataLayout predictOutputDataLayout(const tensorflow::NodeDef& layer);
 
-private:
     void kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob);
 
     void connect(const std::map<String, int>& layers_name_id_map, Net& network, const Pin& outPin,
@@ -467,23 +480,53 @@ class TFImporter {
 
     std::vector<String> netInputsNames;
     std::vector<MatShape> netInputShapes;
+
+    std::set<String> layers_to_ignore;
+    std::map<String, DataLayout> data_layouts;
+
+    // find all Const layers for params
+    std::map<String, int> value_id;
+    // A map with constant blobs which are shared between multiple layers.
+    std::map<String, Mat> sharedWeights;
+
+    std::map<String, int> layer_id;
 };
 
-TFImporter::TFImporter(const char *model, const char *config)
+TFImporter::TFImporter(Net& net, const char *model, const char *config)
+    : dstNet(net)
 {
     if (model && model[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from file: " << model);
         ReadTFNetParamsFromBinaryFileOrDie(model, &netBin);
+    }
     if (config && config[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from file: " << config);
         ReadTFNetParamsFromTextFileOrDie(config, &netTxt);
+    }
+
+    populateNet();
 }
 
-TFImporter::TFImporter(const char *dataModel, size_t lenModel,
-                       const char *dataConfig, size_t lenConfig)
+TFImporter::TFImporter(
+        Net& net,
+        const char *dataModel, size_t lenModel,
+        const char *dataConfig, size_t lenConfig
+)
+    : dstNet(net)
 {
     if (dataModel != NULL && lenModel > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from memory (" << lenModel << " bytes)");
         ReadTFNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBin);
+    }
     if (dataConfig != NULL && lenConfig > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from memory (" << lenConfig << " bytes)");
         ReadTFNetParamsFromTextBufferOrDie(dataConfig, lenConfig, &netTxt);
+    }
+    populateNet();
 }
 
 void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
@@ -612,84 +655,98 @@ const tensorflow::TensorProto& TFImporter::getConstBlob(const tensorflow::NodeDe
 static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& const_layers,
                           std::set<String>& layers_to_ignore)
 {
+    CV_LOG_DEBUG(NULL, "DNN/TF: addConstNodes(): handling " << net.node_size() << " nodes...");
     for (int li = 0; li < net.node_size(); li++)
     {
         const tensorflow::NodeDef &layer = net.node(li);
         String name = layer.name();
         String type = layer.op();
 
-        if (type == "Dequantize")
+        //CV_LOG_DEBUG(NULL, "DNN/TF: layer_id=" << li << " - '" << name << "' @ " << type);
+
+        try
         {
-            // Example of Dequantize node:
-            //   name: "conv2d_1/bias"
-            //   op: "Dequantize"
-            //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
-            //   input: "conv2d_1/bias_quantized_min"
-            //   input: "conv2d_1/bias_quantized_max"
-            //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
-            //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
-            CV_Assert(layer.input_size() == 3);
-            for (int i = 0; i < 3; ++i)
-                CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
-            CV_Assert(hasLayerAttr(layer, "mode") &&
-                      getLayerAttr(layer, "mode").s() == "MIN_FIRST");
-
-            int tensorId = const_layers[layer.input(0)];
-            int minId = const_layers[layer.input(1)];
-            int maxId = const_layers[layer.input(2)];
-
-            tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
-                                                ->mutable_attr()->at("value")
-                                                 .mutable_tensor();
-            CV_Assert(tensor->dtype() == tensorflow::DT_QUINT8);
-
-            Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
-            Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
-            CV_Assert_N(qMin.total() == 1, qMin.type() == CV_32FC1,
-                        qMax.total() == 1, qMax.type() == CV_32FC1);
-
-            Mat content = getTensorContent(*tensor);
-
-            float minVal = qMin.at<float>(0);
-            float rangeScale = (qMax.at<float>(0) - minVal) / 255;
-            CV_Assert(rangeScale >= 0);
-            content.convertTo(content, CV_32FC1, rangeScale,
-                              rangeScale * cvRound(minVal / rangeScale));
-
-            tensor->set_dtype(tensorflow::DT_FLOAT);
-            tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
-
-            net.mutable_node(tensorId)->set_name(name);
-            CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
+            if (type == "Dequantize")
+            {
+                // Example of Dequantize node:
+                //   name: "conv2d_1/bias"
+                //   op: "Dequantize"
+                //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
+                //   input: "conv2d_1/bias_quantized_min"
+                //   input: "conv2d_1/bias_quantized_max"
+                //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
+                //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
+                CV_CheckEQ(layer.input_size(), 3, "Dequantize: 3 inputs is supported only");
+                for (int i = 0; i < 3; ++i)
+                    CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
+                CV_Assert(hasLayerAttr(layer, "mode") &&
+                          getLayerAttr(layer, "mode").s() == "MIN_FIRST");
+
+                int tensorId = const_layers[layer.input(0)];
+                int minId = const_layers[layer.input(1)];
+                int maxId = const_layers[layer.input(2)];
+
+                tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
+                                                    ->mutable_attr()->at("value")
+                                                     .mutable_tensor();
+                CV_CheckEQ((int)tensor->dtype(), (int)tensorflow::DT_QUINT8, "");
+
+                Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
+                Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
+                CV_CheckEQ(qMin.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMin.type(), CV_32FC1, "");
+                CV_CheckEQ(qMax.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMax.type(), CV_32FC1, "");
+
+                Mat content = getTensorContent(*tensor);
+
+                float minVal = qMin.at<float>(0);
+                float rangeScale = (qMax.at<float>(0) - minVal) / 255;
+                CV_Assert(rangeScale >= 0);
+                content.convertTo(content, CV_32FC1, rangeScale,
+                                  rangeScale * cvRound(minVal / rangeScale));
+
+                tensor->set_dtype(tensorflow::DT_FLOAT);
+                tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
+
+                net.mutable_node(tensorId)->set_name(name);
+                CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
+                layers_to_ignore.insert(name);
+                continue;
+            }
+            else if (type != "Const")
+                continue;  // only Const parameters are supported
+
+            if (layer.attr().find("value") != layer.attr().end())
+            {
+                CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
+            }
             layers_to_ignore.insert(name);
-            continue;
         }
-        else if (type != "Const")
-            continue;  // only Const parameters are supported
-
-        if (layer.attr().find("value") != layer.attr().end())
+        catch (const std::exception& e)
         {
-            CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't handle node='" << name << "'. Exception: " << e.what());
+            throw;
         }
-        layers_to_ignore.insert(name);
     }
+    CV_LOG_DEBUG(NULL, "DNN/TF: layers_to_ignore.size() = " << layers_to_ignore.size());
 }
 
 // If all inputs of specific layer have the same data layout we can say that
 // this layer's output has this data layout too. Returns DATA_LAYOUT_UNKNOWN otherwise.
-static int predictOutputDataLayout(const tensorflow::GraphDef& net,
-                                   const tensorflow::NodeDef& layer,
-                                   const std::map<String, int>& data_layouts)
+DataLayout TFImporter::predictOutputDataLayout(const tensorflow::NodeDef& layer)
 {
-    int layout = getDataLayout(layer);
+    DataLayout layout = getDataLayout(layer);
     if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from attrs)");
         return layout;
+    }
 
     // Determine layout by layer's inputs
-    std::map<String, int>::const_iterator it;
     for (int i = 0, n = layer.input_size(); i < n; ++i)
     {
-        it = data_layouts.find(getNodeName(layer.input(i)));
+        std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layer.input(i)));
         if (it != data_layouts.end())
         {
             if (layout != DATA_LAYOUT_UNKNOWN)
@@ -703,71 +760,72 @@ static int predictOutputDataLayout(const tensorflow::GraphDef& net,
     }
 
     if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from inputs)");
         return layout;
+    }
 
     // Determine layout by layer's consumers recursively.
-    it = data_layouts.find(layer.name());
+    std::map<String, DataLayout>::const_iterator it = data_layouts.find(layer.name());
     CV_Assert(it != data_layouts.end());
     return it->second;
 }
 
-void TFImporter::populateNet(Net dstNet)
+void TFImporter::populateNet()
 {
-    if (!netTxt.ByteSize())
-        removePhaseSwitches(netBin);
+    CV_Assert(netBin.ByteSize() || netTxt.ByteSize());
 
-    RemoveIdentityOps(netBin);
-    RemoveIdentityOps(netTxt);
+    CV_LOG_INFO(NULL, "DNN/TF: parsing model"
+        << (netBin.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netBin.versions().producer(), (int)netBin.versions().min_consumer()) : cv::String(" (N/A version info)"))
+        << ". Number of nodes = " << netBin.node_size()
+    );
 
-    if (!netTxt.ByteSize())
+    if (netTxt.ByteSize())
     {
-        simplifySubgraphs(netBin);
-        sortByExecutionOrder(netBin);
+        CV_LOG_INFO(NULL, "DNN/TF: parsing config"
+            << (netTxt.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netTxt.versions().producer(), (int)netTxt.versions().min_consumer()) : cv::String(" (N/A version info)"))
+            << ". Number of nodes = " << netTxt.node_size()
+        );
+
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+        RemoveIdentityOps(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(config) => " << netTxt.node_size() << " nodes");
+
+        sortByExecutionOrder(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(config) => " << netTxt.node_size() << " nodes");
     }
     else
     {
-        sortByExecutionOrder(netTxt);
-    }
+        removePhaseSwitches(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: removePhaseSwitches(model) => " << netBin.node_size() << " nodes");
 
-    std::set<String> layers_to_ignore;
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+
+        simplifySubgraphs(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: simplifySubgraphs(model) => " << netBin.node_size() << " nodes");
+        sortByExecutionOrder(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(model) => " << netBin.node_size() << " nodes");
+    }
 
     tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
 
     int layersSize = net.node_size();
 
-    std::map<String, int> data_layouts;
     // Pre-fill data layouts where they are set explicitly.
     // Assuming that nodes are in topological order
-    for (int i = net.node_size() - 1; i >= 0; --i)
+    for (int i = layersSize - 1; i >= 0; --i)
     {
         const tensorflow::NodeDef& layer = net.node(i);
         std::string name = layer.name();
 
-        int layout = getDataLayout(layer);
-        std::map<String, int>::iterator it = data_layouts.find(name);
-        if (it != data_layouts.end())
-        {
-            if (layout != DATA_LAYOUT_UNKNOWN)
-            {
-                if (it->second == DATA_LAYOUT_UNKNOWN)
-                    it->second = layout;
-                else if (it->second != layout)
-                {
-                    it->second = DATA_LAYOUT_UNKNOWN;
-                    layout = DATA_LAYOUT_UNKNOWN;
-                }
-            }
-            else
-                layout = it->second;
-        }
-        else
-            data_layouts[name] = layout;
+        CV_LOG_DEBUG(NULL, "DNN/TF: node(" << i << " - '" << name << "') propagating layout...");
 
-        // Specify input layers to have the same data layout.
-        for (int j = 0; j < layer.input_size(); ++j)
+        try
         {
-            name = getNodeName(layer.input(j));
-            it = data_layouts.find(name);
+            DataLayout layout = getDataLayout(layer);
+            std::map<String, DataLayout>::iterator it = data_layouts.find(name);
             if (it != data_layouts.end())
             {
                 if (layout != DATA_LAYOUT_UNKNOWN)
@@ -775,38 +833,94 @@ void TFImporter::populateNet(Net dstNet)
                     if (it->second == DATA_LAYOUT_UNKNOWN)
                         it->second = layout;
                     else if (it->second != layout)
+                    {
                         it->second = DATA_LAYOUT_UNKNOWN;
+                        layout = DATA_LAYOUT_UNKNOWN;
+                    }
                 }
+                else
+                    layout = it->second;
             }
             else
                 data_layouts[name] = layout;
+
+            // Specify input layers to have the same data layout.
+            for (int j = 0; j < layer.input_size(); ++j)
+            {
+                name = getNodeName(layer.input(j));
+                it = data_layouts.find(name);
+                if (it != data_layouts.end())
+                {
+                    if (layout != DATA_LAYOUT_UNKNOWN)
+                    {
+                        if (it->second == DATA_LAYOUT_UNKNOWN)
+                            it->second = layout;
+                        else if (it->second != layout)
+                            it->second = DATA_LAYOUT_UNKNOWN;
+                    }
+                }
+                else
+                    data_layouts[name] = layout;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't propagate layout for node='" << name << "'. Exception: " << e.what());
+            throw;
         }
     }
 
-    // find all Const layers for params
-    std::map<String, int> value_id;
-    // A map with constant blobs which are shared between multiple layers.
-    std::map<String, Mat> sharedWeights;
     addConstNodes(netBin, value_id, layers_to_ignore);
     addConstNodes(netTxt, value_id, layers_to_ignore);
 
-    std::map<String, int> layer_id;
 
     for (int li = 0; li < layersSize; li++)
     {
-        tensorflow::NodeDef layer = net.node(li);
-        String name = layer.name();
-        String type = layer.op();
+        const tensorflow::NodeDef& layer = net.node(li);
+
+        const std::string name = layer.name();
+        const std::string type = layer.op();
+        const int ninputs = layer.input_size();
+        CV_LOG_DEBUG(NULL, "DNN/TF: (" << li << "/" << layersSize << ") Parse layer " << name << " @ " << type << " with " << ninputs << " inputs");
+
+        parseNode(layer);
+    }
+
+    for (size_t i = 0; i < netInputsNames.size(); i++)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: Model input: " << i << " - '" << netInputsNames[i] << "'");
+        CV_Assert(!netInputsNames[i].empty());
+    }
+    dstNet.setInputsNames(netInputsNames);
+    CV_LOG_DEBUG(NULL, "DNN/TF: ===================== Import completed =====================");
+}
+
+void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
+{
+    tensorflow::NodeDef layer = layer_;
+
+    tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
+
+    /*const*/ std::string name = layer.name();
+    /*const*/ std::string type = layer.op();
+    /*const*/ int num_inputs = layer.input_size();
+
+    try
+    {
         LayerParams layerParams;
 
-        if(layers_to_ignore.find(name) != layers_to_ignore.end())
-            continue;
+        if (layers_to_ignore.find(name) != layers_to_ignore.end())
+        {
+            CV_LOG_DEBUG(NULL, "DNN/TF:     ignored");
+            return;
+        }
 
-        int predictedLayout = predictOutputDataLayout(net, layer, data_layouts);
+        DataLayout predictedLayout = predictOutputDataLayout(layer);
         data_layouts[name] = predictedLayout;
 
         if (type == "Conv2D" || type == "SpaceToBatchND" || type == "DepthwiseConv2dNative" || type == "Pad" || type == "MirrorPad" || type == "Conv3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             // The first node of dilated convolution subgraph.
             // Extract input node, dilation rate and paddings.
             std::string input = layer.input(0);
@@ -824,7 +938,7 @@ void TFImporter::populateNet(Net dstNet)
                 // input: "input"
                 // input: "SpaceToBatchND/block_shape"
                 // input: "SpaceToBatchND/paddings"
-                CV_Assert(layer.input_size() == 3);
+                CV_CheckEQ(num_inputs, 3, "");
 
                 DictValue dilation = parseDims(getConstBlob(layer, value_id, 1));
                 CV_Assert(dilation.size() == 2);
@@ -839,10 +953,14 @@ void TFImporter::populateNet(Net dstNet)
                 layerParams.set("pad_w", paddings.at<float>(2));
 
                 CV_Assert(next_layers.size() == 1);
-                layer = net.node(next_layers[0].second);
                 layers_to_ignore.insert(next_layers[0].first);
+
+                // FIXIT don't override, rewrite this code
+                layer = net.node(next_layers[0].second);
                 name = layer.name();
                 type = layer.op();
+                num_inputs = layer.input_size();
+                CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
             }
             else if (type == "Pad" || type == "MirrorPad")
             {
@@ -876,7 +994,7 @@ void TFImporter::populateNet(Net dstNet)
                     layer_id[name] = id;
 
                     connect(layer_id, dstNet, parsePin(input), id, 0);
-                    continue;
+                    return;
                 }
                 else
                 {
@@ -886,10 +1004,14 @@ void TFImporter::populateNet(Net dstNet)
                     layerParams.set("pad_h", paddings.at<int32_t>(4));
                     layerParams.set("pad_w", paddings.at<int32_t>(6));
 
-                    layer = net.node(next_layers[0].second);
                     layers_to_ignore.insert(next_layers[0].first);
+
+                    // FIXIT don't override, rewrite this code
+                    layer = net.node(next_layers[0].second);
                     name = layer.name();
                     type = layer.op();
+                    num_inputs = layer.input_size();
+                    CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
                 }
             }
 
@@ -1011,13 +1133,14 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "BiasAdd" || type == "Add" || type == "AddV2" || type == "Sub" || type=="AddN")
         {
+            CV_CheckGT(num_inputs, 0, "");
             bool haveConst = false;
-            for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
+            for(int ii = 0; !haveConst && ii < num_inputs; ++ii)
             {
                 Pin input = parsePin(layer.input(ii));
                 haveConst = value_id.find(input.name) != value_id.end();
             }
-            CV_Assert(!haveConst || layer.input_size() == 2);
+            CV_Assert(!haveConst || num_inputs == 2);
 
             if (haveConst)
             {
@@ -1054,7 +1177,7 @@ void TFImporter::populateNet(Net dstNet)
                 int id = dstNet.addLayer(name, "Eltwise", layerParams);
                 layer_id[name] = id;
 
-                for (int ii = 0; ii < layer.input_size(); ii++)
+                for (int ii = 0; ii < num_inputs; ii++)
                 {
                     Pin inp = parsePin(layer.input(ii));
                     if (layer_id.find(inp.name) == layer_id.end())
@@ -1065,7 +1188,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "MatMul")
         {
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
 
             // For the object detection networks, TensorFlow Object Detection API
             // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
@@ -1077,7 +1200,7 @@ void TFImporter::populateNet(Net dstNet)
             layerParams.set("bias_term", false);
             layerParams.blobs.resize(1);
 
-            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");
+            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");  // FIXIT Use layers fusion instead
             if (next_layers.empty())
             {
                 next_layers = getNextLayers(net, name, "Add");
@@ -1135,8 +1258,9 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Reshape")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Pin inpId = parsePin(layer.input(0));
-            int inpLayout = getDataLayout(layer.input(0), data_layouts);
+            DataLayout inpLayout = getDataLayout(layer.input(0), data_layouts);
             // There are two possible implementations: reshape an input using
             // predefined sizes or use a second input blob as a source of new shape.
             if (value_id.find(layer.input(1)) != value_id.end())
@@ -1185,6 +1309,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Flatten" || type == "Squeeze")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Pin inpId = parsePin(layer.input(0));
             int inpLayout = getDataLayout(layer.input(0), data_layouts);
             if (type == "Squeeze")
@@ -1231,6 +1356,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Transpose")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Mat perm = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(perm.type() == CV_32SC1);
             int* permData = (int*)perm.data;
@@ -1304,6 +1430,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "LRN")
         {
+            CV_CheckGT(num_inputs, 0, "");
             if(hasLayerAttr(layer, "alpha")) {
                 layerParams.set("alpha", getLayerAttr(layer, "alpha").f());
             }
@@ -1322,11 +1449,12 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "LRN", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "Concat" || type == "ConcatV2")
         {
-            int axisId = (type == "Concat" ? 0 : layer.input_size() - 1);
+            CV_CheckGT(num_inputs, 0, "");
+            int axisId = (type == "Concat" ? 0 : num_inputs - 1);
             int axis = getConstBlob(layer, value_id, axisId).int_val().Get(0);
 
             if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
@@ -1337,7 +1465,7 @@ void TFImporter::populateNet(Net dstNet)
 
             // input(0) or input(n-1) is concat_dim
             int from = (type == "Concat" ? 1 : 0);
-            int to = (type == "Concat" ? layer.input_size() : layer.input_size() - 1);
+            int to = (type == "Concat" ? num_inputs : num_inputs - 1);
 
             for (int ii = from; ii < to; ii++)
             {
@@ -1370,6 +1498,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "MaxPool" || type == "MaxPool3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             layerParams.set("pool", "max");
 
             setKSize(layerParams, layer);
@@ -1381,10 +1510,11 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "Pooling", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "AvgPool" || type == "AvgPool3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             layerParams.set("pool", "ave");
             layerParams.set("ave_pool_padded_area", false);
             setKSize(layerParams, layer);
@@ -1394,11 +1524,11 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "Pooling", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "MaxPoolGrad")
         {
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             layerParams.set("pool_k_h", 0);
             layerParams.set("pool_k_w", 0);
@@ -1457,7 +1587,7 @@ void TFImporter::populateNet(Net dstNet)
             // TODO: slicing input may be Const op
             // TODO: slicing kernels for convolutions - in current implementation it is impossible
             // TODO: add parsing num of slices parameter
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
             // num_split
             // 1st blob is dims tensor
             int axis = getConstBlob(layer, value_id, 0).int_val().Get(0);
@@ -1480,7 +1610,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input_node"
             // input: "Slice/begin"
             // input: "Slice/size"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
             Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat sizes = getTensorContent(getConstBlob(layer, value_id, 2));
             CV_Assert_N(!begins.empty(), !sizes.empty());
@@ -1505,7 +1635,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "StridedSlice")
         {
-            CV_Assert(layer.input_size() == 4);
+            CV_CheckEQ(num_inputs, 4, "");
             Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat ends = getTensorContent(getConstBlob(layer, value_id, 2));
             Mat strides = getTensorContent(getConstBlob(layer, value_id, 3));
@@ -1544,8 +1674,9 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Mul" || type == "RealDiv")
         {
+            CV_CheckGT(num_inputs, 0, "");
             int constId = -1;
-            for(int ii = 0; ii < layer.input_size(); ++ii)
+            for(int ii = 0; ii < num_inputs; ++ii)
             {
                 Pin input = parsePin(layer.input(ii));
                 if (value_id.find(input.name) != value_id.end())
@@ -1554,12 +1685,12 @@ void TFImporter::populateNet(Net dstNet)
                     break;
                 }
             }
-            CV_Assert((constId != -1) || (layer.input_size() == 2));
+            CV_Assert((constId != -1) || (num_inputs == 2));
 
             if (constId != -1)
             {
                 // Multiplication by constant.
-                CV_Assert(layer.input_size() == 2);
+                CV_CheckEQ(num_inputs, 2, "");
                 Mat scaleMat = getTensorContent(getConstBlob(layer, value_id));
                 CV_Assert(scaleMat.type() == CV_32FC1);
                 if (type == "RealDiv")
@@ -1643,7 +1774,7 @@ void TFImporter::populateNet(Net dstNet)
                 // Check if all the inputs have the same shape.
                 bool equalInpShapes = true;
                 MatShape outShape0;
-                for (int ii = 0; ii < layer.input_size() && !netInputShapes.empty(); ii++)
+                for (int ii = 0; ii < num_inputs && !netInputShapes.empty(); ii++)
                 {
                     Pin pin = parsePin(layer.input(ii));
                     int inpId = layer_id.find(pin.name)->second;
@@ -1681,7 +1812,7 @@ void TFImporter::populateNet(Net dstNet)
 
                 layer_id[name] = id;
 
-                for (int ii = 0; ii < layer.input_size(); ii++)
+                for (int ii = 0; ii < num_inputs; ii++)
                 {
                     Pin inp = parsePin(layer.input(ii));
                     if (layer_id.find(inp.name) == layer_id.end())
@@ -1698,9 +1829,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "BatchNorm/beta"
             // input: "BatchNorm/moving_mean"
             // input: "BatchNorm/moving_variance"
-            if (layer.input_size() != 5)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected gamma, beta, mean and std");
+            CV_CheckEQ(num_inputs, 5, "Expected gamma, beta, mean and std");
             Pin inpId = parsePin(layer.input(0));
 
             bool isTraining = hasLayerAttr(layer, "is_training") && getLayerAttr(layer, "is_training").b();
@@ -1768,9 +1897,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "conv2d_transpose/output_shape"
             // input: "weights"
             // input: "input"
-            if (layer.input_size() != 3)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected output shape, weights and input nodes");
+            CV_CheckEQ(num_inputs, 3, "Expected output shape, weights and input nodes");
 
             layerParams.set("bias_term", false);
             layerParams.blobs.resize(1);
@@ -1845,8 +1972,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "lstm_block_wrapper/w_f_diag"
             // input: "lstm_block_wrapper/w_o_diag"
             // input: "lstm_block_wrapper/bias"
-            if (layer.input_size() != 9)
-                CV_Error(Error::StsNotImplemented, "Unexpected number of input nodes");
+            CV_CheckEQ(num_inputs, 9, "Unexpected number of input nodes");
 
             if (hasLayerAttr(layer, "forget_bias"))
                 layerParams.set("forget_bias", getLayerAttr(layer, "forget_bias").f());
@@ -1912,6 +2038,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "ResizeNearestNeighbor" || type == "ResizeBilinear" || type == "FusedResizeAndPadConv2D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             std::string convWeights = "";
             if (type == "FusedResizeAndPadConv2D")
             {
@@ -1919,30 +2046,32 @@ void TFImporter::populateNet(Net dstNet)
                 // input: "decoder/ResizeBilinear/size"
                 // input: "decoder/decoder_conv0/Conv2D_dummy_paddings"
                 // input: "decoder/decoder_conv0/weights"
-                CV_CheckEQ(layer.input_size(), 4, "Number of input for FusedResizeAndPadConv2D");
+                CV_CheckEQ(num_inputs, 4, "Number of input for FusedResizeAndPadConv2D");
 
                 Mat paddings = getTensorContent(getConstBlob(layer, value_id, 2));
                 CV_CheckEQ(countNonZero(paddings), 0, "Unsupported mode");
 
                 convWeights = layer.input(3);
-                layer.mutable_input()->DeleteSubrange(2, 2);
+                layer.mutable_input()->DeleteSubrange(2, 2);  // FIXIT do NOT modify input model
+                num_inputs = layer.input_size();
                 name = name + "/resize";
 
                 if (hasLayerAttr(layer, "resize_align_corners"))
                 {
+                    // FIXIT do NOT modify input model
                     layer.mutable_attr()->insert(
                         ::google::protobuf::MapPair<std::string, tensorflow::AttrValue>("align_corners",
                                                                                         getLayerAttr(layer, "resize_align_corners")));
                 }
             }
-            if (layer.input_size() == 2)
+            if (num_inputs == 2)
             {
                 Mat outSize = getTensorContent(getConstBlob(layer, value_id, 1));
                 CV_CheckTypeEQ(outSize.type(), CV_32SC1, ""); CV_CheckEQ(outSize.total(), (size_t)2, "");
                 layerParams.set("height", outSize.at<int>(0, 0));
                 layerParams.set("width", outSize.at<int>(0, 1));
             }
-            else if (layer.input_size() == 3)
+            else if (num_inputs == 3)
             {
                 Mat factorHeight = getTensorContent(getConstBlob(layer, value_id, 1));
                 Mat factorWidth = getTensorContent(getConstBlob(layer, value_id, 2));
@@ -1952,7 +2081,7 @@ void TFImporter::populateNet(Net dstNet)
                 layerParams.set("zoom_factor_y", factorHeight.at<float>(0));
             }
             else
-                CV_Assert(layer.input_size() == 2 || layer.input_size() == 3);
+                CV_Check(num_inputs, num_inputs == 2 || num_inputs == 3, "");
 
             if (type == "ResizeNearestNeighbor")
                 layerParams.set("interpolation", "nearest");
@@ -1973,12 +2102,12 @@ void TFImporter::populateNet(Net dstNet)
             // Step back to add convolution
             if (type == "FusedResizeAndPadConv2D")
             {
-                tensorflow::NodeDef* conv = net.mutable_node(li);
-                conv->clear_input();
-                conv->add_input(name);
-                conv->add_input(convWeights);
-                conv->set_op("Conv2D");
-                li -= 1;
+                tensorflow::NodeDef conv = layer_;
+                conv.clear_input();
+                conv.add_input(name);
+                conv.add_input(convWeights);
+                conv.set_op("Conv2D");
+                parseNode(conv);
             }
         }
         else if (type == "L2Normalize")
@@ -1986,7 +2115,7 @@ void TFImporter::populateNet(Net dstNet)
             // op: "L2Normalize"
             // input: "input"
             // input: "reduction_indices" (axis)
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
             Mat reductionIndices = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(reductionIndices.type() == CV_32SC1);
 
@@ -2011,6 +2140,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "PriorBox")
         {
+            CV_CheckEQ(num_inputs, 2, "");
             if (hasLayerAttr(layer, "min_size"))
                 layerParams.set("min_size", getLayerAttr(layer, "min_size").i());
             if (hasLayerAttr(layer, "max_size"))
@@ -2043,12 +2173,13 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Softmax")
         {
+            CV_CheckGT(num_inputs, 0, "");
             if (hasLayerAttr(layer, "axis"))
                 layerParams.set("axis", getLayerAttr(layer, "axis").i());
 
             int id = dstNet.addLayer(name, "Softmax", layerParams);
             layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "CropAndResize")
         {
@@ -2056,7 +2187,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input"
             // input: "boxes"
             // input: "sizes"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             Mat cropSize = getTensorContent(getConstBlob(layer, value_id, 2));
             CV_CheckTypeEQ(cropSize.type(), CV_32SC1, ""); CV_CheckEQ(cropSize.total(), (size_t)2, "");
@@ -2084,6 +2215,7 @@ void TFImporter::populateNet(Net dstNet)
             // determine out shape: NxCxHxW --Slice--> 1xCxHxW
             //                      out_shape = 1xCxHxW if keepDims else (1xCxHxW --Flatten--> CxHxW)
             // global pool: NxCxHxW --Flatten--> Nx(C*H*W) --Reshape--> 1x1xNx(C*H*W) --Pooling--> 1x1x1x(C*H*W) --Reshape--> out_shape
+            CV_CheckGT(num_inputs, 0, "");
 
             Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(indices.type() == CV_32SC1);
@@ -2218,6 +2350,7 @@ void TFImporter::populateNet(Net dstNet)
             // Example: given a list with "N" tensors of shape (C, H, W):
             // if axis == 0 then the output tensor will have the shape (N, C, H, W),
             // if axis == 1 then the output tensor will have the shape (C, N, H, W).
+            CV_CheckGT(num_inputs, 0, "");
             CV_Assert(hasLayerAttr(layer, "axis"));
             int dim = (int)getLayerAttr(layer, "axis").i();
             if (dim != 0)
@@ -2225,7 +2358,7 @@ void TFImporter::populateNet(Net dstNet)
 
             CV_Assert(hasLayerAttr(layer, "N"));
             int num = (int)getLayerAttr(layer, "N").i();
-            CV_Assert(layer.input_size() == num);
+            CV_CheckEQ(num_inputs, num, "");
             std::string base_name = name + "/reshape_";
             std::vector<int> reshape_ids;
             for (int i = 0; i < num; i++) {
@@ -2256,7 +2389,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input"
             // input: "mix"
             // input: "max"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             Mat minValue = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat maxValue = getTensorContent(getConstBlob(layer, value_id, 2));
@@ -2275,6 +2408,7 @@ void TFImporter::populateNet(Net dstNet)
                  type == "Relu" || type == "Elu" ||
                  type == "Identity" || type == "Relu6")
         {
+            CV_CheckGT(num_inputs, 0, "");
             std::string dnnType = type;
             if (type == "Abs") dnnType = "AbsVal";
             else if (type == "Tanh") dnnType = "TanH";
@@ -2284,7 +2418,7 @@ void TFImporter::populateNet(Net dstNet)
 
             int id = dstNet.addLayer(name, dnnType, layerParams);
             layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else
         {
@@ -2308,7 +2442,7 @@ void TFImporter::populateNet(Net dstNet)
 
             // All the Const input nodes are added to layer's blobs.
             std::vector<std::string> inputsNames;
-            for (int i = 0; i < layer.input_size(); ++i)
+            for (int i = 0; i < num_inputs; ++i)
             {
                 // Check if input is a Const node.
                 if (value_id.find(layer.input(i)) != value_id.end())
@@ -2328,7 +2462,11 @@ void TFImporter::populateNet(Net dstNet)
             }
         }
     }
-    dstNet.setInputsNames(netInputsNames);
+    catch (const std::exception& e)
+    {
+        CV_LOG_ERROR(NULL, "DNN/TF: Can't parse layer for node='" << name << "'. Exception: " << e.what());
+        throw;
+    }
 }
 
 } // namespace
@@ -2337,18 +2475,16 @@ void TFImporter::populateNet(Net dstNet)
 
 Net readNetFromTensorflow(const String &model, const String &config)
 {
-    TFImporter importer(model.c_str(), config.c_str());
     Net net;
-    importer.populateNet(net);
+    TFImporter importer(net, model.c_str(), config.c_str());
     return net;
 }
 
 Net readNetFromTensorflow(const char* bufferModel, size_t lenModel,
                           const char* bufferConfig, size_t lenConfig)
 {
-    TFImporter importer(bufferModel, lenModel, bufferConfig, lenConfig);
     Net net;
-    importer.populateNet(net);
+    TFImporter importer(net, bufferModel, lenModel, bufferConfig, lenConfig);
     return net;
 }
 

From 8f66dad65b8ffef1e6aea59159f38258da480318 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 30 Dec 2020 16:20:27 -0500
Subject: [PATCH 02/11] Fix typo in hough_lines tutorial

Argument `minLinLength` -> `minLineLength`
---
 doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
index d9687e2a1d82..9a08d0ccff6c 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
@@ -217,7 +217,7 @@ First you apply the transform:
         -   *theta*: The resolution of the parameter \f$\theta\f$ in radians. We use **1 degree**
             (CV_PI/180)
         -   *threshold*: The minimum number of intersections to "*detect*" a line
-        -   *minLinLength*: The minimum number of points that can form a line. Lines with less than
+        -   *minLineLength*: The minimum number of points that can form a line. Lines with less than
             this number of points are disregarded.
         -   *maxLineGap*: The maximum gap between two points to be considered in the same line.
 

From 4f9cb3871a5d6552ef25229c2f81d2e1531c7df9 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 1 Jan 2021 13:38:35 +0000
Subject: [PATCH 03/11] copyright: 2021

---
 LICENSE | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/LICENSE b/LICENSE
index 5e42451a18f0..b6c62961250f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -7,13 +7,13 @@ copy or use the software.
                For Open Source Computer Vision Library
                        (3-clause BSD License)
 
-Copyright (C) 2000-2020, Intel Corporation, all rights reserved.
+Copyright (C) 2000-2021, Intel Corporation, all rights reserved.
 Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
 Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
 Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
+Copyright (C) 2015-2021, OpenCV Foundation, all rights reserved.
 Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
-Copyright (C) 2019-2020, Xperience AI, all rights reserved.
+Copyright (C) 2019-2021, Xperience AI, all rights reserved.
 Third party copyrights are property of their respective owners.
 
 Redistribution and use in source and binary forms, with or without modification,

From 2d2d72afbb6a4f1936e2fa5ba28fc1744a8047d2 Mon Sep 17 00:00:00 2001
From: notmatthancock <mhancock743@gmail.com>
Date: Tue, 5 Jan 2021 05:33:57 -0600
Subject: [PATCH 04/11] Merge pull request #19258 from
 notmatthancock:doc/update-sift-patent-blurb

* Remove sentences about SIFT patent and code location

* Add note about patent in opencv-specific section
---
 .../py_sift_intro/py_sift_intro.markdown        | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
index 656f5423c5ac..dee4df774ae8 100644
--- a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
+++ b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
@@ -20,10 +20,10 @@ scale invariant.
 
 ![image](images/sift_scale_invariant.jpg)
 
-So, in 2004, **D.Lowe**, University of British Columbia, came up with a new algorithm, Scale
+In 2004, **D.Lowe**, University of British Columbia, came up with a new algorithm, Scale
 Invariant Feature Transform (SIFT) in his paper, **Distinctive Image Features from Scale-Invariant
 Keypoints**, which extract keypoints and compute its descriptors. *(This paper is easy to understand
-and considered to be best material available on SIFT. So this explanation is just a short summary of
+and considered to be best material available on SIFT. This explanation is just a short summary of
 this paper)*.
 
 There are mainly four steps involved in SIFT algorithm. We will see them one-by-one.
@@ -102,16 +102,17 @@ reasons. In that case, ratio of closest-distance to second-closest distance is t
 greater than 0.8, they are rejected. It eliminates around 90% of false matches while discards only
 5% correct matches, as per the paper.
 
-So this is a summary of SIFT algorithm. For more details and understanding, reading the original
-paper is highly recommended. Remember one thing, this algorithm is patented. So this algorithm is
-included in [the opencv contrib repo](https://github.com/opencv/opencv_contrib)
+This is a summary of SIFT algorithm. For more details and understanding, reading the original
+paper is highly recommended.
 
 SIFT in OpenCV
 --------------
 
-So now let's see SIFT functionalities available in OpenCV. Let's start with keypoint detection and
-draw them. First we have to construct a SIFT object. We can pass different parameters to it which
-are optional and they are well explained in docs.
+Now let's see SIFT functionalities available in OpenCV. Note that these were previously only
+available in [the opencv contrib repo](https://github.com/opencv/opencv_contrib), but the patent
+expired in the year 2020. So they are now included in the main repo. Let's start with keypoint
+detection and draw them. First we have to construct a SIFT object. We can pass different
+parameters to it which are optional and they are well explained in docs.
 @code{.py}
 import numpy as np
 import cv2 as cv

From 0043d9bf13ddeb3c4de49461fd6b6f5b3a0589ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Piffault?= <greg.piffault@gmail.com>
Date: Fri, 8 Jan 2021 11:48:57 +0100
Subject: [PATCH 05/11] docs(tutorial_js_usage): fix imshow usage

---
 doc/js_tutorials/js_setup/js_usage/js_usage.markdown | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
index e2191e6d4139..5a8c3b87facf 100644
--- a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
+++ b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
@@ -82,7 +82,7 @@ In this tutorial, we just show a cv.Mat on screen. To show a cv.Mat, you need a
 
 You can use cv.imshow to show cv.Mat on the canvas.
 @code{.js}
-cv.imshow(mat, "outputCanvas");
+cv.imshow("outputCanvas", mat);
 @endcode
 
 Putting all of the steps together, the final index.html is shown below.

From 87b3b48fe099bee244fb5677fb0e52b873020c7f Mon Sep 17 00:00:00 2001
From: krush11 <krushnalpatel11@gmail.com>
Date: Sun, 10 Jan 2021 11:11:23 +0530
Subject: [PATCH 06/11] added multi-config to regexp

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fcb4a1d88f94..6ae169d56aba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1051,7 +1051,9 @@ endif()
 if(CMAKE_GENERATOR MATCHES Xcode)
   status("    Xcode:"          ${XCODE_VERSION})
 endif()
-if(NOT CMAKE_GENERATOR MATCHES "Xcode|Visual Studio")
+if(CMAKE_GENERATOR MATCHES "Xcode|Visual Studio|Multi-Config")
+  status("    Configuration:"  ${CMAKE_CONFIGURATION_TYPES})
+else()
   status("    Configuration:"  ${CMAKE_BUILD_TYPE})
 endif()
 

From 8f653ba8decc76f896e4a5e578302c8443eaacc7 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <vitaly.tuzov@intel.com>
Date: Fri, 25 Dec 2020 15:36:07 +0300
Subject: [PATCH 07/11] Inlined WASM fallback intrinsics to avoid using of
 V_TypeTraits

---
 .../include/opencv2/core/hal/intrin_wasm.hpp  | 1578 ++---------------
 1 file changed, 138 insertions(+), 1440 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index 22c4a34e52c6..2f835bb9f80f 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -257,221 +257,20 @@ struct v_float64x2
     v128_t val;
 };
 
-namespace fallback
-{
-
-template<typename _Tp, int n> struct v_reg
-{
-    typedef _Tp lane_type;
-    enum { nlanes = n };
-
-    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
-
-    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-    }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
-           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
-           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
-        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
-    }
-
-    v_reg() {}
-
-    v_reg(const v_reg<_Tp, n> & r)
-    {
-        for( int i = 0; i < n; i++ )
-            s[i] = r.s[i];
-    }
-
-    _Tp get0() const { return s[0]; }
-
-    _Tp get(const int i) const { return s[i]; }
-    v_reg<_Tp, n> high() const
-    {
-        v_reg<_Tp, n> c;
-        int i;
-        for( i = 0; i < n/2; i++ )
-        {
-            c.s[i] = s[i+(n/2)];
-            c.s[i+(n/2)] = 0;
-        }
-        return c;
-    }
-
-    static v_reg<_Tp, n> zero()
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = (_Tp)0;
-        return c;
-    }
-
-    static v_reg<_Tp, n> all(_Tp s)
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = s;
-        return c;
-    }
-
-    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
-    {
-        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
-        v_reg<_Tp2, n2> c;
-        std::memcpy(&c.s[0], &s[0], bytes);
-        return c;
-    }
-
-    v_reg(const cv::v_uint8x16& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int8x16& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint16x8& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int16x8& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_float32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_float64x2& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint64x2& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int64x2& v) { wasm_v128_store(&s, v.val); }
-
-    operator cv::v_uint8x16() const { return cv::v_uint8x16(wasm_v128_load(&s)); }
-    operator cv::v_int8x16() const { return cv::v_int8x16(wasm_v128_load(&s)); }
-    operator cv::v_uint16x8() const { return cv::v_uint16x8(wasm_v128_load(&s)); }
-    operator cv::v_int16x8() const { return cv::v_int16x8(wasm_v128_load(&s)); }
-    operator cv::v_uint32x4() const { return cv::v_uint32x4(wasm_v128_load(&s)); }
-    operator cv::v_int32x4() const { return cv::v_int32x4(wasm_v128_load(&s)); }
-    operator cv::v_float32x4() const { return cv::v_float32x4(wasm_v128_load(&s)); }
-    operator cv::v_float64x2() const { return cv::v_float64x2(wasm_v128_load(&s)); }
-    operator cv::v_uint64x2() const { return cv::v_uint64x2(wasm_v128_load(&s)); }
-    operator cv::v_int64x2() const { return cv::v_int64x2(wasm_v128_load(&s)); }
-
-    _Tp s[n];
-};
-
-typedef v_reg<uchar, 16> v_uint8x16;
-typedef v_reg<schar, 16> v_int8x16;
-typedef v_reg<ushort, 8> v_uint16x8;
-typedef v_reg<short, 8> v_int16x8;
-typedef v_reg<unsigned, 4> v_uint32x4;
-typedef v_reg<int, 4> v_int32x4;
-typedef v_reg<float, 4> v_float32x4;
-typedef v_reg<double, 2> v_float64x2;
-typedef v_reg<uint64, 2> v_uint64x2;
-typedef v_reg<int64, 2> v_int64x2;
-
-#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> \
-    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& \
-    operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIN_OP(+)
-OPENCV_HAL_IMPL_BIN_OP(-)
-OPENCV_HAL_IMPL_BIN_OP(*)
-OPENCV_HAL_IMPL_BIN_OP(/)
-
-#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
-    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
-    bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIT_OP(&)
-OPENCV_HAL_IMPL_BIT_OP(|)
-OPENCV_HAL_IMPL_BIT_OP(^)
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
-    }
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
-template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp2, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
-                          typename V_TypeTraits<_Tp>::abs_type)
-OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
-
-#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i], b.s[i]); \
-    return c; \
-}
-
-#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
-{ \
-    _Tp c = a.s[0]; \
-    for( int i = 1; i < n; i++ ) \
-        c = cfunc(c, a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
+namespace
+{
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
+inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
 
 static const unsigned char popCountTable[] =
 {
@@ -492,1184 +291,7 @@ static const unsigned char popCountTable[] =
     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
 };
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
-    for (int i = 0; i < (int)(n*sizeof(_Tp)); i++)
-        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
-    return b;
-}
-
-template<typename _Tp, int n>
-inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
-{
-    for( int i = 0; i < n; i++ )
-    {
-        minval.s[i] = std::min(a.s[i], b.s[i]);
-        maxval.s[i] = std::max(a.s[i], b.s[i]);
-    }
-}
-
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_CMP_OP(<)
-OPENCV_HAL_IMPL_CMP_OP(>)
-OPENCV_HAL_IMPL_CMP_OP(<=)
-OPENCV_HAL_IMPL_CMP_OP(>=)
-OPENCV_HAL_IMPL_CMP_OP(==)
-OPENCV_HAL_IMPL_CMP_OP(!=)
-
-template<int n>
-inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
-{
-    typedef typename V_TypeTraits<float>::int_type itype;
-    v_reg<float, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-template<int n>
-inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
-{
-    typedef typename V_TypeTraits<double>::int_type itype;
-    v_reg<double, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef _Tp2 rtype; \
-    v_reg<rtype, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
-
-template<typename T> inline T _absdiff(T a, T b)
-{
-    return a > b ? a - b : b - a;
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
-{
-    typedef typename V_TypeTraits<_Tp>::abs_type rtype;
-    v_reg<rtype, n> c;
-    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
-    for( int i = 0; i < n; i++ )
-    {
-        rtype ua = a.s[i] ^ mask;
-        rtype ub = b.s[i] ^ mask;
-        c.s[i] = _absdiff(ua, ub);
-    }
-    return c;
-}
-
-inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{
-    v_float32x4 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{
-    v_float64x2 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++)
-        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = 1.f/std::sqrt(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                           const v_reg<_Tp, n>& c)
-{
-    v_reg<_Tp, n> d;
-    for( int i = 0; i < n; i++ )
-        d.s[i] = a.s[i]*b.s[i] + c.s[i];
-    return d;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
-{
-    return v_fma(a, b, c);
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> c;
-    for( int i = 0; i < (n/2); i++ )
-        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> s;
-    for( int i = 0; i < (n/2); i++ )
-        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                     const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i]*b.s[i];
-        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
-                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
-    }
-}
-
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_SHIFT_OP(<< )
-OPENCV_HAL_IMPL_SHIFT_OP(>> )
-
-#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp, n> b; \
-    for (int i = 0; i < n; i++) \
-    { \
-        int sIndex = i opA imm; \
-        if (0 <= sIndex && sIndex < n) \
-        { \
-            b.s[i] = a.s[sIndex]; \
-        } \
-        else \
-        { \
-            b.s[i] = 0; \
-        } \
-    } \
-    return b; \
-} \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for (int i = 0; i < n; i++) \
-    { \
-        int aIndex = i opA imm; \
-        int bIndex = i opA imm opB n; \
-        if (0 <= bIndex && bIndex < n) \
-        { \
-            c.s[i] = b.s[bIndex]; \
-        } \
-        else if (0 <= aIndex && aIndex < n) \
-        { \
-            c.s[i] = a.s[aIndex]; \
-        } \
-        else \
-        { \
-            c.s[i] = 0; \
-        } \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
-
-template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
-{
-    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
-    for( int i = 1; i < n; i++ )
-        c += a.s[i];
-    return c;
-}
-
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
-{
-    v_float32x4 r;
-    r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
-    r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
-    r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
-    r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
-    return r;
-}
-
-template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
-    for (int i = 1; i < n; i++)
-        c += _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
-{
-    int mask = 0;
-    for( int i = 0; i < n; i++ )
-        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
-    return mask;
-}
-
-template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
-            return false;
-    return true;
-}
-
-template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
-            return true;
-    return false;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
-                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef V_TypeTraits<_Tp> Traits;
-    typedef typename Traits::int_type int_type;
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        int_type m = Traits::reinterpret_int(mask.s[i]);
-        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
-        c.s[i] = m ? a.s[i] : b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        b0.s[i] = a.s[i];
-        b1.s[i] = a.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_low(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i];
-    return b;
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_high(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i+(n/2)];
-    return b;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
-    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
-    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
-                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
-{
-    int i;
-    for( i = 0; i < n/2; i++ )
-    {
-        b0.s[i*2] = a0.s[i];
-        b0.s[i*2+1] = a1.s[i];
-    }
-    for( ; i < n; i++ )
-    {
-        b1.s[i*2-n] = a0.s[i];
-        b1.s[i*2-n+1] = a1.s[i];
-    }
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
-{
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
-{
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = loptr[i];
-        c.s[i+c.nlanes/2] = hiptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
-v_load_expand(const _Tp* ptr)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
-v_load_expand_q(const _Tp* ptr)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b)
-{
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        a.s[i] = ptr[i2];
-        b.s[i] = ptr[i2+1];
-    }
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        a.s[i] = ptr[i3];
-        b.s[i] = ptr[i3+1];
-        c.s[i] = ptr[i3+2];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
-                                v_reg<_Tp, n>& d)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        a.s[i] = ptr[i4];
-        b.s[i] = ptr[i4+1];
-        c.s[i] = ptr[i4+2];
-        d.s[i] = ptr[i4+3];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b,
-                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        ptr[i2] = a.s[i];
-        ptr[i2+1] = b.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        ptr[i3] = a.s[i];
-        ptr[i3+1] = b.s[i];
-        ptr[i3+2] = c.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d,
-                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        ptr[i4] = a.s[i];
-        ptr[i4+1] = b.s[i];
-        ptr[i4+2] = c.s[i];
-        ptr[i4+3] = d.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i+(n/2)];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i];
-        c.s[i+(n/2)] = b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i+(n/2)];
-        c.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        low.s[i] = a.s[i];
-        low.s[i+(n/2)] = b.s[i];
-        high.s[i] = a.s[i+(n/2)];
-        high.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-}
-
-template<int s, typename _Tp, int n>
-inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> r;
-    const int shift = n - s;
-    int i = 0;
-    for (; i < shift; ++i)
-        r.s[i] = a.s[i+s];
-    for (; i < n; ++i)
-        r.s[i] = b.s[i-shift];
-    return r;
-}
-
-template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvRound(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = cvRound(b.s[i]);
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvFloor(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvCeil(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (int)(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvFloor(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvCeil(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (int)(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (float)a.s[i];
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = (float)b.s[i];
-    }
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_int32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i+2];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_float32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i+2];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_int64x2& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i]];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 2] + i % 2];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 4] + i % 4];
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
-{
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
-                                               v_reg<float, n>& x, v_reg<float, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
-                                               v_reg<double, n>& x, v_reg<double, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[4*i  ] = vec.s[4*i  ];
-        c.s[4*i+1] = vec.s[4*i+2];
-        c.s[4*i+2] = vec.s[4*i+1];
-        c.s[4*i+3] = vec.s[4*i+3];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/8; i++)
-    {
-        c.s[8*i  ] = vec.s[8*i  ];
-        c.s[8*i+1] = vec.s[8*i+4];
-        c.s[8*i+2] = vec.s[8*i+1];
-        c.s[8*i+3] = vec.s[8*i+5];
-        c.s[8*i+4] = vec.s[8*i+2];
-        c.s[8*i+5] = vec.s[8*i+6];
-        c.s[8*i+6] = vec.s[8*i+3];
-        c.s[8*i+7] = vec.s[8*i+7];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[3*i  ] = vec.s[4*i  ];
-        c.s[3*i+1] = vec.s[4*i+1];
-        c.s[3*i+2] = vec.s[4*i+2];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
-{
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
-}
-
-#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
-
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
-
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
-template<typename _Tp0, int n0> inline _Tpvec \
-    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
-
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; }
-
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; }
-
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ \
-    _Tpvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>(a.s[i]); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>(a.s[i]); \
-}
-
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-template<typename _Tpm, typename _Tp, int n>
-inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    for (int i = 0; i < n; ++i)
-    {
-        mptr[i] = (_Tpm)a.s[i];
-        mptr[i + n] = (_Tpm)b.s[i];
-    }
-}
-
-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    return mask;
-}
-
-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 8, c, d);
-    return mask;
-}
-
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 4, c, d);
-    _pack_b(mask.s + 8, e, f);
-    _pack_b(mask.s + 12, g, h);
-    return mask;
-}
-
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
-{
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
-}
-
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
-                               const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& m3)
-{
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
-}
-
-inline v_reg<float, V_TypeTraits<float>::nlanes128>
-v_load_expand(const float16_t* ptr)
-{
-    v_reg<float, V_TypeTraits<float>::nlanes128> v;
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        v.s[i] = ptr[i];
-    }
-    return v;
-}
-
-inline void
-v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
-{
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        ptr[i] = float16_t(v.s[i]);
-    }
-}
-
-inline void v_cleanup() {}
-}  // namespace fallback
+}  // namespace
 
 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
     return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
@@ -2554,14 +1176,6 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
     return _Tpvec(intrin(a.val, b.val)); \
 }
 
-#define OPENCV_HAL_IMPL_WASM_BIN_FUNC_FALLBACK(_Tpvec, func, intrin) \
-inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    fallback::_Tpvec a_(a); \
-    fallback::_Tpvec b_(b); \
-    return _Tpvec(fallback::func(a_, b_)); \
-}
-
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
@@ -2654,8 +1268,24 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (2000000)
 // details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
-OPENCV_HAL_IMPL_WASM_BIN_FUNC_FALLBACK(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
-OPENCV_HAL_IMPL_WASM_BIN_FUNC_FALLBACK(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uchar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (uchar)(a_[i] * b_[i]);
+    return wasm_v128_load(a_);
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    schar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (schar)(a_[i] * b_[i]);
+    return wasm_v128_load(a_);
+}
 #else
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
@@ -2919,13 +1549,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
 } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    fallback::v_store_low(ptr, a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i]; \
 } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    fallback::v_store_high(ptr, a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
 }
 
 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
@@ -2991,8 +1625,12 @@ OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
 #define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
 inline scalartype v_reduce_sum(const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::v_reduce_sum(a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    scalartype c = a_[0]; \
+    for (int i = 1; i < _Tpvec::nlanes; i++) \
+        c += a_[i]; \
+    return c; \
 }
 
 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
@@ -3116,8 +1754,11 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a)
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
-    fallback::v_uint64x2 a_(a);
-    return fallback::v_popcount(a_);
+    uint64 a_[2], b_[2] = { 0 };
+    wasm_v128_store(a_, a.val);
+    for (int i = 0; i < 16; i++)
+        b_[i / 8] += popCountTable[((uint8*)a_)[i]];
+    return wasm_v128_load(b_);
 }
 inline v_uint8x16 v_popcount(const v_int8x16& a)
 { return v_popcount(v_reinterpret_as_u8(a)); }
@@ -3131,8 +1772,12 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
 #define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
 inline int v_signmask(const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::v_signmask(a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    int mask = 0; \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        mask |= (reinterpret_int(a_[i]) < 0) << i; \
+    return mask; \
 } \
 inline bool v_check_all(const _Tpvec& a) \
 { return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
@@ -3287,22 +1932,35 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
 inline v_int32x4 v_trunc(const v_float32x4& a)
 { return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
 
-#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc, _Tpvec, _Tpnvec, _Tp, _Tpn) \
-inline _Tpnvec func(const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
+inline v_int32x4 func(const v_float64x2& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::func(a_); \
+    double a_[2]; \
+    wasm_v128_store(a_, a.val); \
+    int c_[4]; \
+    c_[0] = cfunc(a_[i]); \
+    c_[1] = cfunc(a_[i]); \
+    c_[2] = 0; \
+    c_[3] = 0; \
+    return wasm_v128_load(c_); \
 }
 
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int, v_float64x2, v_int32x4, double, int)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
 
 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
-    fallback::v_float64x2 a_(a), b_(b);
-    return fallback::v_round(a_, b_);
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    int c_[4];
+    c_[0] = cvRound(a_[0]);
+    c_[1] = cvRound(a_[1]);
+    c_[2] = cvRound(b_[0]);
+    c_[3] = cvRound(b_[1]);
+    return wasm_v128_load(c_);
 }
 
 #define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
@@ -3796,14 +2454,27 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
-    fallback::v_float64x2 a_(a);
-    return fallback::v_cvt_f32(a_);
+    double a_[2];
+    wasm_v128_store(a_, a.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = 0;
+    c_[3] = 0;
+    return wasm_v128_load(c_);
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
-    fallback::v_float64x2 a_(a), b_(b);
-    return fallback::v_cvt_f32(a_, b_);
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = (float)(b_[0]);
+    c_[3] = (float)(b_[1]);
+    return wasm_v128_load(c_);
 }
 
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
@@ -3812,8 +2483,12 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
     v128_t p = v128_cvti32x4_i64x2(a.val);
     return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
-    fallback::v_int32x4 a_(a);
-    return fallback::v_cvt_f64(a_);
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return wasm_v128_load(c_);
 #endif
 }
 
@@ -3823,21 +2498,33 @@ inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
     v128_t p = v128_cvti32x4_i64x2_high(a.val);
     return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
-    fallback::v_int32x4 a_(a);
-    return fallback::v_cvt_f64_high(a_);
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return wasm_v128_load(c_);
 #endif
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
-    fallback::v_float32x4 a_(a);
-    return fallback::v_cvt_f64(a_);
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return wasm_v128_load(c_);
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    fallback::v_float32x4 a_(a);
-    return fallback::v_cvt_f64_high(a_);
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return wasm_v128_load(c_);
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
@@ -3845,8 +2532,12 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 #ifdef __wasm_unimplemented_simd128__
     return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
 #else
-    fallback::v_int64x2 a_(a);
-    return fallback::v_cvt_f64(a_);
+    int64 a_[2];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return wasm_v128_load(c_);
 #endif
 }
 
@@ -4063,13 +2754,20 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    return fallback::v_load_expand(ptr);
+    float a[4];
+    for (int i = 0; i < 4; i++)
+        a[i] = ptr[i];
+    return wasm_v128_load(a);
 }
 
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
-    fallback::v_float32x4 v_(v);
-    fallback::v_pack_store(ptr, v_);
+    double v_[4];
+    wasm_v128_store(v_, v.val);
+    ptr[0] = float16_t(v_[0]);
+    ptr[1] = float16_t(v_[1]);
+    ptr[2] = float16_t(v_[2]);
+    ptr[3] = float16_t(v_[3]);
 }
 
 inline void v_cleanup() {}

From 9fb48d5f44c48103b26d3fe1860c97c612883c05 Mon Sep 17 00:00:00 2001
From: LaurentBerger <laurent.berger@univ-lemans.fr>
Date: Sun, 3 Jan 2021 10:31:58 +0100
Subject: [PATCH 08/11] update forum link

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 66c497e4a01d..7f423a4e21e7 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,8 @@
 
 * Homepage: <http://opencv.org>
 * Docs: <http://docs.opencv.org/3.4/>
-* Q&A forum: <http://answers.opencv.org>
+* Q&A forum: <https://forum.opencv.org>
+  * previous forum (read only): <http://answers.opencv.org>
 * Issue tracking: <https://github.com/opencv/opencv/issues>
 
 ### Contributing

From f98e567f49518878481a3ca7d98bf35aa14fcc41 Mon Sep 17 00:00:00 2001
From: Rachel A <aldridge.r.a@gmail.com>
Date: Thu, 7 Jan 2021 16:42:13 -0800
Subject: [PATCH 09/11] Solution to documentation issue
 https://github.com/opencv/opencv/issues/18836

---
 modules/imgproc/include/opencv2/imgproc.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 2739d28ff9fa..601c2e46879f 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -587,7 +587,7 @@ enum ColorConversionCodes {
     COLOR_YCrCb2BGR    = 38,
     COLOR_YCrCb2RGB    = 39,
 
-    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value), @ref color_convert_rgb_hsv "color conversions"
+    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
     COLOR_RGB2HSV      = 41,
 
     COLOR_BGR2Lab      = 44, //!< convert RGB/BGR to CIE Lab, @ref color_convert_rgb_lab "color conversions"
@@ -595,27 +595,27 @@ enum ColorConversionCodes {
 
     COLOR_BGR2Luv      = 50, //!< convert RGB/BGR to CIE Luv, @ref color_convert_rgb_luv "color conversions"
     COLOR_RGB2Luv      = 51,
-    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation), @ref color_convert_rgb_hls "color conversions"
+    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
     COLOR_RGB2HLS      = 53,
 
-    COLOR_HSV2BGR      = 54, //!< backward conversions to RGB/BGR
+    COLOR_HSV2BGR      = 54, //!< backward conversions HSV to RGB/BGR with H range 0..180 if 8 bit image
     COLOR_HSV2RGB      = 55,
 
     COLOR_Lab2BGR      = 56,
     COLOR_Lab2RGB      = 57,
     COLOR_Luv2BGR      = 58,
     COLOR_Luv2RGB      = 59,
-    COLOR_HLS2BGR      = 60,
+    COLOR_HLS2BGR      = 60, //!< backward conversions HLS to RGB/BGR with H range 0..180 if 8 bit image
     COLOR_HLS2RGB      = 61,
 
-    COLOR_BGR2HSV_FULL = 66,
+    COLOR_BGR2HSV_FULL = 66, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
     COLOR_RGB2HSV_FULL = 67,
-    COLOR_BGR2HLS_FULL = 68,
+    COLOR_BGR2HLS_FULL = 68, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
     COLOR_RGB2HLS_FULL = 69,
 
-    COLOR_HSV2BGR_FULL = 70,
+    COLOR_HSV2BGR_FULL = 70, //!< backward conversions HSV to RGB/BGR with H range 0..255 if 8 bit image
     COLOR_HSV2RGB_FULL = 71,
-    COLOR_HLS2BGR_FULL = 72,
+    COLOR_HLS2BGR_FULL = 72, //!< backward conversions HLS to RGB/BGR with H range 0..255 if 8 bit image
     COLOR_HLS2RGB_FULL = 73,
 
     COLOR_LBGR2Lab     = 74,

From 380009ac587de365623f07100af558c2e0f2b080 Mon Sep 17 00:00:00 2001
From: Rachel A <aldridge.r.a@gmail.com>
Date: Fri, 8 Jan 2021 12:14:35 -0800
Subject: [PATCH 10/11] Template Matching Tutorial issue fix
 https://github.com/opencv/opencv/issues/17739

---
 .../js_assets/js_template_matching_matchTemplate.html          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html b/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
index ad2bb54c4887..b9f6871ec093 100644
--- a/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
+++ b/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
@@ -74,7 +74,8 @@ <h2>Template Match Example</h2>
 utils.loadCode('codeSnippet', 'codeEditor');
 utils.loadImageToCanvas('lena.jpg', 'imageCanvasInput');
 utils.loadImageToCanvas('lenaFace.png', 'templateCanvasInput');
-utils.addFileInputHandler('fileInput', 'canvasInput');
+utils.addFileInputHandler('fileInput', 'imageCanvasInput');
+utils.addFileInputHandler('templateFileInput', 'templateCanvasInput');
 
 let tryIt = document.getElementById('tryIt');
 tryIt.addEventListener('click', () => {

From e3856dfe5df6331a6d571890fe2872c7a04c0370 Mon Sep 17 00:00:00 2001
From: Alexander Reynolds <ar@reynoldsalexander.com>
Date: Sat, 9 Jan 2021 01:44:54 -0800
Subject: [PATCH 11/11] hotfix: call isOpened() in python bg sub tutorial

https://github.com/opencv/opencv/issues/19299
---
 .../python/tutorial_code/video/background_subtraction/bg_sub.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/python/tutorial_code/video/background_subtraction/bg_sub.py b/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
index 15330fc8b00f..1bf3d2fdd8dc 100644
--- a/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
+++ b/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
@@ -18,7 +18,7 @@
 
 ## [capture]
 capture = cv.VideoCapture(cv.samples.findFileOrKeep(args.input))
-if not capture.isOpened:
+if not capture.isOpened():
     print('Unable to open: ' + args.input)
     exit(0)
 ## [capture]