From 72a987ef5a71d24c4737bdad15171a4eb818a4d7 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Sun, 10 Dec 2023 08:59:20 -0800 Subject: [PATCH] Merge pull request #24205 from PeterJohnson:fix-msvc-arm64 ht_dec.c: Improve MSVC arm64 popcount performance #24205 Use NEON instructions for ARM64 (implementation based on https://github.com/microsoft/STL/pull/2127, which is Apache licensed). Godbolt output here: https://godbolt.org/z/q7GPTqT14 Related patch to openjpeg: https://github.com/uclouvain/openjpeg/pull/1479 ### Pull Request Readiness Checklist - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch --- 3rdparty/openjpeg/openjp2/ht_dec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/3rdparty/openjpeg/openjp2/ht_dec.c b/3rdparty/openjpeg/openjp2/ht_dec.c index e2f3afd6a3af..85e7266919a3 100644 --- a/3rdparty/openjpeg/openjp2/ht_dec.c +++ b/3rdparty/openjpeg/openjp2/ht_dec.c @@ -55,6 +55,10 @@ #define OPJ_COMPILER_GNUC #endif +#if defined(OPJ_COMPILER_MSVC) && defined(_M_ARM64) +#include +#endif + //************************************************************************/ /** @brief Displays the error message for disabling the decoding of SPP and * MRP passes @@ -71,6 +75,9 @@ OPJ_UINT32 population_count(OPJ_UINT32 val) { #if defined(OPJ_COMPILER_MSVC) && (defined(_M_IX86) || defined(_M_AMD64)) return (OPJ_UINT32)__popcnt(val); +#elif defined(OPJ_COMPILER_MSVC) && defined(_M_ARM64) + const __n64 temp = neon_cnt(__uint64ToN64_v(val)); + return neon_addv8(temp).n8_i8[0]; #elif (defined OPJ_COMPILER_GNUC) return (OPJ_UINT32)__builtin_popcount(val); #else