Merge pull request #3980 from pratham-mcw:opt-arm64-adaptive-manifold-unroll

pratham-mcw · web-flow · commit 06fc7ad743ac · 2025-10-13T11:33:42.000+03:00
ximgproc: optimize Adaptive Manifold function for ARM64 #3980 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - This PR introduces an ARM64-specific performance optimization in AdaptiveManifoldFilter::h_filter by applying loop unrolling. - The optimization is guarded with #if defined(_M_ARM64) to ensure it only affects ARM64 builds. - The optimization does not affect accuracy and maintains the same numerical behavior as the original scalar implementation. Performance Improvements : - The optimization significantly improves the performance of adaptive Manifold function on Windows ARM64 targets. - The table below shows timing comparisons before and after the optimization: <img width="1098" height="219" alt="image" src="https://github.com/user-attachments/assets/6cab9147-8ba0-4582-bdc6-e1f57989da86" />
diff --git a/modules/ximgproc/src/adaptive_manifold_filter_n.cpp b/modules/ximgproc/src/adaptive_manifold_filter_n.cpp
@@ -520,11 +520,29 @@ void AdaptiveManifoldFilterN::h_filter(const Mat1f& src, Mat& dst, float sigma)
         float* dst_row = dst.ptr<float>(y);
 
         dst_row[0] = src_row[0];
-        for (int x = 1; x < src.cols; ++x)
+        int x = 1;
+    #if CV_ENABLE_UNROLLED && defined(_M_ARM64)
+        for ( ; x + 1 < src.cols; x += 2 )
         {
             dst_row[x] = src_row[x] + a * (dst_row[x - 1] - src_row[x]);
+            dst_row[x + 1] = src_row[x + 1] + a * (dst_row[x] - src_row[x + 1]);
         }
-        for (int x = src.cols - 2; x >= 0; --x)
+    #endif
+        for ( ; x < src.cols; ++x )
+        {
+            dst_row[x] = src_row[x] + a * (dst_row[x - 1] - src_row[x]);
+        }
+
+        x = src.cols - 2;
+
+    #if CV_ENABLE_UNROLLED && defined(_M_ARM64)
+        for ( ; x - 1 >= 0; x -= 2 )
+        {
+            dst_row[x] = dst_row[x] + a * (dst_row[x + 1] - dst_row[x]);
+            dst_row[x - 1] = dst_row[x - 1] + a * (dst_row[x] - dst_row[x - 1]);
+        }
+    #endif
+        for ( ; x >= 0; --x )
         {
             dst_row[x] = dst_row[x] + a * (dst_row[x + 1] - dst_row[x]);
         }