Update base for Update on "[Executorch] Use temp allocator for allocating scratch memory"

kimishpatel · kimishpatel · commit 79bcbc54a6c5 · 2025-11-20T12:49:46.000-08:00
This allows us to leverage temp memory allocator and if that allocator is caching allocator it reduces the allocaiton overhead. Differential Revision: [D85532076](https://our.internmc.facebook.com/intern/diff/D85532076/) [ghstack-poisoned]
diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -775,10 +775,10 @@ void cpu_flash_attention(
   // at::Tensor buf_reduced = at::empty(
   //    {num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0},
   //    query.options());
-  int64_t size_per_thread_qdq_vec = qSplitSize * kvSplitSize * headSize;
+  int64_t size_per_thread_qdq_vec = kvSplitSize * headSize;
   // Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
   // by padding with right number of per thread elements
-  constexpr int64_t kAlignment = 32;
+  constexpr int64_t kAlignment = 64;
   size_per_thread_qdq_vec =
       (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
   int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * sizeof(accum_t);