[Executorch] Use temp allocator for allocating scratch memory

kimishpatel · kimishpatel · commit 71d2419d5efc · 2025-11-24T14:05:36.000-08:00
Pull Request resolved: #15728 This allows us to leverage temp memory allocator and if that allocator is caching allocator it reduces the allocaiton overhead. ghstack-source-id: 325471916 @exported-using-ghexport Differential Revision: [D85532076](https://our.internmc.facebook.com/intern/diff/D85532076/)
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
@@ -273,6 +273,7 @@ Tensor& flash_attention_kernel_out(
         // we might consider another appraoch
         if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
+              ctx,
               output,
               query,
               key,
@@ -289,6 +290,7 @@ Tensor& flash_attention_kernel_out(
               nullopt);
         } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
+              ctx,
               output,
               query,
               key,
@@ -305,6 +307,7 @@ Tensor& flash_attention_kernel_out(
               nullopt);
         } else {
           sdpa::impl::cpu_flash_attention<CTYPE, 32, 512>(
+              ctx,
               output,
               query,
               key,
@@ -418,6 +421,7 @@ Tensor& custom_sdpa_out_impl(
         // we might consider another appraoch
         if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
+              ctx,
               output,
               q,
               k,
@@ -437,6 +441,7 @@ Tensor& custom_sdpa_out_impl(
               num_keys_for_causal_attention);
         } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
+              ctx,
               output,
               q,
               k,
@@ -456,6 +461,7 @@ Tensor& custom_sdpa_out_impl(
               num_keys_for_causal_attention);
         } else {
           sdpa::impl::cpu_flash_attention<CTYPE, 32, 512>(
+              ctx,
               output,
               q,
               k,
diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -35,6 +35,7 @@ enum class SeqDim { ONE = 1, TWO };
 
 namespace sdpa::impl {
 
+static std::vector<char> scratch_for_quant_dequant_vec;
 struct MaybeQuantizedMatrixData {
   const void* data{nullptr};
   const int8_t* zero_points{nullptr};
@@ -543,6 +544,7 @@ TODO: Just handle conversion of bool mask to float
  */
 template <typename scalar_t, int64_t q_split_size, int64_t kv_split_size>
 void cpu_flash_attention(
+    RuntimeContext& ctx,
     Tensor& output,
     const Tensor& query,
     const Tensor& key,
@@ -763,18 +765,17 @@ void cpu_flash_attention(
 
   // Since all intermediate compute is accum_t, we need to
   // allocate a buffer accordingly.
-  int64_t size_of_intermediate_precision = sizeof(accum_t);
-  int64_t size_bytes = size_per_thread * num_thread * query.element_size() *
-      size_of_intermediate_precision;
-  std::vector<char> buf_vec(size_bytes);
-  void* buf = reinterpret_cast<void*>(buf_vec.data());
-  // Need to double check the following
-  size_bytes = num_thread * qSplitSize * kvSplitSize * query.element_size();
-  std::vector<char> buf_reduced_vec(size_bytes);
-  void* buf_reduced = reinterpret_cast<void*>(buf_reduced_vec.data());
-  // at::Tensor buf_reduced = at::empty(
-  //    {num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0},
-  //    query.options());
+  int64_t size_bytes = size_per_thread * num_thread * query.element_size();
+  Result<void*> buff_res = ctx.allocate_temp(size_bytes);
+  std::unique_ptr<char[]> allocated_buf;
+  void* buf;
+  if (!buff_res.ok()) {
+    allocated_buf = std::make_unique<char[]>(size_bytes);
+    buf = reinterpret_cast<void*>(allocated_buf.get());
+  } else {
+    buf = buff_res.get();
+  }
+  void* buf_reduced = nullptr;
   int64_t size_per_thread_qdq_vec = kvSplitSize * headSize;
   // Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
   // by padding with right number of per thread elements
@@ -783,9 +784,18 @@ void cpu_flash_attention(
       (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
   int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * sizeof(accum_t);
   int64_t size_qdq_bytes = size_per_thread_qdq_bytes * num_thread;
-  std::vector<char> scratch_for_quant_dequant_vec(size_qdq_bytes);
-  accum_t* scratch_for_quant_dequant =
-      reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
+  std::unique_ptr<char[]> allocated_buf_for_qdq;
+  Result<void*> scratch_for_quant_dequant_res =
+      ctx.allocate_temp(size_qdq_bytes);
+  accum_t* scratch_for_quant_dequant;
+  if (!scratch_for_quant_dequant_res.ok()) {
+    allocated_buf_for_qdq = std::make_unique<char[]>(size_qdq_bytes);
+    scratch_for_quant_dequant =
+        reinterpret_cast<accum_t*>(allocated_buf_for_qdq.get());
+  } else {
+    scratch_for_quant_dequant =
+        reinterpret_cast<accum_t*>(scratch_for_quant_dequant_res.get());
+  }
 
   // Data ptrs
   const scalar_t* q_data = query.const_data_ptr<scalar_t>();