Skip to content

Commit e21670f

Browse files
committed
[Executorch][LLM] Use caching allocator for runner
Pull Request resolved: #15730 We observed that on iOS it improves perf by 6% because SDPA op does temp allocations. No significant difference on android though. ghstack-source-id: 327186823 @exported-using-ghexport Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)
1 parent 4b1913d commit e21670f

File tree

4 files changed

+22
-3
lines changed

4 files changed

+22
-3
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,10 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
932932
endif()
933933

934934
if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
935+
add_subdirectory(
936+
${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator
937+
)
938+
list(APPEND _executorch_extensions extension_memory_allocator)
935939
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
936940
list(APPEND _executorch_extensions extension_llm_runner)
937941
endif()

extension/llm/runner/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ list(TRANSFORM _extension_llm_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
3434
add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs})
3535

3636
set(runner_deps executorch_core extension_module extension_tensor
37-
tokenizers::tokenizers
37+
extension_memory_allocator tokenizers::tokenizers
3838
)
3939

4040
# depend on arange_utils

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <executorch/extension/llm/runner/text_llm_runner.h>
1818
#include <executorch/extension/llm/runner/text_prefiller.h>
1919
#include <executorch/extension/llm/runner/text_token_generator.h>
20+
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
2021
#include <executorch/runtime/core/result.h>
2122
#include <executorch/runtime/platform/runtime.h>
2223
#include <pytorch/tokenizers/hf_tokenizer.h>
@@ -210,15 +211,28 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
210211

211212
// Create the Module
212213
std::unique_ptr<Module> module;
214+
uint32_t max_cached_memory_size_bytes_ = 1024 * 1024 * 10; // 10MB
213215
if (data_files.size() > 0) {
214216
module = std::make_unique<Module>(
215217
model_path,
216218
data_files,
217219
Module::LoadMode::File,
218-
std::move(event_tracer));
220+
std::move(event_tracer),
221+
nullptr, // memory allocator
222+
std::make_unique<
223+
executorch::extension::CPUCachingAllocator>( // temp memory
224+
// allocator
225+
max_cached_memory_size_bytes_));
219226
} else {
220227
module = std::make_unique<Module>(
221-
model_path, Module::LoadMode::File, std::move(event_tracer));
228+
model_path,
229+
Module::LoadMode::File,
230+
std::move(event_tracer), // event tracer
231+
nullptr, // memory allocator
232+
std::make_unique<
233+
executorch::extension::CPUCachingAllocator>( // temp memory
234+
// allocator
235+
max_cached_memory_size_bytes_));
222236
}
223237

224238
// Get metadata from Module

extension/llm/runner/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def define_common_targets():
148148
":text_prefiller" + aten_suffix,
149149
":text_token_generator" + aten_suffix,
150150
"//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
151+
"//executorch/extension/memory_allocator:cpu_caching_allocator",
151152
"//pytorch/tokenizers:hf_tokenizer",
152153
"//pytorch/tokenizers:llama2c_tokenizer",
153154
"//pytorch/tokenizers:sentencepiece",

0 commit comments

Comments
 (0)