Skip to content

Commit c96ead0

Browse files
committed
[Executorch] Introduce caching cpu memory allocator
Pull Request resolved: #15611 Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations ghstack-source-id: 325371834 @exported-using-ghexport Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/)
1 parent 8b54288 commit c96ead0

File tree

8 files changed

+630
-0
lines changed

8 files changed

+630
-0
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Please keep this file formatted by running:
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
12+
cmake_minimum_required(VERSION 3.19)
13+
14+
# Source root directory for executorch.
15+
if(NOT EXECUTORCH_ROOT)
16+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
17+
endif()
18+
19+
list(TRANSFORM _extension_memory_allocator__srcs PREPEND "${EXECUTORCH_ROOT}/")
20+
if(CMAKE_TOOLCHAIN_IOS
21+
OR CMAKE_TOOLCHAIN_ANDROID
22+
OR APPLE
23+
)
24+
# Building a share library on iOS requires code signing On Android we see
25+
# duplicated registration when using shared lib
26+
add_library(
27+
extension_memory_allocator STATIC ${_extension_memory_allocator__srcs}
28+
)
29+
else()
30+
add_library(extension_memory_allocator ${_extension_memory_allocator__srcs})
31+
endif()
32+
target_link_libraries(extension_memory_allocator PRIVATE executorch_core)
33+
target_include_directories(
34+
extension_memory_allocator PUBLIC ${_common_include_directories}
35+
)
36+
target_compile_options(
37+
extension_memory_allocator
38+
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
39+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
40+
)
41+
42+
# Install libraries
43+
install(
44+
TARGETS extension_memory_allocator
45+
EXPORT ExecuTorchTargets
46+
DESTINATION ${CMAKE_INSTALL_LIBDIR}
47+
INCLUDES
48+
DESTINATION ${_common_include_directories}
49+
)
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#include <cstdlib>
2+
3+
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
4+
5+
namespace executorch::extension {
6+
7+
namespace {
8+
size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
9+
alignment = std::max(alignment, kCachingAllocatorDefaultAlignment);
10+
if (size % alignment != 0) {
11+
// Adjust size to the next multiple of alignment
12+
// This is needed for aligned_alloc to work
13+
return (size + alignment) & ~(alignment - 1);
14+
} else {
15+
return size;
16+
}
17+
}
18+
} // namespace
19+
20+
CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size)
21+
: MemoryAllocator(0, nullptr) {
22+
max_size_ = max_size;
23+
current_size_ = 0;
24+
}
25+
26+
void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
27+
EXECUTORCH_TRACK_ALLOCATION(prof_id(), size);
28+
29+
if (!isPowerOf2(alignment)) {
30+
ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
31+
return nullptr;
32+
}
33+
size = get_alignment_adjusted_size(size, alignment);
34+
35+
std::lock_guard<std::mutex> guard(mutex_);
36+
const auto& it = available_map_.find(size);
37+
// Two choices here.
38+
// 1. Return cached memory
39+
// 2. Allocate new memory
40+
// 2 can lead to current_size > max_size_
41+
if (it == available_map_.end() || it->second.empty()) {
42+
void* ptr = std::aligned_alloc(alignment, size);
43+
if (ptr == nullptr) {
44+
ET_LOG(Error, "Failed to allocate memory");
45+
return nullptr;
46+
}
47+
current_size_ += size;
48+
allocation_map_[ptr] = size;
49+
return ptr;
50+
}
51+
void* ptr = it->second.back();
52+
it->second.pop_back();
53+
allocation_map_[ptr] = size;
54+
return ptr;
55+
}
56+
57+
void CPUCachingAllocator::free_everything() {
58+
// We dont lock mutex_ here because it will cause deadlock otherwise
59+
// we could use recursive_mutex but we just design this differently since
60+
// free_cache is not a public API anyways
61+
for (const auto& it : available_map_) {
62+
for (const auto ptr : it.second) {
63+
std::free(ptr);
64+
}
65+
}
66+
available_map_.clear();
67+
for (const auto& it : allocation_map_) {
68+
void* ptr = it.first;
69+
std::free(ptr);
70+
}
71+
allocation_map_.clear();
72+
// Note that purely by the design, clearing available map does not
73+
// mean that our current allocated size is zero.
74+
current_size_ = 0;
75+
}
76+
77+
void CPUCachingAllocator::reset() {
78+
std::lock_guard<std::mutex> guard(mutex_);
79+
// We make the default allocations, via allcate to be either
80+
// a. gotten via cached memory OR
81+
// b. allocated via malloced and not yet cached
82+
// So if current_size_ (allocated) is larger than the max_size_
83+
// for now we simply deallocate everything.
84+
if (current_size_ > max_size_) {
85+
free_everything();
86+
} else {
87+
for (auto& it : allocation_map_) {
88+
void* ptr = it.first;
89+
size_t alloc_size = it.second;
90+
// Cache the memory
91+
available_map_[alloc_size].push_back(ptr);
92+
}
93+
allocation_map_.clear();
94+
}
95+
}
96+
97+
CPUCachingAllocator::~CPUCachingAllocator() {
98+
// destructor must be called in thread safe manner
99+
reset();
100+
free_everything();
101+
}
102+
103+
} // namespace executorch::extension
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#pragma once
2+
3+
#include <cstddef>
4+
#include <mutex>
5+
6+
#include <executorch/runtime/core/memory_allocator.h>
7+
8+
#ifdef USE_C10_SMALL_VECTOR
9+
#include <c10/util/SmallVector.h>
10+
#else
11+
#include <vector>
12+
#endif
13+
14+
#ifdef USE_C10_FLAT_HASH_MAP
15+
#include <c10/util/flat_hash_map.h>
16+
#else
17+
#include <unordered_map>
18+
#endif
19+
20+
/*
21+
* CPUCachingAllocator:
22+
* This file is copied over from c10/mobile/CPUCachingAllocator.h
23+
* It is a thread safe caching allocator.
24+
*/
25+
26+
namespace executorch::extension {
27+
28+
#ifdef USE_C10_SMALL_VECTOR
29+
template <typename T, unsigned N>
30+
using SmallVector = c10::SmallVector<T, N>;
31+
#else
32+
template <typename T, unsigned N>
33+
using SmallVector = std::vector<T>;
34+
#endif
35+
36+
#ifdef USE_C10_FLAT_HASH_MAP
37+
template <typename KeyType, typename ValueType>
38+
using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
39+
#else
40+
template <typename KeyType, typename ValueType>
41+
using FlatHashMap = std::unordered_map<KeyType, ValueType>;
42+
#endif
43+
44+
constexpr size_t kCachingAllocatorDefaultAlignment = 64;
45+
class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
46+
/*
47+
* What it does:
48+
* Caches all the allocations carried out by this allocator.
49+
* Cache key is the size of the allocation.
50+
* If requested size is found in the cache returns the cached pointer.
51+
* What it does not do:
52+
* No speculative allocation for any future allocations.
53+
*/
54+
private:
55+
void free_everything();
56+
57+
protected:
58+
// Invariants.
59+
// New invariants must be written.
60+
FlatHashMap<size_t, SmallVector<void*, 16>> available_map_;
61+
FlatHashMap<void*, size_t> allocation_map_;
62+
// Since allocation_map_ and other member variables are mutated/read via
63+
// all public APIs, we need a mutex to protect concurrent access to these
64+
// instance members.
65+
std::mutex mutex_;
66+
size_t max_size_;
67+
size_t current_size_;
68+
69+
public:
70+
/*
71+
max_size: Maximum size of memory to cache. Never cache more than that.
72+
*/
73+
CPUCachingAllocator(uint32_t max_size);
74+
// Checks the cache to see if allocation of size bytes can be found.
75+
// If so return cached memory, else
76+
// allocates memory, records it for caching and returns.
77+
void* allocate(
78+
size_t size,
79+
size_t alignment = kCachingAllocatorDefaultAlignment) override;
80+
void reset() override;
81+
~CPUCachingAllocator();
82+
};
83+
84+
} // namespace executorch::extension

extension/memory_allocator/targets.bzl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,20 @@ def define_common_targets():
2020
"@EXECUTORCH_CLIENTS",
2121
],
2222
)
23+
24+
runtime.cxx_library(
25+
name = "cpu_caching_allocator",
26+
srcs = [
27+
"cpu_caching_malloc_allocator.cpp",
28+
],
29+
exported_headers = [
30+
"cpu_caching_malloc_allocator.h",
31+
],
32+
exported_deps = [
33+
"//executorch/runtime/core:memory_allocator",
34+
],
35+
visibility = [
36+
"//executorch/extension/memory_allocator/test/...",
37+
"@EXECUTORCH_CLIENTS",
38+
],
39+
)

0 commit comments

Comments
 (0)