Skip to content

Commit 9574817

Browse files
committed
[Executorch] Introduce caching cpu memory allocator
Pull Request resolved: #15611 Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations ghstack-source-id: 327518047 @exported-using-ghexport Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/)
1 parent 6f4cd3d commit 9574817

File tree

10 files changed

+680
-13
lines changed

10 files changed

+680
-13
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Please keep this file formatted by running:
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
12+
cmake_minimum_required(VERSION 3.19)
13+
14+
# Source root directory for executorch.
15+
if(NOT EXECUTORCH_ROOT)
16+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
17+
endif()
18+
19+
list(TRANSFORM _extension_memory_allocator__srcs PREPEND "${EXECUTORCH_ROOT}/")
20+
if(CMAKE_TOOLCHAIN_IOS
21+
OR CMAKE_TOOLCHAIN_ANDROID
22+
OR APPLE
23+
)
24+
# Building a share library on iOS requires code signing On Android we see
25+
# duplicated registration when using shared lib
26+
add_library(
27+
extension_memory_allocator STATIC ${_extension_memory_allocator__srcs}
28+
)
29+
else()
30+
add_library(extension_memory_allocator ${_extension_memory_allocator__srcs})
31+
endif()
32+
target_link_libraries(extension_memory_allocator PRIVATE executorch_core)
33+
target_include_directories(
34+
extension_memory_allocator PUBLIC ${_common_include_directories}
35+
)
36+
target_compile_options(
37+
extension_memory_allocator
38+
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
39+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
40+
)
41+
42+
# Install libraries
43+
install(
44+
TARGETS extension_memory_allocator
45+
EXPORT ExecuTorchTargets
46+
DESTINATION ${CMAKE_INSTALL_LIBDIR}
47+
INCLUDES
48+
DESTINATION ${_common_include_directories}
49+
)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#include <cstdlib>
2+
3+
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
4+
#include <executorch/extension/memory_allocator/memory_allocator_utils.h>
5+
6+
namespace executorch::extension {
7+
8+
CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size)
9+
: MemoryAllocator(0, nullptr) {
10+
max_size_ = max_size;
11+
current_size_ = 0;
12+
}
13+
14+
void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
15+
EXECUTORCH_TRACK_ALLOCATION(prof_id(), size);
16+
17+
if (!isPowerOf2(alignment)) {
18+
ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
19+
return nullptr;
20+
}
21+
alignment = std::max(alignment, kCachingAllocatorDefaultAlignment);
22+
auto adjusted_size_value =
23+
executorch::extension::utils::get_aligned_size(size, alignment);
24+
if (!adjusted_size_value.ok()) {
25+
return nullptr;
26+
}
27+
size = adjusted_size_value.get();
28+
29+
std::lock_guard<std::mutex> guard(mutex_);
30+
const auto& it = available_map_.find(size);
31+
// Two choices here.
32+
// 1. Return cached memory
33+
// 2. Allocate new memory
34+
// 2 can lead to current_size > max_size_
35+
if (it == available_map_.end() || it->second.empty()) {
36+
void* ptr = std::malloc(size);
37+
if (ptr == nullptr) {
38+
ET_LOG(Error, "Failed to allocate memory");
39+
return nullptr;
40+
}
41+
current_size_ += size;
42+
allocation_map_[ptr] = size;
43+
return alignPointer(ptr, alignment);
44+
}
45+
void* ptr = it->second.back();
46+
it->second.pop_back();
47+
allocation_map_[ptr] = size;
48+
return alignPointer(ptr, alignment);
49+
}
50+
51+
void CPUCachingAllocator::free_everything() {
52+
// We dont lock mutex_ here because it will cause deadlock otherwise
53+
// we could use recursive_mutex but we just design this differently since
54+
// free_cache is not a public API anyways
55+
for (const auto& it : available_map_) {
56+
for (const auto ptr : it.second) {
57+
std::free(ptr);
58+
}
59+
}
60+
available_map_.clear();
61+
for (const auto& it : allocation_map_) {
62+
void* ptr = it.first;
63+
std::free(ptr);
64+
}
65+
allocation_map_.clear();
66+
// Note that purely by the design, clearing available map does not
67+
// mean that our current allocated size is zero.
68+
current_size_ = 0;
69+
}
70+
71+
void CPUCachingAllocator::reset() {
72+
std::lock_guard<std::mutex> guard(mutex_);
73+
// We make the default allocations, via allcate to be either
74+
// a. gotten via cached memory OR
75+
// b. allocated via malloced and not yet cached
76+
// So if current_size_ (allocated) is larger than the max_size_
77+
// for now we simply deallocate everything.
78+
if (current_size_ > max_size_) {
79+
free_everything();
80+
} else {
81+
for (auto& it : allocation_map_) {
82+
void* ptr = it.first;
83+
size_t alloc_size = it.second;
84+
// Cache the memory
85+
available_map_[alloc_size].push_back(ptr);
86+
}
87+
allocation_map_.clear();
88+
}
89+
}
90+
91+
CPUCachingAllocator::~CPUCachingAllocator() {
92+
// destructor must be called in thread safe manner
93+
reset();
94+
free_everything();
95+
}
96+
97+
} // namespace executorch::extension
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#pragma once
2+
3+
#include <cstddef>
4+
#include <mutex>
5+
6+
#include <executorch/runtime/core/memory_allocator.h>
7+
8+
#ifdef USE_C10_SMALL_VECTOR
9+
#include <c10/util/SmallVector.h>
10+
#else
11+
#include <vector>
12+
#endif
13+
14+
#ifdef USE_C10_FLAT_HASH_MAP
15+
#include <c10/util/flat_hash_map.h>
16+
#else
17+
#include <unordered_map>
18+
#endif
19+
20+
/*
21+
* CPUCachingAllocator:
22+
* This file is copied over from c10/mobile/CPUCachingAllocator.h
23+
* It is a thread safe caching allocator.
24+
*/
25+
26+
namespace executorch::extension {
27+
28+
#ifdef USE_C10_SMALL_VECTOR
29+
template <typename T, unsigned N>
30+
using SmallVector = c10::SmallVector<T, N>;
31+
#else
32+
template <typename T, unsigned N>
33+
using SmallVector = std::vector<T>;
34+
#endif
35+
36+
#ifdef USE_C10_FLAT_HASH_MAP
37+
template <typename KeyType, typename ValueType>
38+
using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
39+
#else
40+
template <typename KeyType, typename ValueType>
41+
using FlatHashMap = std::unordered_map<KeyType, ValueType>;
42+
#endif
43+
44+
constexpr size_t kCachingAllocatorDefaultAlignment = 64;
45+
class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
46+
/*
47+
* What it does:
48+
* Caches all the allocations carried out by this allocator.
49+
* Cache key is the size of the allocation.
50+
* If requested size is found in the cache returns the cached pointer.
51+
* What it does not do:
52+
* No speculative allocation for any future allocations.
53+
*/
54+
private:
55+
void free_everything();
56+
57+
protected:
58+
// Invariants.
59+
// New invariants must be written.
60+
FlatHashMap<size_t, SmallVector<void*, 16>> available_map_;
61+
FlatHashMap<void*, size_t> allocation_map_;
62+
// Since allocation_map_ and other member variables are mutated/read via
63+
// all public APIs, we need a mutex to protect concurrent access to these
64+
// instance members.
65+
std::mutex mutex_;
66+
size_t max_size_;
67+
size_t current_size_;
68+
69+
public:
70+
/*
71+
max_size: Maximum size of memory to cache. Never cache more than that.
72+
*/
73+
explicit CPUCachingAllocator(uint32_t max_size);
74+
// No copies allowed
75+
CPUCachingAllocator(const CPUCachingAllocator&) = delete;
76+
CPUCachingAllocator& operator=(const CPUCachingAllocator&) = delete;
77+
// No moves allowed
78+
CPUCachingAllocator(CPUCachingAllocator&&) = delete;
79+
CPUCachingAllocator& operator=(CPUCachingAllocator&&) = delete;
80+
// Checks the cache to see if allocation of size bytes can be found.
81+
// If so return cached memory, else
82+
// allocates memory, records it for caching and returns.
83+
void* allocate(
84+
size_t size,
85+
size_t alignment = kCachingAllocatorDefaultAlignment) override;
86+
void reset() override;
87+
~CPUCachingAllocator();
88+
};
89+
90+
} // namespace executorch::extension

extension/memory_allocator/malloc_memory_allocator.h

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <cstdlib>
1414
#include <vector>
1515

16+
#include <executorch/extension/memory_allocator/memory_allocator_utils.h>
1617
#include <executorch/runtime/core/memory_allocator.h>
1718

1819
namespace executorch {
@@ -51,20 +52,12 @@ class MallocMemoryAllocator : public executorch::runtime::MemoryAllocator {
5152
return nullptr;
5253
}
5354

54-
// The minimum alignment that malloc() is guaranteed to provide.
55-
static constexpr size_t kMallocAlignment = alignof(std::max_align_t);
56-
if (alignment > kMallocAlignment) {
57-
// To get higher alignments, allocate extra and then align the returned
58-
// pointer. This will waste an extra `alignment - 1` bytes every time, but
59-
// this is the only portable way to get aligned memory from the heap.
60-
const size_t extra = alignment - 1;
61-
if ET_UNLIKELY (extra >= SIZE_MAX - size) {
62-
ET_LOG(
63-
Error, "Malloc size overflow: size=%zu + extra=%zu", size, extra);
64-
return nullptr;
65-
}
66-
size += extra;
55+
auto adjusted_size_value =
56+
executorch::extension::utils::get_aligned_size(size, alignment);
57+
if (!adjusted_size_value.ok()) {
58+
return nullptr;
6759
}
60+
size = adjusted_size_value.get();
6861
void* mem_ptr = std::malloc(size);
6962
if (!mem_ptr) {
7063
ET_LOG(Error, "Malloc failed to allocate %zu bytes", size);
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <cstddef>
12+
#include <cstdint>
13+
#include <cstdlib>
14+
15+
#include <executorch/runtime/core/error.h>
16+
#include <executorch/runtime/core/result.h>
17+
#include <executorch/runtime/platform/compiler.h>
18+
19+
using executorch::runtime::Error;
20+
using executorch::runtime::Result;
21+
namespace executorch::extension::utils {
22+
23+
// Util to get alighment adjusted allocation size
24+
inline Result<size_t> get_aligned_size(size_t size, size_t alignment) {
25+
// The minimum alignment that malloc() is guaranteed to provide.
26+
static constexpr size_t kMallocAlignment = alignof(std::max_align_t);
27+
if (alignment > kMallocAlignment) {
28+
// To get higher alignments, allocate extra and then align the returned
29+
// pointer. This will waste an extra `alignment - 1` bytes every time, but
30+
// this is the only portable way to get aligned memory from the heap.
31+
const size_t extra = alignment - 1;
32+
if ET_UNLIKELY (extra >= SIZE_MAX - size) {
33+
ET_LOG(Error, "Malloc size overflow: size=%zu + extra=%zu", size, extra);
34+
return Result<size_t>(Error::InvalidArgument);
35+
}
36+
size += extra;
37+
}
38+
return size;
39+
}
40+
41+
} // namespace executorch::extension::utils

extension/memory_allocator/targets.bzl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,25 @@ def define_common_targets():
1111
name = "malloc_memory_allocator",
1212
exported_headers = [
1313
"malloc_memory_allocator.h",
14+
"memory_allocator_utils.h",
15+
],
16+
exported_deps = [
17+
"//executorch/runtime/core:memory_allocator",
18+
],
19+
visibility = [
20+
"//executorch/extension/memory_allocator/test/...",
21+
"@EXECUTORCH_CLIENTS",
22+
],
23+
)
24+
25+
runtime.cxx_library(
26+
name = "cpu_caching_allocator",
27+
srcs = [
28+
"cpu_caching_malloc_allocator.cpp",
29+
],
30+
exported_headers = [
31+
"cpu_caching_malloc_allocator.h",
32+
"memory_allocator_utils.h",
1433
],
1534
exported_deps = [
1635
"//executorch/runtime/core:memory_allocator",

0 commit comments

Comments
 (0)