Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
6f77646
[Executorch] make slice_copy parallel
kimishpatel Nov 14, 2025
426ee4e
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 20, 2025
e5ad010
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 21, 2025
f97cbdd
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 22, 2025
1b49951
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 23, 2025
9c02718
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 23, 2025
f586127
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 23, 2025
65cfbad
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 24, 2025
6cbca44
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 24, 2025
996847f
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 24, 2025
301673a
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 24, 2025
35edbfb
Update on "[Executorch] make slice_copy parallel"
kimishpatel Nov 25, 2025
91d62d6
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
53b7747
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
2a58036
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
fd05691
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
872303d
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
44f9fa2
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
67cffb5
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
796930a
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
38844e1
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 4, 2025
cbd10d2
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 6, 2025
117b6e7
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 7, 2025
4c5d92d
Update on "[Executorch] make slice_copy parallel"
kimishpatel Dec 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 39 additions & 6 deletions kernels/portable/cpu/util/slice_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <c10/util/irange.h>
#include <executorch/kernels/portable/cpu/util/slice_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/kernel/thread_parallel_interface.h>
#include <cstring>

namespace torch {
Expand Down Expand Up @@ -202,12 +203,44 @@ void compute_slice(
InvalidArgument,
/* void */,
"out.nbytes() is smaller than the expected slice size.");
for (const auto i : c10::irange(leading_dims)) {
const char* src = input_data + (i * dim_length + start) * length_per_step;
for ([[maybe_unused]] const auto j : c10::irange(length)) {
memcpy(dest, src, length_per_step);
src += step * length_per_step;
dest += length_per_step;
// Thresholds for enabling multithreading:
// - Minimum number of leading dimensions: 8
// - Minimum total elements to copy: 32768 (GRAIN_SIZE)
constexpr int64_t MIN_LEADING_DIMS_FOR_MT = 8;
constexpr int64_t MIN_ELEMENTS_FOR_MT =
executorch::extension::internal::GRAIN_SIZE;

const int64_t total_elements = leading_dims * length * trailing_dims;
const bool use_multithreading = leading_dims >= MIN_LEADING_DIMS_FOR_MT &&
total_elements >= MIN_ELEMENTS_FOR_MT;

if (use_multithreading) {
// Use parallel_for to distribute work across leading dimensions
// Calculate grain size based on number of elements per leading dimension
const int64_t grain_size = MIN_LEADING_DIMS_FOR_MT;

executorch::extension::parallel_for(
0, leading_dims, grain_size, [&](const auto begin, const auto end) {
for (const auto i : c10::irange(begin, end)) {
const char* src =
input_data + (i * dim_length + start) * length_per_step;
char* local_dest = dest + i * length * length_per_step;
for ([[maybe_unused]] const auto j : c10::irange(length)) {
memcpy(local_dest, src, length_per_step);
src += step * length_per_step;
local_dest += length_per_step;
}
}
});
} else {
// Single-threaded path for small workloads
for (const auto i : c10::irange(leading_dims)) {
const char* src = input_data + (i * dim_length + start) * length_per_step;
for ([[maybe_unused]] const auto j : c10::irange(length)) {
memcpy(dest, src, length_per_step);
src += step * length_per_step;
dest += length_per_step;
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions kernels/portable/cpu/util/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ def define_common_targets():
exported_headers = ["slice_util.h"],
deps = [
"//executorch/runtime/kernel:kernel_includes",
"//executorch/extension/threadpool:threadpool",
],
visibility = ["//executorch/kernels/portable/cpu/..."],
)
Expand Down
Loading