Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/src/ggml-hexagon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ target_include_directories(htp_iface PUBLIC
${HEXAGON_SDK_ROOT}/incs
${HEXAGON_SDK_ROOT}/incs/stddef
${HEXAGON_SDK_ROOT}/utils/examples
${HEXAGON_SDK_ROOT}/libs/itrace/inc
${CMAKE_CURRENT_SOURCE_DIR}/htp
${CMAKE_CURRENT_BINARY_DIR})

Expand All @@ -32,6 +33,7 @@ ggml_add_backend_library(${TARGET_NAME}
ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)

target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
target_link_libraries(${TARGET_NAME} PRIVATE ${HEXAGON_SDK_ROOT}/libs/itrace/prebuilt/android_aarch64/libitrace.so)
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})

# Build HTP bits
Expand Down
82 changes: 82 additions & 0 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#include <string>
#include <stdexcept>

#include "itrace.h"
#include "itrace_types.h"
#include "itrace_cpu_events.h"
#ifdef _WIN32
# include <sal.h>
# ifndef _WINDOWS
Expand Down Expand Up @@ -45,6 +48,9 @@ static int opt_arch = 0; // autodetect
static int opt_etm = 0;
static int opt_verbose = 0;
static int opt_profile = 0;
static int opt_trace = 0;
static itrace_logger_handle_t g_itrace_logger_handle = NULL;
static itrace_profiler_handle_t g_itrace_cpu_profiler_handle = NULL;
static int opt_hostbuf = 1;
static int opt_experimental = 0;

Expand Down Expand Up @@ -282,6 +288,9 @@ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_

// Flush HTP response queue i.e wait for all outstanding requests to complete
void ggml_hexagon_session::flush() {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, "session-flush", NULL);
}
dspqueue_t q = this->queue;

// Repeatedly read packets from the queue until it's empty. We don't
Expand Down Expand Up @@ -332,6 +341,10 @@ void ggml_hexagon_session::flush() {

this->op_pending--; // atomic dec
}
if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
itrace_flush_logs(g_itrace_logger_handle);
}
}

// ** backend buffers
Expand Down Expand Up @@ -694,6 +707,9 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {

// repack q4_0 data into q4x4x2 tensor
static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("repack-q4_0-q4x4x2-") + t->name).c_str(), NULL);
}
int64_t nrows = ggml_nrows(t);

size_t row_size = ggml_row_size(t->type, t->ne[0]);
Expand Down Expand Up @@ -751,6 +767,9 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)

ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}
}

// repack q4x4x2 tensor into q4_0 data
Expand Down Expand Up @@ -2323,6 +2342,9 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
}

static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("mul-mat-") + op->name).c_str(), NULL);
}
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * dst = op;
Expand Down Expand Up @@ -2390,9 +2412,16 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);

if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}
}

static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("mul-mat-id-") + op->name).c_str(), NULL);
}
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * src2 = op->src[2];
Expand Down Expand Up @@ -2469,9 +2498,15 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
(uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
(uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}
}

static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, "binary", NULL);
}
const struct ggml_tensor * node = op;
const struct ggml_tensor * src0 = node->src[0];
const struct ggml_tensor * src1 = node->src[1];
Expand Down Expand Up @@ -2558,9 +2593,15 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}
}

static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("add-id-") + op->name).c_str(), NULL);
}
const struct ggml_tensor * node = op;
const struct ggml_tensor * src0 = node->src[0];
const struct ggml_tensor * src1 = node->src[1];
Expand Down Expand Up @@ -2633,9 +2674,15 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}
}

static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, "unary", NULL);
}
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * dst = op;
Expand Down Expand Up @@ -2766,9 +2813,15 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
}
if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}
}

static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, "rope", NULL);
}
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * src2 = op->src[2];
Expand Down Expand Up @@ -2874,6 +2927,9 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
}
if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}
}

static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
Expand All @@ -2882,6 +2938,13 @@ static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
}

static void ggml_backend_hexagon_free(ggml_backend_t backend) {
// Flush and close itrace logger if profiling was enabled
if (opt_trace) {
itrace_flush_logs(g_itrace_logger_handle);
itrace_close_logger(g_itrace_logger_handle);
HEX_VERBOSE("ggml-hex: close itrace\n");
}

// we just need to delete the backend here
// the sessions are allocated & freed as part of the registry
delete backend;
Expand Down Expand Up @@ -2911,6 +2974,10 @@ static inline int last_compute_op(ggml_cgraph * graph) {
static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
auto sess = static_cast<ggml_hexagon_session *>(backend->context);

if (opt_trace) {
itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("hexagon-graph-compute-") + sess->name).c_str(), NULL);
}

HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);

const int last = last_compute_op(graph);
Expand Down Expand Up @@ -2983,6 +3050,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
// Wait until all pending ops complete
sess->flush();

if (opt_trace) {
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}

return GGML_STATUS_SUCCESS;
}

Expand Down Expand Up @@ -3476,6 +3547,17 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
opt_profile = getenv("GGML_HEXAGON_PROFILE") != nullptr;
opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr;
opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
opt_trace = getenv("GGML_HEXAGON_TRACE") != nullptr;

// Initialize itrace if profiling is enabled
if (opt_trace) {
HEX_VERBOSE("ggml-hex: open itrace\n");
itrace_open_logger(CPU_DOMAIN_ID, &g_itrace_logger_handle);
itrace_open_profiler(g_itrace_logger_handle, CPU_DOMAIN_ID, 0x1000000, &g_itrace_cpu_profiler_handle);

itrace_start_section(g_itrace_cpu_profiler_handle, "open-itrace", NULL);
itrace_end_section(g_itrace_cpu_profiler_handle, NULL);
}

const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
if (str_opmask != nullptr) {
Expand Down
5 changes: 4 additions & 1 deletion scripts/snapdragon/adb/run-cli.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ sched=
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"

trace=
[ "$TRACE" != "" ] && trace="GGML_HEXAGON_TRACE=$TRACE"

opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"

Expand All @@ -45,7 +48,7 @@ adb $adbserial shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
$verbose $experimental $sched $opmask $profile $trace $nhvx $ndev \
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
Expand Down