From fff5417f543efca6379ff11e273ba61ea511df00 Mon Sep 17 00:00:00 2001 From: kevin Date: Sat, 6 Dec 2025 22:25:36 +0800 Subject: [PATCH 1/4] add itrace --- ggml/src/ggml-hexagon/CMakeLists.txt | 2 ++ ggml/src/ggml-hexagon/ggml-hexagon.cpp | 31 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt index ac422027b91..268df7bce60 100644 --- a/ggml/src/ggml-hexagon/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/CMakeLists.txt @@ -11,6 +11,7 @@ target_include_directories(htp_iface PUBLIC ${HEXAGON_SDK_ROOT}/incs ${HEXAGON_SDK_ROOT}/incs/stddef ${HEXAGON_SDK_ROOT}/utils/examples + ${HEXAGON_SDK_ROOT}/libs/itrace/inc ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR}) @@ -32,6 +33,7 @@ ggml_add_backend_library(${TARGET_NAME} ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h) target_link_libraries(${TARGET_NAME} PRIVATE htp_iface) +target_link_libraries(${TARGET_NAME} PRIVATE ${HEXAGON_SDK_ROOT}/libs/itrace/prebuilt/android_aarch64/libitrace.so) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR}) # Build HTP bits diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 72a82a89116..5df63da6269 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -11,6 +11,9 @@ #include #include +#include "itrace.h" +#include "itrace_types.h" +#include "itrace_cpu_events.h" #ifdef _WIN32 # include # ifndef _WINDOWS @@ -45,6 +48,9 @@ static int opt_arch = 0; // autodetect static int opt_etm = 0; static int opt_verbose = 0; static int opt_profile = 0; +static int opt_trace = 1; +static itrace_logger_handle_t g_itrace_logger_handle = NULL; +static itrace_profiler_handle_t g_itrace_cpu_profiler_handle = NULL; static int opt_hostbuf = 1; static int opt_experimental = 0; @@ -694,6 +700,9 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) { // repack q4_0 data into q4x4x2 tensor static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("ggml-hex-repack-q4_0-q4x4x2-") + t->name).c_str(), NULL); + } int64_t nrows = ggml_nrows(t); size_t row_size = ggml_row_size(t->type, t->ne[0]); @@ -751,6 +760,9 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) ggml_aligned_free(buf_pd, row_size_pd); ggml_aligned_free(buf_rp, row_size_rp); + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } } // repack q4x4x2 tensor into q4_0 data @@ -2323,6 +2335,9 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer } static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("ggml-hex-mul-mat-") + op->name).c_str(), NULL); + } const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * dst = op; @@ -2390,6 +2405,10 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } } static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) { @@ -3429,6 +3448,12 @@ ggml_hexagon_registry::~ggml_hexagon_registry() { auto sess = static_cast(devices[i].context); delete sess; } + + // Flush and close itrace logger if profiling was enabled + if (opt_trace) { + itrace_flush_logs(g_itrace_logger_handle); + itrace_close_logger(g_itrace_logger_handle); + } } static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) { @@ -3477,6 +3502,12 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr; opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr; + // Initialize itrace if profiling is enabled + if (opt_trace) { + itrace_open_logger(CPU_DOMAIN_ID, &g_itrace_logger_handle); + itrace_open_profiler(g_itrace_logger_handle, CPU_DOMAIN_ID, 0, &g_itrace_cpu_profiler_handle); + } + const char * str_opmask = getenv("GGML_HEXAGON_OPMASK"); if (str_opmask != nullptr) { opt_opmask = strtoul(str_opmask, NULL, 0); From 606f7c749fcc3158add053b1343ba27b5e334696 Mon Sep 17 00:00:00 2001 From: kevin Date: Sun, 7 Dec 2025 07:18:31 +0800 Subject: [PATCH 2/4] support enabling trace from commmand line --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 20 +++++++++++++------- scripts/snapdragon/adb/run-cli.sh | 5 ++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 5df63da6269..dad3cf2e254 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -48,7 +48,7 @@ static int opt_arch = 0; // autodetect static int opt_etm = 0; static int opt_verbose = 0; static int opt_profile = 0; -static int opt_trace = 1; +static int opt_trace = 0; static itrace_logger_handle_t g_itrace_logger_handle = NULL; static itrace_profiler_handle_t g_itrace_cpu_profiler_handle = NULL; static int opt_hostbuf = 1; @@ -2901,6 +2901,13 @@ static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { } static void ggml_backend_hexagon_free(ggml_backend_t backend) { + // Flush and close itrace logger if profiling was enabled + if (opt_trace) { + itrace_flush_logs(g_itrace_logger_handle); + itrace_close_logger(g_itrace_logger_handle); + HEX_VERBOSE("ggml-hex: close itrace\n"); + } + // we just need to delete the backend here // the sessions are allocated & freed as part of the registry delete backend; @@ -3448,12 +3455,6 @@ ggml_hexagon_registry::~ggml_hexagon_registry() { auto sess = static_cast(devices[i].context); delete sess; } - - // Flush and close itrace logger if profiling was enabled - if (opt_trace) { - itrace_flush_logs(g_itrace_logger_handle); - itrace_close_logger(g_itrace_logger_handle); - } } static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) { @@ -3501,11 +3502,16 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_profile = getenv("GGML_HEXAGON_PROFILE") != nullptr; opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr; opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr; + opt_trace = getenv("GGML_HEXAGON_TRACE") != nullptr; // Initialize itrace if profiling is enabled if (opt_trace) { + HEX_VERBOSE("ggml-hex: open itrace\n"); itrace_open_logger(CPU_DOMAIN_ID, &g_itrace_logger_handle); itrace_open_profiler(g_itrace_logger_handle, CPU_DOMAIN_ID, 0, &g_itrace_cpu_profiler_handle); + + itrace_start_section(g_itrace_cpu_profiler_handle, "open-itrace", NULL); + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); } const char * str_opmask = getenv("GGML_HEXAGON_OPMASK"); diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh index cc5e47c2d67..939c0a2b471 100755 --- a/scripts/snapdragon/adb/run-cli.sh +++ b/scripts/snapdragon/adb/run-cli.sh @@ -30,6 +30,9 @@ sched= profile= [ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" +trace= +[ "$TRACE" != "" ] && trace="GGML_HEXAGON_TRACE=$TRACE" + opmask= [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" @@ -45,7 +48,7 @@ adb $adbserial shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $experimental $sched $opmask $profile $nhvx $ndev \ + $verbose $experimental $sched $opmask $profile $nhvx $ndev $trace \ ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \ From 67f493ef75cdffc9cdf688071682a41187710aab Mon Sep 17 00:00:00 2001 From: kevin Date: Sun, 7 Dec 2025 09:39:14 +0800 Subject: [PATCH 3/4] fine tune the itrace slice name --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 51 ++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index dad3cf2e254..9e29c60ed7b 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -288,6 +288,9 @@ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_ // Flush HTP response queue i.e wait for all outstanding requests to complete void ggml_hexagon_session::flush() { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, "session-flush", NULL); + } dspqueue_t q = this->queue; // Repeatedly read packets from the queue until it's empty. We don't @@ -338,6 +341,10 @@ void ggml_hexagon_session::flush() { this->op_pending--; // atomic dec } + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + itrace_flush_logs(g_itrace_logger_handle); + } } // ** backend buffers @@ -701,7 +708,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) { // repack q4_0 data into q4x4x2 tensor static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) { if (opt_trace) { - itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("ggml-hex-repack-q4_0-q4x4x2-") + t->name).c_str(), NULL); + itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("repack-q4_0-q4x4x2-") + t->name).c_str(), NULL); } int64_t nrows = ggml_nrows(t); @@ -2336,7 +2343,7 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) { if (opt_trace) { - itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("ggml-hex-mul-mat-") + op->name).c_str(), NULL); + itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("mul-mat-") + op->name).c_str(), NULL); } const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; @@ -2412,6 +2419,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) } static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("mul-mat-id-") + op->name).c_str(), NULL); + } const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * src2 = op->src[2]; @@ -2488,9 +2498,15 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } } static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, "binary", NULL); + } const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2577,9 +2593,15 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } } static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("add-id-") + op->name).c_str(), NULL); + } const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2652,9 +2674,15 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } } static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, "unary", NULL); + } const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * dst = op; @@ -2785,9 +2813,15 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); } + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } } static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, "rope", NULL); + } const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * src2 = op->src[2]; @@ -2893,6 +2927,9 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); } + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } } static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { @@ -2937,6 +2974,10 @@ static inline int last_compute_op(ggml_cgraph * graph) { static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { auto sess = static_cast(backend->context); + if (opt_trace) { + itrace_start_section(g_itrace_cpu_profiler_handle, (std::string("hexagon-graph-compute-") + sess->name).c_str(), NULL); + } + HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes); const int last = last_compute_op(graph); @@ -3009,6 +3050,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg // Wait until all pending ops complete sess->flush(); + if (opt_trace) { + itrace_end_section(g_itrace_cpu_profiler_handle, NULL); + } + return GGML_STATUS_SUCCESS; } @@ -3508,7 +3553,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { if (opt_trace) { HEX_VERBOSE("ggml-hex: open itrace\n"); itrace_open_logger(CPU_DOMAIN_ID, &g_itrace_logger_handle); - itrace_open_profiler(g_itrace_logger_handle, CPU_DOMAIN_ID, 0, &g_itrace_cpu_profiler_handle); + itrace_open_profiler(g_itrace_logger_handle, CPU_DOMAIN_ID, 0x1000000, &g_itrace_cpu_profiler_handle); itrace_start_section(g_itrace_cpu_profiler_handle, "open-itrace", NULL); itrace_end_section(g_itrace_cpu_profiler_handle, NULL); From 0b0003b4912aa4044e8865cecdd11c03e6b2e2ef Mon Sep 17 00:00:00 2001 From: kevin Date: Sun, 7 Dec 2025 07:18:31 +0800 Subject: [PATCH 4/4] support enabling trace from commmand line --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 9e29c60ed7b..7f41e525323 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -3553,7 +3553,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { if (opt_trace) { HEX_VERBOSE("ggml-hex: open itrace\n"); itrace_open_logger(CPU_DOMAIN_ID, &g_itrace_logger_handle); - itrace_open_profiler(g_itrace_logger_handle, CPU_DOMAIN_ID, 0x1000000, &g_itrace_cpu_profiler_handle); + itrace_open_profiler(g_itrace_logger_handle, CPU_DOMAIN_ID, 4*1024*1024, &g_itrace_cpu_profiler_handle); itrace_start_section(g_itrace_cpu_profiler_handle, "open-itrace", NULL); itrace_end_section(g_itrace_cpu_profiler_handle, NULL);