Skip to content
Draft
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
407b408
fix test failure
chraac Nov 27, 2025
4ddb8a4
fix: correct scaling calculations in rope_cache_init
chraac Nov 27, 2025
cfca78b
wip
chraac Nov 27, 2025
e9a02fd
wip
chraac Nov 28, 2025
e324bb0
fix: optimize element copying in rope_hex_f32 using memcpy
chraac Nov 28, 2025
0121291
fix: optimize loop boundaries in rope_hex_f32 for better performance
chraac Nov 28, 2025
010039a
rename
chraac Nov 28, 2025
a6ef41f
wip
chraac Nov 28, 2025
0376146
Merge branch 'master' into dev-fix-rope
chraac Nov 28, 2025
8abecfa
Merge tag 'b7207' into dev-fix-rope
chraac Nov 30, 2025
b567413
feat: add profiling macros for performance measurement in operations
chraac Nov 30, 2025
7c8f101
refactor: replace manual timing with profiling macros in matmul opera…
chraac Dec 3, 2025
3a70465
Merge branch 'master' into dev-fix-rope
chraac Dec 4, 2025
3b0cef4
Revert "refactor: replace manual timing with profiling macros in matm…
chraac Dec 5, 2025
121e656
Revert "feat: add profiling macros for performance measurement in ope…
chraac Dec 5, 2025
401fd3e
refactor: optimize vector operations in vec_dot_q4x4x2_q8x4x2_rx2 fun…
chraac Dec 5, 2025
cf491f2
wip
chraac Dec 5, 2025
3a01d82
feat: enhance vec_dot_q4x4x2_q8x4x2_rx2 function with optimized data …
chraac Dec 7, 2025
87ad8b2
Merge branch 'master' into dev-mulmat-opt
chraac Dec 8, 2025
421d031
feat: add hvx_vec_load_d_and_mpy function for optimized data loading …
chraac Dec 8, 2025
bd43860
wip
chraac Dec 8, 2025
b197464
feat: add hvx_vec_load_d_and_mpy_r2x2 function for optimized vector l…
chraac Dec 8, 2025
309d782
feat: optimize vec_dot functions with improved data handling and loading
chraac Dec 8, 2025
dbe9309
wip
chraac Dec 9, 2025
00d5fb3
feat: add build information and update vector loading functions for o…
chraac Dec 9, 2025
b54ff18
revert rope changes
chraac Dec 10, 2025
f757245
Merge tag 'b7345' into dev-mulmat-opt
chraac Dec 10, 2025
09c4899
fix: revert HVX_Vector back to HVX_UVector
chraac Dec 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Revert "refactor: replace manual timing with profiling macros in matm…
…ul operations"

This reverts commit 7c8f101.
  • Loading branch information
chraac committed Dec 5, 2025
commit 3b0cef47e20c66f680e75dc321141efe634772a5
73 changes: 45 additions & 28 deletions ggml/src/ggml-hexagon/htp/matmul-ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1092,7 +1092,8 @@ static void matmul(struct htp_matmul_type * mt,
uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
uint8_t * restrict src1_data = src1_spad->data;

PROFILER_START(matmul);
volatile uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();

const uint8_t * restrict src0_row = (const uint8_t *) src0->data;

Expand Down Expand Up @@ -1143,9 +1144,12 @@ static void matmul(struct htp_matmul_type * mt,
}
}

PROFILER_END(matmul, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
t2 = HAP_perf_get_qtimer_count();

FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

// q8x4x2 src1 tensor is already in VTCM spad
Expand Down Expand Up @@ -1186,7 +1190,8 @@ static void matvec(struct htp_matmul_type * mt,
uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
uint8_t * src1_data = src1_spad->data;

PROFILER_START(matvec);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();

float * tmp = (float *) spad_dst;

Expand Down Expand Up @@ -1231,9 +1236,12 @@ static void matvec(struct htp_matmul_type * mt,

hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);

PROFILER_END(matvec, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
t2 = HAP_perf_get_qtimer_count();

FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
Expand All @@ -1259,7 +1267,8 @@ static void matmul_id(struct htp_matmul_type * mt,
dma_queue * dma_queue) {
htp_matmul_preamble;

PROFILER_START(matmul_id);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();

const uint32_t src0_nrows = ne01; // src0 rows per expert
const uint32_t src1_nrows = ne11;
Expand Down Expand Up @@ -1364,11 +1373,12 @@ static void matmul_id(struct htp_matmul_type * mt,
}
}

PROFILER_END(matmul_id,
"matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
t2 = HAP_perf_get_qtimer_count();

FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

// q8x4 src1 tensor is already in VTCM spad
Expand All @@ -1387,7 +1397,8 @@ static void matvec_id(struct htp_matmul_type * mt,
dma_queue * dma_queue) {
htp_matmul_preamble;

PROFILER_START(matvec_id);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();

const uint32_t src0_nrows = ne01; // src0 rows per expert

Expand Down Expand Up @@ -1462,11 +1473,12 @@ static void matvec_id(struct htp_matmul_type * mt,
}
}

PROFILER_END(matvec_id,
"matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
t2 = HAP_perf_get_qtimer_count();

FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

// *** matmul in fp16
Expand All @@ -1483,7 +1495,8 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
dma_queue * dma_queue) {
htp_matmul_preamble;

PROFILER_START(matmul_f16_f32);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();

const size_t src0_row_size = sizeof(__fp16) * ne00;
const size_t src1_row_size = sizeof(float) * ne10;
Expand Down Expand Up @@ -1562,10 +1575,12 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
}
}

PROFILER_END(matmul_f16_f32,
"matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end,
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
t2 = HAP_perf_get_qtimer_count();

FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0],
src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

// *** dynamic quant
Expand Down Expand Up @@ -1647,7 +1662,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
uint32_t nth,
uint32_t ith,
uint32_t nrows_per_thread) {
PROFILER_START(quantize_fp32_q8x4);
uint64_t t1 = HAP_perf_get_qtimer_count();

const uint32_t ne0 = src->ne[0];
const uint32_t ne1 = src->ne[1];
Expand Down Expand Up @@ -1679,8 +1694,10 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
src_data += src_row_size;
}

PROFILER_END(quantize_fp32_q8x4, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith,
nth, nrows, ir_first, ir_last, src_row_size, dst_row_size);
uint64_t t2 = HAP_perf_get_qtimer_count();

FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {
Expand Down