Skip to content
Open
Changes from 1 commit
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
ca22e28
Rename sgemm_direct_sme1.S to sgemm_direct_sme1_2VLx2VL.S
martin-frbg Aug 18, 2025
22c6607
Use ASMNAME to get symbol name from build system; leave x18 unused as…
martin-frbg Aug 18, 2025
89898fc
Add sgemm_direct_performant for switching between direct and regular …
martin-frbg Aug 18, 2025
08a0032
Build symbol name from build system variables
martin-frbg Aug 18, 2025
53d3bb5
Get symbol name from build system; change b.first to b.mi for AppleCl…
martin-frbg Aug 18, 2025
731f4dd
Add VORTEXM4 settings
martin-frbg Aug 18, 2025
e82bcd2
Update ARM64 sgemm_direct object generation
martin-frbg Aug 18, 2025
0203657
Add sgemm_direct_performant for ARM64
martin-frbg Aug 18, 2025
de91afd
Move SGEMM_DIRECT after the CBLAS parameter check and add sgemm_direc…
martin-frbg Aug 18, 2025
202a7a0
Separate VORTEXM4 from VORTEX and ARMV9SME
martin-frbg Aug 18, 2025
e76c390
Add sgemm_direct_performant for ARM64
martin-frbg Aug 18, 2025
ef0b883
Add sgemm_direct_performant for ARM64
martin-frbg Aug 18, 2025
ccfd017
Enable SME on MacOS and add VORTEXM4 to DYNAMIC_ARCH list
martin-frbg Aug 18, 2025
b0a00fb
Add minimal compiler flags for VORTEXM4
martin-frbg Aug 18, 2025
3097046
Add VORTEXM4 target
martin-frbg Aug 18, 2025
4e2a8c1
Split VORTEXM4 from VORTEX target due to SME support
martin-frbg Aug 18, 2025
18f9582
Add VORTEXM4
martin-frbg Aug 18, 2025
ca542f3
Add VORTEXM4
martin-frbg Aug 18, 2025
a4f5fec
Add compiler options for VORTEXM4
martin-frbg Aug 18, 2025
c794d0a
Add VORTEXM4
martin-frbg Aug 18, 2025
4328c91
relax requirements in compiler SME capability check
martin-frbg Aug 18, 2025
426b5f2
Add compiler options for VORTEXM4
martin-frbg Aug 18, 2025
0bc19a1
Update SME kernel details
martin-frbg Aug 18, 2025
bf98e44
Add VORTEXM4 to DYNAMIC_ARCH list
martin-frbg Aug 18, 2025
4609732
Relax version number requirement for AppleClang
martin-frbg Aug 18, 2025
05dbb54
Delete misplaced file
martin-frbg Aug 19, 2025
107c883
Update SME-related kernels
martin-frbg Aug 19, 2025
501728a
adjust register 20 accesses to 21 after moving x18
martin-frbg Aug 20, 2025
edaa73f
Hide the local 2VLx2VL symbol as static is insufficient for this with…
martin-frbg Aug 20, 2025
1ee8879
Add VORTEXM4
martin-frbg Aug 20, 2025
7f89c6f
smh-based direct sgemm currently requires leading dimensions to be sa…
martin-frbg Aug 23, 2025
8e50b8d
Add d8 to d15 to clobber lists as the code does not expressly save them
martin-frbg Aug 23, 2025
b4fc09e
Add registers d8 to d15 to clobber lists as the code does not express…
martin-frbg Aug 23, 2025
1b88c9c
remove debugging printouts
martin-frbg Aug 24, 2025
2b5d8c7
remove debugging printout
martin-frbg Aug 24, 2025
fc516af
Merge branch 'develop' into issue5414
martin-frbg Oct 1, 2025
ba9d2d2
remove sme from M4 Fortran flags as gfortran couples it with sve
martin-frbg Oct 2, 2025
b3d0bc4
Update Makefile.L3
martin-frbg Oct 2, 2025
4ae3e37
restore 2VLx2VL naming
martin-frbg Oct 2, 2025
c889558
Rework for DYNAMIC_ARCH use and use of SGEMM functions by SSYMM
martin-frbg Oct 2, 2025
20f5ed1
Merge branch 'OpenMathLib:develop' into issue5414
martin-frbg Oct 8, 2025
47a66ae
Update limits based on benchmarking the SME code on Apple M4
martin-frbg Oct 8, 2025
9bfc361
Merge branch 'OpenMathLib:develop' into issue5414
martin-frbg Oct 12, 2025
8211db6
Don't enable SME for VortexM4 when the compiler is gcc (which does no…
martin-frbg Oct 19, 2025
2346d0b
Add HAVE_SME for VortexM4 only with non-gcc compilers
martin-frbg Oct 19, 2025
d7b0fcc
Enable SME-based kernels for VortexM4 with clang-based compilers only
martin-frbg Oct 19, 2025
643a0b5
Allow VortexM4 on the direct_SME fast path only for clang-based compi…
martin-frbg Oct 19, 2025
e01b109
Allow VortexM4 on the same fast path only with non-gcc compilers
martin-frbg Oct 19, 2025
f4ee3ae
Allow VortexM4 on the SME fast path only with non-gcc compilers
martin-frbg Oct 19, 2025
1b591ea
export HAVE_SME setting and exclude VortexM4 from DYNAMIC_ARCH if gcc…
martin-frbg Oct 19, 2025
83d3e0e
fix copy/paste
martin-frbg Oct 19, 2025
682f61e
Add prototype for gotoblas_corename
martin-frbg Oct 19, 2025
ea85b66
Merge branch 'OpenMathLib:develop' into issue5414
martin-frbg Nov 23, 2025
9c0965b
Merge branch 'OpenMathLib:develop' into issue5414
martin-frbg Nov 23, 2025
8c0b13c
Merge branch 'OpenMathLib:develop' into issue5414
martin-frbg Nov 23, 2025
7d35bf6
Add cpuid for Apple M5 (from a PR to the archspec project)
martin-frbg Nov 24, 2025
7e44f62
fix sequence of arm64 sgemm_direct_performance and sgemm_direct_ab
martin-frbg Nov 24, 2025
b0bd49a
Add compiler guard around the M4 HAVE_SME property
martin-frbg Nov 24, 2025
4af1870
Only add dedicated VORTEXM4 if building with LLVM
martin-frbg Nov 24, 2025
b185c9a
small fixes for separating sme and dummy parts
martin-frbg Nov 24, 2025
a683287
rework for dynamic_arch
martin-frbg Nov 24, 2025
705259c
remove redundant HAVE_SME
martin-frbg Nov 24, 2025
7ab8dc1
rework ARM64 SME dependency handling
martin-frbg Nov 24, 2025
c3c857c
fix sequence
martin-frbg Nov 24, 2025
825d3ad
AppleClang does not define feature local_streaming
martin-frbg Nov 28, 2025
e85efb8
remove za from clobber lists
martin-frbg Dec 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move SGEMM_DIRECT after the CBLAS parameter check and add sgemm_direc…
…t_performant for ARM64
  • Loading branch information
martin-frbg authored Aug 18, 2025
commit de91afd2ae52ee0bf5a00680a7158e6c10cad6ae
56 changes: 32 additions & 24 deletions interface/gemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -424,30 +424,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

PRINT_DEBUG_CNAME;

#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_sme1())
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}else if (order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}
#endif
#endif

#ifndef COMPLEX
args.alpha = (void *)α
args.beta = (void *)β
Expand Down Expand Up @@ -564,6 +540,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
return;
}


if ((args.m == 0) || (args.n == 0)) return;
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (order == CblasRowMajor && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (strcmp(gotoblas_corename(), "armv9sme") == 0 || strcmp(gotoblas_corename(), "vortexm4") == 0)
// if (support_sme1())
#endif
if (order == CblasRowMajor && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
else
if (order == CblasRowMajor && beta != 0. && (!(alpha==1.&&beta==1.)) && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}
#endif
#endif

#endif

#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
Expand All @@ -582,6 +587,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

if ((args.m == 0) || (args.n == 0)) return;




#if 0
fprintf(stderr, "m = %4d n = %d k = %d lda = %4d ldb = %4d ldc = %4d\n",
args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
Expand Down