Skip to content

Some OpenCL kernels do not compile with Clang/LLVM #379

@vedranmiletic

Description

@vedranmiletic

Hello,

I am trying to get Mesa/Gallium/radeonsi/Clover/LLVM OpenCL stack to run libgpuarray and eventually Theano. Mesa already passes more then a half of the tests (I presume some don't use OpenCL):

$ DEVICE="opencl0:0" make test
Running tests...
Test project /users/miletivn/workspace/libgpuarray/build
    Start 1: test_types
1/9 Test #1: test_types .......................   Passed    0.00 sec
    Start 2: test_util
2/9 Test #2: test_util ........................   Passed    0.01 sec
    Start 3: test_util_integerfactoring
3/9 Test #3: test_util_integerfactoring .......   Passed    0.69 sec
    Start 4: test_reduction
4/9 Test #4: test_reduction ...................   Passed   10.07 sec
    Start 5: test_array
5/9 Test #5: test_array .......................***Failed    1.59 sec
    Start 6: test_blas
6/9 Test #6: test_blas ........................***Failed    1.66 sec
    Start 7: test_elemwise
7/9 Test #7: test_elemwise ....................***Failed   22.34 sec
    Start 8: test_error
8/9 Test #8: test_error .......................   Passed    0.00 sec
    Start 9: test_buffer
9/9 Test #9: test_buffer ......................   Passed    2.91 sec

67% tests passed, 3 tests failed out of 9

Total Test time (real) =  39.27 sec

The following tests FAILED:
	  5 - test_array (Failed)
	  6 - test_blas (Failed)
	  7 - test_elemwise (Failed)
Errors while running CTest
Makefile:116: recipe for target 'test' failed
make: *** [test] Error 8

Regarding failing tests, I would require some help figuring out whether it's Mesa/Clang/LLVM's fault or some non-standard feature used by libgpuarray.

$ DEVICE="opencl0:0" ./tests/check_array 
Running suite(s): array
Program build failure ::
input.cl:50:31: error: casting '__global uint *' (aka '__global unsigned int *') to type 'char *' changes address space of pointer
input.cl:51:34: error: casting 'const __global long *' to type 'char *' changes address space of pointer
0001	#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002	#define WITHIN_KERNEL /* empty */
0003	#define KERNEL __kernel
0004	#define GLOBAL_MEM __global
0005	#define LOCAL_MEM __local
0006	#define LOCAL_MEM_ARG __local
0007	#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008	#ifndef NULL
0009	  #define NULL ((void*)0)
0010	#endif
0011	#define LID_0 get_local_id(0)
0012	#define LID_1 get_local_id(1)
0013	#define LID_2 get_local_id(2)
0014	#define LDIM_0 get_local_size(0)
0015	#define LDIM_1 get_local_size(1)
0016	#define LDIM_2 get_local_size(2)
0017	#define GID_0 get_group_id(0)
0018	#define GID_1 get_group_id(1)
0019	#define GID_2 get_group_id(2)
0020	#define GDIM_0 get_num_groups(0)
0021	#define GDIM_1 get_num_groups(1)
0022	#define GDIM_2 get_num_groups(2)
0023	#define ga_bool uchar
0024	#define ga_byte char
0025	#define ga_ubyte uchar
0026	#define ga_short short
0027	#define ga_ushort ushort
0028	#define ga_int int
0029	#define ga_uint uint
0030	#define ga_long long
0031	#define ga_ulong ulong
0032	#define ga_float float
0033	#define ga_double double
0034	#define ga_half half
0035	#define ga_size ulong
0036	#define ga_ssize long
0037	#define load_half(p) vload_half(0, p)
0038	#define store_half(p, v) vstore_half_rtn(v, 0, p)
0039	#define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040	#define GA_DECL_SHARED_BODY(type, name)
0041	#define GA_WARP_SIZE 64
0042	#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043	KERNEL void take1(GLOBAL_MEM ga_uint *r, ga_size r_off, GLOBAL_MEM const ga_uint *v, ga_size v_off, ga_ssize s0, ga_size d0, GLOBAL_MEM const ga_ssize *ind, ga_size i_off, ga_size n0, ga_size n1, GLOBAL_MEM int* err) {
0044	  const ga_uint idx0 = LDIM_0 * GID_0 + LID_0;
0045	  const ga_uint numThreads0 = LDIM_0 * GDIM_0;
0046	  const ga_uint idx1 = LDIM_1 * GID_1 + LID_1;
0047	  const ga_uint numThreads1 = LDIM_1 * GDIM_1;
0048	  ga_uint i0, i1;
0049	  if (idx0 >= n0 || idx1 >= n1) return;
0050	  r = (GLOBAL_MEM ga_uint *)(((char *)r) + r_off);
0051	  ind = (GLOBAL_MEM ga_ssize *)(((char *)ind) + i_off);
0052	  for (i0 = idx0; i0 < n0; i0 += numThreads0) {
0053	    ga_int ii0 = ind[i0];
0054	    ga_uint pos0 = v_off;
0055	    if (ii0 < 0) ii0 += d0;
0056	    if ((ii0 < 0) || (ii0 >= d0)) {
0057	      *err = -1;
0058	      continue;
0059	    }
0060	    pos0 += ii0 * (ga_uint)s0;
0061	    for (i1 = idx1; i1 < n1; i1 += numThreads1) {
0062	      ga_uint p = pos0;
0063	      r[i0*((ga_uint)n1) + i1] = *((GLOBAL_MEM ga_uint *)(((GLOBAL_MEM char *)v) + p));
0064	    }
0065	  }
0066	}

Program build failure ::
input.cl:50:31: error: casting '__global uint *' (aka '__global unsigned int *') to type 'char *' changes address space of pointer
input.cl:51:33: error: casting 'const __global uint *' (aka 'const __global unsigned int *') to type 'char *' changes address space of pointer
0001	#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002	#define WITHIN_KERNEL /* empty */
0003	#define KERNEL __kernel
0004	#define GLOBAL_MEM __global
0005	#define LOCAL_MEM __local
0006	#define LOCAL_MEM_ARG __local
0007	#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008	#ifndef NULL
0009	  #define NULL ((void*)0)
0010	#endif
0011	#define LID_0 get_local_id(0)
0012	#define LID_1 get_local_id(1)
0013	#define LID_2 get_local_id(2)
0014	#define LDIM_0 get_local_size(0)
0015	#define LDIM_1 get_local_size(1)
0016	#define LDIM_2 get_local_size(2)
0017	#define GID_0 get_group_id(0)
0018	#define GID_1 get_group_id(1)
0019	#define GID_2 get_group_id(2)
0020	#define GDIM_0 get_num_groups(0)
0021	#define GDIM_1 get_num_groups(1)
0022	#define GDIM_2 get_num_groups(2)
0023	#define ga_bool uchar
0024	#define ga_byte char
0025	#define ga_ubyte uchar
0026	#define ga_short short
0027	#define ga_ushort ushort
0028	#define ga_int int
0029	#define ga_uint uint
0030	#define ga_long long
0031	#define ga_ulong ulong
0032	#define ga_float float
0033	#define ga_double double
0034	#define ga_half half
0035	#define ga_size ulong
0036	#define ga_ssize long
0037	#define load_half(p) vload_half(0, p)
0038	#define store_half(p, v) vstore_half_rtn(v, 0, p)
0039	#define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040	#define GA_DECL_SHARED_BODY(type, name)
0041	#define GA_WARP_SIZE 64
0042	#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043	KERNEL void take1(GLOBAL_MEM ga_uint *r, ga_size r_off, GLOBAL_MEM const ga_uint *v, ga_size v_off, ga_ssize s0, ga_size d0, GLOBAL_MEM const ga_uint *ind, ga_size i_off, ga_size n0, ga_size n1, GLOBAL_MEM int* err) {
0044	  const ga_uint idx0 = LDIM_0 * GID_0 + LID_0;
0045	  const ga_uint numThreads0 = LDIM_0 * GDIM_0;
0046	  const ga_uint idx1 = LDIM_1 * GID_1 + LID_1;
0047	  const ga_uint numThreads1 = LDIM_1 * GDIM_1;
0048	  ga_uint i0, i1;
0049	  if (idx0 >= n0 || idx1 >= n1) return;
0050	  r = (GLOBAL_MEM ga_uint *)(((char *)r) + r_off);
0051	  ind = (GLOBAL_MEM ga_uint *)(((char *)ind) + i_off);
0052	  for (i0 = idx0; i0 < n0; i0 += numThreads0) {
0053	    ga_int ii0 = ind[i0];
0054	    ga_uint pos0 = v_off;
0055	    if (ii0 < 0) ii0 += d0;
0056	    if ((ii0 < 0) || (ii0 >= d0)) {
0057	      *err = -1;
0058	      continue;
0059	    }
0060	    pos0 += ii0 * (ga_uint)s0;
0061	    for (i1 = idx1; i1 < n1; i1 += numThreads1) {
0062	      ga_uint p = pos0;
0063	      r[i0*((ga_uint)n1) + i1] = *((GLOBAL_MEM ga_uint *)(((GLOBAL_MEM char *)v) + p));
0064	    }
0065	  }
0066	}

0%: Checks: 2, Failures: 2, Errors: 0
/users/miletivn/workspace/libgpuarray/tests/check_array.c:57:F:take1:test_take1_ok:0: Assertion 'GpuArray_take1(&vres, &v, &vidx, 0) == GA_NO_ERROR' failed: GpuArray_take1(&vres, &v, &vidx, 0) == 3, GA_NO_ERROR == 0
/users/miletivn/workspace/libgpuarray/tests/check_array.c:267:F:take1:test_take1_offset:0: Assertion 'GpuArray_take1(&r, &v, &i, 1) == GA_NO_ERROR' failed: GpuArray_take1(&r, &v, &i, 1) == 3, GA_NO_ERROR == 0

Are these casts fine for other drivers? Clang seems to dislike them when compiling for AMDGPU.

$ DEVICE="opencl0:0" ./tests/check_blas 
Running suite(s): blas
OpenCL error -11 on line 244 of /users/miletivn/workspace/clBLAS/src/library/blas/xgemm.cc
check_blas: /users/miletivn/workspace/clBLAS/src/library/blas/xgemm.cc:244: void makeGemmKernel(_cl_kernel**, cl_command_queue, const char*, const char*, const unsigned char**, size_t*, const char*): Assertion `false' failed.
0%: Checks: 1, Failures: 0, Errors: 1
/users/miletivn/workspace/libgpuarray/tests/check_blas.c:26:E:all:test_gemmBatch_3d:0: (after this point) Received signal 6 (Aborted)

-11 is CL_BUILD_PROGRAM_FAILURE, but it's in clBLAS, so not your problem.

$ DEVICE="opencl0:0" ./tests/check_elemwise 
Running suite(s): elemwise
Shader Stats: SGPRS: 24 VGPRS: 10 Code Size: 780 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Program build failure ::
input.cl:52:5: warning: implicit declaration of function 'vload_half' is invalid in C99
input.cl:37:22: note: expanded from macro 'load_half'
input.cl:57:1: warning: implicit declaration of function 'vstore_half_rtn' is invalid in C99
input.cl:38:26: note: expanded from macro 'store_half'
<unknown>:0:0: in function elem void (i64, half addrspace(1)*, i64, half addrspace(1)*, i64, half addrspace(1)*, i64): unsupported call to function vload_half
0001	#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002	#define WITHIN_KERNEL /* empty */
0003	#define KERNEL __kernel
0004	#define GLOBAL_MEM __global
0005	#define LOCAL_MEM __local
0006	#define LOCAL_MEM_ARG __local
0007	#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008	#ifndef NULL
0009	  #define NULL ((void*)0)
0010	#endif
0011	#define LID_0 get_local_id(0)
0012	#define LID_1 get_local_id(1)
0013	#define LID_2 get_local_id(2)
0014	#define LDIM_0 get_local_size(0)
0015	#define LDIM_1 get_local_size(1)
0016	#define LDIM_2 get_local_size(2)
0017	#define GID_0 get_group_id(0)
0018	#define GID_1 get_group_id(1)
0019	#define GID_2 get_group_id(2)
0020	#define GDIM_0 get_num_groups(0)
0021	#define GDIM_1 get_num_groups(1)
0022	#define GDIM_2 get_num_groups(2)
0023	#define ga_bool uchar
0024	#define ga_byte char
0025	#define ga_ubyte uchar
0026	#define ga_short short
0027	#define ga_ushort ushort
0028	#define ga_int int
0029	#define ga_uint uint
0030	#define ga_long long
0031	#define ga_ulong ulong
0032	#define ga_float float
0033	#define ga_double double
0034	#define ga_half half
0035	#define ga_size ulong
0036	#define ga_ssize long
0037	#define load_half(p) vload_half(0, p)
0038	#define store_half(p, v) vstore_half_rtn(v, 0, p)
0039	#define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040	#define GA_DECL_SHARED_BODY(type, name)
0041	#define GA_WARP_SIZE 64
0042	#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043	
0044	KERNEL void elem(const ga_size n, GLOBAL_MEM ga_half *a_p,  const ga_size a_offset, GLOBAL_MEM ga_half *b_p,  const ga_size b_offset, GLOBAL_MEM ga_half *c_p,  const ga_size c_offset) {
0045	const ga_size idx = LDIM_0 * GID_0 + LID_0;
0046	const ga_size numThreads = LDIM_0 * GDIM_0;
0047	ga_size i;
0048	GLOBAL_MEM char *tmp;
0049	
0050	tmp = (GLOBAL_MEM char *)a_p;tmp += a_offset; a_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)b_p;tmp += b_offset; b_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)c_p;tmp += c_offset; c_p = (GLOBAL_MEM ga_half *)tmp;for (i = idx; i < n; i += numThreads) {
0051	ga_float a;
0052	a = load_half(&a_p[i]);
0053	ga_float b;
0054	b = load_half(&b_p[i]);
0055	ga_float c;
0056	c = a + b;
0057	store_half(&c_p[i], c);
0058	}
0059	}

Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Program build failure ::
input.cl:52:5: warning: implicit declaration of function 'vload_half' is invalid in C99
input.cl:37:22: note: expanded from macro 'load_half'
input.cl:57:1: warning: implicit declaration of function 'vstore_half_rtn' is invalid in C99
input.cl:38:26: note: expanded from macro 'store_half'
<unknown>:0:0: in function elem void (i64, half addrspace(1)*, i64, half addrspace(1)*, i64, half addrspace(1)*, i64): unsupported call to function vload_half
0001	#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002	#define WITHIN_KERNEL /* empty */
0003	#define KERNEL __kernel
0004	#define GLOBAL_MEM __global
0005	#define LOCAL_MEM __local
0006	#define LOCAL_MEM_ARG __local
0007	#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008	#ifndef NULL
0009	  #define NULL ((void*)0)
0010	#endif
0011	#define LID_0 get_local_id(0)
0012	#define LID_1 get_local_id(1)
0013	#define LID_2 get_local_id(2)
0014	#define LDIM_0 get_local_size(0)
0015	#define LDIM_1 get_local_size(1)
0016	#define LDIM_2 get_local_size(2)
0017	#define GID_0 get_group_id(0)
0018	#define GID_1 get_group_id(1)
0019	#define GID_2 get_group_id(2)
0020	#define GDIM_0 get_num_groups(0)
0021	#define GDIM_1 get_num_groups(1)
0022	#define GDIM_2 get_num_groups(2)
0023	#define ga_bool uchar
0024	#define ga_byte char
0025	#define ga_ubyte uchar
0026	#define ga_short short
0027	#define ga_ushort ushort
0028	#define ga_int int
0029	#define ga_uint uint
0030	#define ga_long long
0031	#define ga_ulong ulong
0032	#define ga_float float
0033	#define ga_double double
0034	#define ga_half halfMesa bug
0035	#define ga_size ulong
0036	#define ga_ssize long
0037	#define load_half(p) vload_half(0, p)
0038	#define store_half(p, v) vstore_half_rtn(v, 0, p)
0039	#define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040	#define GA_DECL_SHARED_BODY(type, name)
0041	#define GA_WARP_SIZE 64
0042	#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043	
0044	KERNEL void elem(const ga_size n, GLOBAL_MEM ga_half *a_p,  const ga_size a_offset, GLOBAL_MEM ga_half *b_p,  const ga_size b_offset, GLOBAL_MEM ga_half *c_p,  const ga_size c_offset) {
0045	const ga_size idx = LDIM_0 * GID_0 + LID_0;
0046	const ga_size numThreads = LDIM_0 * GDIM_0;
0047	ga_size i;
0048	GLOBAL_MEM char *tmp;
0049	
0050	tmp = (GLOBAL_MEM char *)a_p;tmp += a_offset; a_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)b_p;tmp += b_offset; b_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)c_p;tmp += c_offset; c_p = (GLOBAL_MEM ga_half *)tmp;for (i = idx; i < n; i += numThreads) {
0051	ga_float a;
0052	a = load_half(&a_p[i]);
0053	ga_float b;
0054	b = load_half(&b_p[i]);
0055	ga_float c;
0056	c = a + b;
0057	store_half(&c_p[i], c);
0058	}
0059	}

Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1044 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 24 VGPRS: 10 Code Size: 780 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 31 VGPRS: 8 Code Size: 768 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
75%: Checks: 12, Failures: 3, Errors: 0
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:71:P:contig:test_contig_simple:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:121:F:contig:test_contig_f16:0: Assertion 'ge != ((void *)0)' failed: ge == 0, ((void *)0) == 0
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:177:P:contig:test_contig_0:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:234:P:basic:test_basic_simple:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:285:F:basic:test_basic_f16:0: Assertion 'ge != ((void *)0)' failed: ge == 0, ((void *)0) == 0
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:427:F:basic:test_basic_scalar:0: Assertion 'data3[0] == 9' failed: data3[0] == 4268122713, 9 == 9
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:361:P:basic:test_basic_offset:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:495:P:basic:test_basic_remove1:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:563:P:basic:test_basic_broadcast:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:623:P:basic:test_basic_collapse:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:692:P:basic:test_basic_neg_strides:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:737:P:basic:test_basic_0:0: Passed

(Ignore the shader stats printed as errors, that's a wrongly hooked debug printer.)
So we miss vload_half and vstore_half_rtn in libclc, that's on LLVM/libclc's side and it's fixable (bug on Freedesktop).

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions