-
Notifications
You must be signed in to change notification settings - Fork 93
Closed
Description
Hello,
I am trying to get Mesa/Gallium/radeonsi/Clover/LLVM OpenCL stack to run libgpuarray and eventually Theano. Mesa already passes more then a half of the tests (I presume some don't use OpenCL):
$ DEVICE="opencl0:0" make test
Running tests...
Test project /users/miletivn/workspace/libgpuarray/build
Start 1: test_types
1/9 Test #1: test_types ....................... Passed 0.00 sec
Start 2: test_util
2/9 Test #2: test_util ........................ Passed 0.01 sec
Start 3: test_util_integerfactoring
3/9 Test #3: test_util_integerfactoring ....... Passed 0.69 sec
Start 4: test_reduction
4/9 Test #4: test_reduction ................... Passed 10.07 sec
Start 5: test_array
5/9 Test #5: test_array .......................***Failed 1.59 sec
Start 6: test_blas
6/9 Test #6: test_blas ........................***Failed 1.66 sec
Start 7: test_elemwise
7/9 Test #7: test_elemwise ....................***Failed 22.34 sec
Start 8: test_error
8/9 Test #8: test_error ....................... Passed 0.00 sec
Start 9: test_buffer
9/9 Test #9: test_buffer ...................... Passed 2.91 sec
67% tests passed, 3 tests failed out of 9
Total Test time (real) = 39.27 sec
The following tests FAILED:
5 - test_array (Failed)
6 - test_blas (Failed)
7 - test_elemwise (Failed)
Errors while running CTest
Makefile:116: recipe for target 'test' failed
make: *** [test] Error 8
Regarding failing tests, I would require some help figuring out whether it's Mesa/Clang/LLVM's fault or some non-standard feature used by libgpuarray.
$ DEVICE="opencl0:0" ./tests/check_array
Running suite(s): array
Program build failure ::
input.cl:50:31: error: casting '__global uint *' (aka '__global unsigned int *') to type 'char *' changes address space of pointer
input.cl:51:34: error: casting 'const __global long *' to type 'char *' changes address space of pointer
0001 #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002 #define WITHIN_KERNEL /* empty */
0003 #define KERNEL __kernel
0004 #define GLOBAL_MEM __global
0005 #define LOCAL_MEM __local
0006 #define LOCAL_MEM_ARG __local
0007 #define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008 #ifndef NULL
0009 #define NULL ((void*)0)
0010 #endif
0011 #define LID_0 get_local_id(0)
0012 #define LID_1 get_local_id(1)
0013 #define LID_2 get_local_id(2)
0014 #define LDIM_0 get_local_size(0)
0015 #define LDIM_1 get_local_size(1)
0016 #define LDIM_2 get_local_size(2)
0017 #define GID_0 get_group_id(0)
0018 #define GID_1 get_group_id(1)
0019 #define GID_2 get_group_id(2)
0020 #define GDIM_0 get_num_groups(0)
0021 #define GDIM_1 get_num_groups(1)
0022 #define GDIM_2 get_num_groups(2)
0023 #define ga_bool uchar
0024 #define ga_byte char
0025 #define ga_ubyte uchar
0026 #define ga_short short
0027 #define ga_ushort ushort
0028 #define ga_int int
0029 #define ga_uint uint
0030 #define ga_long long
0031 #define ga_ulong ulong
0032 #define ga_float float
0033 #define ga_double double
0034 #define ga_half half
0035 #define ga_size ulong
0036 #define ga_ssize long
0037 #define load_half(p) vload_half(0, p)
0038 #define store_half(p, v) vstore_half_rtn(v, 0, p)
0039 #define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040 #define GA_DECL_SHARED_BODY(type, name)
0041 #define GA_WARP_SIZE 64
0042 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043 KERNEL void take1(GLOBAL_MEM ga_uint *r, ga_size r_off, GLOBAL_MEM const ga_uint *v, ga_size v_off, ga_ssize s0, ga_size d0, GLOBAL_MEM const ga_ssize *ind, ga_size i_off, ga_size n0, ga_size n1, GLOBAL_MEM int* err) {
0044 const ga_uint idx0 = LDIM_0 * GID_0 + LID_0;
0045 const ga_uint numThreads0 = LDIM_0 * GDIM_0;
0046 const ga_uint idx1 = LDIM_1 * GID_1 + LID_1;
0047 const ga_uint numThreads1 = LDIM_1 * GDIM_1;
0048 ga_uint i0, i1;
0049 if (idx0 >= n0 || idx1 >= n1) return;
0050 r = (GLOBAL_MEM ga_uint *)(((char *)r) + r_off);
0051 ind = (GLOBAL_MEM ga_ssize *)(((char *)ind) + i_off);
0052 for (i0 = idx0; i0 < n0; i0 += numThreads0) {
0053 ga_int ii0 = ind[i0];
0054 ga_uint pos0 = v_off;
0055 if (ii0 < 0) ii0 += d0;
0056 if ((ii0 < 0) || (ii0 >= d0)) {
0057 *err = -1;
0058 continue;
0059 }
0060 pos0 += ii0 * (ga_uint)s0;
0061 for (i1 = idx1; i1 < n1; i1 += numThreads1) {
0062 ga_uint p = pos0;
0063 r[i0*((ga_uint)n1) + i1] = *((GLOBAL_MEM ga_uint *)(((GLOBAL_MEM char *)v) + p));
0064 }
0065 }
0066 }
Program build failure ::
input.cl:50:31: error: casting '__global uint *' (aka '__global unsigned int *') to type 'char *' changes address space of pointer
input.cl:51:33: error: casting 'const __global uint *' (aka 'const __global unsigned int *') to type 'char *' changes address space of pointer
0001 #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002 #define WITHIN_KERNEL /* empty */
0003 #define KERNEL __kernel
0004 #define GLOBAL_MEM __global
0005 #define LOCAL_MEM __local
0006 #define LOCAL_MEM_ARG __local
0007 #define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008 #ifndef NULL
0009 #define NULL ((void*)0)
0010 #endif
0011 #define LID_0 get_local_id(0)
0012 #define LID_1 get_local_id(1)
0013 #define LID_2 get_local_id(2)
0014 #define LDIM_0 get_local_size(0)
0015 #define LDIM_1 get_local_size(1)
0016 #define LDIM_2 get_local_size(2)
0017 #define GID_0 get_group_id(0)
0018 #define GID_1 get_group_id(1)
0019 #define GID_2 get_group_id(2)
0020 #define GDIM_0 get_num_groups(0)
0021 #define GDIM_1 get_num_groups(1)
0022 #define GDIM_2 get_num_groups(2)
0023 #define ga_bool uchar
0024 #define ga_byte char
0025 #define ga_ubyte uchar
0026 #define ga_short short
0027 #define ga_ushort ushort
0028 #define ga_int int
0029 #define ga_uint uint
0030 #define ga_long long
0031 #define ga_ulong ulong
0032 #define ga_float float
0033 #define ga_double double
0034 #define ga_half half
0035 #define ga_size ulong
0036 #define ga_ssize long
0037 #define load_half(p) vload_half(0, p)
0038 #define store_half(p, v) vstore_half_rtn(v, 0, p)
0039 #define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040 #define GA_DECL_SHARED_BODY(type, name)
0041 #define GA_WARP_SIZE 64
0042 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043 KERNEL void take1(GLOBAL_MEM ga_uint *r, ga_size r_off, GLOBAL_MEM const ga_uint *v, ga_size v_off, ga_ssize s0, ga_size d0, GLOBAL_MEM const ga_uint *ind, ga_size i_off, ga_size n0, ga_size n1, GLOBAL_MEM int* err) {
0044 const ga_uint idx0 = LDIM_0 * GID_0 + LID_0;
0045 const ga_uint numThreads0 = LDIM_0 * GDIM_0;
0046 const ga_uint idx1 = LDIM_1 * GID_1 + LID_1;
0047 const ga_uint numThreads1 = LDIM_1 * GDIM_1;
0048 ga_uint i0, i1;
0049 if (idx0 >= n0 || idx1 >= n1) return;
0050 r = (GLOBAL_MEM ga_uint *)(((char *)r) + r_off);
0051 ind = (GLOBAL_MEM ga_uint *)(((char *)ind) + i_off);
0052 for (i0 = idx0; i0 < n0; i0 += numThreads0) {
0053 ga_int ii0 = ind[i0];
0054 ga_uint pos0 = v_off;
0055 if (ii0 < 0) ii0 += d0;
0056 if ((ii0 < 0) || (ii0 >= d0)) {
0057 *err = -1;
0058 continue;
0059 }
0060 pos0 += ii0 * (ga_uint)s0;
0061 for (i1 = idx1; i1 < n1; i1 += numThreads1) {
0062 ga_uint p = pos0;
0063 r[i0*((ga_uint)n1) + i1] = *((GLOBAL_MEM ga_uint *)(((GLOBAL_MEM char *)v) + p));
0064 }
0065 }
0066 }
0%: Checks: 2, Failures: 2, Errors: 0
/users/miletivn/workspace/libgpuarray/tests/check_array.c:57:F:take1:test_take1_ok:0: Assertion 'GpuArray_take1(&vres, &v, &vidx, 0) == GA_NO_ERROR' failed: GpuArray_take1(&vres, &v, &vidx, 0) == 3, GA_NO_ERROR == 0
/users/miletivn/workspace/libgpuarray/tests/check_array.c:267:F:take1:test_take1_offset:0: Assertion 'GpuArray_take1(&r, &v, &i, 1) == GA_NO_ERROR' failed: GpuArray_take1(&r, &v, &i, 1) == 3, GA_NO_ERROR == 0
Are these casts fine for other drivers? Clang seems to dislike them when compiling for AMDGPU.
$ DEVICE="opencl0:0" ./tests/check_blas
Running suite(s): blas
OpenCL error -11 on line 244 of /users/miletivn/workspace/clBLAS/src/library/blas/xgemm.cc
check_blas: /users/miletivn/workspace/clBLAS/src/library/blas/xgemm.cc:244: void makeGemmKernel(_cl_kernel**, cl_command_queue, const char*, const char*, const unsigned char**, size_t*, const char*): Assertion `false' failed.
0%: Checks: 1, Failures: 0, Errors: 1
/users/miletivn/workspace/libgpuarray/tests/check_blas.c:26:E:all:test_gemmBatch_3d:0: (after this point) Received signal 6 (Aborted)
-11 is CL_BUILD_PROGRAM_FAILURE, but it's in clBLAS, so not your problem.
$ DEVICE="opencl0:0" ./tests/check_elemwise
Running suite(s): elemwise
Shader Stats: SGPRS: 24 VGPRS: 10 Code Size: 780 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Program build failure ::
input.cl:52:5: warning: implicit declaration of function 'vload_half' is invalid in C99
input.cl:37:22: note: expanded from macro 'load_half'
input.cl:57:1: warning: implicit declaration of function 'vstore_half_rtn' is invalid in C99
input.cl:38:26: note: expanded from macro 'store_half'
<unknown>:0:0: in function elem void (i64, half addrspace(1)*, i64, half addrspace(1)*, i64, half addrspace(1)*, i64): unsupported call to function vload_half
0001 #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002 #define WITHIN_KERNEL /* empty */
0003 #define KERNEL __kernel
0004 #define GLOBAL_MEM __global
0005 #define LOCAL_MEM __local
0006 #define LOCAL_MEM_ARG __local
0007 #define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008 #ifndef NULL
0009 #define NULL ((void*)0)
0010 #endif
0011 #define LID_0 get_local_id(0)
0012 #define LID_1 get_local_id(1)
0013 #define LID_2 get_local_id(2)
0014 #define LDIM_0 get_local_size(0)
0015 #define LDIM_1 get_local_size(1)
0016 #define LDIM_2 get_local_size(2)
0017 #define GID_0 get_group_id(0)
0018 #define GID_1 get_group_id(1)
0019 #define GID_2 get_group_id(2)
0020 #define GDIM_0 get_num_groups(0)
0021 #define GDIM_1 get_num_groups(1)
0022 #define GDIM_2 get_num_groups(2)
0023 #define ga_bool uchar
0024 #define ga_byte char
0025 #define ga_ubyte uchar
0026 #define ga_short short
0027 #define ga_ushort ushort
0028 #define ga_int int
0029 #define ga_uint uint
0030 #define ga_long long
0031 #define ga_ulong ulong
0032 #define ga_float float
0033 #define ga_double double
0034 #define ga_half half
0035 #define ga_size ulong
0036 #define ga_ssize long
0037 #define load_half(p) vload_half(0, p)
0038 #define store_half(p, v) vstore_half_rtn(v, 0, p)
0039 #define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040 #define GA_DECL_SHARED_BODY(type, name)
0041 #define GA_WARP_SIZE 64
0042 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043
0044 KERNEL void elem(const ga_size n, GLOBAL_MEM ga_half *a_p, const ga_size a_offset, GLOBAL_MEM ga_half *b_p, const ga_size b_offset, GLOBAL_MEM ga_half *c_p, const ga_size c_offset) {
0045 const ga_size idx = LDIM_0 * GID_0 + LID_0;
0046 const ga_size numThreads = LDIM_0 * GDIM_0;
0047 ga_size i;
0048 GLOBAL_MEM char *tmp;
0049
0050 tmp = (GLOBAL_MEM char *)a_p;tmp += a_offset; a_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)b_p;tmp += b_offset; b_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)c_p;tmp += c_offset; c_p = (GLOBAL_MEM ga_half *)tmp;for (i = idx; i < n; i += numThreads) {
0051 ga_float a;
0052 a = load_half(&a_p[i]);
0053 ga_float b;
0054 b = load_half(&b_p[i]);
0055 ga_float c;
0056 c = a + b;
0057 store_half(&c_p[i], c);
0058 }
0059 }
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Program build failure ::
input.cl:52:5: warning: implicit declaration of function 'vload_half' is invalid in C99
input.cl:37:22: note: expanded from macro 'load_half'
input.cl:57:1: warning: implicit declaration of function 'vstore_half_rtn' is invalid in C99
input.cl:38:26: note: expanded from macro 'store_half'
<unknown>:0:0: in function elem void (i64, half addrspace(1)*, i64, half addrspace(1)*, i64, half addrspace(1)*, i64): unsupported call to function vload_half
0001 #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002 #define WITHIN_KERNEL /* empty */
0003 #define KERNEL __kernel
0004 #define GLOBAL_MEM __global
0005 #define LOCAL_MEM __local
0006 #define LOCAL_MEM_ARG __local
0007 #define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008 #ifndef NULL
0009 #define NULL ((void*)0)
0010 #endif
0011 #define LID_0 get_local_id(0)
0012 #define LID_1 get_local_id(1)
0013 #define LID_2 get_local_id(2)
0014 #define LDIM_0 get_local_size(0)
0015 #define LDIM_1 get_local_size(1)
0016 #define LDIM_2 get_local_size(2)
0017 #define GID_0 get_group_id(0)
0018 #define GID_1 get_group_id(1)
0019 #define GID_2 get_group_id(2)
0020 #define GDIM_0 get_num_groups(0)
0021 #define GDIM_1 get_num_groups(1)
0022 #define GDIM_2 get_num_groups(2)
0023 #define ga_bool uchar
0024 #define ga_byte char
0025 #define ga_ubyte uchar
0026 #define ga_short short
0027 #define ga_ushort ushort
0028 #define ga_int int
0029 #define ga_uint uint
0030 #define ga_long long
0031 #define ga_ulong ulong
0032 #define ga_float float
0033 #define ga_double double
0034 #define ga_half halfMesa bug
0035 #define ga_size ulong
0036 #define ga_ssize long
0037 #define load_half(p) vload_half(0, p)
0038 #define store_half(p, v) vstore_half_rtn(v, 0, p)
0039 #define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040 #define GA_DECL_SHARED_BODY(type, name)
0041 #define GA_WARP_SIZE 64
0042 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
0043
0044 KERNEL void elem(const ga_size n, GLOBAL_MEM ga_half *a_p, const ga_size a_offset, GLOBAL_MEM ga_half *b_p, const ga_size b_offset, GLOBAL_MEM ga_half *c_p, const ga_size c_offset) {
0045 const ga_size idx = LDIM_0 * GID_0 + LID_0;
0046 const ga_size numThreads = LDIM_0 * GDIM_0;
0047 ga_size i;
0048 GLOBAL_MEM char *tmp;
0049
0050 tmp = (GLOBAL_MEM char *)a_p;tmp += a_offset; a_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)b_p;tmp += b_offset; b_p = (GLOBAL_MEM ga_half *)tmp;tmp = (GLOBAL_MEM char *)c_p;tmp += c_offset; c_p = (GLOBAL_MEM ga_half *)tmp;for (i = idx; i < n; i += numThreads) {
0051 ga_float a;
0052 a = load_half(&a_p[i]);
0053 ga_float b;
0054 b = load_half(&b_p[i]);
0055 ga_float c;
0056 c = a + b;
0057 store_half(&c_p[i], c);
0058 }
0059 }
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1044 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 39 VGPRS: 14 Code Size: 1024 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 24 VGPRS: 10 Code Size: 780 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
Shader Stats: SGPRS: 31 VGPRS: 8 Code Size: 768 LDS: 0 Scratch: 0 Max Waves: 10 Spilled SGPRs: 0 Spilled VGPRs: 0 PrivMem VGPRs: 0
75%: Checks: 12, Failures: 3, Errors: 0
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:71:P:contig:test_contig_simple:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:121:F:contig:test_contig_f16:0: Assertion 'ge != ((void *)0)' failed: ge == 0, ((void *)0) == 0
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:177:P:contig:test_contig_0:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:234:P:basic:test_basic_simple:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:285:F:basic:test_basic_f16:0: Assertion 'ge != ((void *)0)' failed: ge == 0, ((void *)0) == 0
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:427:F:basic:test_basic_scalar:0: Assertion 'data3[0] == 9' failed: data3[0] == 4268122713, 9 == 9
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:361:P:basic:test_basic_offset:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:495:P:basic:test_basic_remove1:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:563:P:basic:test_basic_broadcast:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:623:P:basic:test_basic_collapse:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:692:P:basic:test_basic_neg_strides:0: Passed
/users/miletivn/workspace/libgpuarray/tests/check_elemwise.c:737:P:basic:test_basic_0:0: Passed
(Ignore the shader stats printed as errors, that's a wrongly hooked debug printer.)
So we miss vload_half and vstore_half_rtn in libclc, that's on LLVM/libclc's side and it's fixable (bug on Freedesktop).
Metadata
Metadata
Assignees
Labels
No labels