Theano/libgpuarray

pygpu.gpuarray.GpuArrayException: ('The following error happened while compiling the node', GpuSoftmaxWithBias(GpuDot22.0, dense_3_b)

Opened this issue · 5 comments

I try to run Theano on a GPU. I know OpenCL is not finished yet but is this there a fix for this?

Theano: 0.9.0.dev-c697eeab84e5b8a74908da654b66ec9eca4f1291
OSX: 10.12.4
GPU: AMD Radeon HD - FirePro D700 Compute Engine

pygpu.gpuarray.GpuArrayException: ('The following error happened while compiling the node', GpuSoftmaxWithBias(GpuDot22.0, dense_3_b), '
', "Program build failure ::
<program source>:44:82: error: kernel pointer arguments must have a global, local, or constant address space qualifier
KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
                                                                                 ^
<program source>:44:166: error: kernel pointer arguments must have a global, local, or constant address space qualifier
KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
                                                                                                                                                                     ^
<program source>:44:224: error: kernel pointer arguments must have a global, local, or constant address space qualifier
KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
                                                                                                                                                                                                                               ^
<program source>:46:16: error: unknown type name '__shared__'
        extern __shared__ ga_float buf[];
               ^
<program source>:46:27: error: expected identifier or '('
        extern __shared__ ga_float buf[];
                          ^
<program source>:34:18: note: expanded from macro 'ga_float'
#define ga_float float
                 ^
<program source>:47:19: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
ga_float * buf2 = buf + N;
                  ^~~
                  buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:51:21: error: use of undeclared identifier 'blockIdx'; did you mean 'blockIDX'?
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                    ^~~~~~~~
                    blockIDX
<program source>:51:10: note: 'blockIDX' declared here
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
         ^
<program source>:51:29: error: member reference base type 'int' is not a structure or union
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                    ~~~~~~~~^~
<program source>:51:63: error: use of undeclared identifier 'gridDim'
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                                                              ^
<program source>:52:15: error: use of undeclared identifier 'threadIdx'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
              ^
<program source>:52:41: error: use of undeclared identifier 'blockDim'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                                        ^
<program source>:53:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[tx] = (x[blockIDX * sx0 + tx * sx1]);
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:54:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[tx] += (b[tx * sb0]);
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:55:12: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf2[tx] = buf[tx];
           ^~~
           buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:57:1: warning: implicit declaration of function '__syncthreads' is invalid in C99
__syncthreads();
^
<program source>:63:13: error: use of undeclared identifier 'threadIdx'
        if (threadIdx.x < warpSize) {
            ^
<program source>:63:27: error: use of undeclared identifier 'warpSize'
        if (threadIdx.x < warpSize) {
                          ^
<program source>:64:26: error: use of undeclared identifier 'threadIdx'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                         ^
<program source>:64:40: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                       ^
<program source>:64:62: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                             ^
<program source>:66:17: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                ^~~
                buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:66:21: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                    ^
<program source>:66:40: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                                       ^~~
                                       buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:66:44: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                                           ^
<program source>:71:32: error: use of undeclared identifier 'warpSize'
        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
                               ^
<program source>:72:15: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
              ^
<program source>:72:35: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
                                  ^
<program source>:73:13: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
            ^~~
            buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:73:17: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
                ^
<program source>:73:36: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
                                   ^~~
                                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:73:40: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
                                       ^
<program source>:79:20: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
ga_float row_max = buf[0];
                   ^~~
                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:81:13: error: use of undeclared identifier 'threadIdx'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
            ^
<program source>:81:38: error: use of undeclared identifier 'blockDim'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                                     ^
<program source>:82:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[__i] = exp(buf2[__i] - row_max);
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:83:13: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf2[__i] = buf[__i];
            ^~~
            buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:91:13: error: use of undeclared identifier 'threadIdx'
        if (threadIdx.x < warpSize) {
            ^
<program source>:91:27: error: use of undeclared identifier 'warpSize'
        if (threadIdx.x < warpSize) {
                          ^
<program source>:92:26: error: use of undeclared identifier 'threadIdx'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                         ^
<program source>:92:40: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                       ^
<program source>:92:62: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                             ^
<program source>:94:17: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                ^~~
                buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:94:21: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                    ^
<program source>:94:36: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                                   ^~~
                                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:94:40: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                                       ^
<program source>:94:55: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                                                      ^~~
                                                      buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:99:32: error: use of undeclared identifier 'warpSize'
        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
                               ^
<program source>:100:15: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
              ^
<program source>:100:35: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
                                  ^
<program source>:101:13: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
            ^~~
            buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:101:17: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                ^
<program source>:101:32: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                               ^~~
                               buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:101:36: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                                   ^
<program source>:101:51: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                                                  ^~~
                                                  buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:101:55: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                                                      ^
<program source>:107:20: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
ga_float row_sum = buf[0];
                   ^~~
                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:109:13: error: use of undeclared identifier 'threadIdx'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
            ^
<program source>:109:38: error: use of undeclared identifier 'blockDim'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                                     ^
<program source>:110:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[__i] = buf2[__i] / row_sum;
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:113:15: error: use of undeclared identifier 'threadIdx'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
              ^
<program source>:113:41: error: use of undeclared identifier 'blockDim'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                                        ^
<program source>:114:38: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
sm[blockIDX * sm_s0 + tx * sm_s1] = (buf[tx]);
                                     ^~~
                                     buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:51:42: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                                ~~~~~~~~ ^ ~
<program source>:52:30: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                           ~~^ ~
<program source>:64:52: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                 ~ ^ ~
<program source>:81:29: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                         ~~~^~
<program source>:92:52: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                 ~ ^ ~
<program source>:109:29: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                         ~~~^~
<program source>:113:30: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                           ~~^ ~
0001	#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002	#define WITHIN_KERNEL /* empty */
0003	#define KERNEL __kernel
0004	#define GLOBAL_MEM __global
0005	#define LOCAL_MEM __local
0006	#define LOCAL_MEM_ARG __local
0007	#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008	#ifndef NULL
0009	  #define NULL ((void*)0)
0010	#endif
0011	#define LID_0 get_local_id(0)
0012	#define LID_1 get_local_id(1)
0013	#define LID_2 get_local_id(2)
0014	#define LDIM_0 get_local_size(0)
0015	#define LDIM_1 get_local_size(1)
0016	#define LDIM_2 get_local_size(2)
0017	#define GID_0 get_group_id(0)
0018	#define GID_1 get_group_id(1)
0019	#define GID_2 get_group_id(2)
0020	#define GDIM_0 get_num_groups(0)
0021	#define GDIM_1 get_num_groups(1)
0022	#define GDIM_2 get_num_groups(2)
0023	#define ga_bool uchar
0024	#define ga_byte char
0025	#define ga_ubyte uchar
0026	#define ga_short short
0027	#define ga_ushort ushort
0028	#define ga_int int
0029	#define ga_uint uint
0030	#define ga_long long
0031	#define ga_ulong ulong
0032	#define ga_float float
0033	#define ga_double double
0034	#define ga_half half
0035	#define ga_size ulong
0036	#define ga_ssize long
0037	#define load_half(p) vload_half(0, p)
0038	#define store_half(p, v) vstore_half_rtn(v, 0, p)
0039	#define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040	#define GA_DECL_SHARED_BODY(type, name)
0041	#define GA_WARP_SIZE 64
0042	KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
0043	    {
0044	        extern __shared__ ga_float buf[];
0045	ga_float * buf2 = buf + N;
0046	x = (const ga_float *)(((char *)x)+offset_x);
0047	b = (const ga_float *)(((char *)b)+offset_b);
0048	sm = (ga_float *)(((char *)sm)+offset_sm);
0049	for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
0050	for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
0051	buf[tx] = (x[blockIDX * sx0 + tx * sx1]);
0052	buf[tx] += (b[tx * sb0]);
0053	buf2[tx] = buf[tx];
0054	};
0055	__syncthreads();
0056	
0057	    {
0058	        // This function trashes buf[1..warpSize],
0059	        // leaving the reduction result in buf[0].
0060	
0061	        if (threadIdx.x < warpSize) {
0062	            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
0063	            {
0064	                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
0065	            }
0066	        }
0067	        __syncthreads();
0068	        //reduce so that threadIdx.x 0 has the reduction of everything
0069	        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
0070	          if (threadIdx.x < _n && threadIdx.x + _n < N)
0071	            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
0072	          __syncthreads();
0073	        }
0074	    }
0075	    ;
0076	__syncthreads();
0077	ga_float row_max = buf[0];
0078	__syncthreads();
0079	for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
0080	buf[__i] = exp(buf2[__i] - row_max);
0081	buf2[__i] = buf[__i];
0082	};
0083	__syncthreads();
0084	
0085	    {
0086	        // This function trashes buf[1..warpSize],
0087	        // leaving the reduction result in buf[0].
0088	
0089	        if (threadIdx.x < warpSize) {
0090	            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
0091	            {
0092	                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
0093	            }
0094	        }
0095	        __syncthreads();
0096	        //reduce so that threadIdx.x 0 has the reduction of everything
0097	        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
0098	          if (threadIdx.x < _n && threadIdx.x + _n < N)
0099	            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
0100	          __syncthreads();
0101	        }
0102	    }
0103	    ;
0104	__syncthreads();
0105	ga_float row_sum = buf[0];
0106	__syncthreads();
0107	for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
0108	buf[__i] = buf2[__i] / row_sum;
0109	};
0110	__syncthreads();
0111	for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
0112	sm[blockIDX * sm_s0 + tx * sm_s1] = (buf[tx]);
0113	};
0114	__syncthreads();
0115	};
0116	    }
0117	   
")
nouiz commented

This error isn't readable. Can you give it again while keeping the new lines?

The softmax kernel is not ready for OpenCL apparently and is missing a couple of GLOBAL_MEM markers around pointers.

If you want to do it, we will accept a PR.

I think I have the same problem with libgpuarray and OpenCL.

nouiz commented

OK, I understand. I am a beginner, so I don't think I can do this.