pygpu.gpuarray.GpuArrayException: ('The following error happened while compiling the node', GpuSoftmaxWithBias(GpuDot22.0, dense_3_b)

Question

pygpu.gpuarray.GpuArrayException: ('The following error happened while compiling the node', GpuSoftmaxWithBias(GpuDot22.0, dense_3_b)

Opened this issue 7 years ago · 5 comments

Jorrit2002 commented 7 years ago

I try to run Theano on a GPU. I know OpenCL is not finished yet but is this there a fix for this?

Theano: 0.9.0.dev-c697eeab84e5b8a74908da654b66ec9eca4f1291
OSX: 10.12.4
GPU: AMD Radeon HD - FirePro D700 Compute Engine

pygpu.gpuarray.GpuArrayException: ('The following error happened while compiling the node', GpuSoftmaxWithBias(GpuDot22.0, dense_3_b), '
', "Program build failure ::
<program source>:44:82: error: kernel pointer arguments must have a global, local, or constant address space qualifier
KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
                                                                                 ^
<program source>:44:166: error: kernel pointer arguments must have a global, local, or constant address space qualifier
KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
                                                                                                                                                                     ^
<program source>:44:224: error: kernel pointer arguments must have a global, local, or constant address space qualifier
KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
                                                                                                                                                                                                                               ^
<program source>:46:16: error: unknown type name '__shared__'
        extern __shared__ ga_float buf[];
               ^
<program source>:46:27: error: expected identifier or '('
        extern __shared__ ga_float buf[];
                          ^
<program source>:34:18: note: expanded from macro 'ga_float'
#define ga_float float
                 ^
<program source>:47:19: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
ga_float * buf2 = buf + N;
                  ^~~
                  buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:51:21: error: use of undeclared identifier 'blockIdx'; did you mean 'blockIDX'?
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                    ^~~~~~~~
                    blockIDX
<program source>:51:10: note: 'blockIDX' declared here
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
         ^
<program source>:51:29: error: member reference base type 'int' is not a structure or union
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                    ~~~~~~~~^~
<program source>:51:63: error: use of undeclared identifier 'gridDim'
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                                                              ^
<program source>:52:15: error: use of undeclared identifier 'threadIdx'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
              ^
<program source>:52:41: error: use of undeclared identifier 'blockDim'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                                        ^
<program source>:53:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[tx] = (x[blockIDX * sx0 + tx * sx1]);
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:54:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[tx] += (b[tx * sb0]);
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:55:12: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf2[tx] = buf[tx];
           ^~~
           buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:57:1: warning: implicit declaration of function '__syncthreads' is invalid in C99
__syncthreads();
^
<program source>:63:13: error: use of undeclared identifier 'threadIdx'
        if (threadIdx.x < warpSize) {
            ^
<program source>:63:27: error: use of undeclared identifier 'warpSize'
        if (threadIdx.x < warpSize) {
                          ^
<program source>:64:26: error: use of undeclared identifier 'threadIdx'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                         ^
<program source>:64:40: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                       ^
<program source>:64:62: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                             ^
<program source>:66:17: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                ^~~
                buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:66:21: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                    ^
<program source>:66:40: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                                       ^~~
                                       buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:66:44: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
                                           ^
<program source>:71:32: error: use of undeclared identifier 'warpSize'
        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
                               ^
<program source>:72:15: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
              ^
<program source>:72:35: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
                                  ^
<program source>:73:13: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
            ^~~
            buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:73:17: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
                ^
<program source>:73:36: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
                                   ^~~
                                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:73:40: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
                                       ^
<program source>:79:20: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
ga_float row_max = buf[0];
                   ^~~
                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:81:13: error: use of undeclared identifier 'threadIdx'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
            ^
<program source>:81:38: error: use of undeclared identifier 'blockDim'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                                     ^
<program source>:82:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[__i] = exp(buf2[__i] - row_max);
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:83:13: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf2[__i] = buf[__i];
            ^~~
            buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:91:13: error: use of undeclared identifier 'threadIdx'
        if (threadIdx.x < warpSize) {
            ^
<program source>:91:27: error: use of undeclared identifier 'warpSize'
        if (threadIdx.x < warpSize) {
                          ^
<program source>:92:26: error: use of undeclared identifier 'threadIdx'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                         ^
<program source>:92:40: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                       ^
<program source>:92:62: error: use of undeclared identifier 'warpSize'
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                             ^
<program source>:94:17: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                ^~~
                buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:94:21: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                    ^
<program source>:94:36: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                                   ^~~
                                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:94:40: error: use of undeclared identifier 'threadIdx'
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                                       ^
<program source>:94:55: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
                                                      ^~~
                                                      buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:99:32: error: use of undeclared identifier 'warpSize'
        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
                               ^
<program source>:100:15: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
              ^
<program source>:100:35: error: use of undeclared identifier 'threadIdx'
          if (threadIdx.x < _n && threadIdx.x + _n < N)
                                  ^
<program source>:101:13: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
            ^~~
            buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:101:17: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                ^
<program source>:101:32: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                               ^~~
                               buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:101:36: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                                   ^
<program source>:101:51: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                                                  ^~~
                                                  buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:101:55: error: use of undeclared identifier 'threadIdx'
            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
                                                      ^
<program source>:107:20: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
ga_float row_sum = buf[0];
                   ^~~
                   buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:109:13: error: use of undeclared identifier 'threadIdx'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
            ^
<program source>:109:38: error: use of undeclared identifier 'blockDim'
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                                     ^
<program source>:110:1: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
buf[__i] = buf2[__i] / row_sum;
^~~
buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:113:15: error: use of undeclared identifier 'threadIdx'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
              ^
<program source>:113:41: error: use of undeclared identifier 'blockDim'
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                                        ^
<program source>:114:38: error: use of undeclared identifier 'buf'; did you mean 'buf2'?
sm[blockIDX * sm_s0 + tx * sm_s1] = (buf[tx]);
                                     ^~~
                                     buf2
<program source>:47:12: note: 'buf2' declared here
ga_float * buf2 = buf + N;
           ^
<program source>:51:42: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
                                ~~~~~~~~ ^ ~
<program source>:52:30: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                           ~~^ ~
<program source>:64:52: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                 ~ ^ ~
<program source>:81:29: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                         ~~~^~
<program source>:92:52: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
                                                 ~ ^ ~
<program source>:109:29: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
                         ~~~^~
<program source>:113:30: warning: comparison of integers of different signs: 'int' and 'const ulong' (aka 'const unsigned long')
for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
                           ~~^ ~
0001	#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
0002	#define WITHIN_KERNEL /* empty */
0003	#define KERNEL __kernel
0004	#define GLOBAL_MEM __global
0005	#define LOCAL_MEM __local
0006	#define LOCAL_MEM_ARG __local
0007	#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))
0008	#ifndef NULL
0009	  #define NULL ((void*)0)
0010	#endif
0011	#define LID_0 get_local_id(0)
0012	#define LID_1 get_local_id(1)
0013	#define LID_2 get_local_id(2)
0014	#define LDIM_0 get_local_size(0)
0015	#define LDIM_1 get_local_size(1)
0016	#define LDIM_2 get_local_size(2)
0017	#define GID_0 get_group_id(0)
0018	#define GID_1 get_group_id(1)
0019	#define GID_2 get_group_id(2)
0020	#define GDIM_0 get_num_groups(0)
0021	#define GDIM_1 get_num_groups(1)
0022	#define GDIM_2 get_num_groups(2)
0023	#define ga_bool uchar
0024	#define ga_byte char
0025	#define ga_ubyte uchar
0026	#define ga_short short
0027	#define ga_ushort ushort
0028	#define ga_int int
0029	#define ga_uint uint
0030	#define ga_long long
0031	#define ga_ulong ulong
0032	#define ga_float float
0033	#define ga_double double
0034	#define ga_half half
0035	#define ga_size ulong
0036	#define ga_ssize long
0037	#define load_half(p) vload_half(0, p)
0038	#define store_half(p, v) vstore_half_rtn(v, 0, p)
0039	#define GA_DECL_SHARED_PARAM(type, name) , __local type *name
0040	#define GA_DECL_SHARED_BODY(type, name)
0041	#define GA_WARP_SIZE 64
0042	KERNEL void kSoftmaxWithBias (const ga_size M, const ga_size N, const ga_float * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, const ga_float * b, const ga_size offset_b, const ga_ssize sb0, ga_float * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1)
0043	    {
0044	        extern __shared__ ga_float buf[];
0045	ga_float * buf2 = buf + N;
0046	x = (const ga_float *)(((char *)x)+offset_x);
0047	b = (const ga_float *)(((char *)b)+offset_b);
0048	sm = (ga_float *)(((char *)sm)+offset_sm);
0049	for (int blockIDX = blockIdx.x; blockIDX < M;     blockIDX += gridDim.x){;
0050	for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
0051	buf[tx] = (x[blockIDX * sx0 + tx * sx1]);
0052	buf[tx] += (b[tx * sb0]);
0053	buf2[tx] = buf[tx];
0054	};
0055	__syncthreads();
0056	
0057	    {
0058	        // This function trashes buf[1..warpSize],
0059	        // leaving the reduction result in buf[0].
0060	
0061	        if (threadIdx.x < warpSize) {
0062	            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
0063	            {
0064	                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
0065	            }
0066	        }
0067	        __syncthreads();
0068	        //reduce so that threadIdx.x 0 has the reduction of everything
0069	        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
0070	          if (threadIdx.x < _n && threadIdx.x + _n < N)
0071	            buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+_n]);
0072	          __syncthreads();
0073	        }
0074	    }
0075	    ;
0076	__syncthreads();
0077	ga_float row_max = buf[0];
0078	__syncthreads();
0079	for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
0080	buf[__i] = exp(buf2[__i] - row_max);
0081	buf2[__i] = buf[__i];
0082	};
0083	__syncthreads();
0084	
0085	    {
0086	        // This function trashes buf[1..warpSize],
0087	        // leaving the reduction result in buf[0].
0088	
0089	        if (threadIdx.x < warpSize) {
0090	            for (int i = threadIdx.x + warpSize; i < N; i += warpSize)
0091	            {
0092	                buf[threadIdx.x] = buf[threadIdx.x] + buf[i];
0093	            }
0094	        }
0095	        __syncthreads();
0096	        //reduce so that threadIdx.x 0 has the reduction of everything
0097	        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
0098	          if (threadIdx.x < _n && threadIdx.x + _n < N)
0099	            buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+_n];
0100	          __syncthreads();
0101	        }
0102	    }
0103	    ;
0104	__syncthreads();
0105	ga_float row_sum = buf[0];
0106	__syncthreads();
0107	for(int __i=threadIdx.x; __i<N; __i+=blockDim.x){;
0108	buf[__i] = buf2[__i] / row_sum;
0109	};
0110	__syncthreads();
0111	for (int tx = threadIdx.x; tx< N; tx += blockDim.x){;
0112	sm[blockIDX * sm_s0 + tx * sm_s1] = (buf[tx]);
0113	};
0114	__syncthreads();
0115	};
0116	    }
0117	   
")

Answer 1 · 2017-04-13T12:05:04.000Z

This error isn't readable. Can you give it again while keeping the new lines?

Answer 2 · 2017-04-13T15:59:58.000Z

The softmax kernel is not ready for OpenCL apparently and is missing a couple of GLOBAL_MEM markers around pointers.

If you want to do it, we will accept a PR.

Answer 3 · 2017-05-07T06:46:16.000Z

I think I have the same problem with libgpuarray and OpenCL.

Answer 4 · 2017-05-07T15:21:44.000Z

OpenCL isn't supported for all operation. We don't have time to finish this and need help get this working. If you don't have time to help us finish this, then I would tell it isn't in a usable state for you now.

…

On Thu, Apr 13, 2017 at 12:00 PM abergeron ***@***.***> wrote: The softmax kernel is not ready for OpenCL apparently and is missing a couple of GLOBAL_MEM markers around pointers. If you want to do it, we will accept a PR. — You are receiving this because you commented. Reply to this email directly, view it on GitHub <#405 (comment)>, or mute the thread <https://github.com/notifications/unsubscribe-auth/AALC-ykUxrZlQOelECmyuhkRTmjkRLcVks5rvkZ_gaJpZM4M8mn4> .

Answer 5 · 2017-05-07T16:28:00.000Z

OK, I understand. I am a beginner, so I don't think I can do this.