Complex number reduction with numba cuda
I am trying to speedup a python code using cudanumba. The code works with large arrays of complex, float and integer numbers. I have included both python version and numba-cuda version here. The numba-cuda version does not compile.
I have tried performing complex number calculation as separate real and imaginary numbers as I though complex format may be the issue.
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
d_tmp[row, col] = d_data[d_data_index[row, col]]
d_tmp[row, col] =d_tmp[row, col] * d_coef[row, col]
# All threads get to this point ===============================
cuda.syncthreads()
if row == 0 and col ==0 :
d_datasum = np.sum(d_tmp, axis=0)
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_npart = cuda.to_device(npart)
d_npts = cuda.to_device(npts)
d_data = cuda.to_device(data)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
d_tmp = cuda.device_array((npart,npts), np.complex64)
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(npts / threadsperblock[0]))+1
blockspergrid_y = int(math.ceil(npart / threadsperblock[1]))+1
blockspergrid = (blockspergrid_x, blockspergrid_y)
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
start_time = time.time()
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
data_size = 1200
rows = 31
cols = 1000
rand_float1 = np.random.randn(data_size)
rand_float2 = np.random.randn(data_size)
data = rand_float1 + 1j * rand_float2
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
When I run the code I get this error:
Calling python function...
---- 0.000344038009644 seconds for python ----:
Traceback (most recent call last):
File "GPU_Fake_PA_partial.py", line 82, in <module>
gpu_data_sum = calculate_cuda (data, data_index, coef)
File "GPU_Fake_PA_partial.py", line 44, in calculate_cuda
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 765, in __call__
kernel = self.specialize(*args)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 776, in specialize
kernel = self.compile(argtypes)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 792, in compile
**self.targetoptions)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 62, in compile_kernel
cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 51, in compile_cuda
locals={})
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 926, in compile_extra
return pipeline.compile_extra(func)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 374, in compile_extra
return self._compile_bytecode()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 857, in _compile_bytecode
return self._compile_core()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 844, in _compile_core
res = pm.run(self.status)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 255, in run
raise patched_exception
numba.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Known signatures:
* (bool, bool) -> bool
* (int8, int8) -> bool
* (int16, int16) -> bool
* (int32, int32) -> bool
* (int64, int64) -> bool
* (uint8, uint8) -> bool
* (uint16, uint16) -> bool
* (uint32, uint32) -> bool
* (uint64, uint64) -> bool
* (float32, float32) -> bool
* (float64, float64) -> bool
* parameterized
In definition 0:
All templates rejected with literals.
In definition 1:
All templates rejected without literals.
In definition 2:
All templates rejected with literals.
In definition 3:
All templates rejected without literals.
In definition 4:
All templates rejected with literals.
In definition 5:
All templates rejected without literals.
In definition 6:
All templates rejected with literals.
In definition 7:
All templates rejected without literals.
This error is usually caused by passing an argument of a type that is unsupported by the named function.
[1] During: typing of intrinsic-call at GPU_Fake_PA_partial.py (15)
File "GPU_Fake_PA_partial.py", line 15:
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
<source elided>
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
Any help is highly appreciated.
python-2.7 cuda numba
add a comment |
I am trying to speedup a python code using cudanumba. The code works with large arrays of complex, float and integer numbers. I have included both python version and numba-cuda version here. The numba-cuda version does not compile.
I have tried performing complex number calculation as separate real and imaginary numbers as I though complex format may be the issue.
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
d_tmp[row, col] = d_data[d_data_index[row, col]]
d_tmp[row, col] =d_tmp[row, col] * d_coef[row, col]
# All threads get to this point ===============================
cuda.syncthreads()
if row == 0 and col ==0 :
d_datasum = np.sum(d_tmp, axis=0)
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_npart = cuda.to_device(npart)
d_npts = cuda.to_device(npts)
d_data = cuda.to_device(data)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
d_tmp = cuda.device_array((npart,npts), np.complex64)
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(npts / threadsperblock[0]))+1
blockspergrid_y = int(math.ceil(npart / threadsperblock[1]))+1
blockspergrid = (blockspergrid_x, blockspergrid_y)
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
start_time = time.time()
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
data_size = 1200
rows = 31
cols = 1000
rand_float1 = np.random.randn(data_size)
rand_float2 = np.random.randn(data_size)
data = rand_float1 + 1j * rand_float2
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
When I run the code I get this error:
Calling python function...
---- 0.000344038009644 seconds for python ----:
Traceback (most recent call last):
File "GPU_Fake_PA_partial.py", line 82, in <module>
gpu_data_sum = calculate_cuda (data, data_index, coef)
File "GPU_Fake_PA_partial.py", line 44, in calculate_cuda
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 765, in __call__
kernel = self.specialize(*args)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 776, in specialize
kernel = self.compile(argtypes)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 792, in compile
**self.targetoptions)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 62, in compile_kernel
cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 51, in compile_cuda
locals={})
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 926, in compile_extra
return pipeline.compile_extra(func)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 374, in compile_extra
return self._compile_bytecode()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 857, in _compile_bytecode
return self._compile_core()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 844, in _compile_core
res = pm.run(self.status)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 255, in run
raise patched_exception
numba.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Known signatures:
* (bool, bool) -> bool
* (int8, int8) -> bool
* (int16, int16) -> bool
* (int32, int32) -> bool
* (int64, int64) -> bool
* (uint8, uint8) -> bool
* (uint16, uint16) -> bool
* (uint32, uint32) -> bool
* (uint64, uint64) -> bool
* (float32, float32) -> bool
* (float64, float64) -> bool
* parameterized
In definition 0:
All templates rejected with literals.
In definition 1:
All templates rejected without literals.
In definition 2:
All templates rejected with literals.
In definition 3:
All templates rejected without literals.
In definition 4:
All templates rejected with literals.
In definition 5:
All templates rejected without literals.
In definition 6:
All templates rejected with literals.
In definition 7:
All templates rejected without literals.
This error is usually caused by passing an argument of a type that is unsupported by the named function.
[1] During: typing of intrinsic-call at GPU_Fake_PA_partial.py (15)
File "GPU_Fake_PA_partial.py", line 15:
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
<source elided>
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
Any help is highly appreciated.
python-2.7 cuda numba
add a comment |
I am trying to speedup a python code using cudanumba. The code works with large arrays of complex, float and integer numbers. I have included both python version and numba-cuda version here. The numba-cuda version does not compile.
I have tried performing complex number calculation as separate real and imaginary numbers as I though complex format may be the issue.
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
d_tmp[row, col] = d_data[d_data_index[row, col]]
d_tmp[row, col] =d_tmp[row, col] * d_coef[row, col]
# All threads get to this point ===============================
cuda.syncthreads()
if row == 0 and col ==0 :
d_datasum = np.sum(d_tmp, axis=0)
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_npart = cuda.to_device(npart)
d_npts = cuda.to_device(npts)
d_data = cuda.to_device(data)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
d_tmp = cuda.device_array((npart,npts), np.complex64)
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(npts / threadsperblock[0]))+1
blockspergrid_y = int(math.ceil(npart / threadsperblock[1]))+1
blockspergrid = (blockspergrid_x, blockspergrid_y)
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
start_time = time.time()
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
data_size = 1200
rows = 31
cols = 1000
rand_float1 = np.random.randn(data_size)
rand_float2 = np.random.randn(data_size)
data = rand_float1 + 1j * rand_float2
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
When I run the code I get this error:
Calling python function...
---- 0.000344038009644 seconds for python ----:
Traceback (most recent call last):
File "GPU_Fake_PA_partial.py", line 82, in <module>
gpu_data_sum = calculate_cuda (data, data_index, coef)
File "GPU_Fake_PA_partial.py", line 44, in calculate_cuda
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 765, in __call__
kernel = self.specialize(*args)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 776, in specialize
kernel = self.compile(argtypes)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 792, in compile
**self.targetoptions)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 62, in compile_kernel
cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 51, in compile_cuda
locals={})
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 926, in compile_extra
return pipeline.compile_extra(func)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 374, in compile_extra
return self._compile_bytecode()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 857, in _compile_bytecode
return self._compile_core()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 844, in _compile_core
res = pm.run(self.status)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 255, in run
raise patched_exception
numba.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Known signatures:
* (bool, bool) -> bool
* (int8, int8) -> bool
* (int16, int16) -> bool
* (int32, int32) -> bool
* (int64, int64) -> bool
* (uint8, uint8) -> bool
* (uint16, uint16) -> bool
* (uint32, uint32) -> bool
* (uint64, uint64) -> bool
* (float32, float32) -> bool
* (float64, float64) -> bool
* parameterized
In definition 0:
All templates rejected with literals.
In definition 1:
All templates rejected without literals.
In definition 2:
All templates rejected with literals.
In definition 3:
All templates rejected without literals.
In definition 4:
All templates rejected with literals.
In definition 5:
All templates rejected without literals.
In definition 6:
All templates rejected with literals.
In definition 7:
All templates rejected without literals.
This error is usually caused by passing an argument of a type that is unsupported by the named function.
[1] During: typing of intrinsic-call at GPU_Fake_PA_partial.py (15)
File "GPU_Fake_PA_partial.py", line 15:
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
<source elided>
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
Any help is highly appreciated.
python-2.7 cuda numba
I am trying to speedup a python code using cudanumba. The code works with large arrays of complex, float and integer numbers. I have included both python version and numba-cuda version here. The numba-cuda version does not compile.
I have tried performing complex number calculation as separate real and imaginary numbers as I though complex format may be the issue.
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
d_tmp[row, col] = d_data[d_data_index[row, col]]
d_tmp[row, col] =d_tmp[row, col] * d_coef[row, col]
# All threads get to this point ===============================
cuda.syncthreads()
if row == 0 and col ==0 :
d_datasum = np.sum(d_tmp, axis=0)
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_npart = cuda.to_device(npart)
d_npts = cuda.to_device(npts)
d_data = cuda.to_device(data)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
d_tmp = cuda.device_array((npart,npts), np.complex64)
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(npts / threadsperblock[0]))+1
blockspergrid_y = int(math.ceil(npart / threadsperblock[1]))+1
blockspergrid = (blockspergrid_x, blockspergrid_y)
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
start_time = time.time()
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
data_size = 1200
rows = 31
cols = 1000
rand_float1 = np.random.randn(data_size)
rand_float2 = np.random.randn(data_size)
data = rand_float1 + 1j * rand_float2
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
When I run the code I get this error:
Calling python function...
---- 0.000344038009644 seconds for python ----:
Traceback (most recent call last):
File "GPU_Fake_PA_partial.py", line 82, in <module>
gpu_data_sum = calculate_cuda (data, data_index, coef)
File "GPU_Fake_PA_partial.py", line 44, in calculate_cuda
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 765, in __call__
kernel = self.specialize(*args)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 776, in specialize
kernel = self.compile(argtypes)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 792, in compile
**self.targetoptions)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 62, in compile_kernel
cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 51, in compile_cuda
locals={})
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 926, in compile_extra
return pipeline.compile_extra(func)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 374, in compile_extra
return self._compile_bytecode()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 857, in _compile_bytecode
return self._compile_core()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 844, in _compile_core
res = pm.run(self.status)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 255, in run
raise patched_exception
numba.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Known signatures:
* (bool, bool) -> bool
* (int8, int8) -> bool
* (int16, int16) -> bool
* (int32, int32) -> bool
* (int64, int64) -> bool
* (uint8, uint8) -> bool
* (uint16, uint16) -> bool
* (uint32, uint32) -> bool
* (uint64, uint64) -> bool
* (float32, float32) -> bool
* (float64, float64) -> bool
* parameterized
In definition 0:
All templates rejected with literals.
In definition 1:
All templates rejected without literals.
In definition 2:
All templates rejected with literals.
In definition 3:
All templates rejected without literals.
In definition 4:
All templates rejected with literals.
In definition 5:
All templates rejected without literals.
In definition 6:
All templates rejected with literals.
In definition 7:
All templates rejected without literals.
This error is usually caused by passing an argument of a type that is unsupported by the named function.
[1] During: typing of intrinsic-call at GPU_Fake_PA_partial.py (15)
File "GPU_Fake_PA_partial.py", line 15:
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
<source elided>
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
Any help is highly appreciated.
python-2.7 cuda numba
python-2.7 cuda numba
asked Jan 19 at 0:16
Ali JooyaAli Jooya
104
104
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
There were a variety of problems with your code. I may not cover all of them, so please compare my code with yours.
numpy array methods (like
np.sum()
) cannot be used in numba CUDA kernels.
scalar quantities passed to a numba cuda kernel (like
npart
,npts
) do not need and should not have array treatment like.to_device()
. Simply use them as they are. This is the reason for the python error you show:
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Your kernel was needlessly complicated. You are basically performing column sums of a matrix that has been permuted according to an index pattern, multiplied by a coefficient array. We can perform this with a single loop, per thread.
For the above realization, we don't need a 2-dimensional grid of threads.
You had indentation problems in the code you posted.
For demonstration purposes, I have reduced the size of your dataset from 1000 columns to 15 columns. Here is an example that addresses the above items:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 15
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
---- 0.000281095504761 seconds for python ----:
[-1.10292518+0.90700358j 2.67771578+2.47935939j -5.22553015-2.22675705j
-3.55810285+2.39755774j 4.11441088-3.89396238j -2.70894790-0.75690132j
3.24859619+0.65993834j 1.05531025+2.3959775j -4.27368307+1.6297332j
0.17896785-7.0437355j -6.31506491+6.22674656j -1.85534143-6.08459902j
0.40037563+6.33309507j -1.71916604-0.55055946j 0.49263301+1.08690035j]
---- 0.593510866165 seconds for gpu ----:
[-1.10292506+0.9070037j 2.67771506+2.47935939j -5.22553062-2.22675681j
-3.55810285+2.39755821j 4.11441135-3.89396238j -2.70894790-0.75690138j
3.24859619+0.65993822j 1.05531013+2.39597774j -4.27368307+1.62973344j
0.17896791-7.0437355j -6.31506491+6.22674656j -1.85534155-6.08459902j
0.40037528+6.33309603j -1.71916604-0.55055946j 0.49263287+1.08690035j]
$
Note that there are slight differences numerically between the host and device calculation results, starting in the 6th decimal place, in some cases. I attribute this to possible calculation order differences between the host and device code, combined with limits of float32
(or complex64
) numpy datatype.
Since you have timing built-in to your code, you may be interested in performance. For numba python, I recommend a typical benchmarking practice which is not to measure the first run, but measure the second run. This avoids having one-time overheads enter into the measurement. Furthermore, we'd like to choose a much larger dataset size than 15 columns, to give the GPU a large enough amount of work to amortize various costs. With those modifications, here is a benchmark showing that the GPU version in this code can be faster than the CPU version in this code:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 1000000
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
gpu_data_sum_python = calculate_python (data, data_index, coef)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
gpu_data_sum = calculate_cuda (data, data_index, coef)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
Calling python function...
---- 0.806931018829 seconds for python ----:
[ 6.56164026-7.95271683j -7.42586899+3.68758106j 3.48999476+3.10376763j
..., 13.12746525+4.61855698j 0.08796659+0.9710322j
-6.54224586+4.89485168j]
---- 0.417661905289 seconds for gpu ----:
[ 6.56164074-7.95271683j -7.42586851+3.68758035j 3.48999500+3.10376763j
..., 13.12746525+4.61855745j 0.08796643+0.97103256j
-6.54224634+4.89485121j]
$
With these modifications, the GPU code appears to be about 2x faster than the CPU code.
This was measured on CUDA 9.2, Fedora 27, Quadro K2000 (a relatively small, slow GPU). I wouldn't read too much into this comparison, as the CPU code is almost certainly non-optimal as well, and this is still a relatively small amount of work per output data point, for GPU acceleration to be interesting.
Thanks Robert. This was a small piece of a larger code. The actual data size is larger and the kernel is more compute intensive. I think I can get a good speed up from GPU. Thanks for your comment. It was a great help. I have one question. In your code, each thread performs the calculation for all the elements in one column through the for loop. Any particular reason, other than keeping the kernel code simple, for not having a thread per element of the result matrix? Wouldn't that improve the performance even further?
– Ali Jooya
Jan 21 at 5:03
Then you have to use a column-wise parallel reduction. It's doable, of course, but the code complexity goes up and it may not yield much if any performance improvement. If you have enough columns to work on, the loop-per-column method is a better approach.
– Robert Crovella
Jan 21 at 6:33
Thanks Robert. As you mentioned, the timing of the first run on GPU is larger than the next run (3x). As I am calling the kernel just once in my application, I have to take into account the timing I get from the first run of the benchmark, right? Is there anyway to reduce this overhead? Also, you mentioned that my python code is not optimized, any optimization suggestion?
– Ali Jooya
Jan 21 at 21:16
I'm not aware of anything to make the first call shorter. For optimization of the CPU code, you could use numba again. I don't have any specific suggestions. I merely suggested that the CPU code might not be completely optimal.
– Robert Crovella
Jan 22 at 0:40
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54262988%2fcomplex-number-reduction-with-numba-cuda%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
There were a variety of problems with your code. I may not cover all of them, so please compare my code with yours.
numpy array methods (like
np.sum()
) cannot be used in numba CUDA kernels.
scalar quantities passed to a numba cuda kernel (like
npart
,npts
) do not need and should not have array treatment like.to_device()
. Simply use them as they are. This is the reason for the python error you show:
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Your kernel was needlessly complicated. You are basically performing column sums of a matrix that has been permuted according to an index pattern, multiplied by a coefficient array. We can perform this with a single loop, per thread.
For the above realization, we don't need a 2-dimensional grid of threads.
You had indentation problems in the code you posted.
For demonstration purposes, I have reduced the size of your dataset from 1000 columns to 15 columns. Here is an example that addresses the above items:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 15
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
---- 0.000281095504761 seconds for python ----:
[-1.10292518+0.90700358j 2.67771578+2.47935939j -5.22553015-2.22675705j
-3.55810285+2.39755774j 4.11441088-3.89396238j -2.70894790-0.75690132j
3.24859619+0.65993834j 1.05531025+2.3959775j -4.27368307+1.6297332j
0.17896785-7.0437355j -6.31506491+6.22674656j -1.85534143-6.08459902j
0.40037563+6.33309507j -1.71916604-0.55055946j 0.49263301+1.08690035j]
---- 0.593510866165 seconds for gpu ----:
[-1.10292506+0.9070037j 2.67771506+2.47935939j -5.22553062-2.22675681j
-3.55810285+2.39755821j 4.11441135-3.89396238j -2.70894790-0.75690138j
3.24859619+0.65993822j 1.05531013+2.39597774j -4.27368307+1.62973344j
0.17896791-7.0437355j -6.31506491+6.22674656j -1.85534155-6.08459902j
0.40037528+6.33309603j -1.71916604-0.55055946j 0.49263287+1.08690035j]
$
Note that there are slight differences numerically between the host and device calculation results, starting in the 6th decimal place, in some cases. I attribute this to possible calculation order differences between the host and device code, combined with limits of float32
(or complex64
) numpy datatype.
Since you have timing built-in to your code, you may be interested in performance. For numba python, I recommend a typical benchmarking practice which is not to measure the first run, but measure the second run. This avoids having one-time overheads enter into the measurement. Furthermore, we'd like to choose a much larger dataset size than 15 columns, to give the GPU a large enough amount of work to amortize various costs. With those modifications, here is a benchmark showing that the GPU version in this code can be faster than the CPU version in this code:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 1000000
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
gpu_data_sum_python = calculate_python (data, data_index, coef)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
gpu_data_sum = calculate_cuda (data, data_index, coef)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
Calling python function...
---- 0.806931018829 seconds for python ----:
[ 6.56164026-7.95271683j -7.42586899+3.68758106j 3.48999476+3.10376763j
..., 13.12746525+4.61855698j 0.08796659+0.9710322j
-6.54224586+4.89485168j]
---- 0.417661905289 seconds for gpu ----:
[ 6.56164074-7.95271683j -7.42586851+3.68758035j 3.48999500+3.10376763j
..., 13.12746525+4.61855745j 0.08796643+0.97103256j
-6.54224634+4.89485121j]
$
With these modifications, the GPU code appears to be about 2x faster than the CPU code.
This was measured on CUDA 9.2, Fedora 27, Quadro K2000 (a relatively small, slow GPU). I wouldn't read too much into this comparison, as the CPU code is almost certainly non-optimal as well, and this is still a relatively small amount of work per output data point, for GPU acceleration to be interesting.
Thanks Robert. This was a small piece of a larger code. The actual data size is larger and the kernel is more compute intensive. I think I can get a good speed up from GPU. Thanks for your comment. It was a great help. I have one question. In your code, each thread performs the calculation for all the elements in one column through the for loop. Any particular reason, other than keeping the kernel code simple, for not having a thread per element of the result matrix? Wouldn't that improve the performance even further?
– Ali Jooya
Jan 21 at 5:03
Then you have to use a column-wise parallel reduction. It's doable, of course, but the code complexity goes up and it may not yield much if any performance improvement. If you have enough columns to work on, the loop-per-column method is a better approach.
– Robert Crovella
Jan 21 at 6:33
Thanks Robert. As you mentioned, the timing of the first run on GPU is larger than the next run (3x). As I am calling the kernel just once in my application, I have to take into account the timing I get from the first run of the benchmark, right? Is there anyway to reduce this overhead? Also, you mentioned that my python code is not optimized, any optimization suggestion?
– Ali Jooya
Jan 21 at 21:16
I'm not aware of anything to make the first call shorter. For optimization of the CPU code, you could use numba again. I don't have any specific suggestions. I merely suggested that the CPU code might not be completely optimal.
– Robert Crovella
Jan 22 at 0:40
add a comment |
There were a variety of problems with your code. I may not cover all of them, so please compare my code with yours.
numpy array methods (like
np.sum()
) cannot be used in numba CUDA kernels.
scalar quantities passed to a numba cuda kernel (like
npart
,npts
) do not need and should not have array treatment like.to_device()
. Simply use them as they are. This is the reason for the python error you show:
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Your kernel was needlessly complicated. You are basically performing column sums of a matrix that has been permuted according to an index pattern, multiplied by a coefficient array. We can perform this with a single loop, per thread.
For the above realization, we don't need a 2-dimensional grid of threads.
You had indentation problems in the code you posted.
For demonstration purposes, I have reduced the size of your dataset from 1000 columns to 15 columns. Here is an example that addresses the above items:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 15
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
---- 0.000281095504761 seconds for python ----:
[-1.10292518+0.90700358j 2.67771578+2.47935939j -5.22553015-2.22675705j
-3.55810285+2.39755774j 4.11441088-3.89396238j -2.70894790-0.75690132j
3.24859619+0.65993834j 1.05531025+2.3959775j -4.27368307+1.6297332j
0.17896785-7.0437355j -6.31506491+6.22674656j -1.85534143-6.08459902j
0.40037563+6.33309507j -1.71916604-0.55055946j 0.49263301+1.08690035j]
---- 0.593510866165 seconds for gpu ----:
[-1.10292506+0.9070037j 2.67771506+2.47935939j -5.22553062-2.22675681j
-3.55810285+2.39755821j 4.11441135-3.89396238j -2.70894790-0.75690138j
3.24859619+0.65993822j 1.05531013+2.39597774j -4.27368307+1.62973344j
0.17896791-7.0437355j -6.31506491+6.22674656j -1.85534155-6.08459902j
0.40037528+6.33309603j -1.71916604-0.55055946j 0.49263287+1.08690035j]
$
Note that there are slight differences numerically between the host and device calculation results, starting in the 6th decimal place, in some cases. I attribute this to possible calculation order differences between the host and device code, combined with limits of float32
(or complex64
) numpy datatype.
Since you have timing built-in to your code, you may be interested in performance. For numba python, I recommend a typical benchmarking practice which is not to measure the first run, but measure the second run. This avoids having one-time overheads enter into the measurement. Furthermore, we'd like to choose a much larger dataset size than 15 columns, to give the GPU a large enough amount of work to amortize various costs. With those modifications, here is a benchmark showing that the GPU version in this code can be faster than the CPU version in this code:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 1000000
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
gpu_data_sum_python = calculate_python (data, data_index, coef)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
gpu_data_sum = calculate_cuda (data, data_index, coef)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
Calling python function...
---- 0.806931018829 seconds for python ----:
[ 6.56164026-7.95271683j -7.42586899+3.68758106j 3.48999476+3.10376763j
..., 13.12746525+4.61855698j 0.08796659+0.9710322j
-6.54224586+4.89485168j]
---- 0.417661905289 seconds for gpu ----:
[ 6.56164074-7.95271683j -7.42586851+3.68758035j 3.48999500+3.10376763j
..., 13.12746525+4.61855745j 0.08796643+0.97103256j
-6.54224634+4.89485121j]
$
With these modifications, the GPU code appears to be about 2x faster than the CPU code.
This was measured on CUDA 9.2, Fedora 27, Quadro K2000 (a relatively small, slow GPU). I wouldn't read too much into this comparison, as the CPU code is almost certainly non-optimal as well, and this is still a relatively small amount of work per output data point, for GPU acceleration to be interesting.
Thanks Robert. This was a small piece of a larger code. The actual data size is larger and the kernel is more compute intensive. I think I can get a good speed up from GPU. Thanks for your comment. It was a great help. I have one question. In your code, each thread performs the calculation for all the elements in one column through the for loop. Any particular reason, other than keeping the kernel code simple, for not having a thread per element of the result matrix? Wouldn't that improve the performance even further?
– Ali Jooya
Jan 21 at 5:03
Then you have to use a column-wise parallel reduction. It's doable, of course, but the code complexity goes up and it may not yield much if any performance improvement. If you have enough columns to work on, the loop-per-column method is a better approach.
– Robert Crovella
Jan 21 at 6:33
Thanks Robert. As you mentioned, the timing of the first run on GPU is larger than the next run (3x). As I am calling the kernel just once in my application, I have to take into account the timing I get from the first run of the benchmark, right? Is there anyway to reduce this overhead? Also, you mentioned that my python code is not optimized, any optimization suggestion?
– Ali Jooya
Jan 21 at 21:16
I'm not aware of anything to make the first call shorter. For optimization of the CPU code, you could use numba again. I don't have any specific suggestions. I merely suggested that the CPU code might not be completely optimal.
– Robert Crovella
Jan 22 at 0:40
add a comment |
There were a variety of problems with your code. I may not cover all of them, so please compare my code with yours.
numpy array methods (like
np.sum()
) cannot be used in numba CUDA kernels.
scalar quantities passed to a numba cuda kernel (like
npart
,npts
) do not need and should not have array treatment like.to_device()
. Simply use them as they are. This is the reason for the python error you show:
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Your kernel was needlessly complicated. You are basically performing column sums of a matrix that has been permuted according to an index pattern, multiplied by a coefficient array. We can perform this with a single loop, per thread.
For the above realization, we don't need a 2-dimensional grid of threads.
You had indentation problems in the code you posted.
For demonstration purposes, I have reduced the size of your dataset from 1000 columns to 15 columns. Here is an example that addresses the above items:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 15
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
---- 0.000281095504761 seconds for python ----:
[-1.10292518+0.90700358j 2.67771578+2.47935939j -5.22553015-2.22675705j
-3.55810285+2.39755774j 4.11441088-3.89396238j -2.70894790-0.75690132j
3.24859619+0.65993834j 1.05531025+2.3959775j -4.27368307+1.6297332j
0.17896785-7.0437355j -6.31506491+6.22674656j -1.85534143-6.08459902j
0.40037563+6.33309507j -1.71916604-0.55055946j 0.49263301+1.08690035j]
---- 0.593510866165 seconds for gpu ----:
[-1.10292506+0.9070037j 2.67771506+2.47935939j -5.22553062-2.22675681j
-3.55810285+2.39755821j 4.11441135-3.89396238j -2.70894790-0.75690138j
3.24859619+0.65993822j 1.05531013+2.39597774j -4.27368307+1.62973344j
0.17896791-7.0437355j -6.31506491+6.22674656j -1.85534155-6.08459902j
0.40037528+6.33309603j -1.71916604-0.55055946j 0.49263287+1.08690035j]
$
Note that there are slight differences numerically between the host and device calculation results, starting in the 6th decimal place, in some cases. I attribute this to possible calculation order differences between the host and device code, combined with limits of float32
(or complex64
) numpy datatype.
Since you have timing built-in to your code, you may be interested in performance. For numba python, I recommend a typical benchmarking practice which is not to measure the first run, but measure the second run. This avoids having one-time overheads enter into the measurement. Furthermore, we'd like to choose a much larger dataset size than 15 columns, to give the GPU a large enough amount of work to amortize various costs. With those modifications, here is a benchmark showing that the GPU version in this code can be faster than the CPU version in this code:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 1000000
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
gpu_data_sum_python = calculate_python (data, data_index, coef)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
gpu_data_sum = calculate_cuda (data, data_index, coef)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
Calling python function...
---- 0.806931018829 seconds for python ----:
[ 6.56164026-7.95271683j -7.42586899+3.68758106j 3.48999476+3.10376763j
..., 13.12746525+4.61855698j 0.08796659+0.9710322j
-6.54224586+4.89485168j]
---- 0.417661905289 seconds for gpu ----:
[ 6.56164074-7.95271683j -7.42586851+3.68758035j 3.48999500+3.10376763j
..., 13.12746525+4.61855745j 0.08796643+0.97103256j
-6.54224634+4.89485121j]
$
With these modifications, the GPU code appears to be about 2x faster than the CPU code.
This was measured on CUDA 9.2, Fedora 27, Quadro K2000 (a relatively small, slow GPU). I wouldn't read too much into this comparison, as the CPU code is almost certainly non-optimal as well, and this is still a relatively small amount of work per output data point, for GPU acceleration to be interesting.
There were a variety of problems with your code. I may not cover all of them, so please compare my code with yours.
numpy array methods (like
np.sum()
) cannot be used in numba CUDA kernels.
scalar quantities passed to a numba cuda kernel (like
npart
,npts
) do not need and should not have array treatment like.to_device()
. Simply use them as they are. This is the reason for the python error you show:
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Your kernel was needlessly complicated. You are basically performing column sums of a matrix that has been permuted according to an index pattern, multiplied by a coefficient array. We can perform this with a single loop, per thread.
For the above realization, we don't need a 2-dimensional grid of threads.
You had indentation problems in the code you posted.
For demonstration purposes, I have reduced the size of your dataset from 1000 columns to 15 columns. Here is an example that addresses the above items:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 15
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
---- 0.000281095504761 seconds for python ----:
[-1.10292518+0.90700358j 2.67771578+2.47935939j -5.22553015-2.22675705j
-3.55810285+2.39755774j 4.11441088-3.89396238j -2.70894790-0.75690132j
3.24859619+0.65993834j 1.05531025+2.3959775j -4.27368307+1.6297332j
0.17896785-7.0437355j -6.31506491+6.22674656j -1.85534143-6.08459902j
0.40037563+6.33309507j -1.71916604-0.55055946j 0.49263301+1.08690035j]
---- 0.593510866165 seconds for gpu ----:
[-1.10292506+0.9070037j 2.67771506+2.47935939j -5.22553062-2.22675681j
-3.55810285+2.39755821j 4.11441135-3.89396238j -2.70894790-0.75690138j
3.24859619+0.65993822j 1.05531013+2.39597774j -4.27368307+1.62973344j
0.17896791-7.0437355j -6.31506491+6.22674656j -1.85534155-6.08459902j
0.40037528+6.33309603j -1.71916604-0.55055946j 0.49263287+1.08690035j]
$
Note that there are slight differences numerically between the host and device calculation results, starting in the 6th decimal place, in some cases. I attribute this to possible calculation order differences between the host and device code, combined with limits of float32
(or complex64
) numpy datatype.
Since you have timing built-in to your code, you may be interested in performance. For numba python, I recommend a typical benchmarking practice which is not to measure the first run, but measure the second run. This avoids having one-time overheads enter into the measurement. Furthermore, we'd like to choose a much larger dataset size than 15 columns, to give the GPU a large enough amount of work to amortize various costs. With those modifications, here is a benchmark showing that the GPU version in this code can be faster than the CPU version in this code:
$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
col = cuda.grid(1)
if col < npts:
temp = 0
for i in range(npart):
temp += d_data[d_data_index[i, col]] * d_coef[i, col]
d_datasum[col] = temp
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_data = cuda.to_device(data)
d_data_imag = cuda.to_device(data_imag)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
threadsperblock = 64
blockspergrid = int(math.ceil(npts / threadsperblock))+1
cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
rows = 31
cols = 1000000
data_size = rows * cols
data_real = np.random.randn(data_size).astype(np.float32)
data_imag = np.random.randn(data_size).astype(np.float32)
data = data_real + 1j * data_imag
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
gpu_data_sum_python = calculate_python (data, data_index, coef)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
print(gpu_data_sum_python)
gpu_data_sum = calculate_cuda (data, data_index, coef)
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
print(gpu_data_sum)
$ python t31.py
Calling python function...
Calling python function...
---- 0.806931018829 seconds for python ----:
[ 6.56164026-7.95271683j -7.42586899+3.68758106j 3.48999476+3.10376763j
..., 13.12746525+4.61855698j 0.08796659+0.9710322j
-6.54224586+4.89485168j]
---- 0.417661905289 seconds for gpu ----:
[ 6.56164074-7.95271683j -7.42586851+3.68758035j 3.48999500+3.10376763j
..., 13.12746525+4.61855745j 0.08796643+0.97103256j
-6.54224634+4.89485121j]
$
With these modifications, the GPU code appears to be about 2x faster than the CPU code.
This was measured on CUDA 9.2, Fedora 27, Quadro K2000 (a relatively small, slow GPU). I wouldn't read too much into this comparison, as the CPU code is almost certainly non-optimal as well, and this is still a relatively small amount of work per output data point, for GPU acceleration to be interesting.
edited Jan 20 at 15:05
answered Jan 19 at 2:44
Robert CrovellaRobert Crovella
95.2k4104149
95.2k4104149
Thanks Robert. This was a small piece of a larger code. The actual data size is larger and the kernel is more compute intensive. I think I can get a good speed up from GPU. Thanks for your comment. It was a great help. I have one question. In your code, each thread performs the calculation for all the elements in one column through the for loop. Any particular reason, other than keeping the kernel code simple, for not having a thread per element of the result matrix? Wouldn't that improve the performance even further?
– Ali Jooya
Jan 21 at 5:03
Then you have to use a column-wise parallel reduction. It's doable, of course, but the code complexity goes up and it may not yield much if any performance improvement. If you have enough columns to work on, the loop-per-column method is a better approach.
– Robert Crovella
Jan 21 at 6:33
Thanks Robert. As you mentioned, the timing of the first run on GPU is larger than the next run (3x). As I am calling the kernel just once in my application, I have to take into account the timing I get from the first run of the benchmark, right? Is there anyway to reduce this overhead? Also, you mentioned that my python code is not optimized, any optimization suggestion?
– Ali Jooya
Jan 21 at 21:16
I'm not aware of anything to make the first call shorter. For optimization of the CPU code, you could use numba again. I don't have any specific suggestions. I merely suggested that the CPU code might not be completely optimal.
– Robert Crovella
Jan 22 at 0:40
add a comment |
Thanks Robert. This was a small piece of a larger code. The actual data size is larger and the kernel is more compute intensive. I think I can get a good speed up from GPU. Thanks for your comment. It was a great help. I have one question. In your code, each thread performs the calculation for all the elements in one column through the for loop. Any particular reason, other than keeping the kernel code simple, for not having a thread per element of the result matrix? Wouldn't that improve the performance even further?
– Ali Jooya
Jan 21 at 5:03
Then you have to use a column-wise parallel reduction. It's doable, of course, but the code complexity goes up and it may not yield much if any performance improvement. If you have enough columns to work on, the loop-per-column method is a better approach.
– Robert Crovella
Jan 21 at 6:33
Thanks Robert. As you mentioned, the timing of the first run on GPU is larger than the next run (3x). As I am calling the kernel just once in my application, I have to take into account the timing I get from the first run of the benchmark, right? Is there anyway to reduce this overhead? Also, you mentioned that my python code is not optimized, any optimization suggestion?
– Ali Jooya
Jan 21 at 21:16
I'm not aware of anything to make the first call shorter. For optimization of the CPU code, you could use numba again. I don't have any specific suggestions. I merely suggested that the CPU code might not be completely optimal.
– Robert Crovella
Jan 22 at 0:40
Thanks Robert. This was a small piece of a larger code. The actual data size is larger and the kernel is more compute intensive. I think I can get a good speed up from GPU. Thanks for your comment. It was a great help. I have one question. In your code, each thread performs the calculation for all the elements in one column through the for loop. Any particular reason, other than keeping the kernel code simple, for not having a thread per element of the result matrix? Wouldn't that improve the performance even further?
– Ali Jooya
Jan 21 at 5:03
Thanks Robert. This was a small piece of a larger code. The actual data size is larger and the kernel is more compute intensive. I think I can get a good speed up from GPU. Thanks for your comment. It was a great help. I have one question. In your code, each thread performs the calculation for all the elements in one column through the for loop. Any particular reason, other than keeping the kernel code simple, for not having a thread per element of the result matrix? Wouldn't that improve the performance even further?
– Ali Jooya
Jan 21 at 5:03
Then you have to use a column-wise parallel reduction. It's doable, of course, but the code complexity goes up and it may not yield much if any performance improvement. If you have enough columns to work on, the loop-per-column method is a better approach.
– Robert Crovella
Jan 21 at 6:33
Then you have to use a column-wise parallel reduction. It's doable, of course, but the code complexity goes up and it may not yield much if any performance improvement. If you have enough columns to work on, the loop-per-column method is a better approach.
– Robert Crovella
Jan 21 at 6:33
Thanks Robert. As you mentioned, the timing of the first run on GPU is larger than the next run (3x). As I am calling the kernel just once in my application, I have to take into account the timing I get from the first run of the benchmark, right? Is there anyway to reduce this overhead? Also, you mentioned that my python code is not optimized, any optimization suggestion?
– Ali Jooya
Jan 21 at 21:16
Thanks Robert. As you mentioned, the timing of the first run on GPU is larger than the next run (3x). As I am calling the kernel just once in my application, I have to take into account the timing I get from the first run of the benchmark, right? Is there anyway to reduce this overhead? Also, you mentioned that my python code is not optimized, any optimization suggestion?
– Ali Jooya
Jan 21 at 21:16
I'm not aware of anything to make the first call shorter. For optimization of the CPU code, you could use numba again. I don't have any specific suggestions. I merely suggested that the CPU code might not be completely optimal.
– Robert Crovella
Jan 22 at 0:40
I'm not aware of anything to make the first call shorter. For optimization of the CPU code, you could use numba again. I don't have any specific suggestions. I merely suggested that the CPU code might not be completely optimal.
– Robert Crovella
Jan 22 at 0:40
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54262988%2fcomplex-number-reduction-with-numba-cuda%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown