i started in cuda. have question. have n*n matrix, , window scale 8x8. want subdivided matrix multiple sub-matrix , find max value of this. example if have 64*64 matrix have 8 small matrix 8*8 scale , find out 8 max values. save max values new array, order change. want find solution keep them in right order
__global__ void calculate_emax_kernel(float emap[],float emax[], int img_height, int img_width,int windows_size) { int x_index = blockidx.x*blockdim.x+threadidx.x; int y_index = blockidx.y*blockdim.y+threadidx.y; int num_row_block = img_height/windows_size; int num_col_block = img_width/windows_size; __shared__ float window_elements[256]; __shared__ int counter; __shared__ int emax_count; if (threadidx.x == 0) emax_count = 0; __syncthreads(); int index; int emax_idx = 0; if(y_index >= img_height|| x_index >= img_width) return; for(int = 0; < num_row_block; i++) { for(int j = 0; j < num_col_block; j++) { counter = 0; if(y_index >= i*windows_size && y_index < (i+1)*windows_size && x_index >= j*windows_size && x_index < (j+1)*windows_size) { int idx = y_index*img_height + x_index; index = atomicadd(&counter, 1); window_elements[index] = emap[idx]; __syncthreads(); // reduction unsigned int k = (windows_size*windows_size)/2; while(k != 0) { if(index < k) { window_elements[index] = fmaxf(window_elements[index], window_elements[index+k]); } k /= 2; } if(index == 0) { emax[i*num_row_block+j] = window_elements[index]; } } __syncthreads(); } __syncthreads(); } __syncthreads(); }
this configuration
void construct_emax(float *input,float *output, int img_height, int img_width) { int windows_size = 4; float * d_input, * d_output; cudamalloc(&d_input, img_width*img_height*sizeof(float)); cudamalloc(&d_output, img_width*img_height*sizeof(float)); cudamemcpy(d_input, input, img_width*img_height*sizeof(float), cudamemcpyhosttodevice); dim3 blocksize(16,16); dim3 gridsize; gridsize.x=(img_width+blocksize.x-1)/blocksize.x; gridsize.y=(img_height+blocksize.y-1)/blocksize.y; calculate_emax_kernel<<<gridsize,blocksize>>>(d_input,d_output,img_height,img_width,windows_size); }
with cuda, parallel reduction tricky; segmented parallel reduction trickier. doing in 2-d, , segment/window smaller thread block.
for large window size, don't think problem. use 1 thread block reduce 1 window. example if have 16x16 window, use 16x16 thread block. if have larger window size, example 64x64, still use 16x16 thread block. first reduce 64x64 window 16x16 elements during data loading, reduce 1 scalar within thread block.
for window size smaller block size, have reduce multiple windows per thread block higher performance. use current block/grid configuration, each 256-thread block (16x16) responsible 16 4x4 windows. not optimal because each 32-thread wrap organized in 2 parts (2x16). not coalesced global memory access, , hard map 2x16 warp 1 or more 4x4 windows efficient parallel reduction.
alternatively suggest use 1-d thread block 256 threads. every m
threads reduce 1 m
xm
window. use 2-d grid cover whole image.
const int m = window_size; dim3 blocksize(256); dim3 gridsize((img_width+255)/256, (img_height+m-1)/m);
in kernel function, could
- reduce each
m
xm
window 1xm
vector during global data loading; - use tree reduction method reduce 1x
m
vector scalar.
this following code conceptual demo works when m
power of 2 , m <= 32
. further modify arbitrary m
, better boundary checking.
#include <assert.h> #include <cuda.h> #include <thrust/device_vector.h> __global__ void calculate_emax_kernel(const float* input, float* output, int height, int width, int win_size, int out_width) { const int tid = threadidx.x; const int = blockidx.y * win_size; const int j = blockidx.x * 256 + tid; const int win_id = j % win_size; __shared__ float smax[256]; float tmax = -1e20; if (j < width) { (int tile = 0; tile < win_size; tile++) { if (i + tile < height) { tmax = max(tmax, input[(i + tile) * width + j]); } } } smax[tid] = tmax; (int shift = win_size / 2; shift > 0; shift /= 2) { if (win_id < shift) { smax[tid] = max(smax[tid], smax[tid + shift]); } } if (win_id == 0 && j < width) { output[blockidx.y * out_width + (j / win_size)] = smax[tid]; } } int main() { const int height = 1024; const int width = 1024; const int m = 4; thrust::device_vector<float> in(height * width); thrust::device_vector<float> out( ((height + m - 1) / m) * ((width + m - 1) / m)); dim3 blocksize(256); dim3 gridsize((width + 255) / 256, (height + m - 1) / m); assert(m == 2 || m == 4 || m == 8 || m == 16 || m == 32); calculate_emax_kernel<<<gridsize, blocksize>>>( thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), height, width, m, (width + m - 1) / m); return 0; }
Comments
Post a Comment