__global__ voidreduce_v0(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsignedint tid = threadIdx.x; unsignedint i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for(unsignedint s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
__global__ voidreduce_v1(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsignedint tid = threadIdx.x; unsignedint i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for(unsignedint s=1; s < blockDim.x; s *= 2) { int index = 2 * s * tid; if (index < blockDim.x) { sdata[index] += sdata[index + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
__global__ voidreduce_v2(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsignedint tid = threadIdx.x; unsignedint i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for(unsignedint s=blockDim.x/2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
__global__ voidreduce_v3(float *g_idata,float *g_odata){ __shared__ float sdata[BLOCK_SIZE]; // each thread loads one element from global to shared mem unsignedint tid = threadIdx.x; unsignedint i = blockIdx.x*(blockDim.x * 2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i + blockDim.x]; __syncthreads(); // do reduction in shared mem for(unsignedint s=blockDim.x/2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
// each thread loads one element from global to shared mem unsignedint tid = threadIdx.x; unsignedint i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i + blockDim.x]; __syncthreads();
// do reduction in shared mem for(unsignedint s=blockDim.x/2; s>32; s >>= 1) { if (tid < s){ sdata[tid] += sdata[tid + s]; } __syncthreads(); }
// write result for this block to global mem if (tid < 32) warpReduce(sdata, tid); if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
__global__ voidreduce_v5(float* g_idata, float* g_odata, int n){ int tid = threadIdx.x; int idx = blockIdx.x * (blockDim.x * 2) + tid; constexprint NUM_WARPS = (BLOCK_SIZE + WARP_SIZE - 1) / WARP_SIZE; __shared__ float reduce_smem[NUM_WARPS]; // keep the data in register is enough for warp operaion. float sum = g_idata[idx] + g_idata[idx + blockDim.x]; int warp = tid / WARP_SIZE; int lane = tid % WARP_SIZE; // perform warp sync reduce. sum = warp_reduce_sum_f32<WARP_SIZE>(sum); // warp leaders store the data to shared memory. if (lane == 0) reduce_smem[warp] = sum; __syncthreads(); // make sure the data is in shared memory. // the first warp compute the final sum. sum = (lane < NUM_WARPS) ? reduce_smem[lane] : 0.0f; if (warp == 0) sum = warp_reduce_sum_f32<NUM_WARPS>(sum); if (tid == 0) g_odata[blockIdx.x] = sum; }