- 6. óra -
Hidrodinamikai Rendszerek Tanszék
#define VECTOR_N 1024
#define ELEMENT_N 256
const int DATA_N = VECTOR_N * ELEMENT_N;
const int DATA_SZ = DATA_N * sizeof(float);
const int RESULT_SZ = VECTOR_N * sizeof(float);
...
float *d_A, *d_B, *d_C;
...
cudaMalloc((void**)&d_A, DATA_SZ);
cudaMalloc((void**)&d_B, DATA_SZ);
cudaMalloc((void**)&d_C, RESULT_SZ);
...
scalarProd<<<VECTOR_N, ELEMENT_N>>>(d_C, d_A, d_B);
__global void scalarProd(float *d_C, float *d_A, float *d_B)
{
__shared__ float accumResult[ELEMETN_N];
float *A = d_A + ELEMENT_N * blockIdx.x;
float *B = d_B + ELEMENT_N * blockIdx.x;
int tx = threadIdx.x;
accumResult[tx] = A[tx] * B[tx];
for(int stride= ELEMENT_N / 2; stride > 0; stride >>=1)
{
__syncthreads();
if(tx < stride)
accumResult[tx] += accumResult[stride + tx];
}
d_C[blockIdx.x] = accumResult[0];
}