1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
|
#define MATRIX_WIDTH 22 // Si > 22, problème
#define MEMSIZE MATRIX_WIDTH*MATRIX_WIDTH * sizeof( float )
__global__ void multMatrixKernel( float* A, float* B, float* R )
{
float res = 0;
for( unsigned int i = 0; i < MATRIX_WIDTH; ++i )
{
float e1 = A[threadIdx.y * MATRIX_WIDTH + i];
float e2 = A[i * MATRIX_WIDTH + threadIdx.x];
res += e1 * e2;
}
R[threadIdx.y * MATRIX_WIDTH + threadIdx.x] = res;
}
void multMatrixGPU( float* A, float* B )
{
float* dA;
float* dB;
float* dR;
float* R = (float*) malloc( MEMSIZE );
cudaMalloc( (void**) &dA, MEMSIZE );
cudaMalloc( (void**) &dB, MEMSIZE );
cudaMalloc( (void**) &dR, MEMSIZE );
cudaMemset( dR, 0x00, MEMSIZE );
cudaMemcpy( dA, A, MEMSIZE, cudaMemcpyHostToDevice );
cudaMemcpy( dB, B, MEMSIZE, cudaMemcpyHostToDevice );
multMatrixKernel<<<1, dim3( MATRIX_WIDTH, MATRIX_WIDTH )>>>( dA, dB, dR );
cudaMemcpy( R, dR, MEMSIZE, cudaMemcpyDeviceToHost );
cudaFree( dA );
cudaFree( dB );
cudaFree( dR );
writeMatrix( "GPU.out", R );
free( R );
} |
Partager