CUDA Programming 행렬곱 예제

haeryong·2023년 5월 27일
0

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <cmath>

#include <stdio.h>
#include <stdlib.h>


__global__ void matMul(int* dA, int* dB, int* dC, int m, int n, int k)
{
	int row = blockDim.x * blockIdx.x + threadIdx.x;
	int col = blockDim.y * blockIdx.y + threadIdx.y;
	int index = row * n + col;

	if (row >= m || col >= n)
	{
		return;
	}

	int sum = 0;
	for (int i = 0; i < k; ++i)
	{
		sum += dA[row * k + i] * dB[n * i + col];
	}
	dC[index] = sum;
	//printf("C at (%d, %d) : %d\n", row, col, sum);
}

int main(int argc, char** argv)
{
	int m, n, k;
	if (argc < 3)
	{
		m = 500;
		n = 600;
		k = 700;
	}
	else
	{
		m = atoi(argv[1]);
		n = atoi(argv[2]);
		k = atoi(argv[3]);
	}

	printf("matrix size A(%d, %d), B(%d, %d), C(%d, %d)\n", m, k, k, n, m, n);

	std::chrono::system_clock::time_point start;

	int* A = new int[m * k];
	int* B = new int[k * n];
	int* C = new int[m * n];
	int* C2 = new int[m * n];
	for (int i = 0; i < m * k; ++i)
	{
		A[i] = rand() % 100;
	}

	for (int i = 0; i < k * n; ++i)
	{
		B[i] = rand() % 100;
	}

	int* dA, * dB, * dC;
	cudaMalloc(&dA, m * k * sizeof(int));
	cudaMemset(dA, 0, m * k * sizeof(int));
	cudaMalloc(&dB, k * n * sizeof(int));
	cudaMemset(dB, 0, k * n * sizeof(int));
	cudaMalloc(&dC, m * n * sizeof(int));
	cudaMemset(dC, 0, m * n * sizeof(int));

	start = std::chrono::system_clock::now();
	cudaMemcpy(dA, A, m * k * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(dB, B, k * n * sizeof(int), cudaMemcpyHostToDevice);
	std::chrono::duration<double> time_memcpyToDevice = std::chrono::system_clock::now() - start;

	int blockSize = 32;
	dim3 gridDim(ceil(static_cast<float>(m) / blockSize), ceil(static_cast<float>(n) / blockSize));
	dim3 blockDim(blockSize, blockSize);
	printf("Grid(%d, %d), Block(%d, %d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);

	start = std::chrono::system_clock::now();
	matMul <<<gridDim, blockDim >>> (dA, dB, dC, m, n, k);
	cudaDeviceSynchronize();
	std::chrono::duration<double> time_matMulGPU = std::chrono::system_clock::now() - start;

	start = std::chrono::system_clock::now();
	cudaMemcpy(C, dC, m * n * sizeof(int), cudaMemcpyDeviceToHost);
	std::chrono::duration<double> time_memcpyToHost = std::chrono::system_clock::now() - start;

	start = std::chrono::system_clock::now();
	for (int i = 0; i < m; ++i)
	{
		for (int j = 0; j < n; ++j)
		{
			int sum = 0;
			for (int d = 0; d < k; ++d)
			{
				sum += A[i * k + d] * B[n * d + j];
			}
			C2[n * i + j] = sum;
			//printf("C2 at (%d, %d) : %d\n", i, j, C2[m * i + j]);

		}
	}
	std::chrono::duration<double> time_matMulCPU = std::chrono::system_clock::now() - start;

	bool matrixCompare = true;
	for (int i = 0; i < m; ++i)
	{
		for (int j = 0; j < n; ++j)
		{
			if (C[n * i + j] != C2[n * i + j])
			{
				printf("wrong value at (%d, %d) C1 = %d C2 = %d\n", i, j, C[n * i + j], C2[n * i + j]);
				matrixCompare = false;
			}
		}
	}
	if (matrixCompare == true)
	{
		printf("matmul results using CPU and GPU are matched\n");
	}

	printf("memcpyToDevice : %lf(ms)\nmemcpyToHost : %lf(ms)\nmatMul GPU : %lf(ms)\nmatMul CPU : %lf(ms)", time_memcpyToDevice * 1000, time_memcpyToHost * 1000, time_matMulGPU * 1000, time_matMulCPU * 1000);

	cudaFree(dA);
	cudaFree(dB);
	cudaFree(dC);

	delete[] A, B, C, C2;
	return 0;
}

0개의 댓글