#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
__global__ void matMul(int* dA, int* dB, int* dC, int m, int n, int k)
{
int row = blockDim.x * blockIdx.x + threadIdx.x;
int col = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * n + col;
if (row >= m || col >= n)
{
return;
}
int sum = 0;
for (int i = 0; i < k; ++i)
{
sum += dA[row * k + i] * dB[n * i + col];
}
dC[index] = sum;
}
int main(int argc, char** argv)
{
int m, n, k;
if (argc < 3)
{
m = 500;
n = 600;
k = 700;
}
else
{
m = atoi(argv[1]);
n = atoi(argv[2]);
k = atoi(argv[3]);
}
printf("matrix size A(%d, %d), B(%d, %d), C(%d, %d)\n", m, k, k, n, m, n);
std::chrono::system_clock::time_point start;
int* A = new int[m * k];
int* B = new int[k * n];
int* C = new int[m * n];
int* C2 = new int[m * n];
for (int i = 0; i < m * k; ++i)
{
A[i] = rand() % 100;
}
for (int i = 0; i < k * n; ++i)
{
B[i] = rand() % 100;
}
int* dA, * dB, * dC;
cudaMalloc(&dA, m * k * sizeof(int));
cudaMemset(dA, 0, m * k * sizeof(int));
cudaMalloc(&dB, k * n * sizeof(int));
cudaMemset(dB, 0, k * n * sizeof(int));
cudaMalloc(&dC, m * n * sizeof(int));
cudaMemset(dC, 0, m * n * sizeof(int));
start = std::chrono::system_clock::now();
cudaMemcpy(dA, A, m * k * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dB, B, k * n * sizeof(int), cudaMemcpyHostToDevice);
std::chrono::duration<double> time_memcpyToDevice = std::chrono::system_clock::now() - start;
int blockSize = 32;
dim3 gridDim(ceil(static_cast<float>(m) / blockSize), ceil(static_cast<float>(n) / blockSize));
dim3 blockDim(blockSize, blockSize);
printf("Grid(%d, %d), Block(%d, %d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
start = std::chrono::system_clock::now();
matMul <<<gridDim, blockDim >>> (dA, dB, dC, m, n, k);
cudaDeviceSynchronize();
std::chrono::duration<double> time_matMulGPU = std::chrono::system_clock::now() - start;
start = std::chrono::system_clock::now();
cudaMemcpy(C, dC, m * n * sizeof(int), cudaMemcpyDeviceToHost);
std::chrono::duration<double> time_memcpyToHost = std::chrono::system_clock::now() - start;
start = std::chrono::system_clock::now();
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n; ++j)
{
int sum = 0;
for (int d = 0; d < k; ++d)
{
sum += A[i * k + d] * B[n * d + j];
}
C2[n * i + j] = sum;
}
}
std::chrono::duration<double> time_matMulCPU = std::chrono::system_clock::now() - start;
bool matrixCompare = true;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n; ++j)
{
if (C[n * i + j] != C2[n * i + j])
{
printf("wrong value at (%d, %d) C1 = %d C2 = %d\n", i, j, C[n * i + j], C2[n * i + j]);
matrixCompare = false;
}
}
}
if (matrixCompare == true)
{
printf("matmul results using CPU and GPU are matched\n");
}
printf("memcpyToDevice : %lf(ms)\nmemcpyToHost : %lf(ms)\nmatMul GPU : %lf(ms)\nmatMul CPU : %lf(ms)", time_memcpyToDevice * 1000, time_memcpyToHost * 1000, time_matMulGPU * 1000, time_matMulCPU * 1000);
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
delete[] A, B, C, C2;
return 0;
}