
High Clock Frequency
Large Caches

Moderate Clock Frequency
Small Caches
CPUs for sequential parts where latency matters
GPUs for parallel parts where throughput wins
Parallelism Scalability

Algorithm Complexity and Data Scalability

Load Balance

The total amout of time to complete a parallel job is limited by the thread that takes the longest to finish

Array of Parallel Threads

A CUDA kernel is executed by a grid(array) of threads
Thread Blocks: Scalable Cooperation

Divide thread array into multiple blocks
CUDA Device Memory Management API Functions

cudaMalloc()
cudaFree()
Host-Device Data Transfer API functions

cudaMemcpy()
#include <cuda.h>
void vecAdd(float* h_A, float* h_B, float* h_C, int n)
{
int size = n* sizeof(float);
float *d_A *d_B, *d_C;
// Allocate Device Memory for A,B and C
cudaMalloc((void **) &d_A, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_B, size);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_C, size);
// Kernel invocation code
vecAddKernel<<<ceil(n/256.0), 256>>>(d_A, d_B, d_C, n);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Free device memory for A, B, C
cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
}
Kernel Function
// Compute vector sum C = A + B
// Each thread performs one pair-wise addition
__global__
void vecAddKernel(float* d_A, float* d_B, float* d_C)
{
int i= blockIdx.x * blockDim.x + threadIdx.x;
if(i<n) d_C[i] = d_A[i] + d_B[i];
}

__global__ defines a kernel function
__device__ and __host__ can be used together
