Chapter 4

GPU Programming
with CUDA

A typical C++ program uses 0% of the GPU. Learning to use it requires understanding a completely different execution model — threads, warps, and memory coalescing.

Execution model

The CUDA thread hierarchy

GRID

All thread blocks for one kernel launch

BLOCK

Up to 1024 threads, share shared memory, can __syncthreads()

WARP

32 threads that execute in lockstep — the key scheduling unit

THREAD

Has own registers and local memory

// Launch configuration: grid of blocks, each with threads
dim3 block(256);                      // 256 threads per block
dim3 grid((n + 255) / 256);         // enough blocks for n elements
myKernel<<<grid, block>>>(args);    // launch!

// Inside kernel: identify which element this thread handles
__global__ void myKernel(float* data, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) data[i] = process(data[i]);
}

GPU V0 → V4 case study

V0 — naive global memory access: Each thread reads from a random location in global memory. No coalescing — 32 threads in a warp access 32 different cache lines → 32 memory transactions instead of 1. Uses ~4% of GPU peak.

__global__ void myppKernel_v0(float* r, const float* d, int n) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;
    if (i >= n || j >= n) return;
    float v = HUGE_VALF;
    for (int k = 0; k < n; ++k)
        v = min(v, d[i*n + k] + d[k*n + j]); // d[k*n+j]: stride-n access ✗
    r[i*n + j] = v;
}

V1 — coalesced access: Threads in a warp access consecutive memory addresses → one 128-byte transaction serves 32 threads. Transpose d to t so both reads are coalesced. ~40× speedup over V0.

// Precompute transpose so both reads are row-major
__global__ void transposeKernel(float* t, const float* d, int n) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;
    if (i < n && j < n) t[j*n + i] = d[i*n + j];
}

__global__ void myppKernel_v1(float* r, const float* d,
                                const float* t, int n) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;
    if (i >= n || j >= n) return;
    float v = HUGE_VALF;
    for (int k = 0; k < n; ++k)
        v = min(v, d[i*n + k] + t[j*n + k]); // both coalesced ✓
    r[i*n + j] = v;
}

V2 — shared memory tiling: Load a BLOCK×BLOCK tile of d and t into shared memory (fast, ~32 cycles vs ~700 for global). Each global load serves BLOCK threads instead of 1. Classic GPU optimisation.

__global__ void myppKernel_v2(float* r, const float* d,
                                const float* t, int n) {
    __shared__ float ds[BLOCK][BLOCK]; // shared memory tile
    __shared__ float ts[BLOCK][BLOCK];

    int i = blockIdx.x*BLOCK + threadIdx.x;
    int j = blockIdx.y*BLOCK + threadIdx.y;
    float v = HUGE_VALF;

    for (int k0 = 0; k0 < n; k0 += BLOCK) {
        // Cooperatively load tile into shared mem
        ds[threadIdx.y][threadIdx.x] = d[i*n + k0 + threadIdx.y];
        ts[threadIdx.y][threadIdx.x] = t[j*n + k0 + threadIdx.x];
        __syncthreads(); // wait for all threads in block

        for (int k = 0; k < BLOCK; ++k)
            v = min(v, ds[k][threadIdx.x] + ts[k][threadIdx.y]);
        __syncthreads(); // before loading next tile
    }
    if (i < n && j < n) r[i*n + j] = v;
}

V3 — register reuse: Each thread computes multiple output elements (like V4 in the CPU case study). Load one row of d into registers once, use it for 4 output columns — 4× data reuse from shared memory.

// Each thread computes a 1×4 strip of output
__global__ void myppKernel_v3(float* r, const float* d,
                                const float* t, int n) {
    float v0=HUGE_VALF, v1=HUGE_VALF,
          v2=HUGE_VALF, v3=HUGE_VALF;
    int i = ...; int j0 = ...; // this thread handles row i, cols j0..j0+3

    for (int k = 0; k < n; ++k) {
        float x = d[i*n + k];  // loaded once, reused 4 times
        v0 = min(v0, x + t[j0*n + k]);
        v1 = min(v1, x + t[(j0+1)*n + k]);
        v2 = min(v2, x + t[(j0+2)*n + k]);
        v3 = min(v3, x + t[(j0+3)*n + k]);
    }
    r[i*n+j0]=v0; r[i*n+j0+1]=v1; r[i*n+j0+2]=v2; r[i*n+j0+3]=v3;
}

V4 — vectorised loads: Use float4 to load 4 consecutive floats in one instruction. Reduces instruction count, keeps memory bus saturated. Combined with V3's register reuse → near peak performance.

// float4 loads 4 floats in 1 instruction (128-bit load)
__global__ void myppKernel_v4(float* r, const float* d,
                                const float* t, int n) {
    float4 v = {HUGE_VALF,HUGE_VALF,HUGE_VALF,HUGE_VALF};

    for (int k = 0; k < n; k += 4) {
        float4 dx = reinterpret_cast<const float4*>(d + i*n + k)[0];
        float4 tx = reinterpret_cast<const float4*>(t + j*n + k)[0];
        v.x = min(v.x, dx.x + tx.x); v.y = min(v.y, dx.y + tx.y);
        v.z = min(v.z, dx.z + tx.z); v.w = min(v.w, dx.w + tx.w);
    }
    r[i*n + j] = min(min(v.x,v.y), min(v.z,v.w));
}

GPU V0→V4 performance (% of peak)

V0 naive

V1 coalesced

28%

V2 shared mem

55%

V3 reg reuse

72%

V4 vectorised

92% of peak

← Chapter 3 Exercises →

GPU Programmingwith CUDA

GPU Programming
with CUDA