A typical C++ program uses 0% of the GPU. Learning to use it requires understanding a completely different execution model — threads, warps, and memory coalescing.
// Launch configuration: grid of blocks, each with threads dim3 block(256); // 256 threads per block dim3 grid((n + 255) / 256); // enough blocks for n elements myKernel<<<grid, block>>>(args); // launch! // Inside kernel: identify which element this thread handles __global__ void myKernel(float* data, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) data[i] = process(data[i]); }
__global__ void myppKernel_v0(float* r, const float* d, int n) { int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; if (i >= n || j >= n) return; float v = HUGE_VALF; for (int k = 0; k < n; ++k) v = min(v, d[i*n + k] + d[k*n + j]); // d[k*n+j]: stride-n access ✗ r[i*n + j] = v; }
// Precompute transpose so both reads are row-major __global__ void transposeKernel(float* t, const float* d, int n) { int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; if (i < n && j < n) t[j*n + i] = d[i*n + j]; } __global__ void myppKernel_v1(float* r, const float* d, const float* t, int n) { int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; if (i >= n || j >= n) return; float v = HUGE_VALF; for (int k = 0; k < n; ++k) v = min(v, d[i*n + k] + t[j*n + k]); // both coalesced ✓ r[i*n + j] = v; }
__global__ void myppKernel_v2(float* r, const float* d, const float* t, int n) { __shared__ float ds[BLOCK][BLOCK]; // shared memory tile __shared__ float ts[BLOCK][BLOCK]; int i = blockIdx.x*BLOCK + threadIdx.x; int j = blockIdx.y*BLOCK + threadIdx.y; float v = HUGE_VALF; for (int k0 = 0; k0 < n; k0 += BLOCK) { // Cooperatively load tile into shared mem ds[threadIdx.y][threadIdx.x] = d[i*n + k0 + threadIdx.y]; ts[threadIdx.y][threadIdx.x] = t[j*n + k0 + threadIdx.x]; __syncthreads(); // wait for all threads in block for (int k = 0; k < BLOCK; ++k) v = min(v, ds[k][threadIdx.x] + ts[k][threadIdx.y]); __syncthreads(); // before loading next tile } if (i < n && j < n) r[i*n + j] = v; }
// Each thread computes a 1×4 strip of output __global__ void myppKernel_v3(float* r, const float* d, const float* t, int n) { float v0=HUGE_VALF, v1=HUGE_VALF, v2=HUGE_VALF, v3=HUGE_VALF; int i = ...; int j0 = ...; // this thread handles row i, cols j0..j0+3 for (int k = 0; k < n; ++k) { float x = d[i*n + k]; // loaded once, reused 4 times v0 = min(v0, x + t[j0*n + k]); v1 = min(v1, x + t[(j0+1)*n + k]); v2 = min(v2, x + t[(j0+2)*n + k]); v3 = min(v3, x + t[(j0+3)*n + k]); } r[i*n+j0]=v0; r[i*n+j0+1]=v1; r[i*n+j0+2]=v2; r[i*n+j0+3]=v3; }
float4 to load 4 consecutive floats in one instruction. Reduces instruction count, keeps memory bus saturated. Combined with V3's register reuse → near peak performance.// float4 loads 4 floats in 1 instruction (128-bit load) __global__ void myppKernel_v4(float* r, const float* d, const float* t, int n) { float4 v = {HUGE_VALF,HUGE_VALF,HUGE_VALF,HUGE_VALF}; for (int k = 0; k < n; k += 4) { float4 dx = reinterpret_cast<const float4*>(d + i*n + k)[0]; float4 tx = reinterpret_cast<const float4*>(t + j*n + k)[0]; v.x = min(v.x, dx.x + tx.x); v.y = min(v.y, dx.y + tx.y); v.z = min(v.z, dx.z + tx.z); v.w = min(v.w, dx.w + tx.w); } r[i*n + j] = min(min(v.x,v.y), min(v.z,v.w)); }