One graph problem. Eight optimisation versions. From wasting 99.4% of CPU power to reaching 93% of theoretical maximum — using every technique modern hardware offers.
v creates a dependency chain — every iteration must wait for the previous min(v,z) to finish before starting. The hardware's parallel units sit completely idle.
void step(float* r, const float* d, int n) { for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) { float v = std::numeric_limits<float>::infinity(); for (int k = 0; k < n; ++k) { float x = d[n*i + k]; // row read ✓ float y = d[n*k + j]; // col read ✗ jumps n floats each step v = std::min(v, x + y); // dependency chain on v ✗ } r[n*i + j] = v; } }
vmovss — scalar single (1 float)vaddss — scalar addvminss — scalar min, reads+writes %xmm1%xmm (128-bit) but only 1 of 4 lanes. 3 lanes idle.
vminss latency = 4 cyclesd[n*k+j] causes L1/L2 cache misses for large n
d[n*k+j] jumps n×4 bytes per step (cache miss). After transposing, t[n*j+k] advances by 4 bytes per step — sequential, hardware prefetcher works perfectly.
// Build transpose once: t[j][i] = d[i][j] std::vector<float> t(n*n); for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) t[n*j + i] = d[n*i + j]; // Inner loop: both reads now row-major for (int k = 0; k < n; ++k) { float x = d[n*i + k]; // row read ✓ float y = t[n*j + k]; // row read ✓ (was column) v = std::min(v, x + y); }
v dependency chain remains. Memory is fixed but single-accumulator vminss chain limits to 0.5 ops/cycle. V2 fixes this.
constexpr int nb = 4 → compiler unrolls the kb loop and maps vv[0..3] to 4 separate registers. Each register has its own independent vminss chain. CPU executes all 4 simultaneously.
constexpr int nb = 4; // constexpr = compiler can unroll float vv[nb]; for (int kb = 0; kb < nb; ++kb) vv[kb] = infty; for (int ka = 0; ka < na; ++ka) for (int kb = 0; kb < nb; ++kb) vv[kb] = std::min(vv[kb], d[nab*i+ka*nb+kb] + t[nab*j+ka*nb+kb]); // After unrolling, compiler sees 4 independent chains: // vv0=min(vv0,z) vv1=min(vv1,z) ← no dependency between them float v = infty; for (int kb = 0; kb < nb; ++kb) v = std::min(v, vv[kb]);
ss) to packed (ps) instructions. One vaddps %ymm0,%ymm1,%ymm2 = eight additions simultaneously. The CPU's 256-bit YMM registers finally get used.
// Define once: 8 floats packed in one 256-bit register typedef float float8_t __attribute__((vector_size(8*sizeof(float)))); static inline float8_t min8(float8_t x, float8_t y) { return x < y ? x : y; // → vminps (8 minimums, 1 instruction) } // Inner loop: 16 useful ops in 3 vector instructions float8_t vv = f8infty; for (int ka = 0; ka < na; ++ka) { float8_t x = vd[na*i + ka]; // vmovaps — load 8 floats float8_t y = vt[na*j + ka]; // vmovaps — load 8 floats vv = min8(vv, x + y); // vaddps + vminps }
constexpr int nd = 3; float8_t vv[nd][nd]; // 9 independent accumulators for (int ka = 0; ka < na; ++ka) { float8_t x0=vd[na*(ic*nd+0)+ka], x1=vd[na*(ic*nd+1)+ka], x2=vd[na*(ic*nd+2)+ka]; float8_t y0=vt[na*(jc*nd+0)+ka], y1=vt[na*(jc*nd+1)+ka], y2=vt[na*(jc*nd+2)+ka]; // x0 reused 3×, y0 reused 3× — 6 reads → 9 results vv[0][0]=min8(vv[0][0],x0+y0); vv[0][1]=min8(vv[0][1],x0+y1); vv[0][2]=min8(vv[0][2],x0+y2); vv[1][0]=min8(vv[1][0],x1+y0); vv[1][1]=min8(vv[1][1],x1+y1); vv[1][2]=min8(vv[1][2],x1+y2); vv[2][0]=min8(vv[2][0],x2+y0); vv[2][1]=min8(vv[2][1],x2+y1); vv[2][2]=min8(vv[2][2],x2+y2); }
// Each compiles to a single instruction (vpermilps / vperm2f128) static inline float8_t swap4(float8_t x) { return _mm256_permute2f128_ps(x,x,1); // x[i] → x[i^4] } static inline float8_t swap2(float8_t x) { return _mm256_permute_ps(x,0b01001110); // x[i] → x[i^2] } static inline float8_t swap1(float8_t x) { return _mm256_permute_ps(x,0b10110001); // x[i] → x[i^1] } // Inner loop: 2 reads → 8 vv accumulators → 64 outputs for (int k = 0; k < n; ++k) { float8_t a000=vd[n*ia+k], b000=vt[n*ja+k]; // 2 reads only! float8_t a100=swap4(a000), a010=swap2(a000), a110=swap2(a100); float8_t b001=swap1(b000); vv000=min8(vv000,a000+b000); vv001=min8(vv001,a000+b001); vv010=min8(vv010,a010+b000); vv011=min8(vv011,a010+b001); // ... 4 more combinations ... }
constexpr int PF = 20; // latency≈200 cycles / loop≈8 cycles = 25 iters for (int k = 0; k < n; ++k) { // Tell CPU: fetch data I'll need 20 iterations from now __builtin_prefetch(&vd[n*ia + k + PF]); // prefetcht0 instruction __builtin_prefetch(&vt[n*ja + k + PF]); // port 2/3 — doesn't compete float8_t a000 = vd[n*ia + k]; // actual read (now likely in L1) float8_t b000 = vt[n*ja + k]; // ... rest of V5 kernel unchanged ... }
#include <immintrin.h> // for _pdep_u32 // Interleave bits: ia=abcd, ja=stuv → ija=satbucvd std::vector<std::tuple<int,int,int>> rows(na*na); for (int ia = 0; ia < na; ++ia) for (int ja = 0; ja < na; ++ja) { int ija = _pdep_u32(ia, 0x55555555) | _pdep_u32(ja, 0xAAAAAAAA); rows[ia*na+ja] = {ija, ia, ja}; } std::sort(rows.begin(), rows.end()); // sort by Z-key // Row slicing: process k in chunks so each slice fits in L2 constexpr int SLICE = 500; // 500 × 8f × 4B = 16KB per row for (int ka0 = 0; ka0 < na; ka0 += SLICE) { for (auto& [ija,ia,ja] : rows) { for (int ka = ka0; ka < std::min(ka0+SLICE,na); ++ka) { // V6 kernel — data now in L2, not RAM } } }