OpenMP turns sequential loops into parallel work with a single pragma. But correctness is your responsibility — the compiler won't warn you about race conditions.
#pragma omp parallel for for (int i = 0; i < 10; ++i) { c(i); // runs in parallel across all cores }
// Equal contiguous chunks // Best cache locality #pragma omp parallel for schedule(static) for (int i = 0; i < n; ++i) { ... }
// Grab 1 unit at a time // Best load balancing #pragma omp parallel for schedule(dynamic,1) for (int b = 0; b < blocks; ++b) { ... }
x[i+1] = f(x[i]); // i reads what i-1 wrote
y[0] = f(x[i]); // all write same y[0]
y[i] = f(x[i]); // each i writes different y[i]
#pragma omp parallel for + sum += arr[i] = data race. All threads write to the same sum simultaneously → wrong answer, silently.
// WRONG — race condition on sum float sum = 0; #pragma omp parallel for for (int i = 0; i < n; ++i) sum += arr[i]; // ✗ // CORRECT — reduction clause float sum = 0; #pragma omp parallel for reduction(+:sum) for (int i = 0; i < n; ++i) sum += arr[i]; // ✓
sum, then combines them at the end.// BAD: partial_sum[0] and partial_sum[1] share a cache line float partial_sum[nthreads]; // adjacent floats = same cache line #pragma omp parallel for for (int t = 0; t < nthreads; ++t) partial_sum[t] += work(t); // threads stomp each other's cache // GOOD: pad to 64-byte boundary struct alignas(64) Padded { float val; char pad[60]; }; Padded partial_sum[nthreads]; // each on its own cache line ✓
// Pattern 1: parallel for (most common) #pragma omp parallel for schedule(static) for (int i = 0; i < n; ++i) result[i] = compute(i); // Pattern 2: parallel for + reduction float total = 0; #pragma omp parallel for reduction(+:total) for (int i = 0; i < n; ++i) total += arr[i]; // Pattern 3: parallel region + critical section #pragma omp parallel { Result local = compute_local(); #pragma omp critical { if (local.score > best.score) best = local; } } // Pattern 4: pre-enumerate work items for dynamic balance std::vector<std::pair<int,int>> blocks; // all (i,j) tile pairs #pragma omp parallel for schedule(dynamic,1) for (int b = 0; b < blocks.size(); ++b) process_tile(blocks[b]); // used in correlate exercise