Interactive visualization of performance optimization techniques
Most straightforward implementation using three nested loops.
for (size_t i = 0; i < size; i++) {
for (size_t j = 0; j < size; j++) {
float acc = 0.0;
for (size_t k = 0; k < size; k++) {
acc += A[i * size + k] * B[k * size + j]; // Poor cache locality for B
}
C[i * size + j] = acc;
}
}
Reduces loop control overhead by processing multiple iterations at once.
for (register int k = 0; k < (size & ~3); ) {
acc += A[i * size + k] * B[k * size + j]; // Iteration 1
k += 1;
acc += A[i * size + k] * B[k * size + j]; // Iteration 2
k += 1;
acc += A[i * size + k] * B[k * size + j]; // Iteration 3
k += 1;
acc += A[i * size + k] * B[k * size + j]; // Iteration 4
k += 1;
}
// Handle remainder
for (register int k = (size & ~3); k < size; k++) {
acc += A[i * size + k] * B[k * size + j];
}
Transposing matrix B converts column-wise access to row-wise access.
// Transpose B matrix once
float* b_t = transpose(B, size);
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
float *a_row = &A[i * size]; // Row-wise access
float *b_transposed_row = &b_t[j * size]; // Now also row-wise!
float acc = 0;
for (int k = 0; k < size; k++) {
acc += a_row[k] * b_transposed_row[k]; // Both sequential!
}
C[i * size + j] = acc;
}
}
Distributes the workload across multiple CPU cores by having each thread process different rows of the result matrix.
void *matrix_multiply_thread(void *user) {
Worker *worker = (Worker *) user;
int workerIdx = worker->workerIdx;
int thread_count = worker->thread_count;
// Each thread processes every nth row
for (int i = workerIdx; i < size; i += thread_count) {
for (int j = 0; j < size; j++) {
// Compute C[i][j]
float acc = 0;
for (int k = 0; k < size; k++) {
acc += A[i * size + k] * B_transposed[j * size + k];
}
C[i * size + j] = acc;
}
}
return NULL;
}
Uses ARM NEON SIMD instructions to process 4 floating-point numbers simultaneously, dramatically increasing computational throughput.
float32x4_t acc = vdupq_n_f32(0); // Zero vector accumulator
for (int k = 0; k < (size & ~3); k += 4) {
// Load 4 floats from A and B simultaneously
float32x4_t a_reg = vld1q_f32(&a_row[k]);
float32x4_t b_reg = vld1q_f32(&b_transposed_row[k]);
// Fused multiply-add: acc = acc + (a_reg * b_reg)
acc = vfmaq_f32(acc, a_reg, b_reg);
}
// Horizontal sum to get final result
float sum = vaddvq_f32(acc);
// Handle remainder elements
for (int k = (size & ~3); k < size; k++) {
sum += a_row[k] * b_transposed_row[k];
}
Hand-written ARM assembly code provides ultimate control over instruction scheduling, register allocation, and memory access patterns.
LD1 - Vector load with auto-incrementFMLA - Fused multiply-addFADDP - Pairwise floating-point addFMOV - Move from vector to general register__asm__ volatile(
"DUP v0.4s, wzr\n" // Zero accumulator vector
"mov x3, %[limit]\n" // Load loop limit
"1:\n" // Loop label
"LD1 {v1.4s}, [%[a_ptr]], #16\n" // Load 4 floats from A, increment pointer
"LD1 {v2.4s}, [%[b_ptr]], #16\n" // Load 4 floats from B, increment pointer
"FMLA v0.4s, v1.4s, v2.4s\n" // acc += A * B (vectorized)
"sub x3, x3, #4\n" // Decrement counter
"cbnz x3, 1b\n" // Branch if not zero
// Horizontal reduction
"FADDP v0.4s, v0.4s, v0.4s\n" // Pairwise add
"FADDP s0, v0.2s\n" // Final sum
"FMOV %w[output], s0\n" // Move to output register
: [output] "=r" (sum), [b_ptr] "+r" (b_transposed_row)
: [limit] "r" ((size_t)(size & ~3)), [a_ptr] "r" (a_row)
: "v0", "v1", "v2", "x3"
);
Each optimization technique builds upon the previous ones, creating a cumulative performance improvement: