It is surprising to see mul1() is 10 times slower than mul2().
Mul2: By using j as the inner loop, C[i][j] & B[k][j] have good cache hits, while A[i][k] is a constant during the inner loop.
Mul1: In the inner loop, C[i][j] has a constant address, and A[i][k] has good cache hits; however B[k][j] is doomed to cache misses.
There are other methods to optimize matrix multiplication, but this one should be the most significant.
reference slide: https://www.cs.duke.edu/courses/fall06/cps220/lectures/PPT/lect12.pdf
#include <iostream>
#include <stdlib.h>
#include <time.h>
using namespace std;
const int N = 2000;
double A[N][N], B[N][N], C[N][N];
void fill()
{
for (int i=0; i<N; i++)
for (int j=0; j<N; j++) {
A[i][j] = (double)rand() / RAND_MAX;
C[i][j] = 0;
}
}
void mul1()
{
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
for (int k=0; k<N; k++)
C[i][j] += A[i][k] * B[k][j];
}
void mul2()
{
for (int i=0; i<N; i++)
for (int k=0; k<N; k++)
for (int j=0; j<N; j++)
C[i][j] += A[i][k] * B[k][j];
}
int main()
{
fill();
clock_t startTime;
startTime = clock();
mul1();
cout << (clock() - startTime)/(double) CLOCKS_PER_SEC << endl;
startTime = clock();
mul2();
cout << (clock() - startTime)/(double) CLOCKS_PER_SEC << endl;
return 0;
}
/*
> g++ mul.cpp -O2
> a.exe
87.64
7.584
*/