Listing 7.
void matmul2(int dim, float *a, float *b,
float *c)
{
float dot0, dot1, dot2, dot3, a0, a1, a2,
a3, b0, b1, b2, b3;
float dot4, dot5, dot6, dot7, a4, a5, a6,
a7, b4, b5, b6, b7;
long I, J, i, j, k;
/* ...transpose b into tb like in matmul1...; */
for (I = 0; I < dim*dim; I += dim)
for (j = J = 0; j < dim; ++j, J += dim)
{
dot0 = dot1 = dot2 = dot3 = dot4 = dot5 =
dot6 = dot7 = 0.0;
for (k = 0; k < dim; k += 8)
{
a0 = a[I + k + 0];
b0 = tb[J + k + 0];
a1 = a[I + k + 1];
b1 = tb[J + k + 1];
a2 = a[I + k + 2];
b2 = tb[J + k + 2];
a3 = a[I + k + 3];
b3 = tb[J + k + 3];
a4 = a[I + k + 4];
b4 = tb[J + k + 4];
a5 = a[I + k + 5];
b5 = tb[J + k + 5];
a6 = a[I + k + 6];
b6 = tb[J + k + 6];
a7 = a[I + k + 7];
b7 = tb[J + k + 7];
dot0 += a0 * b0;
dot1 += a1 * b1;
dot2 += a2 * b2;
dot3 += a3 * b3;
dot4 += a4 * b4;
dot5 += a5 * b5;
dot6 += a6 * b6;
dot7 += a7 * b7;
}
c[I + j] = dot0 + dot1 + dot2 + dot3 +
dot4 + dot5 + dot6 + dot7;
}
}