#include <inttypes.h>
#include <stdio.h>

#include "xutil.h"
#include "matrix.h"

#define SM (CLS/sizeof(uint32_t))


// IDEAS:
// - Use additive indexing
// - Maybe create a special case if eg. iend < SM, to maximize use of SIMD.

void drepper_mult(matrix_t *A, matrix_t *B, matrix_t *C) {
	uint32_t i, j, k, i2, j2, k2;

	uint32_t M = A->m,
			 P = B->n,
			 N = B->m;

	uint32_t iend, jend, kend;

	int32_t *restrict rres, *restrict rmul1, *restrict rmul2;

	for (i = 0; i < M; i += SM) {
		iend = i + SM > M ? M-i : SM;

		for(j = 0; j < P; j += SM) {
			jend = j + SM > P ? P-j : SM;

			for (k = 0; k < N; k += SM) {
				kend = k + SM > N ? N-k : SM;


				for (i2 = 0, rres = &C->data[P*i + j],
						rmul1 = &A->data[N*i + k]; i2 < iend;
						++i2, rres += P, rmul1 += N) {

					for (k2 = 0, rmul2 = &B->data[P*k + j];
							k2 < kend;
							++k2, rmul2 += P) {

						for (j2 = 0; j2 < jend; ++j2) {
							rres[j2] += rmul1[k2] * rmul2[j2];
						}
					}
				}
			}
		}
	}
}

void drepper(struct matrix_mult *str) {
	matrix_t *A;
	matrix_t *B;
	matrix_t *C;

	A = str->A;
	B = str->B;
	C = str->C;

	drepper_mult(A, B, C);
}
