#include <inttypes.h>
#include <stdio.h>

#include "frigo.h"
#include "xutil.h"
#include "matrix.h"

static uint32_t lower = 12;

typedef struct {
	matrix_t *A;
	matrix_t *B;
	matrix_t *C;
} store;

//store st;

void mm_frigo_mult(store *st, const uint32_t ms, const uint32_t me, const uint32_t ns,
		const uint32_t ne, const uint32_t ps, const uint32_t pe) {

	uint32_t m = me - ms;
	uint32_t n = ne - ns;
	uint32_t p = pe - ps;

	// Tuning options here would be ofcourse 12, changing the loop order,
	// maybe using SIMD instructions.
	if (unlikely(m <= lower && n <= lower && p <= lower)) {
		int32_t sum;
		uint32_t i, j, k;

		int32_t *a = st->A->data,
				*b = st->B->data,
				*c = st->C->data;

		int N = st->A->n,
			P = st->B->n;

		for (i = ms; i <= me ; i++) {
			for (j = ps; j <= pe ; j++) {
				sum = 0;

				for (k = ns; k <= ne ; k++) {
					//sum += MAT(st->A, i, k) * MAT(st->B, k, j);
					sum += a[N*i + k] * b[P*k + j];
				}

				//MAT(st->C, i, j) += sum;
				c[P*i + j] += sum;
			}
		}

		return;
	}

	uint32_t a = me,
			 b = ne,
			 c = pe;

	//uint32_t t1, t2, *r;

	// TODO: Can this be improved in some way?
	if (m >= n && m >= p) {
		a = ms + n/2;
		//t1 = ms;
		//t2 = m;
		//r = &a;
	} else if (n >= p) {
		b = ns + n/2;
		//t1 = ns;
		//t2 = n;
		//r = &b;
	} else {
		c = ps + p/2;
		//t1 = ps;
		//t2 = p;
		//r = &c;
	}

	//*r = t1;
	//*r += (t2 >> 1);

	mm_frigo_mult(st, ms, a, ns, b, ps, c);

	a = ms;
	b = ns;
	c = ps;

	// Any way to make this better? Doing the same calculation twice seems bad.
	//*r = t1;
	//*r += (t2 >> 1);
	//*r += 1;

	if (m >= n && m >= p) {
		a = ms + n/2 + 1;
		//t1 = ms;
		//t2 = m;
		//r = &a;
	} else if (n >= p) {
		b = ns + n/2 + 1;
		//t1 = ns;
		//t2 = n;
		//r = &b;
	} else {
		c = ps + p/2 + 1;
		//t1 = ps;
		//t2 = p;
		//r = &c;
	}

	mm_frigo_mult(st, a, me, b, ne, c, pe);
}

void frigo(struct matrix_mult *restrict str) {
	matrix_t *restrict A;
	matrix_t *restrict B;
	matrix_t *restrict C;

	char *val = getenv("FRIGO_STOP");

	if (val != NULL) {
		lower = atoi(val) ;
	}

	A = str->A;
	B = str->B;
	C = str->C;

	store st;
	st.A = A;
	st.B = B;
	st.C = C;

	mm_frigo_mult(&st, 0, A->m-1, 0, A->n-1, 0, B->n-1);
}

