#include <madness/madness_config.h>
/*
        void mTxm<ni>(dimi, long nj, long nk, double* c, const double* a, const double* b)

        double c[ni,nj]
        double a[nk,dimi]   <-- note dimi here
        double b[nk,nj]
        c(i,j) = sum(k) a(k,i) * b(k,j)  i=1..ni, j=1..nj, k=1..nk

        All arrays stored in C order.

        These routines make rigid assumptions in order to obtain high speed

        1) ALL input arrays are aligned on 16-byte boundaries

        2) ni, nj, nk, dimi are all even

        The macros below provide the frame work for the actual routines which are
        automatically generated by the python program genmtxm.py.

        The stratgy in each routine is simple.

        C[i,j] and C[i+1,j] are stored in a single XMM register.

        for (j=0; j<nj; ++j)
        .   zero accumulators for c(*,j)
        .   for (k=0; k<nk; k+=2)
        .       load bkj = b[k,j] and bkj2=b[k+1,j]
        .       for (i=0; i<ni; ++ni) <-- fully unrolled
        .           c[i,j] += a[k,i]*bkj
        .       for (i=0; i<ni; ++ni) <-- fully unrolled
        .           c[i,j] += a[k+1,i]*bkj2

	There is an analgous set of routines for small j in which i
	is moved outside and j is completely unrolled on the inside.
	Another way of looking at this is we compute C = (BT*A)T
	thus the only difference is how we increment C.
	The driver code mTxmq() calls whichever it thinks is the fastest.

        The routine mTxmq is used to implement the same operation for larger
        values of ni with the SAME RESTRICTIONS on all pointers being aligned
        and all ni,nj,nk even.

        For large matrices use ATLAS, GOTO, MKL, ACML, etc., since they include
        optmizations for cache, TLB, etc.  The mTxm routines are tweaked for
        small stuff.

        relevant parts of 64-bit LINUX ABI.
        - rsp is the stack pointer and must be 16-byte aligned
        - first 6 integer args passed in registers in this order rdi, rsi, rdx, rcx, r8, r9
        - first 8 floating point args passed in xmm0-7
        - all other args pushed on the stack in reverse order followed by return address
        - xmm*, ST*, r10 and r11 do not need to be saved
        - all other registers must be saved
        - redzone in rsp-1 to rsp-128 can be freely used

        relevant parts of 32-bit LINUX ABI.
        - esp is the stack pointer and must be 16-byte aligned
        - all args pushed on the stack in reverse order followed by return address
        - xmm*, ST*, eax, ecx, edx, need not be saved
        - ebx, esi, edi, ebp must be saved
*/

        /*
        Late pentium4 (prescott), Core and beyond, opteron, and late AMD athlon and beyond
        all have SSE3 in 32-bit.   I think all 64-bit CPUs will have SSE3.  Thus, I have
        enabled SSE3 by default and am not enthused enuf to examine the cpuid.  On older
	P4s this code also only hits one flop/cycle instead of two ... I don't plan
	to fix this unless it resurfaces on new cpus.

        If you are using an older CPU and you get an illegal instruction, add -DDISABLE_SSE3
        to the ASFLAGS.
        */

#  ifndef DISABLE_SSE3
#    define HAVE_SSE3
#  endif

#ifdef X86_64

#  define DIMI8 %rdi
#  define NJ  %rsi
#  define NK  %eax
#  define C   %rcx
#  define A   %r8
#  define B   %r9

#  define NJ8 %rdx
#  define ASAVE %r10
#  define BSAVE %r11

#  define NKSAVE -8(%rsp)

#  define BKJ %xmm0
#  define BKJ2 %xmm1
#  define AKI %xmm2
#  define C00 %xmm3
#  define C01 %xmm4
#  define C02 %xmm5
#  define C03 %xmm6
#  define C04 %xmm7
#  define C05 %xmm8
#  define C06 %xmm9
#  define C07 %xmm10
#  define C08 %xmm11
#  define C09 %xmm12
#  define C10 %xmm13
#  define C11 %xmm14
#  define C12 %xmm15

#ifdef ON_A_MAC
#define MTXM_ENTRY(name) \
.globl _##name; \
_##name: \
        mov     %edx,   NK; \
        shl     $3,     DIMI8; \
        mov     NJ,     NJ8; \
        shl     $3,     NJ8; \
        mov     A,      ASAVE; \
        mov     B,      BSAVE; \
        mov     NK,     NKSAVE

#else
#define MTXM_ENTRY(name) \
.globl name; \
  .type name,@function ; \
  .align 128; \
name: \
        mov     %edx,   NK; \
        shl     $3,     DIMI8; \
        mov     NJ,     NJ8; \
        shl     $3,     NJ8; \
        mov     A,      ASAVE; \
        mov     B,      BSAVE; \
        mov     NK,     NKSAVE
#endif

#define RETURN ret

#else

        /*
        stack
        +24     B
        +20     A
        +16     C
        +12     NK
        +8      NJ
        +4      DIMI
         0      return address
        -4      ebp save
        -8      ebx save
        -12     esi save
        -16     edi save
        */

#  define DIMI8  %edi
#  define NJ     %esi
#  define NK     %edx
#  define C      32(%esp)
#  define A      %ebx
#  define B      %ecx

#  define NJ8    %eax
#  define ASAVE  36(%esp)
#  define BSAVE  40(%esp)
#  define NKSAVE 28(%esp)

#  define BKJ %xmm0
#  define BKJ2 %xmm1
#  define AKI %xmm2
#  define C00 %xmm3
#  define C01 %xmm4
#  define C02 %xmm5
#  define C03 %xmm6
#  define C04 %xmm7

#ifdef ON_A_MAC
#define NAME(name) _##name
#else
#define NAME(name) name
#endif

/*  .type name,@function ; */
#define MTXM_ENTRY(name) \
.globl NAME(name); \
NAME(name): \
        pushl   %ebp;            \
        mov     %esp,  %ebp;     \
        pushl   %ebx            ;\
        pushl   %esi            ;\
        pushl   %edi            ;\
        mov     20(%esp), DIMI8; \
        mov     24(%esp), NJ; \
        mov     28(%esp), NK; \
        mov     36(%esp), A; \
        mov     40(%esp), B; \
        shl     $3, DIMI8; \
        mov     NJ, NJ8; \
        shl     $3, NJ8;

#define RETURN  \
        popl   %edi             ;\
        popl   %esi             ;\
        popl   %ebx             ;\
        popl   %ebp             ;\
        ret

#endif



#define ZERO(a)  pxor a,a

#ifdef HAVE_SSE3
#define LOADBKJ              \
        movddup (B),    BKJ; \
        add      NJ8,    B;  \
        movddup (B),    BKJ2;\
        add      NJ8,    B
#else
#define LOADBKJ              \
        movsd    (B),    BKJ; unpcklpd BKJ,BKJ;\
        add      NJ8,    B;  \
        movsd    (B),    BKJ2; unpcklpd BKJ2,BKJ2;\
        add      NJ8,    B

#endif

#define ABC(disp,dest) \
        movaps  disp(A),    AKI; \
        mulpd   BKJ, AKI; \
        addpd   AKI, dest

#define ABC2(disp,dest) \
        movaps  disp(A),    AKI; \
        mulpd   BKJ2, AKI; \
        addpd   AKI, dest

#define INCA   add DIMI8, A

/* For 32-bit C will be in memory so use B as a temporary register */
#define STORE(reg) \
	movlpd  reg,  (B); \
        add     NJ8,    B; \
        movhpd  reg,  (B); \
        add     NJ8,    B

/* For main application need result in cache so movntps is counter productive
   e.g., 2.3GHz Barcelona 7+GF -> 4GF in transform() */
#define STORET(reg) movaps  reg,  (B); add $16, B

        /* For 32-bit C and BSAVE will be in memory so also use B as
           a temporary register for incrementing */

#define INCC  mov C, B; add $8, B;    mov B, C;

#define INCCT mov C, B; add DIMI8, B; mov B, C;

#define NEXTJ \
        mov     BSAVE, B;   add $8, B;    mov B, BSAVE; \
        mov     ASAVE,  A; \
        mov     NKSAVE, NK; \
        sub     $1,     NJ

#include "mtxm_gen.h"
