
;	optimized mdct() for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

;	99/09/15	subband.nas̕
;	ϐ̕ύX
;	xxx -> sbd_xxx
;	enwindow_sse -> enwindow

%include "nasm.h"

	globaldef	window_filter_subband_FPU
	externdef	enwindow
	externdef	idct_coefficient
	externdef	sbd_xxx

HAN_SIZE	equ	512		;defined in common.h
SBLIMIT		equ	32

	segment_data

	segment_bss

	segment_code
;********   FPUp[`   ********
; by K.SAKAI
;   sbd_shiftin()Ƒgݍ킹ĎgpB
;   2000/03/29	bł122k[clk]@͓, 114k[clk]@͓
;	2000/04/17	SMPœK̂߂̕z by K.SAKAI
; void window_filter_subband_FPU(float *win_buf, float *s, int mode_gr)
		align	16
window_filter_subband_FPU:
		push	ebx
		push	esi
		push	edi
		push	ebp
; allocate yyy[32]
		sub		esp,32*4
%define	yyy esp
%assign _P (4+32)*4
		mov		edi,[esp+_P+4]		; = win_buf
		mov		ebp,[esp+_P+8]		; = s
		mov		esi,[esp+_P+12]		; = mode_gr(=1 or 2)
		add		edi,1152*4			; = c = &win_buf[1152]
		add		esi,esi
		lea		esi,[esi+esi*8]		; = j = 18*mode_gr
		jmp		short .f1

		align	16
.lp1:
.f1:
;               yprime[16] = (c[64*1]+c[64*7])*enwindow[0][1][0]
;                     + (c[64*2]+c[64*6])*enwindow[0][2][0]
;                     + (c[64*3]+c[64*5])*enwindow[0][3][0]
;                     +  c[64*4]*enwindow[0][4][0]
;                     + (c[64*0+32]-c[64*7+32])*enwindow[8][0][0]
;                     + (c[64*1+32]-c[64*6+32])*enwindow[8][1][0]
;                     + (c[64*2+32]-c[64*5+32])*enwindow[8][2][0]
;                     + (c[64*3+32]-c[64*4+32])*enwindow[8][3][0];
		fld		dword [edi + (64*1 +  0)*4]
		fadd	dword [edi + (64*7 +  0)*4]
		fmul	dword [enwindow + (0*8*4 + 1*4)*4]	; *= enwindow[0][1][0]
		fld		dword [edi + (64*2 +  0)*4]
		fadd	dword [edi + (64*6 +  0)*4]
		fmul	dword [enwindow + (0*8*4 + 2*4)*4]	; *= enwindow[0][2][0]
		faddp	st1,st0
		fld		dword [edi + (64*3 +  0)*4]
		fadd	dword [edi + (64*5 +  0)*4]
		fmul	dword [enwindow + (0*8*4 + 3*4)*4]	; *= enwindow[0][3][0]
		faddp	st1,st0
		fld		dword [edi + (64*4 +  0)*4]
		fmul	dword [enwindow + (0*8*4 + 4*4)*4]	; *= enwindow[0][4][0]
		faddp	st1,st0
		fld		dword [edi + (64*0 + 32)*4]
		fsub	dword [edi + (64*7 + 32)*4]
		fmul	dword [enwindow + (8*8*4 + 0*4)*4]	; *= enwindow[8][0][0]
		faddp	st1,st0
		fld		dword [edi + (64*1 + 32)*4]
		fsub	dword [edi + (64*6 + 32)*4]
		fmul	dword [enwindow + (8*8*4 + 1*4)*4]	; *= enwindow[8][1][0]
		faddp	st1,st0
		fld		dword [edi + (64*2 + 32)*4]
		fsub	dword [edi + (64*5 + 32)*4]
		fmul	dword [enwindow + (8*8*4 + 2*4)*4]	; *= enwindow[8][2][0]
		faddp	st1,st0
		fld		dword [edi + (64*3 + 32)*4]
		fsub	dword [edi + (64*4 + 32)*4]
		fmul	dword [enwindow + (8*8*4 + 3*4)*4]	; *= enwindow[8][3][0]
		faddp	st1,st0
		fstp	dword [yyy + 16*4]

;               yprime[ 0] = c[64*0+16]*enwindow[4][0][0]
;                     + c[64*1+16]*enwindow[4][1][0]
;                     + c[64*2+16]*enwindow[4][2][0]
;                     + c[64*3+16]*enwindow[4][3][0]
;                     + c[64*4+16]*enwindow[4][4][0]
;                     + c[64*5+16]*enwindow[4][5][0]
;                     + c[64*6+16]*enwindow[4][6][0]
;                     + c[64*7+16]*enwindow[4][7][0];
		fld		dword [edi + (64*0 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 0*4)*4]	; *= enwindow[4][0][0]
		fld		dword [edi + (64*1 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 1*4)*4]	; *= enwindow[4][1][0]
		faddp	st1,st0
		fld		dword [edi + (64*2 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 2*4)*4]	; *= enwindow[4][2][0]
		faddp	st1,st0
		fld		dword [edi + (64*3 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 3*4)*4]	; *= enwindow[4][3][0]
		faddp	st1,st0
		fld		dword [edi + (64*4 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 4*4)*4]	; *= enwindow[4][4][0]
		faddp	st1,st0
		fld		dword [edi + (64*5 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 5*4)*4]	; *= enwindow[4][5][0]
		faddp	st1,st0
		fld		dword [edi + (64*6 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 6*4)*4]	; *= enwindow[4][6][0]
		faddp	st1,st0
		fld		dword [edi + (64*7 + 16)*4]
		fmul	dword [enwindow + (4*8*4 + 7*4)*4]	; *= enwindow[4][7][0]
		faddp	st1,st0
		fstp	dword [yyy + 0*4]

		mov		ecx,15		; = i
		mov		edx,-15		; = -i
		jmp		short .f2

;               for(i = 1; i < 16; i++){
;                       register float  *cp, *cm;
;                       cp = &c[i]; cm = &c[-i];

		align	16
.f2:
.lp2:
		mov		eax,ecx
		shl		eax,1
		btr		eax,2
		rcl		eax,1
		btr		eax,2
		rcl		eax,1
		lea		eax,[eax*4 + enwindow]	; = &enwindow[i/4  ][0][i%4]

;                       a  =cp[64*0     ]*enwindow[i/4  ][0][i%4];
;                       b  =cm[64*7 + 64]*enwindow[i/4  ][0][i%4]; // t]
;                       a -=cm[64*7 + 32]*enwindow[i/4+8][0][i%4]; // t]
;                       b +=cp[64*0 + 32]*enwindow[i/4+8][0][i%4];
		fld		dword [edi + ecx*4 + (64*0 +  0)*4]
		fld		dword [eax + (0*8*4 + 0*4)*4]
		fmul	st1,st0
		fmul	dword [edi + edx*4 + (64*7 + 64)*4]

		fld		dword [edi + edx*4 + (64*7 + 32)*4]
		fld		dword [eax + (8*8*4 + 0*4)*4]
		fmul	st1,st0
		fmul	dword [edi + ecx*4 + (64*0 + 32)*4]

		fxch
		fsubp	st3,st0
		faddp	st1,st0

%macro	window_filter	2
		fld		dword [edi + ecx*4 + (64*%1 +  0)*4]
		fld		dword [eax + (0*8*4 + %1*4)*4]
		fmul	st1,st0
		fmul	dword [edi + edx*4 + (64*%2 + 64)*4]

		fxch
		faddp	st3,st0
		faddp	st1,st0

		fld		dword [edi + edx*4 + (64*%2 + 32)*4]
		fld		dword [eax + (8*8*4 + %1*4)*4]
		fmul	st1,st0
		fmul	dword [edi + ecx*4 + (64*%1 + 32)*4]

		fxch
		fsubp	st3,st0
		faddp	st1,st0
%endmacro
		window_filter	7,0
		window_filter	1,6
		window_filter	6,1
		window_filter	2,5
		window_filter	5,2
		window_filter	3,4
		window_filter	4,3

		fxch
		fstp	dword [yyy + 16*4 + edx*4]	; yprime[16-i] = a;
		fstp	dword [yyy + 16*4 + ecx*4]	; yprime[i+16] = b;
		inc		edx
		dec		ecx
		jnz		near .lp2
;               }
		sub		edi,32*4			; c -= 32

;       for( i=0; i<16; i++ ){
;               s0 = s1 = 0.0;
;               for( j=0; j<32; j+=2 ){
;                       s0 += (*m)[i+0][j  ]*yprime[j+0];
;                       s1 += (*m)[i+0][j+1]*yprime[j+1];
;               }
;               xout[i+ 0] = s0+s1;
;               xout[31-i] = s0-s1;
;       }
		xor		ecx,ecx
		mov		cl,16
		mov		edx,idct_coefficient
		jmp		short .lp14

		align	16
.lp14:
		fld		dword [edx+ 0*4]
		fmul	dword [yyy+ 0*4]
		xor		eax,eax
		fld		dword [edx+ 1*4]
		fmul	dword [yyy+ 1*4]
		mov		al,1
		jmp		short .lp140

		align	16
.lp140:
		fld		dword [edx+eax*8+ 0*4]
		fmul	dword [yyy+eax*8+ 0*4]
		fld		dword [edx+eax*8+ 1*4]
		fmul	dword [yyy+eax*8+ 1*4]
		inc		eax
		cmp		al,16
		fxch
		faddp	st3,st0
		faddp	st1,st0
		jl		short .lp140

		add		edx,32*4
		fld		st1
		fxch
		fadd	st2,st0
		fsubp	st1,st0

		fxch
		fstp	dword [ebp]
		add		ebp,4
		dec		ecx
		fstp	dword [ebp+ecx*8]
		jnz		near .lp14

		add		ebp,16*4
		dec		esi
		jnz		near .lp1

; free area for yyy[32]
		add		esp,32*4
%undef yyy
		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret

		end
