Source: audio_blocks/include/fft_3dnow.h
|
|
|
|
#ifndef FFT_3DNOW_H
#define FFT_3DNOW_H
#include <math.h>
void fft_initCosSinTables_3dnow(complex<float> *w, int *bits, int M)
{
int i,j;
int tmp;
int size = 1 << M;
for (i=0;i<size;i++)
{
bits[i]=0;
tmp=i;
for (j=0;j<M;j+=2)
{
bits[i] <<= 2;
bits[i] += tmp&3;
tmp>>=2;
}
}
while (size)
{
int k;
float tmp, p = (2.0 * M_PI) / size;
complex<float> tmp2;
for (k = 0; k < (size>>1); k++) {
tmp = k * p;
tmp2.re=cos(tmp);
tmp2.im=-sin(tmp);
*w++ = tmp2;
}
for (k = 0; k < (size>>2); k++) {
tmp = 2*k * p;
tmp2.re=cos(tmp);
tmp2.im=-sin(tmp);
*w++ = tmp2;
}
for (k = 0; k < (size>>2); k++) {
tmp = 3*k * p;
tmp2.re=cos(tmp);
tmp2.im=-sin(tmp);
*w++ = tmp2;
}
size >>= 1;
}
}
class _negmask {
int a;
int b;
public:
_negmask() : a(0x80000000) , b(0x00000000) {}
};
inline void fft_3dnow(complex<float> *in, complex<float> *_x, int _M, complex<float> *_w, int *bits)
{
complex<float> *w=_w+(1<<(_M+1))-2-6;
_negmask mask;
int rep = 1<<(_M-2);
__asm__ __volatile__ (
"
push %0
push %1
push %3
push %4
movq %2, %%mm7
.align 16
.loop%=:
mov (%4), %%edx
movq (%3,%%edx,8), %%mm0
mov 8(%4), %%edx
movq (%3,%%edx,8), %%mm1
movq %%mm0, %%mm4
pfadd %%mm1, %%mm0
pfsub %%mm1, %%mm4
mov 4(%4), %%edx
movq (%3,%%edx,8), %%mm2
mov 12(%4), %%edx
movq (%3,%%edx,8), %%mm3
movq %%mm2, %%mm5
pfadd %%mm3, %%mm2
mov 16(%4), %%edx
pfsub %%mm3, %%mm5
movq %%mm0, %%mm1
pfsub %%mm2, %%mm0
pfadd %%mm2, %%mm1
pswapd %%mm5, %%mm5
movq %%mm0, 16(%0)
movq %%mm1, (%0)
movq (%3,%%edx,8), %%mm0
pxor %%mm7, %%mm5
movq %%mm4, %%mm6
mov 24(%4), %%edx
pfsub %%mm5, %%mm4
pfadd %%mm5, %%mm6
movq (%3,%%edx,8), %%mm1
movq %%mm4, 8(%0)
movq %%mm6, 24(%0)
mov 20(%4), %%edx
movq (%3,%%edx,8), %%mm2
mov 28(%4), %%edx
movq (%3,%%edx,8), %%mm3
movq %%mm0, %%mm4
pfadd %%mm1, %%mm0
movq %%mm2, %%mm5
pfadd %%mm3, %%mm2
pfsub %%mm1, %%mm4
pfsub %%mm3, %%mm5
movq %%mm0, %%mm1
pfsub %%mm2, %%mm0
pfadd %%mm2, %%mm1
pswapd %%mm5, %%mm5
movq %%mm0, 48(%0)
movq %%mm1, 32(%0)
pxor %%mm7, %%mm5
movq %%mm4, %%mm6
pfsub %%mm5, %%mm4
pfadd %%mm5, %%mm6
movq %%mm6, 56(%0)
movq %%mm4, 40(%0)
add $64, %0
add $32, %4
dec %1
jne .loop%=
pop %4
pop %3
pop %1
pop %0
"
: : "r" (_x), "q" (rep>>1), "m" (mask), "r" (in), "r" (bits)
: "edx", "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
for (int M=4;M<=_M;M+=2)
{
complex<float> *x=_x;
rep >>= 2;
int repeat = rep;
int mul=repeat;
int N = 1 << M;
int N2 = N >> 1;
int N4 = N >> 2;
w-=(N+N2);
//cerr << "M = " << M << "\tN = " << N << "\twoff = " << w-_w << "\trepeat = " << repeat << endl;
while (repeat--)
{
{
__asm__ __volatile__ (
"
push %0
push %1
push %6
movq %5, %%mm7
movq (%0), %%mm0
movq (%3), %%mm1
movq (%0,%2,8), %%mm2
movq (%3,%2,8), %%mm3
movq %%mm0, %%mm4 ;//x0
movq %%mm1, %%mm5 ;//x1
pfadd %%mm2, %%mm0 ;//es
pfadd %%mm3, %%mm1 ;//os
pfsub %%mm2, %%mm4 ;//ed
pfsub %%mm3, %%mm5 ;//od
pswapd %%mm5, %%mm5 ;//od'
pxor %%mm7, %%mm5 ;//od'
movq %%mm0, %%mm2 ;//es
movq %%mm4, %%mm3 ;//ed
pfadd %%mm1, %%mm0 ;//x0
pfadd %%mm5, %%mm4 ;//x3
pfsub %%mm1, %%mm2 ;//x2
pfsub %%mm5, %%mm3 ;//x1
movq %%mm0, (%0)
movq %%mm4, (%3,%2,8)
movq %%mm2, (%0,%2,8)
movq %%mm3, (%3)
.align 16
loop%=:
add $8, %3
movq (%3), %%mm1
add $8, %1
movq (%1), %%mm4
pswapd %%mm4, %%mm0
add $8, %4
add $8, %0
movq (%4), %%mm5
movq (%0,%2,8), %%mm2
movq (%3,%2,8), %%mm3
pfmul %%mm1, %%mm4
pfmul %%mm0, %%mm1
movq (%4,%2,4), %%mm6
pswapd %%mm5, %%mm7
pswapd %%mm6, %%mm0
pfmul %%mm2, %%mm5
pfmul %%mm7, %%mm2
pfmul %%mm3, %%mm6
pfmul %%mm0, %%mm3
pfpnacc %%mm1, %%mm4
pfpnacc %%mm2, %%mm5
pfpnacc %%mm3, %%mm6
movq (%0), %%mm0
movq %5, %%mm7
//1-4 2-5 3-6
movq %%mm0, %%mm1 ;//x0
movq %%mm4, %%mm2 ;//x1
pfadd %%mm5, %%mm0 ;//es
pfadd %%mm6, %%mm4 ;//os
pfsub %%mm5, %%mm1 ;//ed
pfsub %%mm6, %%mm2 ;//od
pswapd %%mm2, %%mm2 ;//od'
pxor %%mm7, %%mm2 ;//od'
pfadd %%mm4, %%mm0 ;//x0
pfsub %%mm4, %%mm5 ;//x2
pfadd %%mm2, %%mm1 ;//x3
pfsub %%mm2, %%mm6 ;//x1
movq %%mm0, (%0)
movq %%mm5, (%0,%2,8)
movq %%mm1, (%3,%2,8)
movq %%mm6, (%3)
dec %6
jne loop%=
pop %6
pop %1
pop %0
"
: : "r" (x), "r" (w), "q" (N2), "q" (x+N4), "q" (w+N2), "m" (mask), "q" (N4-1)
: "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
}
x+=N;
}
}
/*
if (0&&M&1)
{
//recurs_fft(x, M-1, w+N, repeat<<1);
while (repeat--)
{
__asm__ __volatile__ (
"
push %0
push %1
push %2
push %3
.loop%=:
movq (%0), %%mm0
movq 8(%0), %%mm4
pswapd %%mm0, %%mm2
pswapd %%mm4, %%mm6
movq (%1), %%mm1
movq 8(%1), %%mm5
pfmul %%mm1, %%mm0
pfmul %%mm1, %%mm2
pfmul %%mm5, %%mm4
pfmul %%mm5, %%mm6
pfpnacc %%mm2, %%mm0
pfpnacc %%mm6, %%mm4
movq (%2), %%mm3
movq 8(%2), %%mm7
movq %%mm3, %%mm1
movq %%mm7, %%mm5
pfsub %%mm0, %%mm3
pfadd %%mm0, %%mm1
pfsub %%mm4, %%mm7
pfadd %%mm4, %%mm5
movq %%mm1, (%2)
movq %%mm5, 8(%2)
add $16, %1
add $16, %2
movq %%mm3, (%0)
movq %%mm7, 8(%0)
add $16, %0
dec %3
jne .loop%=
pop %3
pop %2
pop %1
pop %0
"
: "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4) : "0" (x+N2), "1" (w), "2" (x), "3" (N2>>1)
: "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
}
}*/
__asm__ __volatile__ ("femms" : : : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
}
#endif
Generated by: jmvalin@usw-pr-shell2 on Mon Jun 24 00:06:42 2002, using kdoc 2.0a40. |