#include <math.h>
#include <complex.h>
#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>
typedef complex float data_t;
#define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N)))
data_t **LUT1;
data_t **LUT3;
#include "vecmath.h"
void splitfft(data_t *in, data_t *out,
int log2stride, int stride, int N) {
if(N == 1) {
out[0] = in[0];
}else if(N == 2) {
out[0] = in[0] + in[stride];
out[1] = in[0] - in[stride];
}else if(N == 4) {
splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);
data_t temp0 = out[0] + (out[2] + out[3]);
data_t temp1 = out[0] - (out[2] + out[3]);
data_t temp2 = out[1] - I*(out[2] - out[3]);
data_t temp3 = out[1] + I*(out[2] - out[3]);
if(log2stride) {
out[0] = creal(temp0) + creal(temp2)*I;
out[1] = creal(temp1) + creal(temp3)*I;
out[2] = cimag(temp0) + cimag(temp2)*I;
out[3] = cimag(temp1) + cimag(temp3)*I;
}else{
out[0] = temp0;
out[2] = temp1;
out[1] = temp2;
out[3] = temp3;
}
}else if(N == 8) {
splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);
data_t o[8];
{
data_t Uk = creal(out[0]) + creal(out[2])*I;
data_t Zk = out[4];
data_t Uk2 = creal(out[1]) + creal(out[3])*I;
data_t Zdk = out[6];
o[0] = Uk + (Zk + Zdk);
o[4] = Uk - (Zk + Zdk);
o[2] = Uk2 - I*(Zk - Zdk);
o[6] = Uk2 + I*(Zk - Zdk);
}
{
data_t Uk = cimag(out[0]) + cimag(out[2])*I;
data_t Zk = out[5];
data_t Uk2 = cimag(out[1]) + cimag(out[3])*I;
data_t Zdk = out[7];
data_t w1 = LUT1[log2stride][1];
data_t w3 = LUT3[log2stride][1];
o[1] = Uk + (w1*Zk + w3*Zdk);
o[5] = Uk - (w1*Zk + w3*Zdk);
o[3] = Uk2 - I*(w1*Zk - w3*Zdk);
o[7] = Uk2 + I*(w1*Zk - w3*Zdk);
}
if(log2stride) {
out[0] = creal(o[0]) + creal(o[1])*I;
out[1] = creal(o[2]) + creal(o[3])*I;
out[2] = cimag(o[0]) + cimag(o[1])*I;
out[3] = cimag(o[2]) + cimag(o[3])*I;
out[4] = creal(o[4]) + creal(o[5])*I;
out[5] = creal(o[6]) + creal(o[7])*I;
out[6] = cimag(o[4]) + cimag(o[5])*I;
out[7] = cimag(o[6]) + cimag(o[7])*I;
}else{
int i;
for(i=0;i<8;i++) out[i] = o[i];
}
}else if(!log2stride){
splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);
int k;
for(k=0;k<N/4;k+=4) {
reg_t Uk = LOAD((float *)&out[k]);
reg_t Zk = LOAD((float *)&out[k+N/2]);
reg_t Uk2 = LOAD((float *)&out[k+N/4]);
reg_t Zdk = LOAD((float *)&out[k+3*N/4]);
reg_t w1 = LOAD((float *)&LUT1[log2stride][k]);
reg_t w3 = LOAD((float *)&LUT3[log2stride][k]);
reg_t w3Zdk = MUL(w3, Zdk);
reg_t w1Zk = MUL(w1, Zk);
reg_t sum = ADD(w1Zk, w3Zdk);
reg_t dif = SUB(w1Zk, w3Zdk);
STOREIL((float *)&out[k], ADD(Uk, sum));
STOREIL((float *)&out[k+N/2], SUB(Uk, sum));
STOREIL((float *)&out[k+N/4], SUB_I(Uk2, dif));
STOREIL((float *)&out[k+3*N/4], ADD_I(Uk2, dif));
}
}else{
splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);
int k;
for(k=0;k<N/4;k+=4) {
reg_t Uk = LOAD((float *)&out[k]);
reg_t Zk = LOAD((float *)&out[k+N/2]);
reg_t Uk2 = LOAD((float *)&out[k+N/4]);
reg_t Zdk = LOAD((float *)&out[k+3*N/4]);
reg_t w1 = LOAD((float *)&LUT1[log2stride][k]);
reg_t w3 = LOAD((float *)&LUT3[log2stride][k]);
reg_t w3Zdk = MUL(w3, Zdk);
reg_t w1Zk = MUL(w1, Zk);
reg_t sum = ADD(w1Zk, w3Zdk);
reg_t dif = SUB(w1Zk, w3Zdk);
STORE((float *)&out[k], ADD(Uk, sum));
STORE((float *)&out[k+N/2], SUB(Uk, sum));
STORE((float *)&out[k+N/4], SUB_I(Uk2, dif));
STORE((float *)&out[k+3*N/4], ADD_I(Uk2, dif));
}
}
}
void fft_init(int N) {
#define log2(x) ((int)(log(x)/log(2)))
int n_luts = log2(N)-1;
LUT1 = malloc(n_luts * sizeof(data_t *));
LUT3 = malloc(n_luts * sizeof(data_t *));
int i;
for(i=0;i<n_luts;i++) {
int n = N / pow(2,i);
LUT1[i] = _mm_malloc(n/4 * sizeof(data_t),16);
LUT3[i] = _mm_malloc(n/4 * sizeof(data_t),16);
if(n == 8) {
int j;
for(j=0;j<n/4;j++) {
LUT1[i][j] = W(n,j);
LUT3[i][j] = W(n,3*j);
}
}else{
int j;
for(j=0;j<n/4;j+=4) {
data_t w1[4], w3[4];
int k;
for(k=0;k<4;k++) w1[k] = W(n,j+k);
for(k=0;k<4;k++) w3[k] = W(n,3*(j+k));
LUT1[i][j] = creal(w1[0]) + creal(w1[1])*I;
LUT1[i][j+1] = creal(w1[2]) + creal(w1[3])*I;
LUT1[i][j+2] = cimag(w1[0]) + cimag(w1[1])*I;
LUT1[i][j+3] = cimag(w1[2]) + cimag(w1[3])*I;
LUT3[i][j] = creal(w3[0]) + creal(w3[1])*I;
LUT3[i][j+1] = creal(w3[2]) + creal(w3[3])*I;
LUT3[i][j+2] = cimag(w3[0]) + cimag(w3[1])*I;
LUT3[i][j+3] = cimag(w3[2]) + cimag(w3[3])*I;
}
}
}
}
|