# OpenStax-CNX

You are here: Home » Content » Computing the fast Fourier transform on SIMD microprocessors » Appendix 3 - FFTs with vectorized loops

### Recently Viewed

This feature requires Javascript to be enabled.

Inside Collection (Ph.D. thesis):

Thesis by: Anthony Blake. E-mail the author

# Appendix 3 - FFTs with vectorized loops

Module by: Anthony Blake. E-mail the author

This Appendix contains source code listings corresponding to the vectorized FFT implementations in Implementation details.

 #include  #include  #include  #include  #include    typedef complex float data_t;   #define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N)))   data_t **LUT;   void ditfft2(data_t *in, data_t *out, int log2stride, int stride, int N) { if(N == 2) { out[0]   = in[0] + in[stride]; out[N/2] = in[0] - in[stride]; }else if(N == 4){ ditfft2(in, out, log2stride+1, stride << 1, N >> 1); ditfft2(in+stride, out+N/2, log2stride+1, stride << 1, N >> 1);     data_t temp0 = out[0] + out[2];     data_t temp1 = out[0] - out[2];     data_t temp2 = out[1] - I*out[3];     data_t temp3 = out[1] + I*out[3];     if(log2stride) { out[0] = creal(temp0) + creal(temp2)*I; out[1] = creal(temp1) + creal(temp3)*I; out[2] = cimag(temp0) + cimag(temp2)*I; out[3] = cimag(temp1) + cimag(temp3)*I; }else{ out[0] = temp0; out[2] = temp1; out[1] = temp2; out[3] = temp3; } }else if(!log2stride){ ditfft2(in, out, log2stride+1, stride << 1, N >> 1); ditfft2(in+stride, out+N/2, log2stride+1, stride << 1, N >> 1);   int k; for(k=0;k> 1); ditfft2(in+stride, out+N/2, log2stride+1, stride << 1, N >> 1); int k; for(k=0;k
 typedef struct _reg_t { __m128 re, im; } reg_t;   static inline reg_t MUL(reg_t a, reg_t b) { reg_t r; r.re = _mm_sub_ps(_mm_mul_ps(a.re,b.re),_mm_mul_ps(a.im,b.im)); r.im = _mm_add_ps(_mm_mul_ps(a.re,b.im),_mm_mul_ps(a.im,b.re)); return r; } static inline reg_t MULJ(reg_t a, reg_t b) { reg_t r; r.re = _mm_add_ps(_mm_mul_ps(a.re,b.re),_mm_mul_ps(a.im,b.im)); r.im = _mm_sub_ps(_mm_mul_ps(a.im,b.re),_mm_mul_ps(a.re,b.im)); return r; }   static inline reg_t ADD(reg_t a, reg_t b) { reg_t r; r.re = _mm_add_ps(a.re,b.re); r.im = _mm_add_ps(a.im,b.im); return r; } static inline reg_t SUB(reg_t a, reg_t b) { reg_t r; r.re = _mm_sub_ps(a.re,b.re); r.im = _mm_sub_ps(a.im,b.im); return r; } static inline reg_t ADD_I(reg_t a, reg_t b) { reg_t r; r.re = _mm_sub_ps(a.re,b.im); r.im = _mm_add_ps(a.im,b.re); return r; } static inline reg_t SUB_I(reg_t a, reg_t b) { reg_t r; r.re = _mm_add_ps(a.re,b.im); r.im = _mm_sub_ps(a.im,b.re); return r; }   static inline reg_t LOAD(float *a) { reg_t r; r.re = _mm_load_ps(a); r.im = _mm_load_ps(a+4); return r; } static inline void STORE(float *a, reg_t r) { _mm_store_ps(a, r.re); _mm_store_ps(a+4, r.im); } static inline void STOREIL(float *a, reg_t r) { _mm_store_ps(a, _mm_unpacklo_ps(r.re, r.im)); _mm_store_ps(a+4, _mm_unpackhi_ps(r.re, r.im)); } 
 #include  #include  #include  #include  #include    typedef complex float data_t;   #define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N))) data_t **LUT1; data_t **LUT3;   #include "vecmath.h"   void splitfft(data_t *in, data_t *out, int log2stride, int stride, int N) { if(N == 1) { out[0] = in[0]; }else if(N == 2) { out[0] = in[0] + in[stride]; out[1] = in[0] - in[stride]; }else if(N == 4) { splitfft(in, out, log2stride+1, stride << 1, N >> 1); splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); data_t temp0 = out[0]  + (out[2] + out[3]); data_t temp1 = out[0]  - (out[2] + out[3]); data_t temp2 = out[1] - I*(out[2] - out[3]); data_t temp3 = out[1] + I*(out[2] - out[3]);     if(log2stride) {    out[0] = creal(temp0) + creal(temp2)*I;    out[1] = creal(temp1) + creal(temp3)*I;    out[2] = cimag(temp0) + cimag(temp2)*I;    out[3] = cimag(temp1) + cimag(temp3)*I;    }else{ out[0] = temp0; out[2] = temp1; out[1] = temp2; out[3] = temp3;    } }else if(N == 8) { splitfft(in, out, log2stride+1, stride << 1, N >> 1); splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); data_t o[8]; { data_t Uk  = creal(out[0]) + creal(out[2])*I; data_t Zk  = out[4]; data_t Uk2  = creal(out[1]) + creal(out[3])*I; data_t Zdk = out[6]; o[0] = Uk  + (Zk + Zdk); o[4] = Uk  - (Zk + Zdk); o[2] = Uk2 - I*(Zk - Zdk); o[6] = Uk2 + I*(Zk - Zdk); } { data_t Uk = cimag(out[0]) + cimag(out[2])*I; data_t Zk  = out[5]; data_t Uk2 = cimag(out[1]) + cimag(out[3])*I; data_t Zdk = out[7]; data_t w1 = LUT1[log2stride][1]; data_t w3 = LUT3[log2stride][1]; o[1] = Uk  + (w1*Zk + w3*Zdk); o[5] = Uk  - (w1*Zk + w3*Zdk); o[3] = Uk2 - I*(w1*Zk - w3*Zdk); o[7] = Uk2 + I*(w1*Zk - w3*Zdk); } if(log2stride) { out[0] = creal(o[0]) + creal(o[1])*I; out[1] = creal(o[2]) + creal(o[3])*I; out[2] = cimag(o[0]) + cimag(o[1])*I; out[3] = cimag(o[2]) + cimag(o[3])*I; out[4] = creal(o[4]) + creal(o[5])*I; out[5] = creal(o[6]) + creal(o[7])*I; out[6] = cimag(o[4]) + cimag(o[5])*I; out[7] = cimag(o[6]) + cimag(o[7])*I; }else{ int i; for(i=0;i<8;i++) out[i] = o[i]; } }else if(!log2stride){   splitfft(in, out, log2stride+1, stride << 1, N >> 1); splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); int k; for(k=0;k> 1); splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); int k; for(k=0;k
 #include  #include  #include  #include  #include    typedef complex float data_t;   #define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N))) data_t **LUT1;   #include "vecmath.h"   data_t *base; int TN;   void conjfft(data_t *in, data_t *out, int log2stride, int stride, int N) { if(N == 1) { if(in < base) in += TN; out[0] = in[0]; }else if(N == 2) { data_t *i0 = in, *i1 = in + stride; if(i0 < base) i0 += TN; if(i1 < base) i1 += TN; out[0]   = *i0 + *i1; out[N/2] = *i0 - *i1; }else if(N == 4) { conjfft(in, out, log2stride+1, stride << 1, N >> 1); conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); data_t temp0 = out[0]  + (out[2] + out[3]); data_t temp1 = out[0]  - (out[2] + out[3]); data_t temp2 = out[1] - I*(out[2] - out[3]); data_t temp3 = out[1] + I*(out[2] - out[3]);     if(log2stride) {    out[0] = creal(temp0) + creal(temp2)*I;    out[1] = creal(temp1) + creal(temp3)*I;    out[2] = cimag(temp0) + cimag(temp2)*I;    out[3] = cimag(temp1) + cimag(temp3)*I;    }else{ out[0] = temp0; out[2] = temp1; out[1] = temp2; out[3] = temp3;    } }else if(N == 8) { conjfft(in, out, log2stride+1, stride << 1, N >> 1); conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); data_t o[8]; { data_t Uk  = creal(out[0]) + creal(out[2])*I; data_t Zk  = out[4]; data_t Uk2  = creal(out[1]) + creal(out[3])*I; data_t Zdk = out[6]; o[0] = Uk  + (Zk + Zdk); o[4] = Uk  - (Zk + Zdk); o[2] = Uk2 - I*(Zk - Zdk); o[6] = Uk2 + I*(Zk - Zdk); } { data_t Uk = cimag(out[0]) + cimag(out[2])*I; data_t Zk  = out[5]; data_t Uk2 = cimag(out[1]) + cimag(out[3])*I; data_t Zdk = out[7]; data_t w1 = LUT1[log2stride][1]; o[1] = Uk  + (w1*Zk + conj(w1)*Zdk); o[5] = Uk  - (w1*Zk + conj(w1)*Zdk); o[3] = Uk2 - I*(w1*Zk - conj(w1)*Zdk); o[7] = Uk2 + I*(w1*Zk - conj(w1)*Zdk); } if(log2stride) { out[0] = creal(o[0]) + creal(o[1])*I; out[1] = creal(o[2]) + creal(o[3])*I; out[2] = cimag(o[0]) + cimag(o[1])*I; out[3] = cimag(o[2]) + cimag(o[3])*I; out[4] = creal(o[4]) + creal(o[5])*I; out[5] = creal(o[6]) + creal(o[7])*I; out[6] = cimag(o[4]) + cimag(o[5])*I; out[7] = cimag(o[6]) + cimag(o[7])*I; }else{ int i; for(i=0;i<8;i++) out[i] = o[i]; } }else if(!log2stride){   conjfft(in, out, log2stride+1, stride << 1, N >> 1); conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); int k; for(k=0;k> 1); conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);   conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2); int k; for(k=0;k

## Content actions

PDF | EPUB (?)

### What is an EPUB file?

EPUB is an electronic book format that can be read on a variety of mobile devices.

#### Collection to:

My Favorites (?)

'My Favorites' is a special kind of lens which you can use to bookmark modules and collections. 'My Favorites' can only be seen by you, and collections saved in 'My Favorites' can remember the last module you were on. You need an account to use 'My Favorites'.

| A lens I own (?)

#### Definition of a lens

##### Lenses

A lens is a custom view of the content in the repository. You can think of it as a fancy kind of list that will let you see content through the eyes of organizations and people you trust.

##### What is in a lens?

Lens makers point to materials (modules and collections), creating a guide that includes their own comments and descriptive tags about the content.

##### Who can create a lens?

Any individual member, a community, or a respected organization.

##### What are tags?

Tags are descriptors added by lens makers to help label content, attaching a vocabulary that is meaningful in the context of the lens.

| External bookmarks

#### Module to:

My Favorites (?)

'My Favorites' is a special kind of lens which you can use to bookmark modules and collections. 'My Favorites' can only be seen by you, and collections saved in 'My Favorites' can remember the last module you were on. You need an account to use 'My Favorites'.

| A lens I own (?)

#### Definition of a lens

##### Lenses

A lens is a custom view of the content in the repository. You can think of it as a fancy kind of list that will let you see content through the eyes of organizations and people you trust.

##### What is in a lens?

Lens makers point to materials (modules and collections), creating a guide that includes their own comments and descriptive tags about the content.

##### Who can create a lens?

Any individual member, a community, or a respected organization.

##### What are tags?

Tags are descriptors added by lens makers to help label content, attaching a vocabulary that is meaningful in the context of the lens.

| External bookmarks