<< Chapter < Page Chapter >> Page >

This Appendix contains source code listings corresponding to the vectorized FFT implementations in Implementation details .

#include <math.h>#include <complex.h>#include <stdio.h>#include <stdlib.h>#include <xmmintrin.h>  typedef complex float data_t;  #define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N)))  data_t **LUT;  void ditfft2(data_t *in, data_t *out, int log2stride, int stride, int N) {if(N == 2) { out[0]   = in[0] + in[stride]; out[N/2] = in[0] - in[stride]; }else if(N == 4){ditfft2(in, out, log2stride+1, stride << 1, N >> 1); ditfft2(in+stride, out+N/2, log2stride+1, stride << 1, N >> 1);    data_t temp0 = out[0] + out[2];     data_t temp1 = out[0] - out[2];    data_t temp2 = out[1] - I*out[3];     data_t temp3 = out[1] + I*out[3];    if(log2stride) { out[0] = creal(temp0) + creal(temp2)*I; out[1] = creal(temp1) + creal(temp3)*I; out[2] = cimag(temp0) + cimag(temp2)*I; out[3] = cimag(temp1) + cimag(temp3)*I; }else{out[0] = temp0;out[2] = temp1;out[1] = temp2;out[3] = temp3;} }else if(!log2stride){ditfft2(in, out, log2stride+1, stride << 1, N >> 1); ditfft2(in+stride, out+N/2, log2stride+1, stride << 1, N >> 1);  int k; for(k=0;k<N/2;k+=4) { __m128 Ok_re = _mm_load_ps((float *)&out[k+N/2]);__m128 Ok_im = _mm_load_ps((float *)&out[k+N/2+2]);__m128 w_re = _mm_load_ps((float *)&LUT[log2stride][k]); __m128 w_im = _mm_load_ps((float *)&LUT[log2stride][k+2]); __m128 Ek_re = _mm_load_ps((float *)&out[k]);__m128 Ek_im = _mm_load_ps((float *)&out[k+2]);__m128 wOk_re = _mm_sub_ps(_mm_mul_ps(Ok_re,w_re),_mm_mul_ps(Ok_im,w_im)); __m128 wOk_im = _mm_add_ps(_mm_mul_ps(Ok_re,w_im),_mm_mul_ps(Ok_im,w_re));__m128 out0_re = _mm_add_ps(Ek_re, wOk_re); __m128 out0_im = _mm_add_ps(Ek_im, wOk_im);__m128 out1_re = _mm_sub_ps(Ek_re, wOk_re); __m128 out1_im = _mm_sub_ps(Ek_im, wOk_im);_mm_store_ps((float *)(out+k), _mm_unpacklo_ps(out0_re, out0_im)); _mm_store_ps((float *)(out+k+2), _mm_unpackhi_ps(out0_re, out0_im));_mm_store_ps((float *)(out+k+N/2), _mm_unpacklo_ps(out1_re, out1_im)); _mm_store_ps((float *)(out+k+N/2+2), _mm_unpackhi_ps(out1_re, out1_im));} }else{ditfft2(in, out, log2stride+1, stride << 1, N >> 1); ditfft2(in+stride, out+N/2, log2stride+1, stride << 1, N >> 1);int k; for(k=0;k<N/2;k+=4) { __m128 Ok_re = _mm_load_ps((float *)&out[k+N/2]);__m128 Ok_im = _mm_load_ps((float *)&out[k+N/2+2]);__m128 w_re = _mm_load_ps((float *)&LUT[log2stride][k]); __m128 w_im = _mm_load_ps((float *)&LUT[log2stride][k+2]); __m128 Ek_re = _mm_load_ps((float *)&out[k]);__m128 Ek_im = _mm_load_ps((float *)&out[k+2]);__m128 wOk_re = _mm_sub_ps(_mm_mul_ps(Ok_re,w_re),_mm_mul_ps(Ok_im,w_im)); __m128 wOk_im = _mm_add_ps(_mm_mul_ps(Ok_re,w_im),_mm_mul_ps(Ok_im,w_re));_mm_store_ps((float *)(out+k), _mm_add_ps(Ek_re, wOk_re)); _mm_store_ps((float *)(out+k+2), _mm_add_ps(Ek_im, wOk_im));_mm_store_ps((float *)(out+k+N/2), _mm_sub_ps(Ek_re, wOk_re)); _mm_store_ps((float *)(out+k+N/2+2), _mm_sub_ps(Ek_im, wOk_im));} }}  void fft_init(int N) { int i;#define log2(x) ((int)(log(x)/log(2)))int n_luts = log2(N)-2;LUT = malloc(n_luts * sizeof(data_t *)); for(i=0;i<n_luts;i++) { int n = N / pow(2,i);LUT[i] = _mm_malloc(n/2 * sizeof(data_t), 16);int j; for(j=0;j<n/2;j+=4) { data_t w[4]; int k;for(k=0;k<4;k++) w[k] = W(n,j+k);  LUT[i][j]   = creal(w[0]) + creal(w[1])*I;LUT[i][j+1] = creal(w[2]) + creal(w[3])*I; LUT[i][j+2] = cimag(w[0]) + cimag(w[1])*I;LUT[i][j+3] = cimag(w[2]) + cimag(w[3])*I; }}}
Radix-2 FFT with vectorized loops

Get Jobilize Job Search Mobile App in your pocket Now!

Get it on Google Play Download on the App Store Now




Source:  OpenStax, Computing the fast fourier transform on simd microprocessors. OpenStax CNX. Jul 15, 2012 Download for free at http://cnx.org/content/col11438/1.2
Google Play and the Google Play logo are trademarks of Google Inc.

Notification Switch

Would you like to follow the 'Computing the fast fourier transform on simd microprocessors' conversation and receive update notifications?

Ask