<< Chapter < Page Chapter >> Page >
  1. elaborate( N l e a f / 2 , 0, 0, 1) ;
  2. elaborate( N l e a f / 2 , - 1 , N l e a f / 2 , 1) ;

The code corresponding to steps 1 and 2 is emitted slightly differently than was the case with the fully hard-coded transforms. Instead of hard coding the input array indices, the indices are themselves loaded from an array that is precomputed when the transform is initialized.

The node list corresponding to the main transform in step 3 is elaborated as in the function in [link] , but with some minor change. First, the recursion terminates with leaf nodes of size N l e a f . Second, because the loops in the body of the sub-transform will be at least 2 × N l e a f iterations, the loop for the body sub-transforms (line 12 of [link] ) is not statically unrolled. Instead only one node is added to the list of nodes, and the loop is computed dynamically.

void sfft_dcf64_hcl16_4_e(offset_t *is,const SFFT_D *in,SFFT_D *out){   SFFT_R r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;  L_4(in+is[0],in+is[1],in+is[2],in+is[3],&r0,&r1,&r2,&r3);   L_2(in+is[4],in+is[5],in+is[6],in+is[7],&r4,&r5,&r6,&r7);   K_0(&r0,&r2,&r4,&r6);   K_N(VLIT2(0.7071,0.7071),VLIT2(0.7071,-0.7071),&r1,&r3,&r5,&r7);   L_4(in+is[8],in+is[9],in+is[10],in+is[11],&r8,&r9,&r10,&r11);   L_4(in+is[12],in+is[13],in+is[14],in+is[15],&r12,&r13,&r14,&r15);   K_0(&r0,&r4,&r8,&r12);   S_4(r0,r4,r8,r12,out+0,out+8,out+16,out+24);   K_N(VLIT2(0.9239,0.9239),VLIT2(0.3827,-0.3827),&r1,&r5,&r9,&r13);   S_4(r1,r5,r9,r13,out+2,out+10,out+18,out+26);  K_N(VLIT2(0.7071,0.7071),VLIT2(0.7071,-0.7071),&r2,&r6,&r10,&r14);   S_4(r2,r6,r10,r14,out+4,out+12,out+20,out+28);  K_N(VLIT2(0.3827,0.3827),VLIT2(0.9239,-0.9239),&r3,&r7,&r11,&r15);   S_4(r3,r7,r11,r15,out+6,out+14,out+22,out+30);} void sfft_dcf64_hcl16_4_o(offset_t *is,const SFFT_D *in,SFFT_D *out){  SFFT_R r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;   L_4(in+is[0],in+is[1],in+is[2],in+is[3],&r0,&r1,&r2,&r3);   L_2(in+is[4],in+is[5],in+is[6],in+is[7],&r4,&r5,&r6,&r7);   K_0(&r0,&r2,&r4,&r6);   S_4(r0,r2,r4,r6,out+0,out+4,out+8,out+12);  K_N(VLIT2(0.7071,0.7071),VLIT2(0.7071,-0.7071),&r1,&r3,&r5,&r7);   S_4(r1,r3,r5,r7,out+2,out+6,out+10,out+14);  L_4(in+is[8],in+is[9],in+is[10],in+is[11],&r8,&r9,&r10,&r11);   L_2(in+is[12],in+is[13],in+is[14],in+is[15],&r12,&r13,&r14,&r15);   K_0(&r8,&r10,&r12,&r14);   S_4(r8,r10,r12,r14,out+16,out+20,out+24,out+28);  K_N(VLIT2(0.7071,0.7071),VLIT2(0.7071,-0.7071),&r9,&r11,&r13,&r15);   S_4(r9,r11,r13,r15,out+18,out+22,out+26,out+30);} void sfft_dcf64_hcl16_4_X_4(SFFT_D *data, int N, SFFT_D *LUT){  X_4(data, N, LUT); }void sfft_dcf64_hcl16_4(sfft_plan_t *p, const void *vin, void *vout){   const SFFT_D *in = vin;  SFFT_D *out = vout;   p->is = p->is_base;   sfft_dcf64_hcl16_4_e(p->is,in,out+0);   p->is += 16;  sfft_dcf64_hcl16_4_o(p->is,in,out+32);   sfft_dcf64_hcl16_4_X_4(out+0,32,p->ws[0]);  p->is += 16;  sfft_dcf64_hcl16_4_e(p->is,in,out+64);   p->is += 16;  sfft_dcf64_hcl16_4_e(p->is,in,out+96);   sfft_dcf64_hcl16_4_X_4(out+0,64,p->ws[1]);}
Hard-coded VL-1 size-64 FFT with size-16 leaves

Get Jobilize Job Search Mobile App in your pocket Now!

Get it on Google Play Download on the App Store Now




Source:  OpenStax, Computing the fast fourier transform on simd microprocessors. OpenStax CNX. Jul 15, 2012 Download for free at http://cnx.org/content/col11438/1.2
Google Play and the Google Play logo are trademarks of Google Inc.

Notification Switch

Would you like to follow the 'Computing the fast fourier transform on simd microprocessors' conversation and receive update notifications?

Ask