Vector Optimized Library of Kernels  2.3
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
81 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
82 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
83 
84 
85 #include <math.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <volk/volk_complex.h>
89 #define ROTATOR_RELOAD 512
90 #define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
91 #define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
92 
93 
94 #ifdef LV_HAVE_GENERIC
95 
96 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
97  const lv_32fc_t* inVector,
98  const lv_32fc_t phase_inc,
99  lv_32fc_t* phase,
100  unsigned int num_points)
101 {
102  unsigned int i = 0;
103  int j = 0;
104  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
105  for (j = 0; j < ROTATOR_RELOAD; ++j) {
106  *outVector++ = *inVector++ * (*phase);
107  (*phase) *= phase_inc;
108  }
109 
110  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
111  }
112  for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
113  *outVector++ = *inVector++ * (*phase);
114  (*phase) *= phase_inc;
115  }
116  if (i) {
117  // Make sure, we normalize phase on every call!
118  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
119  }
120 }
121 
122 #endif /* LV_HAVE_GENERIC */
123 
124 
125 #ifdef LV_HAVE_NEON
126 #include <arm_neon.h>
128 
129 static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
130  const lv_32fc_t* inVector,
131  const lv_32fc_t phase_inc,
132  lv_32fc_t* phase,
133  unsigned int num_points)
134 
135 {
136  lv_32fc_t* outputVectorPtr = outVector;
137  const lv_32fc_t* inputVectorPtr = inVector;
138  lv_32fc_t incr = 1;
139  lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
140  float32x4x2_t input_vec;
141  float32x4x2_t output_vec;
142 
143  unsigned int i = 0, j = 0;
144  // const unsigned int quarter_points = num_points / 4;
145 
146  for (i = 0; i < 4; ++i) {
147  phasePtr[i] *= incr;
148  incr *= (phase_inc);
149  }
150 
151  // Notice that incr has be incremented in the previous loop
152  const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
153  const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
154  float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
155 
156  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
157  for (j = 0; j < ROTATOR_RELOAD_4; j++) {
158  input_vec = vld2q_f32((float*)inputVectorPtr);
159  // Prefetch next one, speeds things up
160  __VOLK_PREFETCH(inputVectorPtr + 4);
161  // Rotate
162  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
163  // Increase phase
164  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
165  // Store output
166  vst2q_f32((float*)outputVectorPtr, output_vec);
167 
168  outputVectorPtr += 4;
169  inputVectorPtr += 4;
170  }
171  // normalize phase so magnitude doesn't grow because of
172  // floating point rounding error
173  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
174  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
175  // Multiply complex with real
176  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
177  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
178  }
179 
180  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
181  input_vec = vld2q_f32((float*)inputVectorPtr);
182  // Prefetch next one, speeds things up
183  __VOLK_PREFETCH(inputVectorPtr + 4);
184  // Rotate
185  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
186  // Increase phase
187  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
188  // Store output
189  vst2q_f32((float*)outputVectorPtr, output_vec);
190 
191  outputVectorPtr += 4;
192  inputVectorPtr += 4;
193  }
194  // if(i) == true means we looped above
195  if (i) {
196  // normalize phase so magnitude doesn't grow because of
197  // floating point rounding error
198  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
199  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
200  // Multiply complex with real
201  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
202  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
203  }
204  // Store current phase
205  vst2q_f32((float*)phasePtr, phase_vec);
206 
207  // Deal with the rest
208  for (i = 0; i < num_points % 4; i++) {
209  *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
210  phasePtr[0] *= (phase_inc);
211  }
212 
213  // For continuous phase next time we need to call this function
214  (*phase) = phasePtr[0];
215 }
216 
217 #endif /* LV_HAVE_NEON */
218 
219 
220 #ifdef LV_HAVE_SSE4_1
221 #include <smmintrin.h>
222 
223 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
224  const lv_32fc_t* inVector,
225  const lv_32fc_t phase_inc,
226  lv_32fc_t* phase,
227  unsigned int num_points)
228 {
229  lv_32fc_t* cPtr = outVector;
230  const lv_32fc_t* aPtr = inVector;
231  lv_32fc_t incr = 1;
232  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
233 
234  unsigned int i, j = 0;
235 
236  for (i = 0; i < 2; ++i) {
237  phase_Ptr[i] *= incr;
238  incr *= (phase_inc);
239  }
240 
241  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
242 
243  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
244  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
245 
246  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
247  for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
248 
249  aVal = _mm_load_ps((float*)aPtr);
250 
251  yl = _mm_moveldup_ps(phase_Val);
252  yh = _mm_movehdup_ps(phase_Val);
253  ylp = _mm_moveldup_ps(inc_Val);
254  yhp = _mm_movehdup_ps(inc_Val);
255 
256  tmp1 = _mm_mul_ps(aVal, yl);
257  tmp1p = _mm_mul_ps(phase_Val, ylp);
258 
259  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
260  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
261  tmp2 = _mm_mul_ps(aVal, yh);
262  tmp2p = _mm_mul_ps(phase_Val, yhp);
263 
264  z = _mm_addsub_ps(tmp1, tmp2);
265  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
266 
267  _mm_store_ps((float*)cPtr, z);
268 
269  aPtr += 2;
270  cPtr += 2;
271  }
272  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
273  tmp2 = _mm_hadd_ps(tmp1, tmp1);
274  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
275  tmp2 = _mm_sqrt_ps(tmp1);
276  phase_Val = _mm_div_ps(phase_Val, tmp2);
277  }
278  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
279  aVal = _mm_load_ps((float*)aPtr);
280 
281  yl = _mm_moveldup_ps(phase_Val);
282  yh = _mm_movehdup_ps(phase_Val);
283  ylp = _mm_moveldup_ps(inc_Val);
284  yhp = _mm_movehdup_ps(inc_Val);
285 
286  tmp1 = _mm_mul_ps(aVal, yl);
287 
288  tmp1p = _mm_mul_ps(phase_Val, ylp);
289 
290  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
291  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
292  tmp2 = _mm_mul_ps(aVal, yh);
293  tmp2p = _mm_mul_ps(phase_Val, yhp);
294 
295  z = _mm_addsub_ps(tmp1, tmp2);
296  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
297 
298  _mm_store_ps((float*)cPtr, z);
299 
300  aPtr += 2;
301  cPtr += 2;
302  }
303  if (i) {
304  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
305  tmp2 = _mm_hadd_ps(tmp1, tmp1);
306  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
307  tmp2 = _mm_sqrt_ps(tmp1);
308  phase_Val = _mm_div_ps(phase_Val, tmp2);
309  }
310 
311  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
312  if (num_points & 1) {
313  *cPtr++ = *aPtr++ * phase_Ptr[0];
314  phase_Ptr[0] *= (phase_inc);
315  }
316 
317  (*phase) = phase_Ptr[0];
318 }
319 
320 #endif /* LV_HAVE_SSE4_1 for aligned */
321 
322 
323 #ifdef LV_HAVE_SSE4_1
324 #include <smmintrin.h>
325 
326 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
327  const lv_32fc_t* inVector,
328  const lv_32fc_t phase_inc,
329  lv_32fc_t* phase,
330  unsigned int num_points)
331 {
332  lv_32fc_t* cPtr = outVector;
333  const lv_32fc_t* aPtr = inVector;
334  lv_32fc_t incr = 1;
335  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
336 
337  unsigned int i, j = 0;
338 
339  for (i = 0; i < 2; ++i) {
340  phase_Ptr[i] *= incr;
341  incr *= (phase_inc);
342  }
343 
344  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
345  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
346  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
347  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
348 
349  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
350  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
351 
352  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
353  for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
354 
355  aVal = _mm_loadu_ps((float*)aPtr);
356 
357  yl = _mm_moveldup_ps(phase_Val);
358  yh = _mm_movehdup_ps(phase_Val);
359  ylp = _mm_moveldup_ps(inc_Val);
360  yhp = _mm_movehdup_ps(inc_Val);
361 
362  tmp1 = _mm_mul_ps(aVal, yl);
363  tmp1p = _mm_mul_ps(phase_Val, ylp);
364 
365  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
366  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
367  tmp2 = _mm_mul_ps(aVal, yh);
368  tmp2p = _mm_mul_ps(phase_Val, yhp);
369 
370  z = _mm_addsub_ps(tmp1, tmp2);
371  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
372 
373  _mm_storeu_ps((float*)cPtr, z);
374 
375  aPtr += 2;
376  cPtr += 2;
377  }
378  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
379  tmp2 = _mm_hadd_ps(tmp1, tmp1);
380  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
381  tmp2 = _mm_sqrt_ps(tmp1);
382  phase_Val = _mm_div_ps(phase_Val, tmp2);
383  }
384  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
385  aVal = _mm_loadu_ps((float*)aPtr);
386 
387  yl = _mm_moveldup_ps(phase_Val);
388  yh = _mm_movehdup_ps(phase_Val);
389  ylp = _mm_moveldup_ps(inc_Val);
390  yhp = _mm_movehdup_ps(inc_Val);
391 
392  tmp1 = _mm_mul_ps(aVal, yl);
393 
394  tmp1p = _mm_mul_ps(phase_Val, ylp);
395 
396  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
397  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
398  tmp2 = _mm_mul_ps(aVal, yh);
399  tmp2p = _mm_mul_ps(phase_Val, yhp);
400 
401  z = _mm_addsub_ps(tmp1, tmp2);
402  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
403 
404  _mm_storeu_ps((float*)cPtr, z);
405 
406  aPtr += 2;
407  cPtr += 2;
408  }
409  if (i) {
410  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
411  tmp2 = _mm_hadd_ps(tmp1, tmp1);
412  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
413  tmp2 = _mm_sqrt_ps(tmp1);
414  phase_Val = _mm_div_ps(phase_Val, tmp2);
415  }
416 
417  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
418  if (num_points & 1) {
419  *cPtr++ = *aPtr++ * phase_Ptr[0];
420  phase_Ptr[0] *= (phase_inc);
421  }
422 
423  (*phase) = phase_Ptr[0];
424 }
425 
426 #endif /* LV_HAVE_SSE4_1 */
427 
428 
429 #ifdef LV_HAVE_AVX
430 #include <immintrin.h>
432 
433 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
434  const lv_32fc_t* inVector,
435  const lv_32fc_t phase_inc,
436  lv_32fc_t* phase,
437  unsigned int num_points)
438 {
439  lv_32fc_t* cPtr = outVector;
440  const lv_32fc_t* aPtr = inVector;
441  lv_32fc_t incr = lv_cmake(1.0, 0.0);
442  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
443 
444  unsigned int i, j = 0;
445 
446  for (i = 0; i < 4; ++i) {
447  phase_Ptr[i] *= incr;
448  incr *= (phase_inc);
449  }
450 
451  __m256 aVal, phase_Val, z;
452 
453  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
454 
455  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
456  lv_creal(incr),
457  lv_cimag(incr),
458  lv_creal(incr),
459  lv_cimag(incr),
460  lv_creal(incr),
461  lv_cimag(incr),
462  lv_creal(incr));
463 
464  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
465  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
466 
467  aVal = _mm256_load_ps((float*)aPtr);
468 
469  z = _mm256_complexmul_ps(aVal, phase_Val);
470  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
471 
472  _mm256_store_ps((float*)cPtr, z);
473 
474  aPtr += 4;
475  cPtr += 4;
476  }
477  phase_Val = _mm256_normalize_ps(phase_Val);
478  }
479 
480  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
481  aVal = _mm256_load_ps((float*)aPtr);
482 
483  z = _mm256_complexmul_ps(aVal, phase_Val);
484  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
485 
486  _mm256_store_ps((float*)cPtr, z);
487 
488  aPtr += 4;
489  cPtr += 4;
490  }
491  if (i) {
492  phase_Val = _mm256_normalize_ps(phase_Val);
493  }
494 
495  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
496  (*phase) = phase_Ptr[0];
497  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
498 }
499 
500 #endif /* LV_HAVE_AVX for aligned */
501 
502 
503 #ifdef LV_HAVE_AVX
504 #include <immintrin.h>
506 
507 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
508  const lv_32fc_t* inVector,
509  const lv_32fc_t phase_inc,
510  lv_32fc_t* phase,
511  unsigned int num_points)
512 {
513  lv_32fc_t* cPtr = outVector;
514  const lv_32fc_t* aPtr = inVector;
515  lv_32fc_t incr = lv_cmake(1.0, 0.0);
516  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
517 
518  unsigned int i, j = 0;
519 
520  for (i = 0; i < 4; ++i) {
521  phase_Ptr[i] *= incr;
522  incr *= (phase_inc);
523  }
524 
525  __m256 aVal, phase_Val, z;
526 
527  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
528 
529  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
530  lv_creal(incr),
531  lv_cimag(incr),
532  lv_creal(incr),
533  lv_cimag(incr),
534  lv_creal(incr),
535  lv_cimag(incr),
536  lv_creal(incr));
537 
538  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
539  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
540 
541  aVal = _mm256_loadu_ps((float*)aPtr);
542 
543  z = _mm256_complexmul_ps(aVal, phase_Val);
544  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
545 
546  _mm256_storeu_ps((float*)cPtr, z);
547 
548  aPtr += 4;
549  cPtr += 4;
550  }
551  phase_Val = _mm256_normalize_ps(phase_Val);
552  }
553 
554  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
555  aVal = _mm256_loadu_ps((float*)aPtr);
556 
557  z = _mm256_complexmul_ps(aVal, phase_Val);
558  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
559 
560  _mm256_storeu_ps((float*)cPtr, z);
561 
562  aPtr += 4;
563  cPtr += 4;
564  }
565  if (i) {
566  phase_Val = _mm256_normalize_ps(phase_Val);
567  }
568 
569  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
570  (*phase) = phase_Ptr[0];
571  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
572 }
573 
574 #endif /* LV_HAVE_AVX */
575 
576 #if LV_HAVE_AVX && LV_HAVE_FMA
577 #include <immintrin.h>
578 
579 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
580  const lv_32fc_t* inVector,
581  const lv_32fc_t phase_inc,
582  lv_32fc_t* phase,
583  unsigned int num_points)
584 {
585  lv_32fc_t* cPtr = outVector;
586  const lv_32fc_t* aPtr = inVector;
587  lv_32fc_t incr = 1;
589  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
590 
591  unsigned int i, j = 0;
592 
593  for (i = 0; i < 4; ++i) {
594  phase_Ptr[i] *= incr;
595  incr *= (phase_inc);
596  }
597 
598  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
599 
600  phase_Val = _mm256_load_ps((float*)phase_Ptr);
601  inc_Val = _mm256_set_ps(lv_cimag(incr),
602  lv_creal(incr),
603  lv_cimag(incr),
604  lv_creal(incr),
605  lv_cimag(incr),
606  lv_creal(incr),
607  lv_cimag(incr),
608  lv_creal(incr));
609 
610  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
611  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
612 
613  aVal = _mm256_load_ps((float*)aPtr);
614 
615  yl = _mm256_moveldup_ps(phase_Val);
616  yh = _mm256_movehdup_ps(phase_Val);
617  ylp = _mm256_moveldup_ps(inc_Val);
618  yhp = _mm256_movehdup_ps(inc_Val);
619 
620  tmp1 = aVal;
621  tmp1p = phase_Val;
622 
623  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
624  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
625  tmp2 = _mm256_mul_ps(aVal, yh);
626  tmp2p = _mm256_mul_ps(phase_Val, yhp);
627 
628  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
629  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
630 
631  _mm256_store_ps((float*)cPtr, z);
632 
633  aPtr += 4;
634  cPtr += 4;
635  }
636  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
637  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
638  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
639  tmp2 = _mm256_sqrt_ps(tmp1);
640  phase_Val = _mm256_div_ps(phase_Val, tmp2);
641  }
642  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
643  aVal = _mm256_load_ps((float*)aPtr);
644 
645  yl = _mm256_moveldup_ps(phase_Val);
646  yh = _mm256_movehdup_ps(phase_Val);
647  ylp = _mm256_moveldup_ps(inc_Val);
648  yhp = _mm256_movehdup_ps(inc_Val);
649 
650  tmp1 = aVal;
651  tmp1p = phase_Val;
652 
653  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
654  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
655  tmp2 = _mm256_mul_ps(aVal, yh);
656  tmp2p = _mm256_mul_ps(phase_Val, yhp);
657 
658  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
659  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
660 
661  _mm256_store_ps((float*)cPtr, z);
662 
663  aPtr += 4;
664  cPtr += 4;
665  }
666  if (i) {
667  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
668  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
669  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
670  tmp2 = _mm256_sqrt_ps(tmp1);
671  phase_Val = _mm256_div_ps(phase_Val, tmp2);
672  }
673 
674  _mm256_store_ps((float*)phase_Ptr, phase_Val);
675  for (i = 0; i < num_points % 4; ++i) {
676  *cPtr++ = *aPtr++ * phase_Ptr[0];
677  phase_Ptr[0] *= (phase_inc);
678  }
679 
680  (*phase) = phase_Ptr[0];
681 }
682 
683 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
684 
685 #if LV_HAVE_AVX && LV_HAVE_FMA
686 #include <immintrin.h>
687 
688 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
689  const lv_32fc_t* inVector,
690  const lv_32fc_t phase_inc,
691  lv_32fc_t* phase,
692  unsigned int num_points)
693 {
694  lv_32fc_t* cPtr = outVector;
695  const lv_32fc_t* aPtr = inVector;
696  lv_32fc_t incr = 1;
697  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
698 
699  unsigned int i, j = 0;
700 
701  for (i = 0; i < 4; ++i) {
702  phase_Ptr[i] *= incr;
703  incr *= (phase_inc);
704  }
705 
706  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
707 
708  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
709  inc_Val = _mm256_set_ps(lv_cimag(incr),
710  lv_creal(incr),
711  lv_cimag(incr),
712  lv_creal(incr),
713  lv_cimag(incr),
714  lv_creal(incr),
715  lv_cimag(incr),
716  lv_creal(incr));
717 
718  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
719  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
720 
721  aVal = _mm256_loadu_ps((float*)aPtr);
722 
723  yl = _mm256_moveldup_ps(phase_Val);
724  yh = _mm256_movehdup_ps(phase_Val);
725  ylp = _mm256_moveldup_ps(inc_Val);
726  yhp = _mm256_movehdup_ps(inc_Val);
727 
728  tmp1 = aVal;
729  tmp1p = phase_Val;
730 
731  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
732  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
733  tmp2 = _mm256_mul_ps(aVal, yh);
734  tmp2p = _mm256_mul_ps(phase_Val, yhp);
735 
736  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
737  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
738 
739  _mm256_storeu_ps((float*)cPtr, z);
740 
741  aPtr += 4;
742  cPtr += 4;
743  }
744  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
745  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
746  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
747  tmp2 = _mm256_sqrt_ps(tmp1);
748  phase_Val = _mm256_div_ps(phase_Val, tmp2);
749  }
750  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
751  aVal = _mm256_loadu_ps((float*)aPtr);
752 
753  yl = _mm256_moveldup_ps(phase_Val);
754  yh = _mm256_movehdup_ps(phase_Val);
755  ylp = _mm256_moveldup_ps(inc_Val);
756  yhp = _mm256_movehdup_ps(inc_Val);
757 
758  tmp1 = aVal;
759  tmp1p = phase_Val;
760 
761  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
762  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
763  tmp2 = _mm256_mul_ps(aVal, yh);
764  tmp2p = _mm256_mul_ps(phase_Val, yhp);
765 
766  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
767  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
768 
769  _mm256_storeu_ps((float*)cPtr, z);
770 
771  aPtr += 4;
772  cPtr += 4;
773  }
774  if (i) {
775  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
776  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
777  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
778  tmp2 = _mm256_sqrt_ps(tmp1);
779  phase_Val = _mm256_div_ps(phase_Val, tmp2);
780  }
781 
782  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
783  for (i = 0; i < num_points % 4; ++i) {
784  *cPtr++ = *aPtr++ * phase_Ptr[0];
785  phase_Ptr[0] *= (phase_inc);
786  }
787 
788  (*phase) = phase_Ptr[0];
789 }
790 
791 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
792 
793 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:94
_mm256_complexmul_ps
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
_vmagnitudesquaredq_f32
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
_mm256_normalize_ps
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:64
i
for i
Definition: volk_config_fixed.tmpl.h:25
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:73
volk_32fc_s32fc_x2_rotator_32fc_generic
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:96
ROTATOR_RELOAD_2
#define ROTATOR_RELOAD_2
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:90
_vinvsqrtq_f32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:97
volk_32fc_s32fc_x2_rotator_32fc_a_avx
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:433
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:70
ROTATOR_RELOAD
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:89
volk_complex.h
volk_neon_intrinsics.h
volk_avx_intrinsics.h
_vmultiply_complexq_f32
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:119
volk_32fc_s32fc_x2_rotator_32fc_neon
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:129
volk_32fc_s32fc_x2_rotator_32fc_u_avx
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:507
ROTATOR_RELOAD_4
#define ROTATOR_RELOAD_4
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:91
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:92