Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
79 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
80 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
81 
82 #include<volk/volk_complex.h>
83 
84 
85 static inline void
86 calculate_scaled_distances(float* target, const lv_32fc_t symbol, const lv_32fc_t* points,
87  const float scalar, const unsigned int num_points)
88 {
89  lv_32fc_t diff;
90  for(unsigned int i = 0; i < num_points; ++i) {
91  /*
92  * Calculate: |y - x|^2 * SNR_lin
93  * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
94  */
95  diff = symbol - *points++;
96  *target++ = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
97  }
98 }
99 
100 
101 #ifdef LV_HAVE_AVX2
102 #include<immintrin.h>
104 
105 static inline void
106 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* src0,
107  lv_32fc_t* points, float scalar,
108  unsigned int num_points)
109 {
110  const unsigned int num_bytes = num_points*8;
111  __m128 xmm9, xmm10;
112  __m256 xmm4, xmm6;
113  __m256 xmm_points0, xmm_points1, xmm_result;
114 
115  const unsigned int bound = num_bytes >> 6;
116 
117  // load complex value into all parts of the register.
118  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
119  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
120 
121  // Load scalar into all 8 parts of the register
122  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
123  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
124 
125  // Set permutation constant
126  const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
127 
128  for(unsigned int i = 0; i < bound; ++i) {
129  xmm_points0 = _mm256_load_ps((float*)points);
130  xmm_points1 = _mm256_load_ps((float*)(points + 4));
131  points += 8;
132  __VOLK_PREFETCH(points);
133 
134  xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
135  xmm_points0, xmm_points1,
136  xmm_scalar);
137 
138  _mm256_store_ps(target, xmm_result);
139  target += 8;
140  }
141 
142  if (num_bytes >> 5 & 1) {
143  xmm_points0 = _mm256_load_ps((float*)points);
144 
145  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
146 
147  points += 4;
148 
149  xmm6 = _mm256_mul_ps(xmm4, xmm4);
150 
151  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
152  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
153 
154  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
155 
156  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
157  _mm_store_ps(target,xmm9);
158  target += 4;
159  }
160 
161  if (num_bytes >> 4 & 1) {
162  xmm9 = _mm_load_ps((float*)points);
163 
164  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
165 
166  points += 2;
167 
168  xmm9 = _mm_mul_ps(xmm10, xmm10);
169 
170  xmm10 = _mm_hadd_ps(xmm9, xmm9);
171 
172  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
173 
174  _mm_storeh_pi((__m64*)target, xmm10);
175  target += 2;
176  }
177 
178  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
179 }
180 
181 #endif /*LV_HAVE_AVX2*/
182 
183 
184 #ifdef LV_HAVE_AVX
185 #include <immintrin.h>
187 
188 static inline void
190  lv_32fc_t *points, float scalar,
191  unsigned int num_points) {
192  const int eightsPoints = num_points / 8;
193  const int remainder = num_points - 8 * eightsPoints;
194 
195  __m256 xmm_points0, xmm_points1, xmm_result;
196 
197  // load complex value into all parts of the register.
198  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
199 
200  // Load scalar into all 8 parts of the register
201  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
202 
203  for(int i = 0; i < eightsPoints; ++i){
204  xmm_points0 = _mm256_load_ps((float*)points);
205  xmm_points1 = _mm256_load_ps((float*)(points + 4));
206  points += 8;
207 
208  xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0,
209  xmm_points1, xmm_scalar);
210 
211  _mm256_store_ps(target, xmm_result);
212  target += 8;
213  }
214 
215  const lv_32fc_t symbol = *src0;
216  calculate_scaled_distances(target, symbol, points, scalar, remainder);
217 }
218 
219 #endif /* LV_HAVE_AVX */
220 
221 
222 #ifdef LV_HAVE_SSE3
223 #include<pmmintrin.h>
225 
226 static inline void
228  lv_32fc_t* points, float scalar,
229  unsigned int num_points)
230 {
231  __m128 xmm_points0, xmm_points1, xmm_result;
232 
233  /*
234  * First do 4 values in every loop iteration.
235  * There may be up to 3 values left.
236  * leftovers0 indicates if at least 2 more are available for SSE execution.
237  * leftovers1 indicates if there is a single element left.
238  */
239  const int quarterPoints = num_points / 4;
240  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
241  const int leftovers1 = num_points % 2;
242 
243  // load complex value into both parts of the register.
244  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
245 
246  // Load scalar into all 4 parts of the register
247  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
248 
249  for(int i = 0; i < quarterPoints; ++i) {
250  xmm_points0 = _mm_load_ps((float*)points);
251  xmm_points1 = _mm_load_ps((float*)(points + 2));
252  points += 4;
253  __VOLK_PREFETCH(points);
254  // calculate distances
255  xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0,
256  xmm_points1, xmm_scalar);
257 
258  _mm_store_ps(target, xmm_result);
259  target += 4;
260  }
261 
262  for(int i = 0; i < leftovers0; ++i) {
263  xmm_points0 = _mm_load_ps((float*)points);
264  points += 2;
265 
266  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
267  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
268  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
269  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
270 
271  _mm_storeh_pi((__m64*)target, xmm_result);
272  target += 2;
273  }
274 
275  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
276 }
277 
278 #endif /*LV_HAVE_SSE3*/
279 
280 #ifdef LV_HAVE_SSE
281 #include <xmmintrin.h>
283 static inline void
285  lv_32fc_t* points, float scalar,
286  unsigned int num_points)
287 {
288  const __m128 xmm_scalar = _mm_set1_ps(scalar);
289  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
290 
291  for (unsigned i = 0; i < num_points / 4; ++i) {
292  __m128 xmm_points0 = _mm_load_ps((float *) points);
293  __m128 xmm_points1 = _mm_load_ps((float *) (points + 2));
294  points += 4;
295  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
296  xmm_points0, xmm_points1,
297  xmm_scalar);
298  _mm_store_ps((float *) target, xmm_result);
299  target += 4;
300  }
301 
302  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
303 }
304 #endif // LV_HAVE_SSE
305 
306 #ifdef LV_HAVE_GENERIC
307 static inline void
309  lv_32fc_t* points, float scalar,
310  unsigned int num_points)
311 {
312  const lv_32fc_t symbol = *src0;
313  calculate_scaled_distances(target, symbol, points, scalar, num_points);
314 }
315 
316 #endif /*LV_HAVE_GENERIC*/
317 
318 
319 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
320 
321 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
322 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
323 
324 #include<volk/volk_complex.h>
325 
326 
327 #ifdef LV_HAVE_AVX2
328 #include<immintrin.h>
330 
331 static inline void
332 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* src0,
333  lv_32fc_t* points, float scalar,
334  unsigned int num_points)
335 {
336  const unsigned int num_bytes = num_points*8;
337  __m128 xmm9, xmm10;
338  __m256 xmm4, xmm6;
339  __m256 xmm_points0, xmm_points1, xmm_result;
340 
341  const unsigned int bound = num_bytes >> 6;
342 
343  // load complex value into all parts of the register.
344  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
345  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
346 
347  // Load scalar into all 8 parts of the register
348  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
349  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
350 
351  // Set permutation constant
352  const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
353 
354  for(unsigned int i = 0; i < bound; ++i) {
355  xmm_points0 = _mm256_loadu_ps((float*)points);
356  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
357  points += 8;
358  __VOLK_PREFETCH(points);
359 
360  xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
361  xmm_points0, xmm_points1,
362  xmm_scalar);
363 
364  _mm256_storeu_ps(target, xmm_result);
365  target += 8;
366  }
367 
368  if (num_bytes >> 5 & 1) {
369  xmm_points0 = _mm256_loadu_ps((float*)points);
370 
371  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
372 
373  points += 4;
374 
375  xmm6 = _mm256_mul_ps(xmm4, xmm4);
376 
377  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
379 
380  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
381 
382  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
383  _mm_storeu_ps(target,xmm9);
384  target += 4;
385  }
386 
387  if (num_bytes >> 4 & 1) {
388  xmm9 = _mm_loadu_ps((float*)points);
389 
390  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
391 
392  points += 2;
393 
394  xmm9 = _mm_mul_ps(xmm10, xmm10);
395 
396  xmm10 = _mm_hadd_ps(xmm9, xmm9);
397 
398  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
399 
400  _mm_storeh_pi((__m64*)target, xmm10);
401  target += 2;
402  }
403 
404  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
405 }
406 
407 #endif /*LV_HAVE_AVX2*/
408 
409 
410 #ifdef LV_HAVE_AVX
411 #include <immintrin.h>
413 
414 static inline void
416  lv_32fc_t *points, float scalar,
417  unsigned int num_points) {
418  const int eightsPoints = num_points / 8;
419  const int remainder = num_points - 8 * eightsPoints;
420 
421  __m256 xmm_points0, xmm_points1, xmm_result;
422 
423  // load complex value into all parts of the register.
424  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
425 
426  // Load scalar into all 8 parts of the register
427  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
428 
429  for(int i = 0; i < eightsPoints; ++i){
430  xmm_points0 = _mm256_loadu_ps((float*)points);
431  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
432  points += 8;
433 
434  xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0,
435  xmm_points1, xmm_scalar);
436 
437  _mm256_storeu_ps(target, xmm_result);
438  target += 8;
439  }
440 
441  const lv_32fc_t symbol = *src0;
442  calculate_scaled_distances(target, symbol, points, scalar, remainder);
443 }
444 
445 #endif /* LV_HAVE_AVX */
446 
447 
448 #ifdef LV_HAVE_SSE3
449 #include<pmmintrin.h>
451 
452 static inline void
454  lv_32fc_t* points, float scalar,
455  unsigned int num_points)
456 {
457  __m128 xmm_points0, xmm_points1, xmm_result;
458 
459  /*
460  * First do 4 values in every loop iteration.
461  * There may be up to 3 values left.
462  * leftovers0 indicates if at least 2 more are available for SSE execution.
463  * leftovers1 indicates if there is a single element left.
464  */
465  const int quarterPoints = num_points / 4;
466  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
467  const int leftovers1 = num_points % 2;
468 
469  // load complex value into both parts of the register.
470  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
471 
472  // Load scalar into all 4 parts of the register
473  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
474 
475  for(int i = 0; i < quarterPoints; ++i) {
476  xmm_points0 = _mm_loadu_ps((float*)points);
477  xmm_points1 = _mm_loadu_ps((float*)(points + 2));
478  points += 4;
479  __VOLK_PREFETCH(points);
480  // calculate distances
481  xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0,
482  xmm_points1, xmm_scalar);
483 
484  _mm_storeu_ps(target, xmm_result);
485  target += 4;
486  }
487 
488  for(int i = 0; i < leftovers0; ++i) {
489  xmm_points0 = _mm_loadu_ps((float*)points);
490  points += 2;
491 
492  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
493  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
494  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
495  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
496 
497  _mm_storeh_pi((__m64*)target, xmm_result);
498  target += 2;
499  }
500 
501  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
502 }
503 
504 #endif /*LV_HAVE_SSE3*/
505 
506 #ifdef LV_HAVE_SSE
507 #include <xmmintrin.h>
509 static inline void
511  lv_32fc_t* points, float scalar,
512  unsigned int num_points)
513 {
514  const __m128 xmm_scalar = _mm_set1_ps(scalar);
515  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
516 
517  for (unsigned i = 0; i < num_points / 4; ++i) {
518  __m128 xmm_points0 = _mm_loadu_ps((float *) points);
519  __m128 xmm_points1 = _mm_loadu_ps((float *) (points + 2));
520  points += 4;
521  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
522  xmm_points0, xmm_points1,
523  xmm_scalar);
524  _mm_storeu_ps((float *) target, xmm_result);
525  target += 4;
526  }
527 
528  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
529 }
530 #endif // LV_HAVE_SSE
531 
532 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:85
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:227
calculate_scaled_distances
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:86
volk_sse3_intrinsics.h
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:453
_mm256_scaled_norm_dist_ps
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:82
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:510
_mm256_scaled_norm_dist_ps_avx2
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:74
_mm_scaled_norm_dist_ps_sse
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:50
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:284
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:52
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:189
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_sse_intrinsics.h
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:415
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:308
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:61
volk_complex.h
volk_avx_intrinsics.h
_mm_scaled_norm_dist_ps_sse3
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:65
volk_avx2_intrinsics.h
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:83