Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_avx_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * This file is intended to hold AVX intrinsics of intrinsics.
25  * They should be used in VOLK kernels to avoid copy-pasta.
26  */
27 
28 #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
29 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
30 #include <immintrin.h>
31 
32 static inline __m256
33 _mm256_complexmul_ps(__m256 x, __m256 y)
34 {
35  __m256 yl, yh, tmp1, tmp2;
36  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
37  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
38  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
39  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
40  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
41  return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
42 }
43 
44 static inline __m256
46  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
47  return _mm256_xor_ps(x, conjugator); // conjugate y
48 }
49 
50 static inline __m256
51 _mm256_complexconjugatemul_ps(__m256 x, __m256 y){
52  y = _mm256_conjugate_ps(y);
53  return _mm256_complexmul_ps(x, y);
54 }
55 
56 static inline __m256
58 {
59  __m256 tmp1 = _mm256_mul_ps(val, val);
60  tmp1 = _mm256_hadd_ps(tmp1, tmp1);
61  tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
62  tmp1 = _mm256_sqrt_ps(tmp1);
63  return _mm256_div_ps(val, tmp1);
64 }
65 
66 static inline __m256
67 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
68  __m256 complex1, complex2;
69  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
70  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
71  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
72  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
73  return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
74 }
75 
76 static inline __m256
77 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
78  return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
79 }
80 
81 static inline __m256
82 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){
83  /*
84  * Calculate: |y - x|^2 * SNR_lin
85  * Consider 'symbolsX' and 'pointsX' to be complex float
86  * 'symbolsX' are 'y' and 'pointsX' are 'x'
87  */
88  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
89  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
90  const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
91  return _mm256_mul_ps(norms, scalar);
92 }
93 
94 static inline __m256
95 _mm256_polar_sign_mask(__m128i fbits){
96  __m256 sign_mask_dummy = _mm256_setzero_ps();
97  const __m128i zeros = _mm_set1_epi8(0x00);
98  const __m128i sign_extract = _mm_set1_epi8(0x80);
99  const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03);
100  const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
101 
102  fbits = _mm_cmpgt_epi8(fbits, zeros);
103  fbits = _mm_and_si128(fbits, sign_extract);
104  __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
105  __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
106 
107  __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
108  return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
109 // // This is the desired function call. Though it seems to be missing in GCC.
110 // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
111 // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0));
112 }
113 
114 static inline void
115 _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){
116  // deinterleave values
117  __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
118  __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
119  *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
120  *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
121 }
122 
123 static inline __m256
124 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1){
125  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
126  const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
127 
128  __m256 llr0, llr1;
129  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
130 
131  // calculate result
132  __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
133  __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
134  return _mm256_or_ps(dst, sign);
135 }
136 
137 static inline __m256
138 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){
139  // prepare sign mask for correct +-
140  __m256 sign_mask = _mm256_polar_sign_mask(fbits);
141 
142  __m256 llr0, llr1;
143  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
144 
145  // calculate result
146  llr0 = _mm256_xor_ps(llr0, sign_mask);
147  __m256 dst = _mm256_add_ps(llr0, llr1);
148  return dst;
149 }
150 
151 #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
_mm256_complexconjugatemul_ps
static __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:51
_mm256_complexmul_ps
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:33
_mm256_conjugate_ps
static __m256 _mm256_conjugate_ps(__m256 x)
Definition: volk_avx_intrinsics.h:45
_mm256_scaled_norm_dist_ps
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:82
volk_arch_defs.val
val
Definition: volk_arch_defs.py:66
_mm256_normalize_ps
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:57
_mm256_polar_deinterleave
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:115
_mm256_magnitudesquared_ps
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:67
_mm256_magnitude_ps
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:77
_mm256_polar_sign_mask
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition: volk_avx_intrinsics.h:95
_mm256_polar_fsign_add_llrs
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:138
_mm256_polar_minsum_llrs
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:124