Vector Optimized Library of Kernels  2.1
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * Copyright (c) 2016-2019 ARM Limited.
25  *
26  * SPDX-License-Identifier: MIT
27  *
28  * Permission is hereby granted, free of charge, to any person obtaining a copy
29  * of this software and associated documentation files (the "Software"), to
30  * deal in the Software without restriction, including without limitation the
31  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
32  * sell copies of the Software, and to permit persons to whom the Software is
33  * furnished to do so, subject to the following conditions:
34  *
35  * The above copyright notice and this permission notice shall be included in all
36  * copies or substantial portions of the Software.
37  *
38  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44  * SOFTWARE.
45  *
46  * _vtaylor_polyq_f32
47  * _vlogq_f32
48  *
49  */
50 
51 /*
52  * This file is intended to hold NEON intrinsics of intrinsics.
53  * They should be used in VOLK kernels to avoid copy-pasta.
54  */
55 
56 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
57 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
58 #ifdef LV_HAVE_NEON
59 #include <arm_neon.h>
60 
61 
62 /* Magnitude squared for float32x4x2_t */
63 static inline float32x4_t
64 _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
65 {
66  float32x4_t iValue, qValue, result;
67  iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
68  qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
69  result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
70  return result;
71 }
72 
73 /* Inverse square root for float32x4_t */
74 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
75 {
76  float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
77  sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
78  sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
79 
80  return sqrt_reciprocal;
81 }
82 
83 /* Complex multiplication for float32x4x2_t */
84 static inline float32x4x2_t
85 _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
86 {
87  float32x4x2_t tmp_real;
88  float32x4x2_t tmp_imag;
89  float32x4x2_t c_val;
90 
91  // multiply the real*real and imag*imag to get real result
92  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
93  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
94  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
95  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
96  // Multiply cross terms to get the imaginary result
97  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
98  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
99  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
100  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
101  // combine the products
102  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
103  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
104  return c_val;
105 }
106 
107 /* From ARM Compute Library, MIT license */
108 static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
109 {
110  float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
111  float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
112  float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
113  float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
114  float32x4_t x2 = vmulq_f32(x, x);
115  float32x4_t x4 = vmulq_f32(x2, x2);
116  float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
117  return res;
118 }
119 
120 /* Natural logarithm.
121  * From ARM Compute Library, MIT license */
122 static inline float32x4_t _vlogq_f32(float32x4_t x)
123 {
124  const float32x4_t log_tab[8] = {
125  vdupq_n_f32(-2.29561495781f),
126  vdupq_n_f32(-2.47071170807f),
127  vdupq_n_f32(-5.68692588806f),
128  vdupq_n_f32(-0.165253549814f),
129  vdupq_n_f32(5.17591238022f),
130  vdupq_n_f32(0.844007015228f),
131  vdupq_n_f32(4.58445882797f),
132  vdupq_n_f32(0.0141278216615f),
133  };
134 
135  const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
136  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
137 
138  // Extract exponent
139  int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
140  float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
141 
142  // Polynomial Approximation
143  float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
144 
145  // Reconstruct
146  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
147 
148  return poly;
149 }
150 
151 #endif /*LV_HAVE_NEON*/
152 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
_vmagnitudesquaredq_f32
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:64
volk_arch_defs.val
val
Definition: volk_arch_defs.py:69
_vtaylor_polyq_f32
static float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
Definition: volk_neon_intrinsics.h:108
_vlogq_f32
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:122
_vinvsqrtq_f32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:74
_vmultiply_complexq_f32
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:85