libosmocore  1.6.0
Osmocom core library
conv_acc_neon_impl.h
Go to the documentation of this file.
1 
4 /*
5  * (C) 2020 by sysmocom - s.f.m.c. GmbH
6  * Author: Eric Wild
7  *
8  * All Rights Reserved
9  *
10  * SPDX-License-Identifier: GPL-2.0+
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License along
23  * with this program; if not, write to the Free Software Foundation, Inc.,
24  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25  */
26 
27 /* Some distributions (notably Alpine Linux) for some strange reason
28  * don't have this #define */
29 #ifndef __always_inline
30 #define __always_inline inline __attribute__((always_inline))
31 #endif
32 
33 #define NEON_BUTTERFLY(M0,M1,M2,M3,M4) \
34 { \
35  M3 = vqaddq_s16(M0, M2); \
36  M4 = vqsubq_s16(M1, M2); \
37  M0 = vqsubq_s16(M0, M2); \
38  M1 = vqaddq_s16(M1, M2); \
39  M2 = vmaxq_s16(M3, M4); \
40  M3 = vreinterpretq_s16_u16(vcgtq_s16(M3, M4)); \
41  M4 = vmaxq_s16(M0, M1); \
42  M1 = vreinterpretq_s16_u16(vcgtq_s16(M0, M1)); \
43 }
44 
45 #define NEON_DEINTERLEAVE_K5(M0,M1,M2,M3) \
46 { \
47  int16x8x2_t tmp; \
48  tmp = vuzpq_s16(M0, M1); \
49  M2 = tmp.val[0]; \
50  M3 = tmp.val[1]; \
51 }
52 
53 #define NEON_DEINTERLEAVE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15) \
54 { \
55  int16x8x2_t tmp; \
56  tmp = vuzpq_s16(M0, M1); \
57  M8 = tmp.val[0]; M9 = tmp.val[1]; \
58  tmp = vuzpq_s16(M2, M3); \
59  M10 = tmp.val[0]; M11 = tmp.val[1]; \
60  tmp = vuzpq_s16(M4, M5); \
61  M12 = tmp.val[0]; M13 = tmp.val[1]; \
62  tmp = vuzpq_s16(M6, M7); \
63  M14 = tmp.val[0]; M15 = tmp.val[1]; \
64 }
65 
66 #define NEON_BRANCH_METRIC_N2(M0,M1,M2,M3,M4,M6,M7) \
67 { \
68  M0 = vmulq_s16(M4, M0); \
69  M1 = vmulq_s16(M4, M1); \
70  M2 = vmulq_s16(M4, M2); \
71  M3 = vmulq_s16(M4, M3); \
72  M6 = vcombine_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
73  M7 = vcombine_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
74 }
75 
76 #define NEON_BRANCH_METRIC_N4(M0,M1,M2,M3,M4,M5) \
77 { \
78  M0 = vmulq_s16(M4, M0); \
79  M1 = vmulq_s16(M4, M1); \
80  M2 = vmulq_s16(M4, M2); \
81  M3 = vmulq_s16(M4, M3); \
82  int16x4_t t1 = vpadd_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
83  int16x4_t t2 = vpadd_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
84  M5 = vcombine_s16(t1, t2); \
85 }
86 
87 #define NEON_NORMALIZE_K5(M0,M1,M2,M3) \
88 { \
89  M2 = vminq_s16(M0, M1); \
90  int16x4_t t = vpmin_s16(vget_low_s16(M2), vget_high_s16(M2)); \
91  t = vpmin_s16(t, t); \
92  t = vpmin_s16(t, t); \
93  M2 = vdupq_lane_s16(t, 0); \
94  M0 = vqsubq_s16(M0, M2); \
95  M1 = vqsubq_s16(M1, M2); \
96 }
97 
98 #define NEON_NORMALIZE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11) \
99 { \
100  M8 = vminq_s16(M0, M1); \
101  M9 = vminq_s16(M2, M3); \
102  M10 = vminq_s16(M4, M5); \
103  M11 = vminq_s16(M6, M7); \
104  M8 = vminq_s16(M8, M9); \
105  M10 = vminq_s16(M10, M11); \
106  M8 = vminq_s16(M8, M10); \
107  int16x4_t t = vpmin_s16(vget_low_s16(M8), vget_high_s16(M8)); \
108  t = vpmin_s16(t, t); \
109  t = vpmin_s16(t, t); \
110  M8 = vdupq_lane_s16(t, 0); \
111  M0 = vqsubq_s16(M0, M8); \
112  M1 = vqsubq_s16(M1, M8); \
113  M2 = vqsubq_s16(M2, M8); \
114  M3 = vqsubq_s16(M3, M8); \
115  M4 = vqsubq_s16(M4, M8); \
116  M5 = vqsubq_s16(M5, M8); \
117  M6 = vqsubq_s16(M6, M8); \
118  M7 = vqsubq_s16(M7, M8); \
119 }
120 
121 __always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
122  int norm)
123 {
124  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
125  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
126  int16x8_t m0, m1, m2, m3, m4, m5, m6;
127  int16x4_t input;
128 
129  /* (BMU) Load and expand 8-bit input out to 16-bits */
130  input = vld1_s16(val);
131  m2 = vcombine_s16(input, input);
132 
133  /* (BMU) Load and compute branch metrics */
134  m0 = vld1q_s16(&out[0]);
135  m1 = vld1q_s16(&out[8]);
136 
137  m0 = vmulq_s16(m2, m0);
138  m1 = vmulq_s16(m2, m1);
139  m2 = vcombine_s16(vpadd_s16(vget_low_s16(m0), vget_high_s16(m0)),
140  vpadd_s16(vget_low_s16(m1), vget_high_s16(m1)));
141 
142  /* (PMU) Load accumulated path matrics */
143  m0 = vld1q_s16(&sums[0]);
144  m1 = vld1q_s16(&sums[8]);
145 
146  NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
147 
148  /* (PMU) Butterflies: 0-7 */
149  NEON_BUTTERFLY(m3, m4, m2, m5, m6)
150 
151  if (norm)
152  NEON_NORMALIZE_K5(m2, m6, m0, m1)
153 
154  vst1q_s16(&sums[0], m2);
155  vst1q_s16(&sums[8], m6);
156  vst1q_s16(&paths[0], m5);
157  vst1q_s16(&paths[8], m4);
158 }
159 
160 __always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
161  int norm)
162 {
163  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
164  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
165  int16x8_t m0, m1, m2, m3, m4, m5, m6;
166  int16x4_t input;
167 
168  /* (BMU) Load and expand 8-bit input out to 16-bits */
169  input = vld1_s16(val);
170  m4 = vcombine_s16(input, input);
171 
172  /* (BMU) Load and compute branch metrics */
173  m0 = vld1q_s16(&out[0]);
174  m1 = vld1q_s16(&out[8]);
175  m2 = vld1q_s16(&out[16]);
176  m3 = vld1q_s16(&out[24]);
177 
178  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
179 
180  /* (PMU) Load accumulated path matrics */
181  m0 = vld1q_s16(&sums[0]);
182  m1 = vld1q_s16(&sums[8]);
183 
184  NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
185 
186  /* (PMU) Butterflies: 0-7 */
187  NEON_BUTTERFLY(m3, m4, m2, m5, m6)
188 
189  if (norm)
190  NEON_NORMALIZE_K5(m2, m6, m0, m1)
191 
192  vst1q_s16(&sums[0], m2);
193  vst1q_s16(&sums[8], m6);
194  vst1q_s16(&paths[0], m5);
195  vst1q_s16(&paths[8], m4);
196 }
197 
198 __always_inline static void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
199  int norm)
200 {
201  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
202  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
203  int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
204  int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
205  int16x4_t input;
206 
207  /* (PMU) Load accumulated path matrics */
208  m0 = vld1q_s16(&sums[0]);
209  m1 = vld1q_s16(&sums[8]);
210  m2 = vld1q_s16(&sums[16]);
211  m3 = vld1q_s16(&sums[24]);
212  m4 = vld1q_s16(&sums[32]);
213  m5 = vld1q_s16(&sums[40]);
214  m6 = vld1q_s16(&sums[48]);
215  m7 = vld1q_s16(&sums[56]);
216 
217  /* (PMU) Deinterleave into even and odd packed registers */
218  NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
219 
220  /* (BMU) Load and expand 8-bit input out to 16-bits */
221  input = vld1_s16(val);
222  m7 = vcombine_s16(input, input);
223 
224  /* (BMU) Load and compute branch metrics */
225  m0 = vld1q_s16(&out[0]);
226  m1 = vld1q_s16(&out[8]);
227  m2 = vld1q_s16(&out[16]);
228  m3 = vld1q_s16(&out[24]);
229 
230  NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
231 
232  m0 = vld1q_s16(&out[32]);
233  m1 = vld1q_s16(&out[40]);
234  m2 = vld1q_s16(&out[48]);
235  m3 = vld1q_s16(&out[56]);
236 
237  NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
238 
239  /* (PMU) Butterflies: 0-15 */
240  NEON_BUTTERFLY(m8, m9, m4, m0, m1)
241  NEON_BUTTERFLY(m10, m11, m5, m2, m3)
242 
243  vst1q_s16(&paths[0], m0);
244  vst1q_s16(&paths[8], m2);
245  vst1q_s16(&paths[32], m9);
246  vst1q_s16(&paths[40], m11);
247 
248  /* (PMU) Butterflies: 17-31 */
249  NEON_BUTTERFLY(m12, m13, m6, m0, m2)
250  NEON_BUTTERFLY(m14, m15, m7, m9, m11)
251 
252  vst1q_s16(&paths[16], m0);
253  vst1q_s16(&paths[24], m9);
254  vst1q_s16(&paths[48], m13);
255  vst1q_s16(&paths[56], m15);
256 
257  if (norm)
258  NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
259 
260  vst1q_s16(&sums[0], m4);
261  vst1q_s16(&sums[8], m5);
262  vst1q_s16(&sums[16], m6);
263  vst1q_s16(&sums[24], m7);
264  vst1q_s16(&sums[32], m1);
265  vst1q_s16(&sums[40], m3);
266  vst1q_s16(&sums[48], m2);
267  vst1q_s16(&sums[56], m11);
268 }
269 
270 __always_inline static void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
271  int norm)
272 {
273  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
274  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
275  int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
276  int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
277  int16x4_t input;
278 
279  /* (PMU) Load accumulated path matrics */
280  m0 = vld1q_s16(&sums[0]);
281  m1 = vld1q_s16(&sums[8]);
282  m2 = vld1q_s16(&sums[16]);
283  m3 = vld1q_s16(&sums[24]);
284  m4 = vld1q_s16(&sums[32]);
285  m5 = vld1q_s16(&sums[40]);
286  m6 = vld1q_s16(&sums[48]);
287  m7 = vld1q_s16(&sums[56]);
288 
289  /* (PMU) Deinterleave into even and odd packed registers */
290  NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
291 
292  /* (BMU) Load and expand 8-bit input out to 16-bits */
293  input = vld1_s16(val);
294  m7 = vcombine_s16(input, input);
295 
296  /* (BMU) Load and compute branch metrics */
297  m0 = vld1q_s16(&out[0]);
298  m1 = vld1q_s16(&out[8]);
299  m2 = vld1q_s16(&out[16]);
300  m3 = vld1q_s16(&out[24]);
301 
302  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
303 
304  m0 = vld1q_s16(&out[32]);
305  m1 = vld1q_s16(&out[40]);
306  m2 = vld1q_s16(&out[48]);
307  m3 = vld1q_s16(&out[56]);
308 
309  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
310 
311  m0 = vld1q_s16(&out[64]);
312  m1 = vld1q_s16(&out[72]);
313  m2 = vld1q_s16(&out[80]);
314  m3 = vld1q_s16(&out[88]);
315 
316  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
317 
318  m0 = vld1q_s16(&out[96]);
319  m1 = vld1q_s16(&out[104]);
320  m2 = vld1q_s16(&out[112]);
321  m3 = vld1q_s16(&out[120]);
322 
323  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
324 
325  /* (PMU) Butterflies: 0-15 */
326  NEON_BUTTERFLY(m8, m9, m4, m0, m1)
327  NEON_BUTTERFLY(m10, m11, m5, m2, m3)
328 
329  vst1q_s16(&paths[0], m0);
330  vst1q_s16(&paths[8], m2);
331  vst1q_s16(&paths[32], m9);
332  vst1q_s16(&paths[40], m11);
333 
334  /* (PMU) Butterflies: 17-31 */
335  NEON_BUTTERFLY(m12, m13, m6, m0, m2)
336  NEON_BUTTERFLY(m14, m15, m7, m9, m11)
337 
338  vst1q_s16(&paths[16], m0);
339  vst1q_s16(&paths[24], m9);
340  vst1q_s16(&paths[48], m13);
341  vst1q_s16(&paths[56], m15);
342 
343  if (norm)
344  NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
345 
346  vst1q_s16(&sums[0], m4);
347  vst1q_s16(&sums[8], m5);
348  vst1q_s16(&sums[16], m6);
349  vst1q_s16(&sums[24], m7);
350  vst1q_s16(&sums[32], m1);
351  vst1q_s16(&sums[40], m3);
352  vst1q_s16(&sums[48], m2);
353  vst1q_s16(&sums[56], m11);
354 }
__always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:121
#define NEON_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_neon_impl.h:53
#define NEON_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_neon_impl.h:33
#define NEON_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_neon_impl.h:98
#define NEON_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:87
#define __always_inline
Definition: conv_acc_neon_impl.h:30
#define NEON_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_neon_impl.h:66
#define NEON_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_neon_impl.h:76
#define NEON_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:45
static __always_inline void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:198
__always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:160
static __always_inline void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:270