Grok  7.6.2
simd.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2016-2020 Grok Image Compression Inc.
3  *
4  * This source code is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU Affero General Public License, version 3,
6  * as published by the Free Software Foundation.
7  *
8  * This source code is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU Affero General Public License for more details.
12  *
13  * You should have received a copy of the GNU Affero General Public License
14  * along with this program. If not, see <http://www.gnu.org/licenses/>.
15  *
16  *
17  * This source code incorporates work covered by the BSD 2-clause license.
18  * Please see the LICENSE file in the root directory for details.
19  *
20  */
21 
22 #pragma once
23 
24 #define GRK_SKIP_POISON
25 #ifdef __SSE__
26 #include <xmmintrin.h>
27 #endif
28 #ifdef __SSE2__
29 #include <emmintrin.h>
30 #endif
31 #ifdef __SSSE3__
32 #include <tmmintrin.h>
33 #endif
34 #ifdef __AVX2__
35 #include <immintrin.h>
36 #endif
37 
38 
39 #ifdef __AVX2__
40 
41 #define VREG_INT_COUNT 8
42 #else
43 
44 #define VREG_INT_COUNT 4
45 #endif
46 
47 
48 
49 #if (defined(__SSE2__) || defined(__AVX2__))
50 
51 /* Convenience macros to improve readability */
52 #if __AVX2__
53 
54 #define VREG __m256i
55 #define LOAD_CST(x) _mm256_set1_epi32(x)
56 #define LOAD(x) _mm256_load_si256((const VREG*)(x))
57 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x))
58 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y))
59 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
60 #define ADD(x,y) _mm256_add_epi32((x),(y))
61 #define AND(x,y) _mm256_and_si256((x),(y));
62 #define SUB(x,y) _mm256_sub_epi32((x),(y))
63 #define VMAX(x,y) _mm256_max_epi32((x),(y))
64 #define VMIN(x,y) _mm256_min_epi32((x),(y))
65 #define SAR(x,y) _mm256_srai_epi32((x),(y))
66 #define MUL(x,y) _mm256_mullo_epi32((x),(y))
67 
68 #define VREGF __m256
69 #define LOADF(x) _mm256_load_ps((float const*)(x))
70 #define LOADUF(x) _mm256_loadu_ps((float const*)(x))
71 #define LOAD_CST_F(x)_mm256_set1_ps(x)
72 #define ADDF(x,y) _mm256_add_ps((x),(y))
73 #define MULF(x,y) _mm256_mul_ps((x),(y))
74 #define SUBF(x,y) _mm256_sub_ps((x),(y))
75 #define VMAXF(x,y) _mm256_max_ps((x),(y))
76 #define VMINF(x,y) _mm256_min_ps((x),(y))
77 #define STOREF(x,y) _mm256_store_ps((float*)(x),(y))
78 #define STOREUF(x,y) _mm256_storeu_ps((float*)(x),(y))
79 
80 #else
81 
82 #define VREG __m128i
83 #define LOAD_CST(x) _mm_set1_epi32(x)
84 #define LOAD(x) _mm_load_si128((const VREG*)(x))
85 #define LOADU(x) _mm_loadu_si128((const VREG*)(x))
86 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y))
87 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
88 #define ADD(x,y) _mm_add_epi32((x),(y))
89 #define AND(x,y) _mm_and_si128((x),(y));
90 #define SUB(x,y) _mm_sub_epi32((x),(y))
91 // !!! supported by SSE 4.1, not SSE 2
92 #define VMAX(x,y) _mm_max_epi32((x),(y))
93 // !!! supported by SSE 4.1, not SSE 2
94 #define VMIN(x,y) _mm_min_epi32((x),(y))
95 
96 #define VREGF __m128
97 // !!! MUL is only valid for SSE 4.1, not SSE 2
98 #define MUL(x,y) _mm_mullo_epi32((x),(y))
99 #define SAR(x,y) _mm_srai_epi32((x),(y))
100 #define LOADF(x) _mm_load_ps((float const*)(x))
101 #define LOADUF(x) _mm_loadu_ps((float const*)(x))
102 #define LOAD_CST_F(x) _mm_set1_ps(x)
103 #define ADDF(x,y) _mm_add_ps((x),(y))
104 #define MULF(x,y) _mm_mul_ps((x),(y))
105 #define SUBF(x,y) _mm_sub_ps((x),(y))
106 #define VMAXF(x,y) _mm_max_ps((x),(y))
107 #define VMINF(x,y) _mm_min_ps((x),(y))
108 #define STOREF(x,y) _mm_store_ps((float*)(x),(y))
109 #define STOREUF(x,y) _mm_storeu_ps((float*)(x),(y))
110 
111 #endif
112 
113 #define ADD3(x,y,z) ADD(ADD(x,y),z)
114 // !!! supported by SSE 4.1 or AVX
115 #define VCLAMP(x,min,max) VMIN(VMAX(x, min), max)
116 
117 #define VCLAMPF(x,min,max) VMINF(VMAXF(x, min), max)
118 
119 #endif